In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [86]:
X_full = pd.read_csv("train.csv")
X_test_full = pd.read_csv("test.csv")
X_all = pd.read_csv("train.csv")

X_full['Age'] = X_full.groupby(['Sex', 'Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))

X_full.dropna(axis=0, subset=['Survived'], inplace=True)
y = X_full.Survived
X_full.drop(['Survived'], axis=1, inplace=True)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, train_size=0.7, test_size=0.3, random_state=3)

categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and 
                    X_train_full[cname].dtype == "object"]

numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [87]:
X_train.head()

Unnamed: 0,Sex,Embarked,PassengerId,Pclass,Age,SibSp,Parch,Fare
233,female,S,234,3,5.0,4,2,31.3875
229,female,S,230,3,21.5,3,1,25.4667
356,female,S,357,1,22.0,0,1,55.0
439,male,S,440,2,31.0,0,0,10.5
637,male,S,638,2,31.0,1,1,26.25


In [88]:
X_train.isna().sum()

Sex            0
Embarked       1
PassengerId    0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
dtype: int64

In [73]:
X_all.groupby(['Sex', 'Pclass']).median()['Age']

Sex     Pclass
female  1         35.0
        2         28.0
        3         21.5
male    1         40.0
        2         30.0
        3         25.0
Name: Age, dtype: float64

In [84]:
X_all_corr = X_all.corr().abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()
X_all_corr[X_all_corr['level_0'] == 'Age']

Unnamed: 0,level_0,level_1,0
5,Age,Age,1.0
12,Age,Pclass,0.369226
16,Age,SibSp,0.308247
21,Age,Parch,0.189119
26,Age,Fare,0.096067
31,Age,Survived,0.077221
36,Age,PassengerId,0.036847


In [85]:
X_all_corr[X_all_corr['level_0'] == 'Pclass']

Unnamed: 0,level_0,level_1,0
3,Pclass,Pclass,1.0
7,Pclass,Fare,0.5495
11,Pclass,Age,0.369226
13,Pclass,Survived,0.338481
27,Pclass,SibSp,0.083081
39,Pclass,PassengerId,0.035144
41,Pclass,Parch,0.018443


In [51]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier

numerical_transformer = SimpleImputer(strategy='constant') # Maybe remove as it only effects Age column

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [None]:
my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBClassifier(n_estimators=500, learning_rate=0.02, random_state=3))
])

scores = cross_val_score(my_pipeline, X_train, y_train,
                              cv=5)

print("Scores:\n", scores.mean())

my_pipeline.fit(X_train, y_train)

preds = my_pipeline.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, preds))

print(confusion_matrix(y_valid, preds))

In [73]:
scores.mean()

0.8169935483870967

In [74]:
preds_test = my_pipeline.predict(X_test)

In [51]:
output = pd.DataFrame({'PassengerId': X_test.PassengerId, 'Survived': preds_test})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [59]:
my_pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'model', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__num', 'preprocessor__cat', 'preprocessor__num__add_indicator', 'preprocessor__num__copy', 'preprocessor__num__fill_value', 'preprocessor__num__missing_values', 'preprocessor__num__strategy', 'preprocessor__num__verbose', 'preprocessor__cat__memory', 'preprocessor__cat__steps', 'preprocessor__cat__verbose', 'preprocessor__cat__imputer', 'preprocessor__cat__onehot', 'preprocessor__cat__imputer__add_indicator', 'preprocessor__cat__imputer__copy', 'preprocessor__cat__imputer__fill_value', 'preprocessor__cat__imputer__missing_values', 'preprocessor__cat__imputer__strategy', 'preprocessor__cat__imputer__verbose', 'preprocessor__cat__onehot__categories', 'preprocessor__cat__onehot__drop', 'preprocessor__cat__onehot__dtype', 'preprocessor__

In [8]:
from datetime import datetime
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [91]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBClassifier(random_state=3))
])
n_e = np.arange(10)*25+850
#l_r = np.arange(10)*0.01+0.01
params = {
        'model__n_estimators': n_e,
        'model__learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
        'model__min_child_weight': [1, 5, 10],
        'model__gamma': [0.5, 1, 1.5, 2, 5],
        'model__subsample': [0.6, 0.8, 1.0],
        'model__colsample_bytree': [0.6, 0.8, 1.0],
        'model__max_depth': [3, 4, 5]
        }
folds=5
param_comb = 5
cv_method = RepeatedStratifiedKFold(n_repeats=3, n_splits=folds, random_state=3)

random_search = RandomizedSearchCV(estimator=my_pipeline, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=2, cv=cv_method, verbose=3, random_state=3 )

start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X_train, y_train)
timer(start_time) # timing ends here for "start_time" variable

Fitting 15 folds for each of 5 candidates, totalling 75 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    8.7s
[Parallel(n_jobs=2)]: Done  75 out of  75 | elapsed:   26.7s finished



 Time taken: 0 hours 0 minutes and 27.45 seconds.


In [92]:
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)


 All results:
{'mean_fit_time': array([0.6517426 , 0.55860017, 0.69192821, 0.66158762, 0.89016026]), 'std_fit_time': array([0.01780814, 0.00766758, 0.02428063, 0.00716184, 0.04648117]), 'mean_score_time': array([0.00984031, 0.01176828, 0.01176834, 0.01150252, 0.01043619]), 'std_score_time': array([0.00166397, 0.00116313, 0.00083643, 0.00049761, 0.00158575]), 'param_model__subsample': masked_array(data=[0.6, 1.0, 1.0, 0.6, 0.8],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_model__n_estimators': masked_array(data=[1050, 900, 1025, 1000, 925],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_model__min_child_weight': masked_array(data=[5, 10, 10, 10, 1],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_model__max_depth': masked_array(data=[3, 3, 4, 4, 5],
             mask=[False, False, False, Fal

In [None]:
'0.06': 0.7469888858186733   '0.02':0.7465456234073258

In [20]:
np.arange(10)*25+850

array([ 850,  875,  900,  925,  950,  975, 1000, 1025, 1050, 1075])

In [93]:
preds_test = random_search.predict(X_test)

In [94]:
output = pd.DataFrame({'PassengerId': X_test.PassengerId, 'Survived': preds_test})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [None]:
'PassengerId': X_test.PassengerId, 'Survived': preds_test

In [10]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [43]:
train_data['Age'].replace('', np.nan, inplace=True)
train_data.dropna(subset=['Age'], inplace=True)
train_data['Embarked'].replace('', np.nan, inplace=True)
train_data.dropna(subset=['Embarked'], inplace=True)
print(train_data.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          529
Embarked         0
dtype: int64


In [46]:
from sklearn import preprocessing
from sklearn.preprocessing import RobustScaler
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

Data = train_data.drop(columns = ['Survived', 'Name', 'Ticket', 'Cabin'])
Data_train_data = Data.copy()
target = train_data['Survived']
Data=pd.get_dummies(Data, columns=["Sex"], drop_first=True)
Data=pd.get_dummies(Data, columns=["Embarked"], drop_first=True)
Data.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,1,3,22.0,1,0,7.25,1,0,1
1,2,1,38.0,1,0,71.2833,0,0,0
2,3,3,26.0,0,0,7.925,0,0,1
3,4,1,35.0,1,0,53.1,0,0,1
4,5,3,35.0,0,0,8.05,1,0,1


In [36]:
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV

cv_method = RepeatedStratifiedKFold(n_repeats=3, n_splits=5, random_state=333)

In [41]:
from sklearn.base import BaseEstimator, TransformerMixin

# custom function for RFI feature selection
# here we use n_estimators=100
class RFIFeatureSelector(BaseEstimator, TransformerMixin):
    
    # class constructor 
    # make sure class attributes end with a "_"
    # per scikit-learn convention to avoid errors
    def __init__(self, n_features_=10):
        self.n_features_ = n_features_
        self.fs_indices_ = None

    # override the fit function
    def fit(self, X, y):
        from sklearn.ensemble import RandomForestClassifier
        from numpy import argsort
        model_rfi = RandomForestClassifier(n_estimators=100)
        model_rfi.fit(X, y)
        self.fs_indices_ = argsort(model_rfi.feature_importances_)[::-1][0:self.n_features_] 
        return self 
    
    # override the transform function
    def transform(self, X, y=None):
        return X[:, self.fs_indices_]

In [48]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
np.random.seed(333)
pipe_DT = Pipeline([('rfi_fs', RFIFeatureSelector()),
                    ('dt', DecisionTreeClassifier(criterion='gini'))])

params_pipe_DT = {'rfi_fs__n_features_': [1,2,3,4,5,6,7,8,9,10,Data.shape[1]],
                  'dt__max_depth': [1,2,3,4,5,6,7],
                  'dt__min_samples_split': [20,50,70,100]}

gs_pipe_DT = GridSearchCV(estimator=pipe_DT, 
                          param_grid=params_pipe_DT, 
                          cv=cv_method,
                          refit=True,
                          n_jobs=-2,
                          scoring=scoring_metric,
                          verbose=1) 

gs_pipe_DT.fit(D_train.values, t_train);
print(gs_pipe_DT.best_score_)
print(gs_pipe_DT.best_params_)

Fitting 15 folds for each of 308 candidates, totalling 4620 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  82 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-2)]: Done 382 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-2)]: Done 882 tasks      | elapsed:   45.5s
[Parallel(n_jobs=-2)]: Done 1582 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-2)]: Done 2482 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-2)]: Done 3582 tasks      | elapsed:  3.1min


0.8558949347756191
{'dt__max_depth': 3, 'dt__min_samples_split': 100, 'rfi_fs__n_features_': 5}


[Parallel(n_jobs=-2)]: Done 4620 out of 4620 | elapsed:  4.0min finished
