In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

import re

from sklearn.preprocessing import StandardScaler

import xgboost as xgb

In [35]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [36]:
train_data = train_data.dropna(subset=['Embarked'])

In [37]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [38]:
def impute_missing(train_data, test_data):
    
    train_data['Sex'] = train_data['Sex'].replace({'female':0, 'male':1})
    test_data['Sex'] = test_data['Sex'].replace({'female':0, 'male':1})
    
    train_data['Embarked'] = train_data['Embarked'].replace({'C':0, 'Q':1, 'S':2})
    test_data['Embarked'] = test_data['Embarked'].replace({'C':0, 'Q':1, 'S':2})
    
    train_data_dropped = train_data.drop(['Cabin', 'Survived', 'Name', 'Ticket'], axis=1)
    test_data_dropped = test_data.drop(['Cabin', 'Name', 'Ticket'], axis=1)
    
    train_dropped_cols = train_data[['Cabin', 'Survived', 'Name', 'Ticket']]
    test_dropped_cols = test_data[['Cabin', 'Name', 'Ticket']]
    
    imputer = IterativeImputer(random_state=0)
    
    imputer.fit(train_data_dropped)
    
    train_imp = imputer.transform(train_data_dropped)
    test_imp = imputer.transform(test_data_dropped)
    
    train_data_imp_df = pd.DataFrame(train_imp, columns=train_data_dropped.columns)
    test_data_imp_df = pd.DataFrame(test_imp, columns=test_data_dropped.columns)
    
    train_data_imp = pd.concat((train_data_imp_df, train_dropped_cols.reset_index()), axis=1).drop(['index'], axis=1)
    test_data_imp = pd.concat((test_data_imp_df, test_dropped_cols.reset_index()), axis=1).drop(['index'], axis=1)
        
    return train_data_imp, test_data_imp

In [39]:
train_data_imp, test_data_imp = impute_missing(train_data, test_data)

In [40]:
train_data_imp.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin,Survived,Name,Ticket
0,1.0,3.0,1.0,22.0,1.0,0.0,7.25,2.0,,0,"Braund, Mr. Owen Harris",A/5 21171
1,2.0,1.0,0.0,38.0,1.0,0.0,71.2833,0.0,C85,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",PC 17599
2,3.0,3.0,0.0,26.0,0.0,0.0,7.925,2.0,,1,"Heikkinen, Miss. Laina",STON/O2. 3101282
3,4.0,1.0,0.0,35.0,1.0,0.0,53.1,2.0,C123,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",113803
4,5.0,3.0,1.0,35.0,0.0,0.0,8.05,2.0,,0,"Allen, Mr. William Henry",373450


In [41]:
train_data_imp['Embarked'] = train_data_imp['Embarked'].replace({0:'C', 1:'Q', 2:'S'})
test_data_imp['Embarked'] = test_data_imp['Embarked'].replace({0:'C', 1:'Q', 2:'S'})

In [42]:
def get_family_size(train_data, test_data):
    
    train_data['Family_size'] = train_data['SibSp'] + train_data['Parch'] + 1
    train_data['Family_size'] = train_data['Family_size'].astype('int32')
    
    test_data['Family_size'] = test_data['SibSp'] + test_data['Parch'] + 1
    test_data['Family_size'] = test_data['Family_size'].astype('int32')
    
    return train_data, test_data

In [43]:
train_data_imp, test_data_imp = get_family_size(train_data_imp, test_data_imp)

In [44]:
def label_alone(row):
    if row['Family_size'] == 1:
        return 1
    else:
        return 0

In [45]:
def get_alone_column(train_data, test_data):
    
    train_data['Is_alone'] = train_data.apply(lambda row: label_alone(row), axis=1)
    test_data['Is_alone'] = test_data.apply(lambda row: label_alone(row), axis=1)
    
    return train_data, test_data

In [46]:
train_data_imp, test_data_imp = get_alone_column(train_data_imp, test_data_imp)

In [47]:
def get_title(row):
    title = re.search('([A-Za-z]+)\.', row['Name'])
    if title:
        return title.group(1)
    return ''

In [48]:
def get_title_column(train_data, test_data):
    
    train_data['Title'] = train_data.apply(lambda row: get_title(row), axis=1)
    test_data['Title'] = test_data.apply(lambda row:get_title(row), axis=1)
    
    return train_data, test_data

In [49]:
train_data_imp, test_data_imp = get_title_column(train_data_imp, test_data_imp)

In [50]:
def convert_rare_titles(train_data, test_data):
    
    train_data['Title'] = train_data['Title'].replace({'Mme':'Mrs', 'Mlle':'Miss', 'Ms':'Miss', 'Don':'Mr'})
    test_data['Title'] = test_data['Title'].replace({'Mme':'Mrs', 'Mlle':'Miss', 'Ms':'Miss', 'Don':'Mr', 'Dona':'Mrs'})
    
    all_titles = train_data['Title'].value_counts().index # Gets titles sorted by frequency
    
    rare_titles = all_titles[4: ] # Omits 4 most frequent titles
    
    train_data['Title'] = train_data['Title'].replace(rare_titles, 'Rare')
    test_data['Title'] = test_data['Title'].replace(rare_titles, 'Rare')
    
    return train_data, test_data

In [51]:
train_data_imp, test_data_imp = convert_rare_titles(train_data_imp, test_data_imp)

In [52]:
def normalise_fare(train_data, test_data):
    scaler = StandardScaler()
    
    train_fare_array = np.array(train_data['Fare'])
    train_fare_reshaped = train_fare_array.reshape(-1, 1)
    train_data['Fare_norm'] = scaler.fit_transform(train_fare_reshaped)
    
    test_fare_array = np.array(test_data['Fare'])
    test_fare_reshaped = test_fare_array.reshape(-1, 1)
    test_data['Fare_norm'] = scaler.transform(test_fare_reshaped)
    
    return train_data, test_data

In [53]:
train_data_imp, test_data_imp = normalise_fare(train_data_imp, test_data_imp)

In [54]:
def normalise_age(train_data, test_data):
    scaler = StandardScaler()
    
    train_age_array = np.array(train_data['Age'])
    train_age_reshaped = train_age_array.reshape(-1, 1)
    train_data['Age_norm'] = scaler.fit_transform(train_age_reshaped)
    
    test_age_array = np.array(test_data['Age'])
    test_age_reshaped = test_age_array.reshape(-1, 1)
    test_data['Age_norm'] = scaler.transform(test_age_reshaped)
    
    return train_data, test_data
    

In [55]:
train_data_imp, test_data_imp = normalise_age(train_data_imp, test_data_imp)

In [56]:
def one_hot_encode_embarked(train_data, test_data):
    
    train_dummies = pd.get_dummies(train_data['Embarked'])
    test_dummies = pd.get_dummies(test_data['Embarked'])
    
    train_data = train_data.join(train_dummies)
    test_data = test_data.join(test_dummies)
    
    return train_data, test_data

In [57]:
train_data_imp, test_data_imp = one_hot_encode_embarked(train_data_imp, test_data_imp)

In [58]:
def one_hot_encode_title(train_data, test_data):
    
    one_hot_train = pd.get_dummies(train_data['Title'])
    one_hot_test = pd.get_dummies(test_data['Title'])
    
    train_data = train_data.join(one_hot_train)
    test_data = test_data.join(one_hot_test)
    
    return train_data, test_data

In [59]:
train_data_imp, test_data_imp = one_hot_encode_title(train_data_imp, test_data_imp)

In [60]:
train_data_clean = train_data_imp.drop(['PassengerId', 'Age', 'Fare', 'Embarked', 'Cabin', 'Name', 'Ticket', 'Title'], axis=1)
test_data_clean = test_data_imp.drop(['PassengerId', 'Age', 'Fare', 'Embarked', 'Cabin', 'Name', 'Ticket', 'Title'], axis=1)

In [61]:
X_train = train_data_clean.drop(['Survived'], axis=1)
y_train = train_data_clean['Survived']
X_test = test_data_clean.copy()

In [62]:
classifier = xgb.XGBClassifier(objective='binary:logistic', n_estimators=10, seed=123)

classifier.fit(X_train, y_train)

preds = classifier.predict(X_test)

In [63]:
classifier

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=10, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=123,
              silent=None, subsample=1, verbosity=1)

In [64]:
submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': preds})
# submission.to_csv('submission.csv', index=False)

In [65]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

In [66]:
from sklearn.model_selection import GridSearchCV

gbm_param_grid = {'colsample_bytree': [0.1, 0.2, 0.4, 0.6,],
                  'n_estimators': [10, 20, 50, 100, 200],
                  'max_depth': [6, 7, 8, 9, 10, 11, 12], 
                  'eta': [0.001, 0.01, 0.1]}

cv_vals = [3, 4, 5, 6]
best_auc = []

for val in cv_vals:
    gbm = xgb.XGBClassifier(objective='binary:logistic', seed=123)
    
    grid_mse = GridSearchCV(estimator=gbm, param_grid=gbm_param_grid, scoring='roc_auc', verbose=1, cv=val, n_jobs=-1)

    grid_mse.fit(X_train, y_train)
    
    best_auc.append(grid_mse.best_score_)
    print('Best paramters_found: ', grid_mse.best_params_)

Fitting 3 folds for each of 420 candidates, totalling 1260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 1000 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 1237 out of 1260 | elapsed:    5.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 1260 out of 1260 | elapsed:    5.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  31 tasks      | elapsed:    0.0s


Best paramters_found:  {'colsample_bytree': 0.2, 'eta': 0.001, 'max_depth': 11, 'n_estimators': 50}
Fitting 4 folds for each of 420 candidates, totalling 1680 fits


[Parallel(n_jobs=-1)]: Done 1430 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 1680 out of 1680 | elapsed:    8.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    0.0s


Best paramters_found:  {'colsample_bytree': 0.2, 'eta': 0.001, 'max_depth': 11, 'n_estimators': 100}
Fitting 5 folds for each of 420 candidates, totalling 2100 fits


[Parallel(n_jobs=-1)]: Done 1570 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 2077 out of 2100 | elapsed:   10.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 2100 out of 2100 | elapsed:   10.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    0.0s


Best paramters_found:  {'colsample_bytree': 0.2, 'eta': 0.001, 'max_depth': 7, 'n_estimators': 100}
Fitting 6 folds for each of 420 candidates, totalling 2520 fits


[Parallel(n_jobs=-1)]: Done 1640 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 2364 tasks      | elapsed:   11.2s


Best paramters_found:  {'colsample_bytree': 0.2, 'eta': 0.001, 'max_depth': 9, 'n_estimators': 50}


[Parallel(n_jobs=-1)]: Done 2520 out of 2520 | elapsed:   12.6s finished


In [67]:
print(pd.DataFrame(list(zip(cv_vals, best_auc)), columns=['CV_folds', 'AUC']))

   CV_folds       AUC
0         3  0.874276
1         4  0.880115
2         5  0.883024
3         6  0.879974


In [69]:
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, eta=0.001, colsample_bytree=0.2, max_depth=7, seed=123)

xgb_classifier.fit(X_train, y_train)

opt_preds = xgb_classifier.predict(X_test)

opt_sub = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': opt_preds})
opt_sub.to_csv('opt_submission.csv', index=False)