In [1]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, cross_val_score

from xgboost import XGBClassifier


In [214]:
train_file = 'train.csv'
test_file = 'test.csv'

train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)

In [215]:
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [216]:
train_y = train_data['Survived']
train_X = train_data.drop(['Survived', 'Ticket', 'PassengerId'], axis=1)

test_X = test_data.drop(['Ticket', 'PassengerId'], axis=1)

In [217]:
train_X.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,,S


In [218]:
names = train_X['Name'].copy()
prefix = []

names_test = test_X['Name'].copy()
prefix_test = []

In [219]:
for i, name in enumerate(names):
    if 'Mr' in name:
        prefix.append('Mr')
    elif 'Mrs' in name:
        prefix.append('Mrs')
    else:
        prefix.append('x')
    last_name = name.split()[-1].replace(")","").replace('\"',"")
    names[i] = last_name
    
        

In [220]:
for i, name in enumerate(names_test):
    if 'Mr' in name:
        prefix_test.append('Mr')
    elif 'Mrs' in name:
        prefix_test.append('Mrs')
    else:
        prefix_test.append('x')
    last_name = name.split()[-1].replace(")","").replace('\"',"")
    names_test[i] = last_name

In [221]:
cabin = train_X['Cabin'].copy()

cabin_test = test_X['Cabin'].copy()

In [222]:
for i, c in enumerate(cabin):
    cabin_string = str(c)
    cabin[i] = cabin_string[0]

In [223]:
for i, c in enumerate(cabin_test):
    cabin_string = str(c)
    cabin[i] = cabin_string[0]

In [224]:
train_X.drop(['Name', 'Cabin'], axis=1, inplace=True)
test_X.drop(['Name', 'Cabin'], axis=1, inplace=True)

train_X['LastNames'] = names
train_X['Prefix'] = prefix
train_X['Cabin'] = cabin

test_X['LastNames'] = names_test
test_X['Prefix'] = prefix_test
test_X['Cabin'] = cabin_test


In [225]:
train_X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,LastNames,Prefix,Cabin
0,3,male,22.0,1,0,7.2500,S,Harris,Mr,n
1,1,female,38.0,1,0,71.2833,C,Thayer,Mr,n
2,3,female,26.0,0,0,7.9250,S,Laina,x,n
3,1,female,35.0,1,0,53.1000,S,Peel,Mr,n
4,3,male,35.0,0,0,8.0500,S,Henry,Mr,n
5,3,male,,0,0,8.4583,Q,James,Mr,n
6,1,male,54.0,0,0,51.8625,S,J,Mr,n
7,3,male,2.0,3,1,21.0750,S,Leonard,x,n
8,3,female,27.0,0,2,11.1333,S,Berg,Mr,n
9,2,female,14.0,1,0,30.0708,C,Achem,Mr,n


In [226]:
train_X = pd.get_dummies(train_X)
test_X = pd.get_dummies(test_X)


In [227]:
cols_with_missing_values = [col for col in train_X.columns 
                            if train_X[col].isnull().any()]

for col in cols_with_missing_values:
    train_X[col + 'was_missing'] = train_X[col].isnull()
    test_X[col + 'was_missing'] = test_X[col].isnull()
    

In [228]:
train_X, test_X = train_X.align(test_X, join='left', axis=1)

In [211]:
imputer = SimpleImputer()

train_X = imputer.fit_transform(train_X)
test_X = imputer.transform(test_X)

In [184]:
#params = {'gamma':[0, 0.1, 0.3, 0.5, 0.9]}

In [185]:
#after testing some hyperparams got n_estimators=1000, learning_rate=0.001, min_child_weight=1, max_depth=8, gamma=0.1 with bests results
gridSearch = GridSearchCV(XGBClassifier(n_estimators=1000, learning_rate=0.001, min_child_weight=1, max_depth=8, gamma=0.1), params, scoring='accuracy', n_jobs=4, cv=5)

In [186]:
gridSearch.fit(train_X, train_y)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.001, max_delta_step=0,
       max_depth=8, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid={'gamma': [0, 0.1, 0.3, 0.5, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [187]:
gridSearch.best_params_ , gridSearch.best_score_ 

({'gamma': 0.1}, 0.8282828282828283)

In [229]:
model = XGBClassifier(n_estimators=1000, learning_rate=0.001, min_child_weight=1, max_depth=8, gamma=0.1, n_jobs=4)

In [93]:
scores = cross_val_score(model, train_X, train_y, scoring='accuracy', cv=5, n_jobs=-1)

In [94]:
print(scores.mean() * 100)

81.70966104886645


In [230]:
pipeline = make_pipeline(SimpleImputer(), model)


In [231]:
pipeline.fit(train_X, train_y)

Pipeline(memory=None,
     steps=[('simpleimputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('xgbclassifier', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.1, learning_rate=0.001,
       max_delta_step=0, max_d... reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1))])

In [232]:
predictions = pipeline.predict(test_X)

In [233]:
predictions

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [234]:
submission = pd.DataFrame({
        "PassengerId": test_data["PassengerId"],
        "Survived": predictions
    })
submission.to_csv('submission.csv', index=False)