In [19]:
import pandas as pd
test = pd.read_csv('data/test.csv', sep=',')
train = pd.read_csv('data/train.csv', sep=',')
selected_features = ['Pclass', 'Sex', 'Age', 'Embarked', 'SibSp', 'Parch','Fare']
X_train = train[selected_features]
X_test = test[selected_features]
y_train = train['Survived']

In [20]:
print X_train['Embarked'].value_counts()
print X_test['Embarked'].value_counts()
X_train['Embarked'].fillna('S', inplace=True)
X_test['Embarked'].fillna('S', inplace=True)

S    644
C    168
Q     77
Name: Embarked, dtype: int64
S    270
C    102
Q     46
Name: Embarked, dtype: int64


In [21]:
age_train_mean = X_train['Age'].mean()
age_test_mean = X_test['Age'].mean()
fare_test_mean = X_test['Fare'].mean()
X_train['Age'].fillna(age_train_mean, inplace=True)
X_test['Age'].fillna(age_test_mean, inplace=True)
X_test['Fare'].fillna(fare_test_mean, inplace=True)
X_train.info()
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null int64
Sex         891 non-null object
Age         891 non-null float64
Embarked    891 non-null object
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
dtypes: float64(2), int64(3), object(2)
memory usage: 48.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null int64
Sex         418 non-null object
Age         418 non-null float64
Embarked    418 non-null object
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        418 non-null float64
dtypes: float64(2), int64(3), object(2)
memory usage: 22.9+ KB


In [22]:
from sklearn.feature_extraction import DictVectorizer
dict_vec = DictVectorizer(sparse=False)
X_train = dict_vec.fit_transform(X_train.to_dict(orient='record'))
dict_vec.feature_names_
X_test = dict_vec.transform(X_test.to_dict(orient='record'))

In [23]:
dict_vec.feature_names_

['Age',
 'Embarked=C',
 'Embarked=Q',
 'Embarked=S',
 'Fare',
 'Parch',
 'Pclass',
 'Sex=female',
 'Sex=male',
 'SibSp']

In [24]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
from xgboost import XGBClassifier
xgbc = XGBClassifier()
from sklearn.model_selection import cross_val_score
cross_val_score(rfc, X_train, y_train, cv=5).mean()


0.8058730677175262

In [25]:
cross_val_score(xgbc, X_train, y_train, cv=5).mean()

0.81824559798311003

In [26]:
from sklearn.model_selection import GridSearchCV
params = {'max_depth':range(2, 7), 'n_estimators':range(100, 1100, 200), 'learning_rate':[0.05, 0.1, 0.25, 0.5, 1.0]}
xgbc_best = XGBClassifier()
gs = GridSearchCV(xgbc_best, params, n_jobs=-1, cv=5, verbose=1)
gs.fit(X_train, y_train)

Fitting 5 folds for each of 125 candidates, totalling 625 fits


[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 482 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done 625 out of 625 | elapsed:   21.5s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [100, 300, 500, 700, 900], 'learning_rate': [0.05, 0.1, 0.25, 0.5, 1.0], 'max_depth': [2, 3, 4, 5, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [27]:
gs.best_score_

0.83501683501683499

In [28]:
gs.best_params_

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}

In [29]:
xgbc_best_y_predict = gs.predict(X_test)

In [30]:
xgbc_best_submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': xgbc_best_y_predict})
xgbc_best_submission.to_csv('result/xgbc_best_submission.csv', index=False)