In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.ensemble import RandomForestClassifier

In [2]:
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

In [3]:
missing_val_df = pd.DataFrame(index=["Total", "Unique Cabin", "Missing Cabin"])
for name, df in zip(("Training data", "Test data"), (train, test)):
    total = df.shape[0]
    unique_cabin = len(df["Cabin"].unique())
    missing_cabin = df["Cabin"].isnull().sum()
    missing_val_df[name] = [total, unique_cabin, missing_cabin]
missing_val_df

Unnamed: 0,Training data,Test data
Total,891,418
Unique Cabin,148,77
Missing Cabin,687,327


In [4]:
train.drop("PassengerId", axis=1, inplace=True)
for df in train, test:
    df.drop("Cabin", axis=1, inplace=True)

In [5]:
for df in train, test:
    df["Embarked"].fillna("S", inplace=True)
    for feature in "Age", "Fare":
        df[feature].fillna(train[feature].mean(), inplace=True)

In [6]:
for df in train, test:
    df.drop("Ticket", axis=1, inplace=True)

In [7]:
for df in train, test:
    df["Embarked"] = df["Embarked"].map(dict(zip(("S", "C", "Q"), (0, 1, 2))))
    df["Sex"] = df["Sex"].map(dict(zip(("female", "male"), (0, 1))))

In [8]:
for df in train, test:
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1

In [9]:
for df in train, test:
    titles = list()
    for row in df["Name"]:
        surname, title, name = re.split(r"[,.]", row, maxsplit=2)
        titles.append(title.strip())
    df["Title"] = titles
    df.drop("Name", axis=1, inplace=True)

In [10]:
for df in train, test:
    for key, value in zip(("Mr", "Mrs", "Miss", "Master", "Dr", "Rev"),
                          np.arange(6)):
        df.loc[df["Title"] == key, "Title"] = value
    df.loc[df["Title"] == "Ms", "Title"] = 1
    for title in "Major", "Col", "Capt":
        df.loc[df["Title"] == title, "Title"] = 6
    for title in "Mlle", "Mme":
        df.loc[df["Title"] == title, "Title"] = 7
    for title in "Don", "Sir":
        df.loc[df["Title"] == title, "Title"] = 8
    for title in "Lady", "the Countess", "Jonkheer":
        df.loc[df["Title"] == title, "Title"] = 9
test["Title"][414] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [11]:
train.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,Title
0,0,3,1,22.0,1,0,7.25,0,2,0
1,1,1,0,38.0,1,0,71.2833,1,2,1
2,1,3,0,26.0,0,0,7.925,0,1,2


In [12]:
from sklearn.model_selection import train_test_split

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch",
              "Fare", "Embarked", "FamilySize", "Title"]
X_train, X_test, y_train, y_test = train_test_split(train[predictors], train["Survived"])

In [13]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import xgboost as xgb
import lightgbm as lgb

In [14]:
# スコア方法をF1に設定
f1_scoring = make_scorer(f1_score,  pos_label=1)

In [15]:
# RandomForest
print("RandomForest")
forest_param = {
    'n_estimators': [20,100,500],
    'max_depth': [3,5,7,9],
    'min_samples_leaf': [1, 2, 4]
}

RandomForest


In [16]:
from contextlib import contextmanager
import time

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

In [17]:
with timer('RandomForest Model Trainning'):
    # グリッドサーチで学習
    rf = GridSearchCV(RandomForestClassifier(random_state=0, n_jobs=-1), forest_param, scoring=f1_scoring, cv=5)
    rf.fit(X_train, y_train)
    print('Best parameters: {}'.format(rf.best_params_))

Best parameters: {'max_depth': 9, 'min_samples_leaf': 1, 'n_estimators': 20}
[RandomForest Model Trainning] done in 27 s


In [18]:
# スコア検証
print('Train score: {:.3f}'.format(rf.score(X_train, y_train)))
print('Confusion matrix:\n{}'.format(confusion_matrix(y_test, rf.predict(X_test))))
print('f1 score: {:.3f}'.format(f1_score(y_test, rf.predict(X_test))))
rf_importances = pd.Series(rf.best_estimator_.feature_importances_, index = X_train.columns)
print(rf_importances)

Train score: 0.906
Confusion matrix:
[[126  15]
 [ 23  59]]
f1 score: 0.756
Pclass        0.080899
Sex           0.189316
Age           0.155218
SibSp         0.029664
Parch         0.022349
Fare          0.183555
Embarked      0.033418
FamilySize    0.081554
Title         0.224027
dtype: float64


In [20]:
prediction_rf = rf.predict(test[predictors])
submission_rf = pd.DataFrame({"PassengerId": test["PassengerId"], "Survived": prediction_rf})
submission_rf.to_csv("submission_RandomForest3.csv", index=False)

In [21]:
# XGBoosting
print("XGBoosting")
xgb_param = {
    'learning_rate':[0.1,0.2],
    'n_estimators':[20,100,500],
    'max_depth':[3,5,7,9],
    'min_child_weight':[0.5,1,2],
    'max_delta_step':[5],
    'gamma':[1,3,5],
    'subsample':[0.8],
    'colsample_bytree':[0.8],
    'objective':['binary:logistic'],
    'nthread':[4],
    'scale_pos_weight':[1],
    'seed':[0]
}

XGBoosting


In [22]:
with timer('XGboost Model Trainning'):
    # グリッドサーチで学習
    xgb = GridSearchCV(xgb.XGBClassifier(
        silent=True, booster='gbtree', reg_alpha=0, reg_lambda=1, base_score=0.5, random_state=0, missing=None),
        xgb_param, scoring=f1_scoring, cv=4)
    xgb.fit(X_train, y_train)
print('Best parameters: {}'.format(xgb.best_params_))

[XGboost Model Trainning] done in 138 s
Best parameters: {'colsample_bytree': 0.8, 'gamma': 3, 'learning_rate': 0.2, 'max_delta_step': 5, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 500, 'nthread': 4, 'objective': 'binary:logistic', 'scale_pos_weight': 1, 'seed': 0, 'subsample': 0.8}


In [23]:
# スコア検証
print('Train score: {:.3f}'.format(xgb.score(X_train, y_train)))
print('Confusion matrix:\n{}'.format(confusion_matrix(y_test, xgb.predict(X_test))))
print('f1 score: {:.3f}'.format(f1_score(y_test, xgb.predict(X_test))))
xgb_importances = pd.Series(xgb.best_estimator_.feature_importances_, index = X_train.columns)
print(xgb_importances)

Train score: 0.880
Confusion matrix:
[[127  14]
 [ 24  58]]
f1 score: 0.753
Pclass        0.069767
Sex           0.034884
Age           0.340116
SibSp         0.023256
Parch         0.008721
Fare          0.375000
Embarked      0.031977
FamilySize    0.046512
Title         0.069767
dtype: float32


In [25]:
test.Title = test.Title.astype(int)

In [26]:
prediction_xgb = xgb.predict(test[predictors])
submission_xgb = pd.DataFrame({"PassengerId": test["PassengerId"], "Survived": prediction_xgb})
submission_xgb.to_csv("submission_XGBoost2.csv", index=False)

In [27]:
# LightGBM
print("LightGBM")
gbm_param = {
    'learning_rate':[0.1,0.2],
    'n_estimators':[20,100,500],
    'max_depth':[3,5,7,9],
    'min_child_weight':[0.5,1,2],
    'min_child_samples':[5,10,20],
    'subsample':[0.8],
    'colsample_bytree':[0.8],
    'verbose':[-1],
    'num_leaves':[80]
}
with timer('LightGBM Model Trainning'):
    # グリッドサーチで学習
    gbm = GridSearchCV(lgb.LGBMClassifier(),gbm_param, scoring=f1_scoring, cv=5)
    gbm.fit(X_train, y_train)
print('Best parameters: {}'.format(gbm.best_params_))

LightGBM
[LightGBM Model Trainning] done in 50 s
Best parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 3, 'min_child_samples': 5, 'min_child_weight': 0.5, 'n_estimators': 100, 'num_leaves': 80, 'subsample': 0.8, 'verbose': -1}


In [28]:
# スコア検証
print('Train score: {:.3f}'.format(gbm.score(X_train, y_train)))
print('Confusion matrix:\n{}'.format(confusion_matrix(y_test, gbm.predict(X_test))))
print('f1 score: {:.3f}'.format(f1_score(y_test, gbm.predict(X_test))))
gbm_importances = pd.Series(gbm.best_estimator_.feature_importances_, index = X_train.columns)
print(gbm_importances)

Train score: 0.896
Confusion matrix:
[[125  16]
 [ 24  58]]
f1 score: 0.744
Pclass         35
Sex            28
Age           196
SibSp          17
Parch           9
Fare          198
Embarked       20
FamilySize     36
Title          50
dtype: int32


In [30]:
prediction_gbm = gbm.predict(test[predictors])
submission_gbm = pd.DataFrame({"PassengerId": test["PassengerId"], "Survived": prediction_gbm})
submission_gbm.to_csv("submission_LightGBM2.csv", index=False)