In [2]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.ensemble import RandomForestClassifier

In [13]:
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

In [14]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [15]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [16]:
missing_val_df = pd.DataFrame(index=["Total", "Unique Cabin", "Missing Cabin"])
for name, df in zip(("Training data", "Test data"), (train, test)):
    total = df.shape[0]
    unique_cabin = len(df["Cabin"].unique())
    missing_cabin = df["Cabin"].isnull().sum()
    missing_val_df[name] = [total, unique_cabin, missing_cabin]
missing_val_df

Unnamed: 0,Training data,Test data
Total,891,418
Unique Cabin,148,77
Missing Cabin,687,327


In [17]:
train.drop("PassengerId", axis=1, inplace=True)
for df in train, test:
    df.drop("Cabin", axis=1, inplace=True)

In [18]:
for df in train, test:
    df["Embarked"].fillna("S", inplace=True)
    for feature in "Age", "Fare":
        df[feature].fillna(train[feature].mean(), inplace=True)

In [20]:
train.head(5)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [21]:
for df in train, test:
    df.drop("Ticket", axis=1, inplace=True)

In [22]:
for df in train, test:
    df["Embarked"] = df["Embarked"].map(dict(zip(("S", "C", "Q"), (0, 1, 2))))
    df["Sex"] = df["Sex"].map(dict(zip(("female", "male"), (0, 1))))

In [23]:
train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,7.25,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,71.2833,1
2,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,7.925,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,53.1,0
4,0,3,"Allen, Mr. William Henry",1,35.0,0,0,8.05,0


In [24]:
for df in train, test:
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1

In [25]:
train.head(3)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize
0,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,7.25,0,2
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,71.2833,1,2
2,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,7.925,0,1


In [26]:
for df in train, test:
    titles = list()
    for row in df["Name"]:
        surname, title, name = re.split(r"[,.]", row, maxsplit=2)
        titles.append(title.strip())
    df["Title"] = titles
    df.drop("Name", axis=1, inplace=True)

In [27]:
train.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,Title
0,0,3,1,22.0,1,0,7.25,0,2,Mr
1,1,1,0,38.0,1,0,71.2833,1,2,Mrs
2,1,3,0,26.0,0,0,7.925,0,1,Miss


In [28]:
for df in train, test:
    for key, value in zip(("Mr", "Mrs", "Miss", "Master", "Dr", "Rev"),
                          np.arange(6)):
        df.loc[df["Title"] == key, "Title"] = value
    df.loc[df["Title"] == "Ms", "Title"] = 1
    for title in "Major", "Col", "Capt":
        df.loc[df["Title"] == title, "Title"] = 6
    for title in "Mlle", "Mme":
        df.loc[df["Title"] == title, "Title"] = 7
    for title in "Don", "Sir":
        df.loc[df["Title"] == title, "Title"] = 8
    for title in "Lady", "the Countess", "Jonkheer":
        df.loc[df["Title"] == title, "Title"] = 9
test["Title"][414] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [29]:
train.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,Title
0,0,3,1,22.0,1,0,7.25,0,2,0
1,1,1,0,38.0,1,0,71.2833,1,2,1
2,1,3,0,26.0,0,0,7.925,0,1,2


In [30]:
from sklearn.model_selection import train_test_split

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch",
              "Fare", "Embarked", "FamilySize", "Title"]
X_train, X_test, y_train, y_test = train_test_split(train[predictors], train["Survived"])

In [31]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import xgboost as xgb
import lightgbm as lgb

In [32]:
# スコア方法をF1に設定
f1_scoring = make_scorer(f1_score,  pos_label=1)

In [33]:
# RandomForest
print("RandomForest")
forest_param = {
    'n_estimators': [20,100,500],
    'max_depth': [3,5,7,9],
    'min_samples_leaf': [1, 2, 4]
}
'''
forest = RandomForestClassifier(n_estimators=100,
                                criterion='gini',
                                max_depth=5,
                                min_samples_split=10,
                                min_samples_leaf=5,
                                random_state=0)
'''

RandomForest


In [34]:
from contextlib import contextmanager
import time

In [35]:
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

In [36]:
with timer('RandomForest Model Trainning'):
    # グリッドサーチで学習
    rf = GridSearchCV(RandomForestClassifier(random_state=0, n_jobs=-1), forest_param, scoring=f1_scoring, cv=5)
    rf.fit(X_train, y_train)
    print('Best parameters: {}'.format(rf.best_params_))

Best parameters: {'max_depth': 9, 'min_samples_leaf': 2, 'n_estimators': 20}
[RandomForest Model Trainning] done in 27 s


In [37]:
# スコア検証
print('Train score: {:.3f}'.format(rf.score(X_train, y_train)))
print('Confusion matrix:\n{}'.format(confusion_matrix(y_test, rf.predict(X_test))))
print('f1 score: {:.3f}'.format(f1_score(y_test, rf.predict(X_test))))
rf_importances = pd.Series(rf.best_estimator_.feature_importances_, index = X_train.columns)
print(rf_importances)

Train score: 0.855
Confusion matrix:
[[123  14]
 [ 25  61]]
f1 score: 0.758
Pclass        0.077512
Sex           0.218300
Age           0.137676
SibSp         0.047361
Parch         0.029156
Fare          0.174237
Embarked      0.026255
FamilySize    0.057820
Title         0.231683
dtype: float64


In [38]:
# XGBoosting
print("XGBoosting")
xgb_param = {
    'learning_rate':[0.1,0.2],
    'n_estimators':[20,100,500],
    'max_depth':[3,5,7,9],
    'min_child_weight':[0.5,1,2],
    'max_delta_step':[5],
    'gamma':[1,3,5],
    'subsample':[0.8],
    'colsample_bytree':[0.8],
    'objective':['binary:logistic'],
    'nthread':[4],
    'scale_pos_weight':[1],
    'seed':[0]
}

XGBoosting


In [39]:
with timer('XGboost Model Trainning'):
    # グリッドサーチで学習
    xgb = GridSearchCV(xgb.XGBClassifier(
        silent=True, booster='gbtree', reg_alpha=0, reg_lambda=1, base_score=0.5, random_state=0, missing=None),
        xgb_param, scoring=f1_scoring, cv=4)
    xgb.fit(X_train, y_train)
print('Best parameters: {}'.format(xgb.best_params_))

[XGboost Model Trainning] done in 136 s
Best parameters: {'colsample_bytree': 0.8, 'gamma': 1, 'learning_rate': 0.2, 'max_delta_step': 5, 'max_depth': 5, 'min_child_weight': 2, 'n_estimators': 100, 'nthread': 4, 'objective': 'binary:logistic', 'scale_pos_weight': 1, 'seed': 0, 'subsample': 0.8}


In [40]:
# スコア検証
print('Train score: {:.3f}'.format(xgb.score(X_train, y_train)))
print('Confusion matrix:\n{}'.format(confusion_matrix(y_test, xgb.predict(X_test))))
print('f1 score: {:.3f}'.format(f1_score(y_test, xgb.predict(X_test))))
xgb_importances = pd.Series(xgb.best_estimator_.feature_importances_, index = X_train.columns)
print(xgb_importances)

Train score: 0.897
Confusion matrix:
[[125  12]
 [ 26  60]]
f1 score: 0.759
Pclass        0.050691
Sex           0.030722
Age           0.317972
SibSp         0.026114
Parch         0.015361
Fare          0.399386
Embarked      0.033794
FamilySize    0.052227
Title         0.073733
dtype: float32


In [41]:
# LightGBM
print("LightGBM")
gbm_param = {
    'learning_rate':[0.1,0.2],
    'n_estimators':[20,100,500],
    'max_depth':[3,5,7,9],
    'min_child_weight':[0.5,1,2],
    'min_child_samples':[5,10,20],
    'subsample':[0.8],
    'colsample_bytree':[0.8],
    'verbose':[-1],
    'num_leaves':[80]
}

LightGBM


In [42]:
with timer('LightGBM Model Trainning'):
    # グリッドサーチで学習
    gbm = GridSearchCV(lgb.LGBMClassifier(),gbm_param, scoring=f1_scoring, cv=5)
    gbm.fit(X_train, y_train)
print('Best parameters: {}'.format(gbm.best_params_))

[LightGBM Model Trainning] done in 50 s
Best parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 3, 'min_child_samples': 10, 'min_child_weight': 2, 'n_estimators': 100, 'num_leaves': 80, 'subsample': 0.8, 'verbose': -1}




In [43]:
# スコア検証
print('Train score: {:.3f}'.format(gbm.score(X_train, y_train)))
print('Confusion matrix:\n{}'.format(confusion_matrix(y_test, gbm.predict(X_test))))
print('f1 score: {:.3f}'.format(f1_score(y_test, gbm.predict(X_test))))
gbm_importances = pd.Series(gbm.best_estimator_.feature_importances_, index = X_train.columns)
print(gbm_importances)

Train score: 0.874
Confusion matrix:
[[120  17]
 [ 26  60]]
f1 score: 0.736
Pclass         30
Sex            15
Age           157
SibSp          10
Parch           7
Fare          205
Embarked       12
FamilySize     29
Title          54
dtype: int32


In [59]:
test.Title = test.Title.astype(int)

In [61]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Sex            418 non-null int64
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           418 non-null float64
Embarked       418 non-null int64
FamilySize     418 non-null int64
Title          418 non-null int32
dtypes: float64(2), int32(1), int64(7)
memory usage: 31.1 KB


In [62]:
prediction = xgb.predict(test[predictors])

In [63]:
submission = pd.DataFrame({"PassengerId": test["PassengerId"], "Survived": prediction})
submission.to_csv("submission.csv", index=False)

In [64]:
prediction_rf = rf.predict(test[predictors])
submission_rf = pd.DataFrame({"PassengerId": test["PassengerId"], "Survived": prediction_rf})
submission_rf.to_csv("submission_RandomForest.csv", index=False)

In [65]:
prediction_gbm = gbm.predict(test[predictors])
submission_gbm = pd.DataFrame({"PassengerId": test["PassengerId"], "Survived": prediction_gbm})
submission_gbm.to_csv("submission_LightGBM.csv", index=False)