In [None]:
# © 2020 지성. all rights reserved.
# <llllllllll@kakao.com>
# MIT License

In [970]:
import numpy as np 
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [971]:
train = pd.read_csv('D:/JISUNG/titanic/data/train.csv')
test = pd.read_csv('D:/JISUNG/titanic/data/test.csv')

In [972]:
passenger_id = test.PassengerId

In [973]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [974]:
train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [975]:
train.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
482,483,0,3,"Rouse, Mr. Richard Henry",male,50.0,0,0,A/5 3594,8.05,,S
63,64,0,3,"Skoog, Master. Harald",male,4.0,3,2,347088,27.9,,S
643,644,1,3,"Foo, Mr. Choong",male,,0,0,1601,56.4958,,S
270,271,0,1,"Cairns, Mr. Alexander",male,,0,0,113798,31.0,,S
298,299,1,1,"Saalfeld, Mr. Adolphe",male,,0,0,19988,30.5,C106,S


In [976]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [977]:
test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [978]:
train.Embarked.fillna(train['Embarked'].mode()[0], inplace=True)
test.Fare.fillna(test['Fare'].median(), inplace=True)

In [979]:
# train.Cabin.fillna('N')
# train['Cabin'] = train.Cabin.str[0]
# train['Cabin'] = train.Cabin.replace('G', 'N')
# train['Cabin'] = train.Cabin.replace('T', 'N')

# test.Cabin.fillna('N')
# test['Cabin'] = test.Cabin.str[0]
# test['Cabin'] = test.Cabin.replace('G', 'N')

In [980]:
train['Women_Child'] = ((train.Age < 10) | (train.Sex == 'female'))
test['Women_Child'] = ((test.Age < 10) | (test.Sex == 'female'))

In [981]:
train['Family'] = train.SibSp + train.Parch + 1
test['Family'] = test.SibSp + test.Parch + 1

In [982]:
train['Title'] = train.Name.str.extract('(, \w+.)', expand=False).str.replace('[,. ]', '')
train['Title'] = train.Title.replace('Mlle', 'Miss')
train['Title'] = train.Title.replace('Ms', 'Miss')
train['Title'] = train.Title.replace('Mme', 'Mrs')
train['Title'] = train.Title.str.replace('(Major|Col|the|Capt|Don|Jonkheer|Sir|Lady|Dr|Rev)', 'ETC')

test['Title'] = test.Name.str.extract('(, \w+.)', expand=False).str.replace('[,. ]', '')
test['Title'] = test.Title.replace('Mlle', 'Miss')
test['Title'] = test.Title.replace('Ms', 'Miss')
test['Title'] = test.Title.replace('Mme', 'Mrs')
test['Title'] = train.Title.str.replace('(Rev|Col|Dona|Dr)', 'ETC')

In [983]:
train.Title.value_counts()

Mr        517
Miss      185
Mrs       126
Master     40
ETC        23
Name: Title, dtype: int64

In [984]:
test.Title.value_counts()

Mr        228
Miss      101
Mrs        59
Master     23
ETC         7
Name: Title, dtype: int64

In [985]:
train_title = train.groupby('Title')['Age'].mean().round()
train_df = pd.DataFrame(
    {
        'Title':train_title.index,
        'Age_Mean':train_title.values
    }
)
test_title = test.groupby('Title')['Age'].mean().round()
test_df = pd.DataFrame(
    {
        'Title':test_title.index,
        'Age_Mean':test_title.values
    }
)

train = pd.merge(train, train_df, on='Title')
test = pd.merge(test, test_df, on='Title')

train.Age.fillna(train.Age_Mean, inplace=True)
test.Age.fillna(test.Age_Mean, inplace=True)

In [986]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Women_Child      0
Family           0
Title            0
Age_Mean         0
dtype: int64

In [987]:
train['Age'] = pd.cut(
    train.Age,
    bins=[0, 10, 20, 60, 80],
    labels=['Children', 'Teenager', 'Adult', 'Elder']
)
test['Age'] = pd.cut(
    test.Age,
    bins=[0, 10, 20, 60, 80],
    labels=['Children', 'Teenager', 'Adult', 'Elder']
)

In [988]:
train['Fare'] = pd.cut(
    train.Fare,
    bins=[-1, 7.9104, 14.4542, 31, 512.3292],
    labels=['Q1', 'Q2', 'Q3', 'Max']
)
test['Fare'] = pd.cut(
    test.Fare,
    bins=[-1, 7.9104, 14.4542, 31, 512.3292],
    labels=['Q1', 'Q2', 'Q3', 'Max']
)

In [989]:
train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Age_Mean'], axis=1, inplace=True)
test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Age_Mean'], axis=1, inplace=True)

In [990]:
train.isna().sum()

Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
Women_Child    0
Family         0
Title          0
dtype: int64

In [991]:
test.isna().sum()

Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
Women_Child    0
Family         0
Title          0
dtype: int64

In [992]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Women_Child,Family,Title
0,0,3,male,Adult,1,0,Q1,S,False,2,Mr
1,0,3,male,Adult,0,0,Q2,S,False,1,Mr
2,0,3,male,Adult,0,0,Q2,Q,False,1,Mr
3,0,1,male,Adult,0,0,Max,S,False,1,Mr
4,0,3,male,Teenager,0,0,Q2,S,False,1,Mr


In [993]:
train = pd.get_dummies(
    train,
    columns=[
        'Sex', 'Age', 'Fare', 'Embarked', 'Title'
    ],
    prefix=[
        'Sex', 'Age', 'Fare', 'Embarked', 'Title'
    ]
)
test = pd.get_dummies(
    test,
    columns=[
        'Sex', 'Age', 'Fare', 'Embarked', 'Title'
    ],
    prefix=[
        'Sex', 'Age', 'Fare', 'Embarked', 'Title'
    ]
)

In [994]:
train.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Women_Child,Family,Sex_female,Sex_male,Age_Children,Age_Teenager,...,Fare_Q3,Fare_Max,Embarked_C,Embarked_Q,Embarked_S,Title_ETC,Title_Master,Title_Miss,Title_Mr,Title_Mrs
0,0,3,1,0,False,2,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0,3,0,0,False,1,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
2,0,3,0,0,False,1,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
3,0,1,0,0,False,1,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
4,0,3,0,0,False,1,0,1,0,1,...,0,0,0,0,1,0,0,0,1,0


In [995]:
feature = train.drop('Survived', axis=1)
target = train.Survived
X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size=0.3, random_state=19)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((623, 23), (268, 23), (623,), (268,))

In [998]:
model = RandomForestClassifier(
    criterion='gini',
    n_estimators=300,
    min_samples_split=10,
    min_samples_leaf=1,
    max_features='auto',
    oob_score=True,
    random_state=1,
    n_jobs=-1
)

model.fit(X_train, y_train)
pred = model.predict(X_test)

accuracy_score(pred, y_test)

0.8171641791044776

In [999]:
# X_train = pd.concat([X_train, X_test])
# y_train = pd.concat([y_train, y_test])
# X_test = test

# predict = model.predict(X_test)
# submission = pd.DataFrame(
#     {
#         'PassengerId':passenger_id,
#         'Survived':predict
#     }
# )

# submission.to_csv('D:/JISUNG/titanic/submission.csv', mode='w', index=False)

In [1000]:
k_fold = KFold(n_splits=10, random_state=22)
result = cross_val_score(model, feature, target, cv=10, scoring='accuracy')

result.mean()



0.7734207240948814

In [1001]:
X_train = pd.concat([X_train, X_test])
y_train = pd.concat([y_train, y_test])
X_test = test
X_train.shape, y_train.shape, X_test.shape

((891, 23), (891,), (418, 23))

In [1002]:
model = RandomForestClassifier()
n_estimators = range(100, 1000, 50)

param_grid = {'n_estimators':n_estimators}

model2 = GridSearchCV(
    model,
    param_grid=param_grid,
    cv=10,
    scoring='accuracy',
    n_jobs=5,
    verbose=1
)

model2.fit(X_train, y_train)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:    5.0s
[Parallel(n_jobs=5)]: Done 180 out of 180 | elapsed:   47.4s finished


GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rand

In [1003]:
model2.best_score_

0.8193508114856429

In [1005]:
model3 = RandomForestClassifier(
    bootstrap=True,
    class_weight=None,
    criterion='gini',
    max_depth=None,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    n_estimators=100,
    n_jobs=None,
    oob_score=False,
    random_state=None,
    verbose=0,
    warm_start=False
)
model3.fit(X_train, y_train)
model3.score(X_train, y_train)

0.8810325476992144

In [1006]:
predict = model3.predict(X_test)
submission = pd.DataFrame(
    {
        'PassengerId':passenger_id,
        'Survived':predict
    }
)

In [1007]:
submission.to_csv('D:/JISUNG/titanic/submission.csv', mode='w', index=False)