In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
test = pd.read_csv('../input/titanic/test.csv')
train = pd.read_csv('../input/titanic/train.csv')
submission = pd.read_csv('../input/titanic/gender_submission.csv')

PassengerId : 각 승객의 고유 번호

Survived : 생존 여부(종속 변수)

0 = 사망
1 = 생존

Pclass : 객실 등급 - 승객의 사회적, 경제적 지위

1st = Upper
2nd = Middle
3rd = Lower

Name : 이름

Sex : 성별

Age : 나이

SibSp : 동반한 Sibling(형제자매)와 Spouse(배우자)의 수

Parch : 동반한 Parent(부모) Child(자식)의 수

Ticket : 티켓의 고유넘버

Fare : 티켓의 요금

Cabin : 객실 번호

Embarked : 승선한 항

C = Cherbourg
Q = Queenstown
S = Southampton

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print(train.info())
print(train.shape)
print(test.info())
print(test.shape)

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

# Visualization

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set() # setting seaborn default for plots

In [None]:
def bar_chart(feature):
    survived = train[train['Survived']==1][feature].value_counts()
    dead = train[train['Survived']==0][feature].value_counts()
    df = pd.DataFrame([survived,dead])
    df.index = ['Survived','Dead']
    df.plot(kind='bar',stacked=True, figsize=(10,5))

In [None]:
bar_chart('Sex')

In [None]:
bar_chart('Pclass')

In [None]:
bar_chart('SibSp')

In [None]:
bar_chart('Parch')

In [None]:
bar_chart('Embarked')

# Feature Engineering

## 1️⃣ Name

### Name에서 Mr, Miss 등의 Title -> 중요 ⭐️

In [None]:
train_test_data = [train, test] # combining train and test dataset

for dataset in train_test_data:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [None]:
train['Title'].value_counts()

In [None]:
test['Title'].value_counts()

### Title이라는 새로운 column 생성

In [None]:
# Master까지 분류해줬더니 성능 향상
title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2, 
                 "Master": 3, "Dr": 4, "Rev": 4, "Col": 4, "Major": 4, "Mlle": 4,"Countess": 4,
                 "Ms": 4, "Lady": 4, "Jonkheer": 4, "Don": 4, "Dona" : 4, "Mme": 4,"Capt": 4,"Sir": 4 }
for dataset in train_test_data:
    dataset['Title'] = dataset['Title'].map(title_mapping)

In [None]:
bar_chart('Title')

In [None]:
train.drop('Name', axis=1, inplace=True)
test.drop('Name', axis=1, inplace=True)

## 2️⃣ Sex

In [None]:
sex_mapping = {"male": 0, "female": 1}
for dataset in train_test_data:
    dataset['Sex'] = dataset['Sex'].map(sex_mapping)

In [None]:
bar_chart('Sex')

## 3️⃣ Age
→ title에 따른 평균 값으로 채우기

In [None]:
train["Age"].fillna(train.groupby("Title")["Age"].transform("median"), inplace=True)
test["Age"].fillna(test.groupby("Title")["Age"].transform("median"), inplace=True)

In [None]:
train.head(30)
train.groupby("Title")["Age"].transform("median")

In [None]:
# train['Age'] = train['Age'].round(0).astype('int64')
# test['Age'] = test['Age'].round(0).astype('int64')

In [None]:
# bining 방식이 성능 더 좋게 나옴
# Train
train.loc[ train['Age'] <= 10, 'Age_clean'] = 0
train.loc[(train['Age'] > 10) & (train['Age'] <= 16), 'Age_clean'] = 1
train.loc[(train['Age'] > 16) & (train['Age'] <= 20), 'Age_clean'] = 2
train.loc[(train['Age'] > 20) & (train['Age'] <= 26), 'Age_clean'] = 3
train.loc[(train['Age'] > 26) & (train['Age'] <= 30), 'Age_clean'] = 4
train.loc[(train['Age'] > 30) & (train['Age'] <= 36), 'Age_clean'] = 5
train.loc[(train['Age'] > 36) & (train['Age'] <= 40), 'Age_clean'] = 6
train.loc[(train['Age'] > 40) & (train['Age'] <= 46), 'Age_clean'] = 7
train.loc[(train['Age'] > 46) & (train['Age'] <= 50), 'Age_clean'] = 8
train.loc[(train['Age'] > 50) & (train['Age'] <= 60), 'Age_clean'] = 9
train.loc[ train['Age'] > 60, 'Age_clean'] = 10

# Test
test.loc[ test['Age'] <= 10, 'Age_clean'] = 0
test.loc[(test['Age'] > 10) & (test['Age'] <= 16), 'Age_clean'] = 1
test.loc[(test['Age'] > 16) & (test['Age'] <= 20), 'Age_clean'] = 2
test.loc[(test['Age'] > 20) & (test['Age'] <= 26), 'Age_clean'] = 3
test.loc[(test['Age'] > 26) & (test['Age'] <= 30), 'Age_clean'] = 4
test.loc[(test['Age'] > 30) & (test['Age'] <= 36), 'Age_clean'] = 5
test.loc[(test['Age'] > 36) & (test['Age'] <= 40), 'Age_clean'] = 6
test.loc[(test['Age'] > 40) & (test['Age'] <= 46), 'Age_clean'] = 7
test.loc[(test['Age'] > 46) & (test['Age'] <= 50), 'Age_clean'] = 8
test.loc[(test['Age'] > 50) & (test['Age'] <= 60), 'Age_clean'] = 9
test.loc[ test['Age'] > 60, 'Age_clean'] = 10

## 4️⃣ Embarked

In [None]:
for dataset in train_test_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

In [None]:
embarked_mapping = {"S": 0, "C": 1, "Q": 2}
for dataset in train_test_data:
    dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping)

In [None]:
train['Embarked'].value_counts()

In [None]:
train.isnull().sum()

## 5️⃣ Fare

In [None]:
train["Fare"].fillna(train.groupby("Pclass")["Fare"].transform("median"), inplace=True)
test["Fare"].fillna(test.groupby("Pclass")["Fare"].transform("median"), inplace=True)

In [None]:
train['Fare'] = train['Fare'].round(0).astype('int64')
test['Fare'] = test['Fare'].round(0).astype('int64')

## 6️⃣ Cabin

In [None]:
train.Cabin.value_counts()

In [None]:
for dataset in train_test_data:
    dataset['Cabin'] = dataset['Cabin'].str[:1]

In [None]:
Pclass1 = train[train['Pclass']==1]['Cabin'].value_counts()
Pclass2 = train[train['Pclass']==2]['Cabin'].value_counts()
Pclass3 = train[train['Pclass']==3]['Cabin'].value_counts()
df = pd.DataFrame([Pclass1, Pclass2, Pclass3])
df.index = ['1st class','2nd class', '3rd class']
df.plot(kind='bar',stacked=True, figsize=(10,5))

In [None]:
cabin_mapping = {"A": 0, "B": 0.4, "C": 0.8, "D": 1.2, "E": 1.6, "F": 2, "G": 2.4, "T": 2.8}
for dataset in train_test_data:
    dataset['Cabin'] = dataset['Cabin'].map(cabin_mapping)

In [None]:
# fill missing Fare with median fare for each Pclass
train["Cabin"].fillna(train.groupby("Pclass")["Cabin"].transform("median"), inplace=True)
test["Cabin"].fillna(test.groupby("Pclass")["Cabin"].transform("median"), inplace=True)

## 7️⃣ FamilySize

In [None]:
train["FamilySize"] = train["SibSp"] + train["Parch"] + 1
test["FamilySize"] = test["SibSp"] + test["Parch"] + 1

In [None]:
family_mapping = {1: 0, 2: 0.4, 3: 0.8, 4: 1.2, 5: 1.6, 6: 2, 7: 2.4, 8: 2.8, 9: 3.2, 10: 3.6, 11: 4}
for dataset in train_test_data:
    dataset['FamilySize'] = dataset['FamilySize'].map(family_mapping)

In [None]:
train.head()

In [None]:
features_drop = ['Ticket', 'SibSp', 'Parch', 'FamilySize']
train = train.drop(features_drop, axis=1)
test = test.drop(features_drop, axis=1)
train = train.drop(['PassengerId'], axis=1)

In [None]:
train_data = train.drop('Survived', axis=1)
target = train['Survived']

# Modelling

In [None]:
# knn classifier -> 0.66028
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors = 13)
clf.fit(train_data, target)
test_data = test.drop("PassengerId", axis=1)
predict = clf.predict(test_data)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier() # -> 0.73923
clf = RandomForestClassifier(random_state = 0, criterion = 'entropy') # -> 0.75358
clf = RandomForestClassifier(random_state = 0, criterion = 'entropy', max_features = 'log2') # -> 0.76555
clf.fit(train_data, target)
test_data = test.drop("PassengerId", axis=1)
predict = clf.predict(test_data)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier() # -> 0.73923
clf = RandomForestClassifier(random_state = 0, criterion = 'entropy') # -> 0.75358
clf = RandomForestClassifier(random_state = 0, criterion = 'entropy', max_features = 'log2') # -> 0.76555
clf = RandomForestClassifier(random_state=0,n_estimators=100,min_samples_split=20,oob_score=True,max_depth=15) # -> 0.76076
clf = RandomForestClassifier(random_state=2,n_estimators=100,min_samples_split=20,oob_score=True,max_depth=15) # -> 0.77272
clf = RandomForestClassifier(random_state=2,n_estimators=100,min_samples_split=20,oob_score=True,max_depth=15, max_features = 'log2') # -> 0.77511
clf = RandomForestClassifier(n_estimators=50, max_depth=6, random_state=0) # -> 0.79186
clf = RandomForestClassifier(n_estimators=50, max_depth=6, random_state=0, criterion = 'entropy') # -> 0.80143

clf.fit(train_data, target)
test_data = test.drop("PassengerId", axis=1)
predict = clf.predict(test_data)

In [None]:
# from sklearn.linear_model import LogisticRegression
# log = LogisticRegression(solver='liblinear')
# log.fit(train_data, target)
# test_data = test.drop("PassengerId", axis=1)
# predict = log.predict(test_data)

In [None]:
# from xgboost import XGBClassifier
# classifier = XGBClassifier(colsample_bylevel= 0.9,
#                     colsample_bytree = 0.8, 
#                     gamma=0.99,
#                     max_depth= 5,
#                     min_child_weight= 1,
#                     n_estimators= 10,
#                     nthread= 4,
#                     random_state= 2,
#                     silent= True)
# classifier.fit(train_data, target)
# test_data = test.drop("PassengerId", axis=1)
# predict = classifier.predict(test_data)

In [None]:
submission['Survived'] = predict
submission.to_csv('submission.csv', index=False)