In [1]:
import pandas as pd
import numpy as np
from plotnine import *
import missingno as msno

import sys
print(sys.version)
print(pd.__version__)
print(np.__version__)
print(msno.__version__)

3.6.5 |Anaconda, Inc.| (default, Apr 26 2018, 08:42:37) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
0.23.0
1.14.3
0.4.1


In [2]:
# 판다스를 통해 데이터를 로드해 온다.
# 여기에서는 캐글의 타이타닉 데이터를 사용한다. 
# 데이터 다운로드 : https://www.kaggle.com/c/titanic/data

train = pd.read_csv('/Users/heebunny/Documents/dataitgirls/all/train.csv')
train.shape

(891, 12)

In [3]:
test = pd.read_csv('/Users/heebunny/Documents/dataitgirls/all/test.csv')
test.shape

(418, 11)

In [4]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [6]:
train['Embarked'].fillna(train['Embarked'].mode()[0], inplace=True)

In [7]:
test['Embarked'].fillna(test['Embarked'].mode()[0], inplace=True)

In [8]:
train_cp = train.copy()
test_cp = test.copy()

In [9]:
# 결측치가 있는 지 확인
train['Name'].isnull().sum()

0

In [10]:
train['Name'].str.split(", ", expand=True).head()

Unnamed: 0,0,1
0,Braund,Mr. Owen Harris
1,Cumings,Mrs. John Bradley (Florence Briggs Thayer)
2,Heikkinen,Miss. Laina
3,Futrelle,Mrs. Jacques Heath (Lily May Peel)
4,Allen,Mr. William Henry


In [11]:
# 이름에서 Title 항목만 떼어낸다.
# 콤마로 분리를 하고 분리된 두 번째 데이터의 첫번째 항목을 가져온다.
train['Title'] = train['Name'].str.split(
    ", ", expand=True)[1].str.split(".", expand=True)[0]
train['Title'].head()

test['Title'] = test['Name'].str.split(
    ", ", expand=True)[1].str.split(".", expand=True)[0]
test['Title'].head()

0     Mr
1    Mrs
2     Mr
3     Mr
4    Mrs
Name: Title, dtype: object

In [12]:
train.query("Title in ('Dona', 'Lady', 'the Countess')")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
556,557,1,1,"Duff Gordon, Lady. (Lucille Christiana Sutherl...",female,48.0,1,0,11755,39.6,A16,C,Lady
759,760,1,1,"Rothes, the Countess. of (Lucy Noel Martha Dye...",female,33.0,0,0,110152,86.5,B77,S,the Countess


In [13]:
# 결혼한 여성 관련 호칭은 Mrs로 분류
train.loc[train['Title'].isin(['Dona', 'Lady', 'the Countess', 'Mme', 'Mlle']), 'Title'] = 'Ms'
test.loc[test['Title'].isin(['Dona', 'Lady', 'the Countess', 'Mme', 'Mlle']), 'Title'] = 'Ms'
train['Title'].value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Ms            6
Major         2
Col           2
Capt          1
Jonkheer      1
Don           1
Sir           1
Name: Title, dtype: int64

In [14]:
train.loc[train['Title'].isin(['Don', 'Sir', 'Capt', 'Col', 'Major', 'Master', 'Rev', 'Dr', 'Jonkheer']), 'Title'] = 'Mr'
test.loc[test['Title'].isin(['Don', 'Sir', 'Capt', 'Col', 'Major', 'Master', 'Rev', 'Dr', 'Jonkheer']), 'Title'] = 'Mr'
train['Title'].value_counts()

Mr      578
Miss    182
Mrs     125
Ms        6
Name: Title, dtype: int64

In [15]:
train.Age.isnull().sum()

177

In [16]:
train_groupby = train.copy()
test_groupby = test.copy()

In [17]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Title            0
dtype: int64

In [18]:
train_groupby['Age'].fillna(train.groupby('Title')['Age'].transform('median'), inplace=True)
test_groupby['Age'].fillna(test.groupby('Title')['Age'].transform('median'), inplace=True)

In [19]:
print(train.Age.isnull().sum())
print(train_groupby.Age.isnull().sum())

177
0


In [20]:
train = train_groupby
test = test_groupby

In [21]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title'],
      dtype='object')

In [22]:
test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title'],
      dtype='object')

In [23]:
train['AgeBin'] = pd.cut(train['Age'].astype(int), 5)
test['AgeBin'] = pd.cut(test['Age'].astype(int), 5)

In [24]:
# 기계가 데이터를 이해할 수 있도록 
# 카테고리 데이터를 one-hot-encoding 해준다.
def dummy_data(data, columns):
    for column in columns:
        data = pd.concat([data, pd.get_dummies(data[column], prefix = column)], axis=1)
        data = data.drop(column, axis=1)
    return data


dummy_columns = ["Sex", "Pclass", "Embarked", "AgeBin"]
train_dummy = dummy_data(train, dummy_columns)
test_dummy = dummy_data(test, dummy_columns)

print('원핫인코딩 전 shape')
print(train.shape)
print(test.shape)

print('get_dummies로 원핫인코딩 후 shape')
print(train_dummy.shape)
print(test_dummy.shape)

원핫인코딩 전 shape
(891, 14)
(418, 13)
get_dummies로 원핫인코딩 후 shape
(891, 23)
(418, 22)


In [25]:
train_dummy.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Title,...,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,"AgeBin_(-0.08, 16.0]","AgeBin_(16.0, 32.0]","AgeBin_(32.0, 48.0]","AgeBin_(48.0, 64.0]","AgeBin_(64.0, 80.0]"
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,Mr,...,0,1,0,0,1,0,1,0,0,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,Mrs,...,0,0,1,0,0,0,0,1,0,0
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,Miss,...,0,1,0,0,1,0,1,0,0,0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,Mrs,...,0,0,0,0,1,0,0,1,0,0
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,Mr,...,0,1,0,0,1,0,0,1,0,0


In [26]:
# 사용하지 않을 컬럼을 제거해 피처로 사용할 컬럼만 남겨둔다.
def drop_not_concerned(data, columns):
    return data.drop(columns, axis=1)

not_concerned_columns = ["PassengerId", "Age", "Name", "Ticket", "Fare", "Title", "Cabin"]
X_train = drop_not_concerned(train_dummy, not_concerned_columns)
X_train = X_train.drop('Survived', axis=1)
X_test = drop_not_concerned(test_dummy, not_concerned_columns)

In [27]:
y_label = train['Survived']

In [28]:
X_test.head()

Unnamed: 0,SibSp,Parch,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,"AgeBin_(-0.076, 15.2]","AgeBin_(15.2, 30.4]","AgeBin_(30.4, 45.6]","AgeBin_(45.6, 60.8]","AgeBin_(60.8, 76.0]"
0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0
1,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0
2,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1
3,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0
4,1,1,1,0,0,0,1,0,0,1,0,1,0,0,0


In [29]:
from sklearn.tree import DecisionTreeClassifier


In [30]:
model = DecisionTreeClassifier(max_depth=10, random_state=2018)
# model.fit(X_train, y_train)
# prediction = model.predict(X_test)

In [31]:
model.fit(X_train, y_label)
prediction = model.predict(X_test)

In [32]:
score = round(model.score(X_train, y_label)*100, 2)
score

86.53

In [33]:
# #결정트리 시각화 하기

# from sklearn.tree import export_graphviz
# import graphviz

In [34]:
# export_graphviz(model,
#                 feature_names=feature_names,
#                 class_names=["Perish", "Survived"],
#                 out_file="decision-tree.dot")

# with open("decision-tree.dot") as f:
#     dot_graph = f.read()
    
# graphviz.Source(dot_graph)

In [36]:
test['Survived'] = prediction
test.columns

submissions = test[['PassengerId', 'Survived']]
submissions.head()

submissions.to_csv('/Users/heebunny/Documents/dataitgirls/all/submission180827_2.csv', index=False)

In [37]:
submissions.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0


In [38]:
submissions.shape

(418, 2)