# 머신러닝 스터디 계획

### 참석자: 오현수, 정헌재, 채무진, 최현수
### 참석일: 매주 화요일 오전 8시 30분 (KST)
### 스터디 자료 후보

- [모두를 위한 머신러닝/딥러닝 by Sung kim](https://hunkim.github.io/ml/)
    - [ML lec 01 - 기본적인 Machine Learnnig의 용어와 개념 설명](https://www.youtube.com/watch?v=qPMeuL2LIqY)
    - [ML lec 02 - Linear Regression의 Hypothesis 와 cost 설명](https://www.youtube.com/watch?v=Hax03rCn3UI)
- [Coursera 앤드루 응](https://www.coursera.org/learn/machine-learning/home)

# 타이타닉호의 생존자 예측하기

- 아래 코드는 [해당 Tutorial](http://corpocrat.com/2014/08/29/tutorial-titanic-dataset-machine-learning-for-kaggle/)을 참고하였습니다.
- 직접 실습을 해보려면 [데이터셋을 다운로드](https://www.kaggle.com/c/titanic/data)하고 `./data/` 폴더에 넣어주세요.
- [헌재/원경이 진행했던 Pandas, Numpy 강의 자료](http://nbviewer.jupyter.org/github/LyuGgang/python-data-analytics-lecture/blob/master/%EA%B0%95%EC%9D%98%EC%9E%90%EB%A3%8C/6_pandas.ipynb#Pandas.DataFrame->-Pandas.Series->-numpy.ndarray)를 참고할 수 있습니다.

### 데이터 받아오기

In [4]:
import pandas as pd
import numpy as np
from pandas import DataFrame, Series

# to import sklearn (이것은 정헌재의 로컬 컴퓨터 파이썬 패키지 참조를 위해 추가된 것..)
import sys; sys.path.append('/usr/local/lib/python2.7/site-packages')

In [5]:
df = pd.read_csv('./data/titanic/train.csv', header=0)

In [6]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [8]:
# 알파벳 요소 제거
df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 62.7+ KB


In [10]:
# If you want drop null values 
# df = df.dropna()
# df.info()

### Data mangling (데이터 조작)

In [11]:
# pd.get_dummies는 무엇을 하는걸까요?
d = np.array([1,3,3,4,'a',1,3,2])
pd.get_dummies(d)

Unnamed: 0,1,2,3,4,a
0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,1.0
5,1.0,0.0,0.0,0.0,0.0
6,0.0,0.0,1.0,0.0,0.0
7,0.0,1.0,0.0,0.0,0.0


In [12]:
d

array(['1', '3', '3', '4', 'a', '1', '3', '2'], 
      dtype='|S21')

In [13]:
# 아하
pd.get_dummies(df['Sex']).head()

Unnamed: 0,female,male
0,0.0,1.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,0.0,1.0


In [14]:
# 생존을 판별하는 중요한(!) 데이터(Pclass와 성별, 탑승지)만 일단 골라낸다.
dummies = []
cols = ['Pclass', 'Sex', 'Embarked']
for col in cols:
    dummies.append(pd.get_dummies(df[col]))

In [15]:
# 컬럼들을 붙이자
titanic_dummies = pd.concat(dummies, axis=1)

In [16]:
titanic_dummies.head()

Unnamed: 0,1,2,3,female,male,C,Q,S
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [17]:
# df랑도 붙이자
df = pd.concat((df, titanic_dummies), axis=1)

In [18]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,1,2,3,female,male,C,Q,S
0,1,0,3,male,22.0,1,0,7.25,S,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,2,1,1,female,38.0,1,0,71.2833,C,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,3,1,3,female,26.0,0,0,7.925,S,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,4,1,1,female,35.0,1,0,53.1,S,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,5,0,3,male,35.0,0,0,8.05,S,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [19]:
# dummies로 붙였으니까 제거
df = df.drop(['Pclass','Sex','Embarked'], axis=1)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
1              891 non-null float64
2              891 non-null float64
3              891 non-null float64
female         891 non-null float64
male           891 non-null float64
C              891 non-null float64
Q              891 non-null float64
S              891 non-null float64
dtypes: float64(10), int64(4)
memory usage: 97.5 KB


### Age만 714개.. interpolate를 하자

In [21]:
sample = DataFrame([[1,2,3],[2,None,2],[3,3,3]], columns=['a','b','c'])

In [22]:
sample

Unnamed: 0,a,b,c
0,1,2.0,3
1,2,,2
2,3,3.0,3


In [23]:
sample['b'].interpolate()

0    2.0
1    2.5
2    3.0
Name: b, dtype: float64

In [24]:
# null value 보간법으로 채우기
df['Age'] = df['Age'].interpolate()

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
1              891 non-null float64
2              891 non-null float64
3              891 non-null float64
female         891 non-null float64
male           891 non-null float64
C              891 non-null float64
Q              891 non-null float64
S              891 non-null float64
dtypes: float64(10), int64(4)
memory usage: 97.5 KB


### 누가 살았고 누가 돌아가셨는지 이제 학습을 해보자

In [26]:
df.tail()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,1,2,3,female,male,C,Q,S
886,887,0,27.0,0,0,13.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
887,888,1,19.0,0,0,30.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
888,889,0,22.5,1,2,23.45,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
889,890,1,26.0,0,0,30.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
890,891,0,32.0,0,0,7.75,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [27]:
X = df.values

In [28]:
X.shape

(891, 14)

In [29]:
y = df['Survived'].values

In [30]:
y.shape

(891,)

In [31]:
# Survived 컬럼 지우기
X = np.delete(X, 1, axis=1)

### Cross validation?
- http://scikit-learn.org/stable/modules/cross_validation.html
- https://en.wikipedia.org/wiki/Cross-validation_(statistics)

In [32]:
from sklearn.model_selection import train_test_split

In [33]:

# random_state는 데이터 shuffling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [34]:
print X.shape, y.shape

(891, 13) (891,)


In [35]:
print X_train.shape, y_train.shape

(623, 13) (623,)


In [36]:
print X_test.shape, y_test.shape

(268, 13) (268,)


### Decision tree에 train 데이터를 넣고, test 데이터로 검증
- http://scikit-learn.org/stable/modules/tree.html

![](https://www.dropbox.com/s/nvxc4r0kka5p5mi/%EC%8A%A4%ED%81%AC%EB%A6%B0%EC%83%B7%202016-10-13%2013.10.02.png?raw=1)

In [37]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=5)

In [38]:
# 학습은 train 데이터로
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [39]:
# 테스트는 test 데이터로
clf.score(X_test,y_test)

0.79104477611940294

### 음? 이 데이터로 Decision tree가 어떻게 만들어지는가?

In [40]:
X = np.array([[1,2,3,1,2,1],[1,1,3,4,3,2],[1,2,3,4,5,1]])
y = np.array(['a','b','c'])

In [41]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X,y)

In [42]:
clf.predict([[1,2,3,4,5,2]])

array(['b'], 
      dtype='|S1')

In [43]:
clf.predict_proba([[1,2,3,0,0,0]])

array([[ 1.,  0.,  0.]])

### 모르겠다..

### Iris 데이터

In [44]:
# https://en.wikipedia.org/wiki/Iris_flower_data_set
from sklearn.datasets import load_iris
iris = load_iris()

![](https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Petal-sepal.jpg/440px-Petal-sepal.jpg)

In [45]:
print iris.data.shape, iris.target.shape

(150, 4) (150,)


In [46]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.4, random_state=0)

In [47]:
clf = tree.DecisionTreeClassifier()

In [48]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [49]:
clf.score(X_test, y_test)

0.94999999999999996