# Titanic
https://www.kaggle.com/c/titanic/data


**데이터 전처리하기**
#### 범주형 데이터 다루기 - 원핫인코딩

데이터에는 수치형 데이터와 텍스트 데이터나 범주형 데이터가 있다. 머신러닝이나 딥러닝 알고리즘은 수치로 된 데이터만 이해할 수 있다. 그래서 기계가 이해할 수 있는 형태로 데이터를 변환해 주어야 하는데 범주형 데이터는 원핫인코딩 형태로 변환해 준다. 원핫인코딩이란 해당되는 하나의 데이터만 1로 변경해 주고 나머지는 0으로 채워주는 것을 뜻한다.

예를 들어 과일이라는 컬럼에 사과, 배, 감이 들어있다고 하자, 이 때 각각의 과일인 사과, 배, 감으로 컬럼을 만들어 주고 해당 되는 과일에만 1로 표기를 해주고 나머지 과일은 0으로 표기해 주는 것이다.


#### 원핫인코딩 전
| 과일 |
|------|
| 사과 |
| 배 |
| 감 |

#### 원핫인코딩 후

| 과일 | 과일_사과 | 과일_배 | 과일_감 |
|------|-----------|---------|---------|
| 사과 | 1 | 0 | 0 |
| 배 | 0 | 1 | 0 |
| 감 | 0 | 0 | 1 |


In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('../input/kaggle_data/titanic/train.csv')
test = pd.read_csv('../input/kaggle_data/titanic/test.csv')
print(train.shape)
print(test.shape)

(891, 12)
(418, 11)


In [3]:
print(train.columns)
train.dtypes

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [4]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


* AGE는 누락값이 많음을 알 수 있다. Count를 비교해보면 알 수 있다

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [5]:
test.dtypes
# Survived가 없다.

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [6]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [7]:
train.select_dtypes(include=['object']).describe()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Aks, Mrs. Sam (Leah Rosen)",male,347082,G6,S
freq,1,577,7,4,644


In [6]:
obj_df = train.select_dtypes(include=['object']).copy()
obj_df.head()

# 오브젝트 타입의 데이터만 따로 추출해본다. 
# 이 데이터 중 카테고리 형태의 데이터가 무엇인지 보고 인코딩 해준다
# 원핫인코딩 뿐만 아니라 TF , TF-IDF의 인코딩도 해줄 수 있다.

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S


In [7]:

# 어느 데이터든 누락 된 데이터가 있으면 출력하도록 했다.
# Cabin이 누락 된 데이터가 가장 많다.
# 결측치 다루는 법은 따로 다룰 것이다.

obj_df[obj_df.isnull().any(axis=1)].head(20)

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
4,"Allen, Mr. William Henry",male,373450,,S
5,"Moran, Mr. James",male,330877,,Q
7,"Palsson, Master. Gosta Leonard",male,349909,,S
8,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,347742,,S
9,"Nasser, Mrs. Nicholas (Adele Achem)",female,237736,,C
12,"Saundercock, Mr. William Henry",male,A/5. 2151,,S
13,"Andersson, Mr. Anders Johan",male,347082,,S
14,"Vestrom, Miss. Hulda Amanda Adolfina",female,350406,,S


In [9]:
train.select_dtypes(include=['object']).columns

Index(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object')

In [14]:
for i in list(train.select_dtypes(include=['object']).columns):
    print("\n", i, len(train[i].unique()))
#     print(len(train[i].unique()))
    print(train[i].unique()[:10])


 Name 891
['Braund, Mr. Owen Harris'
 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)'
 'Heikkinen, Miss. Laina' 'Futrelle, Mrs. Jacques Heath (Lily May Peel)'
 'Allen, Mr. William Henry' 'Moran, Mr. James' 'McCarthy, Mr. Timothy J'
 'Palsson, Master. Gosta Leonard'
 'Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)'
 'Nasser, Mrs. Nicholas (Adele Achem)']

 Sex 2
['male' 'female']

 Ticket 681
['A/5 21171' 'PC 17599' 'STON/O2. 3101282' '113803' '373450' '330877'
 '17463' '349909' '347742' '237736']

 Cabin 148
[nan 'C85' 'C123' 'E46' 'G6' 'C103' 'D56' 'A6' 'C23 C25 C27' 'B78']

 Embarked 4
['S' 'C' 'Q' nan]


In [8]:
# 카테고리 데이터로 적합한지 확인
obj_df["Cabin"].value_counts().head(5)

B96 B98        4
G6             4
C23 C25 C27    4
F33            3
F2             3
Name: Cabin, dtype: int64

# preprocessing

In [15]:
# 처리 전과 비교해 보기 위해 데이터를 복사
_train = train.copy()
_test = test.copy()

In [16]:
# 성별
train['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [17]:
# 인코딩방법 1
train.loc[train["Sex"]=="male","Sex"]=0
train.loc[train["Sex"]=="female","Sex"]=1

test.loc[test["Sex"]=="male","Sex"]=0
test.loc[test["Sex"]=="female","Sex"]=1

train.head()


# 인코딩방법2
# data['Sex'] = data['Sex'].apply(lambda s: 1 if s == 'female' else 0)
# testdata['Sex'] = testdata['Sex'].apply(lambda s: 1 if s == 'female' else 0)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


binary catetory 이기 때문에 0과 1로 인코딩 해준다.

In [19]:
# Sklearn의 원핫인코딩
from sklearn.preprocessing import LabelEncoder

def gender_to_int(data):
    le = LabelEncoder()
    le.fit(["male", "female"])
    data["Sex"] = le.transform(data["Sex"])
    return data

_train = gender_to_int(_train)
_test = gender_to_int(_test)

_train['Sex'].value_counts()

1    577
0    314
Name: Sex, dtype: int64

## 승선위치

In [20]:
train['Embarked'].value_counts() # S일 때 승선을 많이했다.

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [21]:
_train["Embarked_C"] = _train["Embarked"] == "C"
_train["Embarked_S"] = _train["Embarked"] == "S"
_train["Embarked_Q"] = _train["Embarked"] == "Q"
print(train.shape)
print(_train.shape)

(891, 12)
(891, 15)


In [22]:
_train[["Embarked", "Embarked_C", "Embarked_S", "Embarked_Q"]].head()

Unnamed: 0,Embarked,Embarked_C,Embarked_S,Embarked_Q
0,S,False,True,False
1,C,True,False,False
2,S,False,True,False
3,S,False,True,False
4,S,False,True,False


## pandas 원핫인코딩

#### (예제)================================

In [24]:
s = pd.Series(list('abca'))
s

0    a
1    b
2    c
3    a
dtype: object

In [25]:
pd.get_dummies(s)

Unnamed: 0,a,b,c
0,1,0,0
1,0,1,0
2,0,0,1
3,1,0,0


In [27]:
import numpy as np

# nan value도 
s1=['a', 'b', np.nan]
pd.get_dummies(s1)

Unnamed: 0,a,b
0,1,0
1,0,1
2,0,0


In [28]:
pd.get_dummies(s1, dummy_na=True)

Unnamed: 0,a,b,nan
0,1,0,0
1,0,1,0
2,0,0,1


In [29]:
# DataFrame 
df_temp = pd.DataFrame({'A':['a', 'b', 'a'], 'B':['b', 'a', 'c'], 'C':[1,2,3]})

In [30]:
df_temp

Unnamed: 0,A,B,C
0,a,b,1
1,b,a,2
2,a,c,3


In [32]:
pd.get_dummies(df_temp, prefix=['col1', 'col2'])

Unnamed: 0,C,col1_a,col1_b,col2_a,col2_b,col2_c
0,1,1,0,0,1,0
1,2,0,1,1,0,0
2,3,1,0,0,0,1


In [33]:
pd.get_dummies(df_temp, prefix=['A', 'B'])
# category 형의 컬럼들만 바뀌었음.

Unnamed: 0,C,A_a,A_b,B_a,B_b,B_c
0,1,1,0,0,1,0
1,2,0,1,1,0,0
2,3,1,0,0,0,1


#### ================================(예제 끝)

In [34]:
pd.get_dummies(train['Sex'], prefix = 'Sex')

Unnamed: 0,Sex_0,Sex_1
0,1,0
1,0,1
2,0,1
3,0,1
4,1,0
5,1,0
6,1,0
7,1,0
8,0,1
9,0,1


## Pandas의 get_dummies로 원핫인코딩

In [23]:
# 기계가 데이터를 이해할 수 있도록
# 카테고리 데이터를 one-hot-encoding 해준다.
def dummy_data(data, columns):
    for col in columns:
        data = pd.concat([data, pd.get_dummies(data[col], prefix = col)], axis=1)
        data = data.drop(col, axis=1)
    return data

In [24]:
dummy_columns = ["Sex", "Pclass", "Embarked"]
train_dummy = dummy_data(train, dummy_columns)
test_dummy = dummy_data(test, dummy_columns)

In [25]:
print('원핫인코딩 전 shape')
print(train.shape)
print(test.shape)

print('get_dummies로 원핫인코딩 후 shape')
print(train_dummy.shape)
print(test_dummy.shape)

원핫인코딩 전 shape
(891, 12)
(418, 11)
get_dummies로 원핫인코딩 후 shape
(891, 17)
(418, 16)


In [26]:
train_dummy.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_0,Sex_1,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,1,0,0,0,1,0,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,0,1,1,0,0,1,0,0
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0,1,0,0,1,0,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,0,1,1,0,0,0,0,1
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,1,0,0,0,1,0,0,1


* 인코딩 된 데이터를 그대로 사용하게 된다면 사용하지 않는 컬럼을 drop해 주는 방법으로 피처를 생성해준다.

In [27]:
# feature selection
def drop_not_concerned(data, columns):
    return data.drop(columns, axis=1)

not_concerned_columns = ["PassengerId", "Name", "Ticket", "Cabin"]
X_train = drop_not_concerned(train_dummy, not_concerned_columns)
X_train = X_train.drop('Survived', axis=1)
X_test = drop_not_concerned(test_dummy, not_concerned_columns)