# 作業 : (Kaggle)鐵達尼生存預測
***
https://www.kaggle.com/c/titanic

# 作業1
* 試著使用鐵達尼號的例子，創立兩種以上的群聚編碼特徵( mean、median、mode、max、min、count 均可 )

In [1]:
# 做完特徵工程前的所有準備 (與前範例相同)
import pandas as pd
import numpy as np
import copy
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

data_path = 'data/'
df = pd.read_csv(data_path + 'titanic_train.csv')

train_Y = df['Survived']
df = df.drop(['PassengerId', 'Survived'] , axis=1)
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
df.nunique()

Pclass        3
Name        891
Sex           2
Age          88
SibSp         7
Parch         7
Ticket      681
Fare        248
Cabin       147
Embarked      3
dtype: int64

In [3]:
# 取一個類別型欄位, 與一個數值型欄位, 做群聚編碼
# Fare 對 Cabin做群聚編碼
df['Cabin'] = df['Cabin'].fillna('None')
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

mean_df = df.groupby(['Cabin'])['Fare'].mean().reset_index()
mode_df = df.groupby(['Cabin'])['Fare'].apply(lambda x: x.mode()[0]).reset_index()
median_df = df.groupby(['Cabin'])['Fare'].median().reset_index()
count_df = df.groupby(['Cabin'])['Fare'].count().reset_index()
temp = pd.merge(mean_df, mode_df, on = ['Cabin'], how = 'left')
temp = pd.merge(temp, median_df, on = ['Cabin'], how = 'left')
temp = pd.merge(temp, count_df, on = ['Cabin'], how = 'left')
temp.columns = ['Cabin', 'Fare_mean', 'Fare_mode', 'Fare_median', 'Fare_count']
temp

Unnamed: 0,Cabin,Fare_mean,Fare_mode,Fare_median,Fare_count
0,A10,40.125000,40.1250,40.12500,1
1,A14,52.000000,52.0000,52.00000,1
2,A16,39.600000,39.6000,39.60000,1
3,A19,26.000000,26.0000,26.00000,1
4,A20,56.929200,56.9292,56.92920,1
5,A23,30.000000,30.0000,30.00000,1
6,A24,50.495800,50.4958,50.49580,1
7,A26,35.500000,35.5000,35.50000,1
8,A31,31.000000,31.0000,31.00000,1
9,A32,50.000000,50.0000,50.00000,1


In [4]:
df = pd.merge(df, temp, how='left', on=['Cabin'])
df = df.drop(['Cabin'] , axis=1)
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Fare_mean,Fare_mode,Fare_median,Fare_count
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,19.157325,8.05,10.5,687
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,71.2833,71.2833,71.2833,1
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,19.157325,8.05,10.5,687
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,53.1,53.1,53.1,2
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,19.157325,8.05,10.5,687


In [5]:
#只取 int64, float64 兩種數值型欄位, 存於 num_features 中
num_features = []
for dtype, feature in zip(df.dtypes, df.columns):
    if dtype == 'float64' or dtype == 'int64':
        num_features.append(feature)
print(f'{len(num_features)} Numeric Features : {num_features}\n')

# 削減文字型欄位, 只剩數值型欄位
df = df[num_features]
df = df.fillna(-1)
MMEncoder = MinMaxScaler()
df.head()

9 Numeric Features : ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Fare_mean', 'Fare_mode', 'Fare_median', 'Fare_count']



Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Fare_mean,Fare_mode,Fare_median,Fare_count
0,3,22.0,1,0,7.25,19.157325,8.05,10.5,687
1,1,38.0,1,0,71.2833,71.2833,71.2833,71.2833,1
2,3,26.0,0,0,7.925,19.157325,8.05,10.5,687
3,1,35.0,1,0,53.1,53.1,53.1,53.1,2
4,3,35.0,0,0,8.05,19.157325,8.05,10.5,687


# 作業2
* 將上述的新特徵，合併原有的欄位做生存率預估，結果是否有改善?

In [6]:
# 原始特徵 + 邏輯斯迴歸
df_ori = df.drop(['Fare_mean', 'Fare_mode', 'Fare_median', 'Fare_count'], axis=1)
train_X = MMEncoder.fit_transform(df_ori)
LR = LogisticRegression()
print(f'Score : {cross_val_score(LR, train_X, train_Y, cv=5).mean()}')

Score : 0.7038635542329971


  return self.partial_fit(X, y)


In [7]:
# 新特徵 + 邏輯斯迴歸
train_X = MMEncoder.fit_transform(df)
LR = LogisticRegression()
print(f'Score : {cross_val_score(LR, train_X, train_Y, cv=5).mean()}')

Score : 0.6892504646627564


  return self.partial_fit(X, y)


### ANS2: 無，反而更差

### 老師補充：
#### 取船票票號(Ticket), 對乘客年齡(Age)做群聚編碼
不論是例題的線性迴歸或者梯度提升樹, 以及作業的邏輯斯迴歸
聚類編碼都在正確率上有穩定提升, 這就是我們所說的:均值編碼容易overfitting/聚類編碼不容易overfitting的效果
不過助教這邊的數值型特徵與類別型特徵, 是有特別用特徵重要性挑選過的, 因此同學自行挑選的特徵可能未必提升
至於特徵重要性如何使用, 請同學參考 Day29 內容