In [1]:
import numpy as np
import pandas as pd
import copy
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
from sklearn.model_selection import  cross_val_score
from sklearn.linear_model import LogisticRegression

df_train=pd.read_csv('C:/Users/User/Documents/titanic_train.csv')
df_test=pd.read_csv('C:/Users/User/Documents/titanic_test.csv')

train_y=df_train['Survived']
ids=df_test['PassengerId']
df_trian=df_train.drop(['PassengerId','Survived'],axis=1)
df_test=df_test.drop(['PassengerId'],axis=1)
df=pd.concat([df_train,df_test])
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1.0,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2.0,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3.0,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4.0,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5.0,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
# 只取出 object 欄位資料，存於box_feature中
box_feature=[]
for dtype, feature in zip(df.dtypes, df.columns):
    if dtype=='object':
        box_feature.append(feature)
print(f'{len(box_feature)} Numeric feature : {box_feature}\n')

# 削減文字型欄位，只留數值型
df=df[box_feature]
df=df.fillna('None')
MMEncoder=MinMaxScaler()
train_num=train_y.shape[0]
df.head()

5 Numeric feature : ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']



Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S


In [3]:
# 對照組: label encoding + rogistic regression
df_temp=pd.DataFrame()
for i in df.columns:
    df_temp[i]=LabelEncoder().fit_transform(df[i])
train_x=df_temp[:train_num]
estimator=LogisticRegression()
print(cross_val_score(estimator,train_x,train_y,cv=5).mean())
df_temp.head()

0.7800138095537004


Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,155,1,720,185,3
1,286,0,816,106,0
2,523,0,914,185,3
3,422,0,65,70,3
4,22,1,649,185,3


In [4]:
# 加上'Cabin' 欄位計數編碼
count_df=df.groupby(['Cabin']).count()['Name'].reset_index() # 轉成dataframe形式
count_df=count_df.rename(columns={'Name':'Cabin_count'})
df=pd.merge(df,count_df,on=['Cabin'],how='left')
count_df.sort_values(by=['Cabin_count'],ascending=False).head(10)

Unnamed: 0,Cabin,Cabin_count
185,,1014
80,C23 C25 C27,6
184,G6,5
47,B57 B59 B63 B66,5
60,B96 B98,4
180,F2,4
117,D,4
79,C22 C26,4
181,F33,4
183,F4,4


In [5]:
# 計數編碼 + rogistic regression
df_temp=pd.DataFrame()
for i in box_feature:
    df_temp[i]=LabelEncoder().fit_transform(df[i])
df_temp['Cabin_count']=df['Cabin_count']
train_x=df_temp[:train_num]
estimator=LogisticRegression()
print(cross_val_score(estimator,train_x,train_y,cv=5).mean())
df_temp.head()

0.7822547234950725


Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Cabin_count
0,155,1,720,185,3,1014
1,286,0,816,106,0,2
2,523,0,914,185,3,1014
3,422,0,65,70,3,2
4,22,1,649,185,3,1014


In [6]:
# Hashing + rogistic regression
df_temp=pd.DataFrame()
for i in box_feature:
    df_temp[i]=LabelEncoder().fit_transform(df[i])
df_temp['Cabin_Hash']=df['Cabin'].map(lambda x:hash(x) % 5)
train_x=df_temp[:train_num]
estimator=LogisticRegression(max_iter=1000)
print(cross_val_score(estimator,train_x,train_y,cv=5).mean())
df_temp.head()

0.7710250455087565


Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Cabin_Hash
0,155,1,720,185,3,3
1,286,0,816,106,0,4
2,523,0,914,185,3,3
3,422,0,65,70,3,0
4,22,1,649,185,3,3


In [7]:
# 計數編碼 + Hashing + rogistic regression
df_temp=pd.DataFrame()
for i in box_feature:
    df_temp[i]=LabelEncoder().fit_transform(df[i])
df_temp['Cabin_Hash']=df['Cabin'].map(lambda x:hash(x) % 5)
df_temp['Cabin_count']=df['Cabin_count']
train_x=df_temp[:train_num]
estimator=LogisticRegression(max_iter=1000)
print(cross_val_score(estimator,train_x,train_y,cv=5).mean())
df_temp.head()

0.7766430230368464


Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Cabin_Hash,Cabin_count
0,155,1,720,185,3,3,1014
1,286,0,816,106,0,4,2
2,523,0,914,185,3,3,1014
3,422,0,65,70,3,0,2
4,22,1,649,185,3,3,1014


In [None]:
# 可看出資料在label encoding下效果最好，但其實某些情況下或許用hash更合適