In [1]:
import pandas as pd
import numpy as np
import seaborn as sns


In [2]:
#read table data
train = pd.read_csv('train.tsv',sep='\t')
test = pd.read_csv('test.tsv',sep='\t')

#label　to 'sex' and 'embarked'
sex_dic = {'male':0,'female':1}
train['sex'] = train['sex'].map(sex_dic)
test['sex'] = test['sex'].map(sex_dic)
embarked_dic = {'S':0,'C':1,'Q':2}
train['embarked'] = train['embarked'].map(embarked_dic)
test['embarked'] = test['embarked'].map(embarked_dic)

train['embarked']  = train['embarked'].fillna(0)
test['embarked']  = test['embarked'].fillna(0)


In [3]:
train

Unnamed: 0,id,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,3,1,1,1,35.0,1,0,53.1000,0.0
1,4,0,3,0,35.0,0,0,8.0500,0.0
2,7,0,3,0,2.0,3,1,21.0750,0.0
3,9,1,2,1,14.0,1,0,30.0708,1.0
4,11,1,1,1,58.0,0,0,26.5500,0.0
...,...,...,...,...,...,...,...,...,...
440,873,0,3,0,47.0,0,0,9.0000,0.0
441,874,1,2,1,28.0,1,0,24.0000,1.0
442,879,1,1,1,56.0,0,1,83.1583,1.0
443,884,0,3,0,25.0,0,0,7.0500,0.0


In [4]:
#family count
train['family_count'] = train['sibsp']+train['parch']
test['family_count'] = test['sibsp']+test['parch']

#fare / person
train['fare_per_person'] = train['fare']/(train['family_count']+1)
test['fare_per_person'] = test['fare']/(test['family_count']+1)


In [5]:
#drop 'id' and 'survived'  to leran
train_x = train.drop(['id','survived','fare'],axis=1)
train_y = train['survived']
test_x = test.copy().drop(['id','fare'],axis=1)

In [6]:
#model with xgboost
from xgboost import XGBClassifier
model_xgb = XGBClassifier(n_estimators=35,random_state=77,max_depth = 3, min_child_weight = 4.0)
model_xgb.fit(train_x,train_y)

#predict
pred_flag_xgb =  model_xgb.predict(test_x)

In [7]:
#train+test
#test_x+test_y/pred
pred_flag_xgb = pd.DataFrame(pred_flag_xgb)
te_tr = test.merge(pred_flag_xgb,on=test_x.index).drop(['key_0'],axis=1)
columns=['id','pclass','sex','age','sibsp','parch','fare','embarked','family_count', 'fare_per_person','survived']
te_tr.columns = columns
tr_concat = pd.concat([train,te_tr])

#float to int at embarked 
tr_concat['embarked']=tr_concat['embarked'].astype(int)

In [8]:
tr_concat = tr_concat.sort_values('id')
tr_concat

Unnamed: 0,id,survived,pclass,sex,age,sibsp,parch,fare,embarked,family_count,fare_per_person
0,0,0,3,0,22.0,1,0,7.2500,0,1,3.62500
1,1,1,1,1,38.0,1,0,71.2833,1,1,35.64165
2,2,1,3,1,26.0,0,0,7.9250,0,0,7.92500
0,3,1,1,1,35.0,1,0,53.1000,0,1,26.55000
1,4,0,3,0,35.0,0,0,8.0500,0,0,8.05000
...,...,...,...,...,...,...,...,...,...,...,...
442,886,0,2,0,27.0,0,0,13.0000,0,0,13.00000
443,887,1,1,1,19.0,0,0,30.0000,0,0,30.00000
444,888,0,3,1,,1,2,23.4500,0,3,5.86250
444,889,1,1,0,26.0,0,0,30.0000,1,0,30.00000


In [9]:
#groupby
#'embarked','sex'
tr_groupby_es = tr_concat.groupby(['embarked','sex']).mean().drop(['id','survived'],axis=1)
tr_groupby_es.columns = ['pclass_es','age_es','sibsp_es','parch_es','fare_es','family_count_es','fare_per_person_es']
#'pclass','sex'
tr_groupby_ps = tr_concat.groupby(['pclass','sex']).mean().drop(['id','survived'],axis=1)
tr_groupby_ps.columns = ['age_ps','sibsp_ps','parch_ps','fare_ps','embarked_ps','family_count_ps','fare_per_person_ps']

#'pclass','embarked'
# tr_groupby_pe = tr_concat.groupby(['pclass','embarked']).mean().drop(['id','survived'],axis=1)
# tr_groupby_pe.columns = ['sex_pe','age_pe','sibsp_pe','parch_pe','fare_pe','family_count_pe','fare_per_person_pe']

tr_groupby_s = tr_concat.groupby(['sex']).mean().drop(['id'],axis=1)

tr_concat = tr_concat.merge(tr_groupby_es, how='left',on= ['embarked','sex'])
tr_concat = tr_concat.merge(tr_groupby_ps, how='left',on= ['pclass','sex'])
# # tr_concat = tr_concat.merge(tr_groupby_pe, how='left',on= ['pclass','embarked'])

tr_groupby_ps


Unnamed: 0_level_0,Unnamed: 1_level_0,age_ps,sibsp_ps,parch_ps,fare_ps,embarked_ps,family_count_ps,fare_per_person_ps
pclass,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,41.281386,0.311475,0.278689,67.226127,0.360656,0.590164,43.502624
1,1,34.611765,0.553191,0.457447,106.125798,0.478723,1.010638,65.181486
2,0,30.740707,0.342593,0.222222,19.741782,0.111111,0.564815,13.36811
2,1,28.722973,0.486842,0.605263,21.970121,0.144737,1.092105,11.747752
3,0,26.507589,0.498559,0.224784,12.661633,0.348703,0.723343,8.602482
3,1,21.75,0.895833,0.798611,16.11881,0.618056,1.694444,6.871278


In [10]:
tr_concat

Unnamed: 0,id,survived,pclass,sex,age,sibsp,parch,fare,embarked,family_count,...,fare_es,family_count_es,fare_per_person_es,age_ps,sibsp_ps,parch_ps,fare_ps,embarked_ps,family_count_ps,fare_per_person_ps
0,0,0,3,0,22.0,1,0,7.2500,0,1,...,21.711996,0.689342,14.144123,26.507589,0.498559,0.224784,12.661633,0.348703,0.723343,8.602482
1,1,1,1,1,38.0,1,0,71.2833,1,1,...,75.169805,1.041096,49.127769,34.611765,0.553191,0.457447,106.125798,0.478723,1.010638,65.181486
2,2,1,3,1,26.0,0,0,7.9250,0,0,...,39.143456,1.609756,20.018357,21.750000,0.895833,0.798611,16.118810,0.618056,1.694444,6.871278
3,3,1,1,1,35.0,1,0,53.1000,0,1,...,39.143456,1.609756,20.018357,34.611765,0.553191,0.457447,106.125798,0.478723,1.010638,65.181486
4,4,0,3,0,35.0,0,0,8.0500,0,0,...,21.711996,0.689342,14.144123,26.507589,0.498559,0.224784,12.661633,0.348703,0.723343,8.602482
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,0,27.0,0,0,13.0000,0,0,...,21.711996,0.689342,14.144123,30.740707,0.342593,0.222222,19.741782,0.111111,0.564815,13.368110
887,887,1,1,1,19.0,0,0,30.0000,0,0,...,39.143456,1.609756,20.018357,34.611765,0.553191,0.457447,106.125798,0.478723,1.010638,65.181486
888,888,0,3,1,,1,2,23.4500,0,3,...,39.143456,1.609756,20.018357,21.750000,0.895833,0.798611,16.118810,0.618056,1.694444,6.871278
889,889,1,1,0,26.0,0,0,30.0000,1,0,...,48.262109,0.526316,33.111051,41.281386,0.311475,0.278689,67.226127,0.360656,0.590164,43.502624


In [11]:
#sns.pairplot(tr_concat)

In [12]:
test

Unnamed: 0,id,pclass,sex,age,sibsp,parch,fare,embarked,family_count,fare_per_person
0,0,3,0,22.0,1,0,7.2500,0,1,3.625000
1,1,1,1,38.0,1,0,71.2833,1,1,35.641650
2,2,3,1,26.0,0,0,7.9250,0,0,7.925000
3,5,3,0,,0,0,8.4583,2,0,8.458300
4,6,1,0,54.0,0,0,51.8625,0,0,51.862500
...,...,...,...,...,...,...,...,...,...,...
441,885,3,1,39.0,0,5,29.1250,2,5,4.854167
442,886,2,0,27.0,0,0,13.0000,0,0,13.000000
443,887,1,1,19.0,0,0,30.0000,0,0,30.000000
444,889,1,0,26.0,0,0,30.0000,1,0,30.000000


In [13]:
test_groupby_es = test.groupby(['embarked','sex']).mean().drop(['id'],axis=1)
test_groupby_es.columns = ['pclass_es','age_es','sibsp_es','parch_es','fare_es','family_count_es','fare_per_person_es']

test_groupby_ps = test.groupby(['pclass','sex']).mean().drop(['id'],axis=1)
test_groupby_ps.columns = ['age_ps','sibsp_ps','parch_ps','fare_ps','embarked_ps','family_count_ps','fare_per_person_ps']

# test_groupby_pe = test.groupby(['pclass','embarked']).mean().drop(['id'],axis=1)
# test_groupby_pe.columns = ['sex_pe','age_pe','sibsp_pe','parch_pe','fare_pe','family_count_pe','fare_per_person_pe']

test = test.merge(test_groupby_es, how='left',on= ['embarked','sex'])
test = test.merge(test_groupby_ps, how='left',on= ['pclass','sex'])
# test = test.merge(test_groupby_pe, how='left',on= ['pclass','embarked'])
test

Unnamed: 0,id,pclass,sex,age,sibsp,parch,fare,embarked,family_count,fare_per_person,...,fare_es,family_count_es,fare_per_person_es,age_ps,sibsp_ps,parch_ps,fare_ps,embarked_ps,family_count_ps,fare_per_person_ps
0,0,3,0,22.0,1,0,7.2500,0,1,3.625000,...,19.989800,0.516129,14.099553,26.146190,0.465116,0.162791,11.861213,0.325581,0.627907,8.370807
1,1,1,1,38.0,1,0,71.2833,1,1,35.641650,...,90.905854,1.000000,57.892306,36.171429,0.512195,0.365854,116.328254,0.609756,0.878049,74.484436
2,2,3,1,26.0,0,0,7.9250,0,0,7.925000,...,30.827738,1.539216,16.827002,22.250000,0.886076,0.810127,15.510761,0.582278,1.696203,6.706164
3,5,3,0,,0,0,8.4583,2,0,8.458300,...,19.001095,1.315789,8.996051,26.146190,0.465116,0.162791,11.861213,0.325581,0.627907,8.370807
4,6,1,0,54.0,0,0,51.8625,0,0,51.862500,...,19.989800,0.516129,14.099553,44.064815,0.298507,0.194030,57.331593,0.388060,0.492537,38.915962
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
441,885,3,1,39.0,0,5,29.1250,2,5,4.854167,...,15.524784,0.578947,9.883775,22.250000,0.886076,0.810127,15.510761,0.582278,1.696203,6.706164
442,886,2,0,27.0,0,0,13.0000,0,0,13.000000,...,19.989800,0.516129,14.099553,31.302955,0.306122,0.204082,18.316327,0.163265,0.510204,12.203543
443,887,1,1,19.0,0,0,30.0000,0,0,30.000000,...,30.827738,1.539216,16.827002,36.171429,0.512195,0.365854,116.328254,0.609756,0.878049,74.484436
444,889,1,0,26.0,0,0,30.0000,1,0,30.000000,...,40.000727,0.557692,27.203730,44.064815,0.298507,0.194030,57.331593,0.388060,0.492537,38.915962


In [14]:
tr_concat.columns

Index(['id', 'survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'family_count', 'fare_per_person', 'pclass_es', 'age_es',
       'sibsp_es', 'parch_es', 'fare_es', 'family_count_es',
       'fare_per_person_es', 'age_ps', 'sibsp_ps', 'parch_ps', 'fare_ps',
       'embarked_ps', 'family_count_ps', 'fare_per_person_ps'],
      dtype='object')

In [15]:
#drop 'id' and 'survived'  to leran
train_x = tr_concat.drop(['id','survived','age_es',
       'sibsp_es', 'parch_es', 'fare_es', 'family_count_es',
       'fare_per_person_es', 'age_ps', 'sibsp_ps','embarked_ps', 'family_count_ps', 'fare_per_person_ps'],axis=1)
train_y = tr_concat['survived']
test_x = test.copy().drop(['id','age_es',
       'sibsp_es', 'parch_es', 'fare_es', 'family_count_es',
       'fare_per_person_es', 'age_ps', 'sibsp_ps','embarked_ps', 'family_count_ps', 'fare_per_person_ps'],axis=1)


In [16]:
#model with xgboost
from xgboost import XGBClassifier
model_xgb = XGBClassifier(n_estimators=35,random_state=77,max_depth = 3, min_child_weight = 4.0)
model_xgb.fit(train_x,train_y)

#predict
pred_xgb =  model_xgb.predict_proba(test_x)[:,1]
pred_flag_xgb =  model_xgb.predict(test_x)


In [17]:
pred_xgb = pd.DataFrame(pred_xgb)
pred_xgb


Unnamed: 0,0
0,0.073707
1,0.936076
2,0.756863
3,0.077572
4,0.158699
...,...
441,0.290103
442,0.085133
443,0.936121
444,0.529842


In [18]:
#feature importance
feature_importance_xgb = model_xgb.feature_importances_
importance_xgb = pd.DataFrame({'features':test_x.columns,'importance':feature_importance_xgb})
importance_xgb


Unnamed: 0,features,importance
0,pclass,0.033275
1,sex,0.527233
2,age,0.047405
3,sibsp,0.063013
4,parch,0.028121
5,fare,0.006358
6,embarked,0.025535
7,family_count,0.100914
8,fare_per_person,0.051258
9,pclass_es,0.012487


In [19]:
#cross validaton
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import KFold

scores_accuracy = []
scores_logloss = []

kf = KFold(n_splits=4, shuffle=True, random_state=77)
for tr_idx, va_idx in kf.split(train_x):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
    
    #model
    model = XGBClassifier(n_estimators=35,random_state=77,max_depth = 3, min_child_weight = 4)
    model.fit(tr_x,tr_y)
    
    va_pred = model.predict_proba(va_x)[:,1]#list of probability
    
    #scores culculation
    logloss = log_loss(va_y,va_pred)
    accuracy = accuracy_score(va_y,va_pred > 0.5)#Transforming the probability_list to binary_list,culculate　accuracies.
    
    #append
    scores_logloss.append(logloss)
    scores_accuracy.append(accuracy)



In [20]:
np.mean(scores_accuracy),np.mean(scores_logloss)

(0.9023502201753323, 0.31813279952186757)

In [21]:
# Transforming the predict_data to submit.
submission = pd.DataFrame(pred_xgb)
test[0] = test['id']
submission.index = test['id']

submission.to_csv('submit.tsv',sep='\t',header=None)

In [22]:
submission

Unnamed: 0_level_0,0
id,Unnamed: 1_level_1
0,0.073707
1,0.936076
2,0.756863
5,0.077572
6,0.158699
...,...
885,0.290103
886,0.085133
887,0.936121
889,0.529842
