In [438]:
##### Data-Split, Training, Testing Flow and Considerations

In [439]:
import numpy as np
import pandas as pd

In [440]:
# read data
df = pd.read_csv('data/kaggleTitanic/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [441]:
# transform 'Cabin' column to 'Deck'
df['Deck'] = df['Cabin'].apply(lambda x: x[0] if pd.notna(x) else np.nan)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,C
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,


In [442]:
# pick columns for analysis
df = df[['Survived', 'Pclass', 'Sex', 'Age', 'Deck']]
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Deck
0,0,3,male,22.0,
1,1,1,female,38.0,C
2,1,3,female,26.0,
3,1,1,female,35.0,C
4,0,3,male,35.0,


In [443]:
# set up data
X = df.drop(['Survived'], axis=1)
y = df['Survived']
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

### Following code to deal with SetttingWithCopyWarning, and ensure we are working with a copy of the data and not a view
#https://github.com/scikit-learn/scikit-learn/issues/8723#issuecomment-416513938
#http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#evaluation-order-matters
Xtrain = Xtrain.copy()
Xtest = Xtest.copy()
ytrain = ytrain.copy()
ytest = ytest.copy()

In [444]:
##### Fit/Transform Using Training Data #####

In [445]:
# (1) Xtrain: impute Deck
from sklearn.impute import SimpleImputer
sideck = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='X') 

### Note: fit_transform returns a numpy array.
#type(sideck.fit_transform(Xtrain[['Cabin']]))

### You can use method 1 or method 2 to assign the result of the fit_transform as a new column in the original dataframe

### method 1:
#     when you convert a numpy array to a dataframe, it will assign indices starting at 0 by default
#     however, due to the train-test split, Xtrain's indices are now shuffled.
#     so if you use method 1, you will need to explicitly ask pd.DataFrame to use Xtrain.index
Xtrain['impDeck'] = pd.DataFrame(sideck.fit_transform(Xtrain[['Deck']]), index=Xtrain.index)      

### method 2:
#     if you use method 2, since you are assigning a numpy array to the dataframe column, 
#     it will pick up whatever indices exist in the dataframe
#Xtrain['impDeck'] = sideck.fit_transform(Xtrain[['Deck']])  

Xtrain.sample(frac=0.01)

Unnamed: 0,Pclass,Sex,Age,Deck,impDeck
557,1,male,,,X
481,2,male,,,X
721,3,male,17.0,,X
339,1,male,45.0,T,T
192,3,female,19.0,,X
303,2,female,,E,E
745,1,male,70.0,B,B


In [446]:
# (2) Xtrain: impute Age
from sklearn.impute import SimpleImputer
siage = SimpleImputer(missing_values=np.nan, strategy='median')
Xtrain['impAge'] = pd.DataFrame(siage.fit_transform(Xtrain[['Age']]), index=Xtrain.index)
Xtrain.sample(frac=0.01)

Unnamed: 0,Pclass,Sex,Age,Deck,impDeck,impAge
781,1,female,17.0,B,B,17.0
82,3,female,,,X,29.0
786,3,female,18.0,,X,18.0
38,3,female,18.0,,X,18.0
150,2,male,51.0,,X,51.0
709,3,male,,,X,29.0
140,3,female,,,X,29.0


In [447]:
# (3) Xtrain: k-bin discretize impAge
from sklearn.preprocessing import KBinsDiscretizer
kbdage = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile') 
Xtrain['kbinAge'] = pd.DataFrame(kbdage.fit_transform(Xtrain[['impAge']]), index=Xtrain.index)
Xtrain.head()

Unnamed: 0,Pclass,Sex,Age,Deck,impDeck,impAge,kbinAge
301,3,male,,,X,29.0,3.0
309,1,female,30.0,E,E,30.0,3.0
516,2,female,34.0,F,F,34.0,3.0
120,2,male,21.0,,X,21.0,1.0
570,2,male,62.0,,X,62.0,4.0


In [448]:
# (4) Xtrain: OneHotEncode all categorical variables
# note the handle_unknown flag
from sklearn.preprocessing import OneHotEncoder
categoricalvars = ['Pclass', 'Sex', 'impDeck', 'kbinAge']
ohe = OneHotEncoder(sparse=False, dtype=int, handle_unknown='ignore')
Xcat = pd.DataFrame(ohe.fit_transform(Xtrain[categoricalvars]), columns=ohe.get_feature_names(), index=Xtrain.index)
Xtrain = pd.concat([Xtrain, Xcat], axis=1)
Xtrain.head()

Unnamed: 0,Pclass,Sex,Age,Deck,impDeck,impAge,kbinAge,x0_1,x0_2,x0_3,...,x2_D,x2_E,x2_F,x2_G,x2_T,x2_X,x3_0.0,x3_1.0,x3_3.0,x3_4.0
301,3,male,,,X,29.0,3.0,0,0,1,...,0,0,0,0,0,1,0,0,1,0
309,1,female,30.0,E,E,30.0,3.0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
516,2,female,34.0,F,F,34.0,3.0,0,1,0,...,0,0,1,0,0,0,0,0,1,0
120,2,male,21.0,,X,21.0,1.0,0,1,0,...,0,0,0,0,0,1,0,1,0,0
570,2,male,62.0,,X,62.0,4.0,0,1,0,...,0,0,0,0,0,1,0,0,0,1


In [449]:
# (5) drop all unwanted columns
Xtrain.drop(['Pclass', 'Sex', 'Age', 'Deck', 'impAge', 'impDeck', 'kbinAge'], axis=1, inplace=True)
Xtrain.head()

Unnamed: 0,x0_1,x0_2,x0_3,x1_female,x1_male,x2_A,x2_B,x2_C,x2_D,x2_E,x2_F,x2_G,x2_T,x2_X,x3_0.0,x3_1.0,x3_3.0,x3_4.0
301,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0
309,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0
516,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0
120,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0
570,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1


In [450]:
# (6) fit random forest classifier on training data
from sklearn.ensemble import RandomForestClassifier                                                                                     
rfc = RandomForestClassifier(n_estimators=100, random_state=1)   
rfc.fit(Xtrain, ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [451]:
##### Transform/Predict Using Test Data #####

In [452]:
# (1) Xtest: impute Deck - use model you fit with training data, and transform test data
Xtest['impDeck'] = pd.DataFrame(sideck.transform(Xtest[['Deck']]), index=Xtest.index)  
Xtest.head()

Unnamed: 0,Pclass,Sex,Age,Deck,impDeck
862,1,female,48.0,D,D
223,3,male,,,X
84,2,female,17.0,,X
680,3,female,,,X
535,2,female,7.0,,X


In [453]:
# (2) Xtest: impute Age - use model you fit with training data, and transform test data
Xtest['impAge'] = pd.DataFrame(siage.transform(Xtest[['Age']]), index=Xtest.index)
Xtest.head()

Unnamed: 0,Pclass,Sex,Age,Deck,impDeck,impAge
862,1,female,48.0,D,D,48.0
223,3,male,,,X,29.0
84,2,female,17.0,,X,17.0
680,3,female,,,X,29.0
535,2,female,7.0,,X,7.0


In [454]:
# (3) Xtest: k-bin discretize impAge - use model you fit with training data, and transform test data
Xtest['kbinAge'] = pd.DataFrame(kbdage.transform(Xtest[['impAge']]), index=Xtest.index)
Xtest.head()

Unnamed: 0,Pclass,Sex,Age,Deck,impDeck,impAge,kbinAge
862,1,female,48.0,D,D,48.0,4.0
223,3,male,,,X,29.0,3.0
84,2,female,17.0,,X,17.0,0.0
680,3,female,,,X,29.0,3.0
535,2,female,7.0,,X,7.0,0.0


In [455]:
# (4) Xtest: OneHotEncode all categorical variables - use model you fit with training data, and transform test data
Xcat = pd.DataFrame(ohe.transform(Xtest[categoricalvars]), columns=ohe.get_feature_names(), index=Xtest.index)
Xtest = pd.concat([Xtest, Xcat], axis=1)
Xtest.head()

Unnamed: 0,Pclass,Sex,Age,Deck,impDeck,impAge,kbinAge,x0_1,x0_2,x0_3,...,x2_D,x2_E,x2_F,x2_G,x2_T,x2_X,x3_0.0,x3_1.0,x3_3.0,x3_4.0
862,1,female,48.0,D,D,48.0,4.0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
223,3,male,,,X,29.0,3.0,0,0,1,...,0,0,0,0,0,1,0,0,1,0
84,2,female,17.0,,X,17.0,0.0,0,1,0,...,0,0,0,0,0,1,1,0,0,0
680,3,female,,,X,29.0,3.0,0,0,1,...,0,0,0,0,0,1,0,0,1,0
535,2,female,7.0,,X,7.0,0.0,0,1,0,...,0,0,0,0,0,1,1,0,0,0


In [456]:
# (5) drop all unwanted columns
Xtest.drop(['Pclass', 'Sex', 'Age', 'Deck', 'impAge', 'impDeck', 'kbinAge'], axis=1, inplace=True)
Xtest.head()

Unnamed: 0,x0_1,x0_2,x0_3,x1_female,x1_male,x2_A,x2_B,x2_C,x2_D,x2_E,x2_F,x2_G,x2_T,x2_X,x3_0.0,x3_1.0,x3_3.0,x3_4.0
862,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1
223,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0
84,0,1,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0
680,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0
535,0,1,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0


In [457]:
# (6) use random forest classifier you fit on training data - to evaluate on test data
ypred = rfc.predict(Xtest)
from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred))
print (metrics.confusion_matrix(ytest, ypred))
print (metrics.classification_report(ytest, ypred))

0.7541899441340782
[[88 18]
 [26 47]]
              precision    recall  f1-score   support

           0       0.77      0.83      0.80       106
           1       0.72      0.64      0.68        73

   micro avg       0.75      0.75      0.75       179
   macro avg       0.75      0.74      0.74       179
weighted avg       0.75      0.75      0.75       179

