In [83]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pandas.tools.plotting import scatter_matrix
%matplotlib inline
# Set font for plotting
sns.set(font_scale=1.5)

In [84]:
train = pd.read_csv('data/titanic_train_cleaned.csv', index_col="PassengerId")
train_X = train.iloc[:, 1:10]
train_y = train.iloc[:,0]
test = pd.read_csv('data/titanic_test_cleaned.csv', index_col="PassengerId")
test_index = test.index

In [85]:
train_X.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


## Different types of features:
    - numerical: scaling
    - unordered categorical features: one hot enconding + scaling
    - ordered categorical features: label encoding + scaling

## One hot encoding (order of categories has no meaning)
![One hot encoding](images/one_hot_encoding.png)

## Label enconding (order of categories adds information)
![label encoding](images/label_enconding.png)

## Feature engineering: create/extract, delete, select, transform/scale

## 1. Feature extraction: Sex + Age => PersonType

In [86]:
def apply_func(age_sex):
    age, sex = age_sex
    if age < 16:
        return 'Child'
    else:
        return 'Woman' if sex == 'female' else 'Man'

In [87]:
train_X['PersonType'] = train_X[['Age', 'Sex']].apply(apply_func, axis=1)
test['PersonType'] = test[['Age', 'Sex']].apply(apply_func, axis=1)

In [64]:
train_X.head(3)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,PersonType
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,Man
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,Woman
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,Woman


## 2. Feature extraction: Name => Title 

In [88]:
title_dict = {
            "Capt":       "Officer",
            "Col":        "Officer",
            "Major":      "Officer",
            "Jonkheer":   "Royalty",
            "Don":        "Royalty",
            "Sir" :       "Royalty",
            "Dr":         "Officer",
            "Rev":        "Officer",
            "the Countess":"Royalty",
            "Dona":       "Royalty",
            "Mme":        "Mrs",
            "Mlle":       "Miss",
            "Ms":         "Mrs",
            "Mr" :        "Mr",
            "Mrs" :       "Mrs",
            "Miss" :      "Miss",
            "Master" :    "Master",
            "Lady" :      "Royalty"
            }

In [90]:
train_X['Title'] = train_X['Name'].map(lambda name: name.split(',')[1].split('.')[0].strip()).map(title_dict)
test['Title'] = test['Name'].map(lambda name: name.split(',')[1].split('.')[0].strip()).map(title_dict)

In [67]:
train_X.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,PersonType,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,Man,Mr
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,Woman,Mrs
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,Woman,Miss
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,Woman,Mrs
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,Man,Mr


## 3. Feature reduction: Ticket and Name

In [91]:
train_X.drop(['Ticket', 'Name'], axis=1, inplace=True)
test.drop(['Ticket', 'Name'], axis=1, inplace=True)

In [92]:
train_X.head(3)

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,PersonType,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3,male,22.0,1,0,7.25,S,Man,Mr
2,1,female,38.0,1,0,71.2833,C,Woman,Mrs
3,3,female,26.0,0,0,7.925,S,Woman,Miss


## 4. Label encoding using Pandas factorize function

In [93]:
train_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 9 columns):
Pclass        891 non-null int64
Sex           891 non-null object
Age           891 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Fare          891 non-null float64
Embarked      891 non-null object
PersonType    891 non-null object
Title         891 non-null object
dtypes: float64(2), int64(3), object(4)
memory usage: 69.6+ KB


In [94]:
cols = ['Sex', 'Embarked', 'PersonType', 'Title']
for col in cols:
    train_X.loc[:,col] = pd.factorize(train_X.loc[:,col])[0]
    test.loc[:,col] = pd.factorize(test.loc[:,col])[0]

In [95]:
train_X.head(10)

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,PersonType,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3,0,22.0,1,0,7.25,0,0,0
2,1,1,38.0,1,0,71.2833,1,1,1
3,3,1,26.0,0,0,7.925,0,1,2
4,1,1,35.0,1,0,53.1,0,1,1
5,3,0,35.0,0,0,8.05,0,0,0
6,3,0,29.699118,0,0,8.4583,2,0,0
7,1,0,54.0,0,0,51.8625,0,0,0
8,3,0,2.0,3,1,21.075,0,2,3
9,3,1,27.0,0,2,11.1333,0,1,1
10,2,1,14.0,1,0,30.0708,1,2,1


In [73]:
test.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,PersonType,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
892,3,0,34.5,0,0,7.8292,0,0,0
893,3,1,47.0,1,0,7.0,1,1,1
894,2,0,62.0,0,0,9.6875,0,0,0
895,3,0,27.0,0,0,8.6625,1,0,0
896,3,1,22.0,1,1,12.2875,1,1,1


## 5. Feature scaling => N(0,1)

In [96]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train_X)

train_imputed = scaler.transform(train_X)
train_X = pd.DataFrame(train_imputed, columns = train_X.columns)
test_imputed = scaler.transform(test)
test = pd.DataFrame(test_imputed, columns = test.columns)

In [97]:
len(train_X),len(train_y)

(891, 891)

In [76]:
train_X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,PersonType,Title
0,0.827377,-0.737695,-0.592481,0.432793,-0.473674,-0.502445,-0.571933,-0.742596,-0.71219
1,-1.566107,1.355574,0.638789,0.432793,-0.473674,0.786845,1.000883,0.771484,0.163067
2,0.827377,1.355574,-0.284663,-0.474545,-0.473674,-0.488854,-0.571933,0.771484,1.038323
3,-1.566107,1.355574,0.407926,0.432793,-0.473674,0.42073,-0.571933,0.771484,0.163067
4,0.827377,-0.737695,0.407926,-0.474545,-0.473674,-0.486337,-0.571933,-0.742596,-0.71219


In [77]:
#test.head()

In [78]:
test.index = test_index
test.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,PersonType,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
892,0.827377,-0.737695,0.369449,-0.474545,-0.473674,-0.490783,-0.571933,-0.742596,-0.71219
893,0.827377,1.355574,1.331378,0.432793,-0.473674,-0.507479,1.000883,0.771484,0.163067
894,-0.369365,-0.737695,2.485693,-0.474545,-0.473674,-0.453367,-0.571933,-0.742596,-0.71219
895,0.827377,-0.737695,-0.207709,-0.474545,-0.473674,-0.474005,1.000883,-0.742596,-0.71219
896,0.827377,1.355574,-0.592481,0.432793,0.76763,-0.401017,1.000883,0.771484,0.163067


In [79]:
train_X.index = range(1,len(train_X)+1)

In [80]:
train = pd.concat([train_X, train_y], axis=1)

## Check if there are any missing values (concatination errors)

In [81]:
train.isnull().sum()

Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked      0
PersonType    0
Title         0
Survived      0
dtype: int64

## Save the data

In [82]:
train.to_csv('data/titanic_train_model.csv', index=True, index_label="PassengerId")
test.to_csv('data/titanic_test_model.csv', index=True, index_label="PassengerId")