In [108]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pandas.tools.plotting import scatter_matrix
%matplotlib inline
# Set font for plotting
sns.set(font_scale=1.5)

In [131]:
train = pd.read_csv('data/titanic_train_cleaned.csv')
train_X = train.iloc[:, [0,2,3,4,5,6,7,8,9,10]]
train_y = train.iloc[:,1]
test = pd.read_csv('data/titanic_test_cleaned.csv')

In [133]:
train_X.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S


## Feature Extraction: Sex + Age => PersonType

In [111]:
def apply_func(age_sex):
    age, sex = age_sex
    if age < 16:
        return 'Child'
    else:
        return 'Woman' if sex == 'female' else 'Man'

In [134]:
train_X['PersonType'] = train_X[['Age', 'Sex']].apply(apply_func, axis=1)
test['PersonType'] = test[['Age', 'Sex']].apply(apply_func, axis=1)

In [135]:
train_X.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,PersonType
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,Man
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,Woman
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,Woman


## Feature Extraction: Name => Title 

In [114]:
title_dict = {
            "Capt":       "Officer",
            "Col":        "Officer",
            "Major":      "Officer",
            "Jonkheer":   "Royalty",
            "Don":        "Royalty",
            "Sir" :       "Royalty",
            "Dr":         "Officer",
            "Rev":        "Officer",
            "the Countess":"Royalty",
            "Dona":       "Royalty",
            "Mme":        "Mrs",
            "Mlle":       "Miss",
            "Ms":         "Mrs",
            "Mr" :        "Mr",
            "Mrs" :       "Mrs",
            "Miss" :      "Miss",
            "Master" :    "Master",
            "Lady" :      "Royalty"
            }

In [136]:
train_X['Title'] = train_X['Name'].map(lambda name: name.split(',')[1].split('.')[0].strip()).map(title_dict)
test['Title'] = test['Name'].map(lambda name: name.split(',')[1].split('.')[0].strip()).map(title_dict)

## Feature Reduction: Ticket (no idea how to deal with it) and Name

In [137]:
train_X.drop(['Ticket', 'Name', 'PassengerId'], axis=1, inplace=True)
test.drop(['Ticket', 'Name', 'PassengerId' ], axis=1, inplace=True)

In [138]:
train_X.head(3)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,PersonType,Title
0,3,male,22.0,1,0,7.25,S,Man,Mr
1,1,female,38.0,1,0,71.2833,C,Woman,Mrs
2,3,female,26.0,0,0,7.925,S,Woman,Miss


## Label Encoding using Pandas factorize function

In [139]:
train_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
Pclass        891 non-null int64
Sex           891 non-null object
Age           891 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Fare          891 non-null float64
Embarked      891 non-null object
PersonType    891 non-null object
Title         891 non-null object
dtypes: float64(2), int64(3), object(4)
memory usage: 62.7+ KB


In [141]:
cols = ['Pclass', 'Sex', 'Embarked', 'PersonType', 'Title']
for col in cols:
    train_X.loc[:,col] = pd.factorize(train_X.loc[:,col])[0]
    test.loc[:,col] = pd.factorize(test.loc[:,col])[0]

In [142]:
train_X.head(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,PersonType,Title
0,0,0,22.0,1,0,7.25,0,0,0
1,1,1,38.0,1,0,71.2833,1,1,1
2,0,1,26.0,0,0,7.925,0,1,2
3,1,1,35.0,1,0,53.1,0,1,1
4,0,0,35.0,0,0,8.05,0,0,0
5,0,0,29.881138,0,0,8.4583,2,0,0
6,1,0,54.0,0,0,51.8625,0,0,0
7,0,0,2.0,3,1,21.075,0,2,3
8,0,1,27.0,0,2,11.1333,0,1,1
9,2,1,14.0,1,0,30.0708,1,2,1


## Feature scaling categorical features besser one hot encoden

In [143]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train_X)

train_imputed = scaler.transform(train_X)
train_X = pd.DataFrame(train_imputed, columns = train_X.columns)
#test[test.columns] = scaler.transform(test[test.columns])

In [144]:
train_X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,PersonType,Title
0,-0.820037,-0.737695,-0.595254,0.432793,-0.473674,-0.502445,-0.571933,-0.742596,-0.71219
1,0.431081,1.355574,0.635996,0.432793,-0.473674,0.786845,1.000883,0.771484,0.163067
2,-0.820037,1.355574,-0.287441,-0.474545,-0.473674,-0.488854,-0.571933,0.771484,1.038323
3,0.431081,1.355574,0.405137,0.432793,-0.473674,0.42073,-0.571933,0.771484,0.163067
4,-0.820037,-0.737695,0.405137,-0.474545,-0.473674,-0.486337,-0.571933,-0.742596,-0.71219


In [75]:
test.head(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,PersonType,Title
0,-0.820037,-0.737695,0.36666,-0.474545,-0.473674,-0.490783,-0.571933,-0.742596,-0.71219
1,-0.820037,1.355574,1.328575,0.432793,-0.473674,-0.507479,1.000883,0.771484,0.163067
2,0.431081,-0.737695,2.482872,-0.474545,-0.473674,-0.453367,-0.571933,-0.742596,-0.71219
3,-0.820037,-0.737695,-0.210488,-0.474545,-0.473674,-0.474005,1.000883,-0.742596,-0.71219
4,-0.820037,1.355574,-0.595254,0.432793,0.76763,-0.401017,1.000883,0.771484,0.163067
5,-0.820037,-0.737695,-1.210879,-0.474545,-0.473674,-0.462679,1.000883,2.285565,-0.71219
6,-0.820037,1.355574,0.020371,-0.474545,-0.473674,-0.49481,-0.571933,0.771484,1.038323
7,0.431081,-0.737695,-0.287441,0.432793,0.76763,-0.064516,1.000883,-0.742596,-0.71219
8,-0.820037,1.355574,-0.903066,-0.474545,-0.473674,-0.502864,2.573699,0.771484,0.163067
9,-0.820037,-0.737695,-0.672207,1.340132,-0.473674,-0.162169,1.000883,-0.742596,-0.71219


In [145]:
train = pd.concat([train_x,y_train], axis=1)

NameError: name 'X_train' is not defined

In [55]:
train.to_csv('data/titanic_train_model.csv', index=False)
test.to_csv('data/titanic_test_model.csv', index=False)