# Feature Engineering the Model

In this second attempt to increase the model efficiency using a random forest classifier, I will add the feature engineered variables and view the outcome.

## Import Data

In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

train = pd.read_csv("https://www.dropbox.com/s/1xyc3klpx2mtrqf/train.csv?dl=1")
test = pd.read_csv("https://www.dropbox.com/s/7n7k0f676i6nbng/test.csv?dl=1")

In [2]:
# fill missing values
train['Age'] = train['Age'].fillna(train['Age'].mean())
test['Age'] = test['Age'].fillna(test['Age'].mean())
test['Fare'] = test['Fare'].fillna(test['Fare'].median())
train['Embarked'] = train['Embarked'].fillna('S')

## Add Variables

In [24]:
# review of variables before encoding
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,isAlone,hasCabin,Ticket_Len,Title,Name_Len
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2,0,0,9,Mr.,23
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2,0,1,8,Mrs.,51
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,1,0,16,Miss.,22
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2,0,1,6,Mrs.,44
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1,1,0,6,Mr.,24


In [42]:
# add FamilySize
train['FamilySize'] = train['Parch'] + train['SibSp'] + 1
test['FamilySize'] = test['Parch'] + test['SibSp'] + 1

In [43]:
# add isAlone
train['isAlone'] = train['FamilySize'].map(lambda x: 1 if x == 1 else 0)
test['isAlone'] = test['FamilySize'].map(lambda x: 1 if x == 1 else 0)

In [44]:
# add hasCabin
train['hasCabin'] = train['Cabin'].notnull().astype(int)
test['hasCabin'] = test['Cabin'].notnull().astype(int)

In [45]:
# add Ticket_Len
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))

In [46]:
# add Title
train['Title'] = train['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split()[0])
test['Title'] = test['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split()[0])

In [47]:
# add Name_Len
train['Name_Len'] = train['Name'].apply(lambda x: len(x))
test['Name_Len'] = test['Name'].apply(lambda x: len(x))

In [48]:
# drop less useful variables
train.drop(labels=(['Cabin','Name', 'Parch', 'SibSp', 'Ticket']), axis=1, inplace=True)
test.drop(labels=(['Cabin','Name', 'Parch', 'SibSp', 'Ticket']), axis=1, inplace=True)

In [49]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,FamilySize,isAlone,hasCabin,Ticket_Len,Title,Name_Len
0,1,0,3,0,22.0,7.25,0,2,0,0,9,Mr.,23
1,2,1,1,1,38.0,71.2833,1,2,0,1,8,Mrs.,51
2,3,1,3,1,26.0,7.925,0,1,1,0,16,Miss.,22
3,4,1,1,1,35.0,53.1,0,2,0,1,6,Mrs.,44
4,5,0,3,0,35.0,8.05,0,1,1,0,6,Mr.,24


## Convert Variables

In [38]:
# encode variables
train.loc[train['Sex'] == 'male', 'Sex'] = 0
train.loc[train['Sex'] == 'female', 'Sex'] = 1
test.loc[test['Sex'] == 'male', 'Sex'] = 0
test.loc[test['Sex'] == 'female', 'Sex'] = 1

In [39]:
train.loc[train['Embarked'] == 'S', 'Embarked'] = 0
train.loc[train['Embarked'] == 'C', 'Embarked'] = 1
train.loc[train['Embarked'] == 'Q', 'Embarked'] = 2
test.loc[test['Embarked'] == 'S', 'Embarked'] = 0
test.loc[test['Embarked'] == 'C', 'Embarked'] = 1
test.loc[test['Embarked'] == 'Q', 'Embarked'] = 2

In [51]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,FamilySize,isAlone,hasCabin,Ticket_Len,Title,Name_Len
0,1,0,3,0,22.0,7.25,0,2,0,0,9,Mr.,23
1,2,1,1,1,38.0,71.2833,1,2,0,1,8,Mrs.,51
2,3,1,3,1,26.0,7.925,0,1,1,0,16,Miss.,22
3,4,1,1,1,35.0,53.1,0,2,0,1,6,Mrs.,44
4,5,0,3,0,35.0,8.05,0,1,1,0,6,Mr.,24


## Random Forest Classifier