Clean, prepare and encode the dataset ready to ingest app data.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import scipy.stats as ss
import math

train = pd.read_csv("https://www.dropbox.com/s/1xyc3klpx2mtrqf/train.csv?dl=1")
test = pd.read_csv("https://www.dropbox.com/s/7n7k0f676i6nbng/test.csv?dl=1")

In [3]:
# fill missing values
train['Age'] = train['Age'].fillna(train['Age'].mean())
test['Age'] = test['Age'].fillna(test['Age'].mean())
test['Fare'] = test['Fare'].fillna(test['Fare'].median())
train['Embarked'] = train['Embarked'].fillna('S')

In [4]:
train = train.drop(columns=["Ticket","Embarked", "Cabin", "PassengerId"])
test = test.drop(columns=["Ticket","Embarked", "Cabin", "PassengerId"])

In [5]:
train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05


In [6]:
train.groupby(['Pclass']).mean()

Unnamed: 0_level_0,Survived,Age,SibSp,Parch,Fare
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.62963,37.048118,0.416667,0.356481,84.154687
2,0.472826,29.866958,0.402174,0.380435,20.662183
3,0.242363,26.403259,0.615071,0.393075,13.67555


In [7]:
train['SibSp'].value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

In [8]:
# add FamilySize
train['FamilySize'] = train['Parch'] + train['SibSp'] + 1
test['FamilySize'] = test['Parch'] + test['SibSp'] + 1

In [9]:
# add isAlone
train['isAlone'] = train['FamilySize'].map(lambda x: 1 if x == 1 else 0)
test['isAlone'] = test['FamilySize'].map(lambda x: 1 if x == 1 else 0)

In [10]:
# add Title
train['Title'] = train['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split()[0])
test['Title'] = test['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split()[0])

In [11]:
# add Name_Len
train['Name_Len'] = train['Name'].apply(lambda x: len(x))
test['Name_Len'] = test['Name'].apply(lambda x: len(x))

In [12]:
train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,FamilySize,isAlone,Title,Name_Len
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,2,0,Mr.,23
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,2,0,Mrs.,51
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,1,1,Miss.,22
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,2,0,Mrs.,44
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,1,1,Mr.,24


In [13]:
# drop less useful variables
train.drop(labels=(['Name', 'Parch', 'SibSp',]), axis=1, inplace=True)
test.drop(labels=(['Name', 'Parch', 'SibSp',]), axis=1, inplace=True)

In [14]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,FamilySize,isAlone,Title,Name_Len
0,0,3,male,22.0,7.25,2,0,Mr.,23
1,1,1,female,38.0,71.2833,2,0,Mrs.,51
2,1,3,female,26.0,7.925,1,1,Miss.,22
3,1,1,female,35.0,53.1,2,0,Mrs.,44
4,0,3,male,35.0,8.05,1,1,Mr.,24


In [15]:
# train.to_csv("analysis.csv", index=False)

In [16]:
test.head()

Unnamed: 0,Pclass,Sex,Age,Fare,FamilySize,isAlone,Title,Name_Len
0,3,male,34.5,7.8292,1,1,Mr.,16
1,3,female,47.0,7.0,2,0,Mrs.,32
2,2,male,62.0,9.6875,1,1,Mr.,25
3,3,male,27.0,8.6625,1,1,Mr.,16
4,3,female,22.0,12.2875,3,0,Mrs.,44


In [17]:
train.loc[train['Sex'] == 'male', 'Sex'] = 1
train.loc[train['Sex'] == 'female', 'Sex'] = 0
test.loc[test['Sex'] == 'male', 'Sex'] = 1
test.loc[test['Sex'] == 'female', 'Sex'] = 0

In [18]:
train.loc[train['Title'] == 'Mr.', 'Title'] = 0
train.loc[train['Title'] == 'Mrs.', 'Title'] = 1
train.loc[train['Title'] == 'Master.', 'Title'] = 2
train.loc[train['Title'] == 'Dr.', 'Title'] = 3
train.loc[train['Title'] == 'Rev.', 'Title'] = 4
train.loc[train['Title'] == 'Mlle.', 'Title'] = 5
train.loc[train['Title'] == 'Col.', 'Title'] = 6
train.loc[train['Title'] == 'Major.', 'Title'] = 7
train.loc[train['Title'] == 'the', 'Title'] = 8
train.loc[train['Title'] == 'Don.', 'Title'] = 9
train.loc[train['Title'] == 'Capt.', 'Title'] = 10
train.loc[train['Title'] == 'Mme.', 'Title'] = 11
train.loc[train['Title'] == 'Lady.', 'Title'] = 12
train.loc[train['Title'] == 'Jonkheer.', 'Title'] = 13
train.loc[train['Title'] == 'Sir.', 'Title'] = 14
train.loc[train['Title'] == 'Ms.', 'Title'] = 15
train.loc[train['Title'] == 'Miss.', 'Title'] = 16

In [19]:
test.loc[test['Title'] == 'Mr.', 'Title'] = 0
test.loc[test['Title'] == 'Mrs.', 'Title'] = 1
test.loc[test['Title'] == 'Master.', 'Title'] = 2
test.loc[test['Title'] == 'Dr.', 'Title'] = 3
test.loc[test['Title'] == 'Rev.', 'Title'] = 4
test.loc[test['Title'] == 'Mlle.', 'Title'] = 5
test.loc[test['Title'] == 'Col.', 'Title'] = 6
test.loc[test['Title'] == 'Major.', 'Title'] = 7
test.loc[test['Title'] == 'the', 'Title'] = 8
test.loc[test['Title'] == 'Don.', 'Title'] = 9
test.loc[test['Title'] == 'Capt.', 'Title'] = 10
test.loc[test['Title'] == 'Mme.', 'Title'] = 11
test.loc[test['Title'] == 'Lady.', 'Title'] = 12
test.loc[test['Title'] == 'Jonkheer.', 'Title'] = 13
test.loc[test['Title'] == 'Sir.', 'Title'] = 14
test.loc[test['Title'] == 'Ms.', 'Title'] = 15
test.loc[test['Title'] == 'Miss.', 'Title'] = 16
test.loc[test['Title'] == 'Dona.', 'Title'] = 17

In [20]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,FamilySize,isAlone,Title,Name_Len
0,0,3,1,22.0,7.25,2,0,0,23
1,1,1,0,38.0,71.2833,2,0,1,51
2,1,3,0,26.0,7.925,1,1,16,22
3,1,1,0,35.0,53.1,2,0,1,44
4,0,3,1,35.0,8.05,1,1,0,24


In [21]:
# train.to_csv("train.csv", index=False)