In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.preprocessing as pp
from sklearn.model_selection import KFold

from IPython.display import display, HTML

### Data Exploration

In [3]:
#Data Exploration
df = pd.read_csv('train.csv')

display(df.head())
print ('Length of DataFrame: {}'.format(len(df)))
print ('Shape of DataFrame: {}' .format(df.shape))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Length of DataFrame: 891
Shape of DataFrame: (891, 12)


In [4]:
#Separate Label(Survived) from DataFrame
train_label = (df['Survived'])
train_feature = (df.drop('Survived',axis=1))

print ('Shape of train_label: {}' .format(train_label.shape))
print ('Shape of train_feature: {}' .format(train_feature.shape))

Shape of train_label: (891,)
Shape of train_feature: (891, 11)



- **Survived**: Outcome of survival (0 = No; 1 = Yes)
- **Pclass**: Socio-economic class (1 = Upper class; 2 = Middle class; 3 = Lower class)
- **Name**: Name of passenger
- **Sex**: Sex of the passenger
- **Age**: Age of the passenger (Some entries contain `NaN`)
- **SibSp**: Number of siblings and spouses of the passenger aboard
- **Parch**: Number of parents and children of the passenger aboard
- **Ticket**: Ticket number of the passenger
- **Fare**: Fare paid by the passenger
- **Cabin** Cabin number of the passenger (Some entries contain `NaN`)
- **Embarked**: Port of embarkation of the passenger (C = Cherbourg; Q = Queenstown; S = Southampton)

In [5]:
#Check how many types of Categorical data

print ('Check how many types of Categorical data: \n')
print ('Sex: ',np.unique(train_feature['Sex']))
print ('SibSp: ',np.unique(train_feature['SibSp']))
print ('Pclass: ',np.unique(train_feature['Pclass']))


#Names: deal with >) NaN 2) Mr,Miss
#Sex: M/W -> 1/0
train_feature['Sex'] = np.where(train_feature['Sex']=='male',1,0)

#Cabin: Has -> 1 ; NaN -> 0
train_feature['Cabin'] = train_feature['Cabin'].fillna(0)
train_feature['Cabin'] = np.where(train_feature['Cabin']==0,0,1)
#train_feature.rename({'Cabin':'Cabin(Y/N)'})

#Embarked: S,C,Q -> 0,1,2
train_feature['Embarked'] = train_feature['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} )

#Combine Sib_Sp and Par_ch -> ['Familysize']
train_feature['Familysize'] = train_feature['SibSp'] + train_feature['Parch']+1


#print ('Cabin: ',np.unique(train_feature['Cabin']))
#print ('Embarked: ',np.unique(train_feature['Embarked']))

Check how many types of Categorical data: 

Sex:  ['female' 'male']
SibSp:  [0 1 2 3 4 5 8]
Pclass:  [1 2 3]


In [6]:
train_feature.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Familysize
0,1,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,0,0.0,2
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,1,1.0,2
2,3,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,0,0.0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,1,0.0,2
4,5,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,0,0.0,1
