### Including Main Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

## Importing Dataset

In [2]:
dataset = pd.read_csv("train.csv")

In [3]:
dataset = dataset.drop('PassengerId', axis=1)
dataset.head(7)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S


In [4]:
dataset.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


### Removing the missing data

In [5]:
null_columns=dataset.columns[dataset.isnull().any()]
dataset[null_columns].isnull().sum()

Age         177
Cabin       687
Embarked      2
dtype: int64

In [6]:
dataset[dataset["Embarked"].isnull()]


Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


Since "Embarked" has only 2 missing values, it wouldn't hurt to replace the values by preceeding/following entries

In [7]:
dataset["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [8]:
dataset["Embarked"] = dataset["Embarked"].fillna(method='bfill')
dataset[null_columns].isnull().sum()

Age         177
Cabin       687
Embarked      0
dtype: int64

Now that that's done, we see that there are too many missing values of "Cabin" to be imputed. What we can do is make it a binary feature, where all instances of a person having a cabin is replaced by 1 and of not having a cabin listed by 0.<br>
0 : Does not have a cabin listed<br>
1 : Has a Cabin listed<br>

In [9]:
cabins = dataset['Cabin'].values
i = 0
while(i < len(cabins)):
    if cabins[i] is np.nan:
        cabins[i] = 0
    else:
        cabins[i] = 1
    i+=1
cabins
#dataset['Cabin']
#dataset['Cabin'] = pd.Series(cabins)

array([0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0,

In [10]:
dataset['Cabin'] = pd.Series(cabins)
dataset.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,0,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,1,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,0,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,1,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,0,S


In [11]:
dataset[null_columns].isnull().sum()

Age         177
Cabin         0
Embarked      0
dtype: int64

When it comes to age, the problem is that there are enough missing values that make regular replacement an issue for the predictive model (Model may get wrongly biased).<br>
Now either we replace the values of age with some other value based on a linear regression model that is based on all other columns.<br>
I'm keen on trying multiple Imputation to solve this issue as multiple articles have referenced is relatively low bias on the model. However, to apply such a imputation model, I would have to first further clean the data

One more step that should ideally come under the Data Preprocessing phase is the seperation of "Title" (Mr, Mrs, Master etc) from name. Since all names seem to have a title, it wouldn't hurt to test the statistical importance of the respective titles

In [12]:
titles = []
for name in dataset['Name']:
    fname = name.split(',')
    title = fname[1].split('.')[0]
    titles.append(title)
dataset['Title'] = pd.Series(titles)
dataset = dataset.drop('Name',axis = 1)


In [25]:
ttype = []
for ticket in dataset['Ticket']:
    typetick = ticket.split()[0]
    try:
        int(typetick)
        ttype.append('Regular')
    except:
        ttype.append(typetick)

dataset['TicketType'] = pd.Series(ttype)
dataset.drop('Ticket',axis = 1)
    

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title,TicketType
0,0,3,male,22.0,1,0,7.2500,0,S,Mr,A/5
1,1,1,female,38.0,1,0,71.2833,1,C,Mrs,PC
2,1,3,female,26.0,0,0,7.9250,0,S,Miss,STON/O2.
3,1,1,female,35.0,1,0,53.1000,1,S,Mrs,Regular
4,0,3,male,35.0,0,0,8.0500,0,S,Mr,Regular
5,0,3,male,,0,0,8.4583,0,Q,Mr,Regular
6,0,1,male,54.0,0,0,51.8625,1,S,Mr,Regular
7,0,3,male,2.0,3,1,21.0750,0,S,Master,Regular
8,1,3,female,27.0,0,2,11.1333,0,S,Mrs,Regular
9,1,2,female,14.0,1,0,30.0708,0,C,Mrs,Regular


# Data Analysis

# Data Pre-Processing

In [None]:
dataset.head(10)

First things first.<br>
Converting all the string variables (excluding name) to categorical variables is an obvious starting point.

Lets take Sex and convert Male to 0 and Female to 1<br>
Male : 0<br>
Female : 1<br>

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Encoding Sex
labelencoder_sex = LabelEncoder()
dataset.loc[:, 'Sex'] = labelencoder_sex.fit_transform(dataset.loc[:, 'Sex'])

# Encoding Embarked
labelencoder_embarked = LabelEncoder()
dataset.loc[:,'Embarked'] = labelencoder_embarked.fit_transform(dataset.loc[:, 'Embarked'])

# Binarizing Embarked because it makes more sense here
#onehotencode_Embarked = OneHotEncoder(categorical_features=[9])
#onehotencode_Embarked.fit_transform(dataset).toarray()
