### Importing Relevant Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
sns.set()

### Loading the Train and Test data

In [2]:
raw_train_data = pd.read_csv('train.csv')
raw_test_data = pd.read_csv('test.csv')

In [3]:
#To see what is in our data frame
raw_train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
#To see what is in our test data frame
raw_test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


#### To always be on the same page, whatever processing I do on the train data, will also be done on the test data to avoid confusion in our testing stage.

### Data Preprocessing

In [5]:
raw_train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
raw_test_data.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [7]:
# Creating a checkpoint
train_data_processing = raw_train_data.copy()

# To fill the missing columns in Age for train data
a = train_data_processing['Age'].mean()
train_data_processing['Age'] = train_data_processing['Age'].fillna(a)

In [8]:
# Creating a checkpoint
test_data_processing = raw_test_data.copy()

# To fill the missing columns in Age for test data

test_data_processing['Age'] = test_data_processing['Age'].fillna(test_data_processing['Age'].mean())

In [9]:
test_data_processing.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,12.634534,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,23.0,0.0,0.0,7.8958
50%,1100.5,3.0,30.27259,0.0,0.0,14.4542
75%,1204.75,3.0,35.75,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [10]:
# Dropping the passenger id, name, ticket and cabin columns
train_data_processing = train_data_processing.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1)
test_data_processing = test_data_processing.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1)
train_data_processing.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [11]:
# To check how many people survived
train_data_processing['Survived'].sum()

342

##### Now, I'll be checking the correlation between some features and survival to know which ones are useful

In [12]:
train_data_processing[['Survived','Pclass']].groupby(['Pclass'], as_index = False).mean().sort_values(by = 'Survived', ascending=False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [13]:
train_data_processing[['Survived','Sex']].groupby(['Sex'], as_index = False).mean().sort_values(by = 'Survived', ascending=False)

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [14]:
train_data_processing[['Survived','SibSp']].groupby(['SibSp'], as_index = False).mean().sort_values(by = 'Survived', ascending=False)

Unnamed: 0,SibSp,Survived
1,1,0.535885
2,2,0.464286
0,0,0.345395
3,3,0.25
4,4,0.166667
5,5,0.0
6,8,0.0


In [15]:
train_data_processing[['Survived','Parch']].groupby(['Parch'], as_index = False).mean().sort_values(by = 'Survived', ascending=False)

Unnamed: 0,Parch,Survived
3,3,0.6
1,1,0.550847
2,2,0.5
0,0,0.343658
5,5,0.2
4,4,0.0
6,6,0.0


In [16]:
train_data_processing[['Survived','Embarked']].groupby(['Embarked'], as_index = False).mean().sort_values(by = 'Survived', ascending=False)

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.336957


#### The features Pclass, Sex and Embarked affect the survival... SibSp and Parch have zero correlation for some values, so I'll just combine the as one to get one less feature and see if there is a correlation.

In [17]:
# To check how many unique values to help with our mapping
print (pd.unique(train_data_processing['Sex']))
print (pd.unique(train_data_processing['Pclass']))
print (pd.unique(train_data_processing['Embarked']))


['male' 'female']
[3 1 2]
['S' 'C' 'Q' nan]


In [18]:
# To check how many missing values in Embarked
bool_train = pd.isnull(train_data_processing['Embarked'])
train_data_processing[bool_train]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
61,1,1,female,38.0,0,0,80.0,
829,1,1,female,62.0,0,0,80.0,


In [19]:
# Since the missin values for embarked is just 2, we drop the rows with the missing data
train_data_processing = train_data_processing.drop([61,829], axis = 0)

In [20]:
#Time to map the sex and embarked columns
train_data_processing['Sex'] = train_data_processing['Sex'].map({'male': 0, 'female':1})
train_data_processing['Embarked'] = train_data_processing['Embarked'].map({'C': 0, 'S':1, 'Q': 2})

test_data_processing['Sex'] = test_data_processing['Sex'].map({'male': 0, 'female':1})
test_data_processing['Embarked'] = test_data_processing['Embarked'].map({'C': 0, 'S':1, 'Q': 2})

train_data_processing.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,1
1,1,1,1,38.0,1,0,71.2833,0
2,1,3,1,26.0,0,0,7.925,1
3,1,1,1,35.0,1,0,53.1,1
4,0,3,0,35.0,0,0,8.05,1


In [21]:
# We want to categorise the Age into age groups before analysing
#Toddlers
train_data_processing.loc[ train_data_processing['Age'] <= 2, 'Age'] = 0
#Children
train_data_processing.loc[(train_data_processing['Age'] > 2) & (train_data_processing['Age'] <= 17), 'Age'] = 1
#Adults
train_data_processing.loc[(train_data_processing['Age'] > 17) & (train_data_processing['Age'] <= 60), 'Age'] = 2
#Elders
train_data_processing.loc[ train_data_processing['Age'] > 60, 'Age'] = 3
    
# Test   
test_data_processing.loc[ test_data_processing['Age'] <= 2, 'Age'] = 0
test_data_processing.loc[(test_data_processing['Age'] > 2) & (test_data_processing['Age'] <= 17), 'Age'] = 1
test_data_processing.loc[(test_data_processing['Age'] > 17) & (test_data_processing['Age'] <= 60), 'Age'] = 2
test_data_processing.loc[ test_data_processing['Age'] > 60, 'Age'] = 3
    
train_data_processing.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,2.0,1,0,7.25,1
1,1,1,1,2.0,1,0,71.2833,0
2,1,3,1,2.0,0,0,7.925,1
3,1,1,1,2.0,1,0,53.1,1
4,0,3,0,2.0,0,0,8.05,1


In [22]:
train_data_processing[['Survived','Age']].groupby(['Age'], as_index = False).mean().sort_values(by = 'Survived', ascending=False)

Unnamed: 0,Age,Survived
0,0.0,0.625
1,1.0,0.516854
2,2.0,0.364238
3,3.0,0.190476


In [23]:
# Family size is a combination of the Parents-Children and Sibling-Spouse relationship

train_data_processing['Famsize'] = train_data_processing['Parch'] + train_data_processing['SibSp']
test_data_processing['Famsize'] = test_data_processing['Parch'] + test_data_processing['SibSp']

#To get the order of the columns
order = train_data_processing.columns.values
order

array(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked', 'Famsize'], dtype=object)

In [24]:
# Reordering the columns for better readability
reordered = ['Survived', 'Pclass', 'Sex', 'Age', 'Fare',
       'Embarked', 'SibSp', 'Parch', 'Famsize']
train_data_processing = train_data_processing[reordered]
test_data_processing = test_data_processing[['Pclass', 'Sex', 'Age', 'Fare',
       'Embarked', 'SibSp', 'Parch', 'Famsize']]

train_data_processing

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,SibSp,Parch,Famsize
0,0,3,0,2.0,7.2500,1,1,0,1
1,1,1,1,2.0,71.2833,0,1,0,1
2,1,3,1,2.0,7.9250,1,0,0,0
3,1,1,1,2.0,53.1000,1,1,0,1
4,0,3,0,2.0,8.0500,1,0,0,0
...,...,...,...,...,...,...,...,...,...
886,0,2,0,2.0,13.0000,1,0,0,0
887,1,1,1,2.0,30.0000,1,0,0,0
888,0,3,1,2.0,23.4500,1,1,2,3
889,1,1,0,2.0,30.0000,0,0,0,0


In [25]:
train_data_processing[['Survived','Famsize']].groupby(['Famsize'], as_index = False).mean().sort_values(by = 'Survived', ascending=False)

Unnamed: 0,Famsize,Survived
3,3,0.724138
2,2,0.578431
1,1,0.552795
6,6,0.333333
0,0,0.300935
4,4,0.2
5,5,0.136364
7,7,0.0
8,10,0.0


In [26]:
train_data_processing['Famsize'].value_counts(dropna = False)


0     535
1     161
2     102
3      29
5      22
4      15
6      12
10      7
7       6
Name: Famsize, dtype: int64

###### The first cell shows that majority of the people that survived did not have family onboard.. To make work easier, we classify them into 2.. Alone and not-alone

In [27]:
train_data_processing['Alone'] = 1
train_data_processing.loc[(train_data_processing['Famsize'] >=1), ['Alone']] = 0


test_data_processing['Alone'] = 1
test_data_processing.loc[(test_data_processing['Famsize'] >=1), ['Alone']] = 0



train_data_processing

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,SibSp,Parch,Famsize,Alone
0,0,3,0,2.0,7.2500,1,1,0,1,0
1,1,1,1,2.0,71.2833,0,1,0,1,0
2,1,3,1,2.0,7.9250,1,0,0,0,1
3,1,1,1,2.0,53.1000,1,1,0,1,0
4,0,3,0,2.0,8.0500,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,2.0,13.0000,1,0,0,0,1
887,1,1,1,2.0,30.0000,1,0,0,0,1
888,0,3,1,2.0,23.4500,1,1,2,3,0
889,1,1,0,2.0,30.0000,0,0,0,0,1


#### Handling the Fare columns

In [28]:
# To fill in the missing values in the fare columns
train_data_processing['Fare'].fillna(train_data_processing['Fare'].dropna().median(), inplace=True)
test_data_processing['Fare'].fillna(test_data_processing['Fare'].dropna().median(), inplace=True)

# Now using fareband to to check correlation
train_data_processing['FareBand'] = pd.qcut(train_data_processing['Fare'], 4)
train_data_processing[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)

Unnamed: 0,FareBand,Survived
0,"(-0.001, 7.896]",0.197309
1,"(7.896, 14.454]",0.303571
2,"(14.454, 31.0]",0.454955
3,"(31.0, 512.329]",0.577273


In [29]:
# Regrouping the Fare column
train_data_processing.loc[ train_data_processing['Fare'] <= 7.91, 'Fare'] = 0
train_data_processing.loc[(train_data_processing['Fare'] > 7.91) & (train_data_processing['Fare'] <= 14.454), 'Fare'] = 1
train_data_processing.loc[(train_data_processing['Fare'] > 14.454) & (train_data_processing['Fare'] <= 31), 'Fare']   = 2
train_data_processing.loc[ train_data_processing['Fare'] > 31, 'Fare'] = 3

# Test
test_data_processing.loc[ test_data_processing['Fare'] <= 7.91, 'Fare'] = 0
test_data_processing.loc[(test_data_processing['Fare'] > 7.91) & (test_data_processing['Fare'] <= 14.454), 'Fare'] = 1
test_data_processing.loc[(test_data_processing['Fare'] > 14.454) & (test_data_processing['Fare'] <= 31), 'Fare']   = 2
test_data_processing.loc[ test_data_processing['Fare'] > 31, 'Fare'] = 3

# Dropping Fareband, SibSp, Parch and Famsize columns
train_data_processing = train_data_processing.drop(['FareBand', 'SibSp', 'Parch', 'Famsize'], axis = 1)
test_data_processing = test_data_processing.drop(['SibSp', 'Parch', 'Famsize'], axis = 1)
train_data_processing.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Alone
0,0,3,0,2.0,0.0,1,0
1,1,1,1,2.0,3.0,0,0
2,1,3,1,2.0,1.0,1,1
3,1,1,1,2.0,3.0,1,0
4,0,3,0,2.0,1.0,1,1


In [30]:
# Checking my test data
test_data_processing.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Alone
0,3,0,2.0,0.0,2,1
1,3,1,2.0,0.0,1,0
2,2,0,3.0,1.0,2,1
3,3,0,2.0,1.0,1,1
4,3,1,2.0,1.0,1,0


In [31]:
train_data_processing.columns.values

array(['Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Alone'],
      dtype=object)

In [32]:
print(train_data_processing.shape, test_data_processing.shape)

(889, 7) (418, 6)


Now, we declare our independent and dependent variables

In [33]:
y_train = train_data_processing['Survived']
x_train = train_data_processing[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Alone']]
x_test = test_data_processing[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Alone']]

In [35]:
classifier = RandomForestClassifier(n_estimators = 100)
classifier.fit(x_train, y_train)
y_predict = classifier.predict(x_test)
score = round(classifier.score(x_train, y_train) * 100, 2)
score


84.93

# The End.

In [37]:
submission = pd.DataFrame({'PassengerId': raw_test_data['PassengerId'],'Survived': y_predict})

#Visualize the first 5 rows
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0


In [38]:
# Converting the dataframe to csv to submit on kaggle
filename = 'Titanic_Prediction.csv'

submission.to_csv(filename, index = False)

print('Saved file: ' + filename)

Saved file: Titanic_Prediction.csv
