In [1]:
import pandas as pd
from sklearn.cross_validation import train_test_split

### Data Credits URL: http://web.stanford.edu/class/archive/cs/cs109/cs109.1166/problem12.html

In [2]:
df = pd.read_csv("titanic.csv")

In [3]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35,0,0,8.05


In [4]:
df.describe()

Unnamed: 0,Survived,Pclass,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
count,887.0,887.0,887.0,887.0,887.0,887.0
mean,0.385569,2.305524,29.471443,0.525366,0.383315,32.30542
std,0.487004,0.836662,14.121908,1.104669,0.807466,49.78204
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.25,0.0,0.0,7.925
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.1375
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
len(df), len(df[df['Survived'] == 1])

(887, 342)

In [6]:
round((342 / float(887)) * 100, 0)

39.0

### Out of of 887 person travelling, 342 people survived, close to 39% survived

### Will leave out columns Siblings/Spouses Aboard, Parents/Children Aboard, Fare from the selection

In [7]:
df = df[['Survived', 'Sex', 'Pclass', 'Age']]

In [8]:
pd.crosstab(df.Survived, df.Pclass, margins=True)

Pclass,1,2,3,All
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,80,97,368,545
1,136,87,119,342
All,216,184,487,887


In [9]:
136 / 887.0, 87 / 887.0, 119 / 887.0

(0.15332581736189402, 0.09808342728297632, 0.13416009019165728)

### Person in First Class had a better survival percentage

In [10]:
pd.crosstab(df.Survived, df.Sex, margins=True)

Sex,female,male,All
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,81,464,545
1,233,109,342
All,314,573,887


In [11]:
233 / 314.0, 109 / 573.0

(0.7420382165605095, 0.19022687609075042)

### Women survival percentage : 74%, Men Survival percentage 19%

In [12]:
gender_mapping = {'male': 0, 'female': 1}
df['Sex'] = df['Sex'].map(gender_mapping)


In [13]:
df.head()

Unnamed: 0,Survived,Sex,Pclass,Age
0,0,0,3,22
1,1,1,1,38
2,1,1,3,26
3,1,1,1,35
4,0,0,3,35


In [14]:
X = df[['Sex', 'Pclass', 'Age']]
Y = df['Survived']

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split (X, Y, test_size = 0.33, random_state = 42)
print X_train.shape
print X_test.shape
print Y_train.shape
print Y_test.shape

(594L, 3L)
(293L, 3L)
(594L,)
(293L,)


In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
rf = RandomForestClassifier(n_estimators=50)

In [18]:
rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)

In [19]:
accuracy = rf.score(X_test, Y_test)
print"Accuracy = {}%".format(accuracy * 100)

Accuracy = 73.7201365188%


In [20]:
print rf.feature_importances_

[ 0.40343186  0.16039715  0.43617098]


In [21]:
Y_predicted = rf.predict(X_test)

In [22]:
Y_predicted.shape

(293L,)

In [23]:
from sklearn.metrics import accuracy_score

In [24]:
print"Accuracy = {}%".format(accuracy_score(Y_test, Y_predicted) * 100)

Accuracy = 73.7201365188%


### Both score & accuracy_score returns the same accuracy

In [25]:
from sklearn.metrics import confusion_matrix

In [26]:
confusion_matrix(Y_test, Y_predicted)

array([[150,  29],
       [ 48,  66]])

### matrix element [0,0] says 150 are true negatives & [0,1] are false negatives
### [1,0] says 41 are true positives & [1,1] are false positives

In [27]:
len(Y_predicted[Y_test == Y_predicted])

216

In [28]:
len(Y_test[Y_test == 1])  # 41 + 73 = 114

114

In [29]:
len(Y_test[Y_test == 0]) # 150 + 29 = 179

179