# Kaggle machine learning competition: Titanic predictions

In this competition, I am provided with some information on each of the passengers and we are asked to predit whether or not the passenger survived.


Competition link: https://www.kaggle.com/c/titanic/overview

In [3]:
#import standard libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

pd.options.mode.chained_assignment = None
#Import data provided by Kaggle:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


In [4]:
train.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891,891.0,204,889
unique,,,,891,2,,,,681,,147,3
top,,,,"Dahlberg, Miss. Gerda Ulrika",male,,,,CA. 2343,,G6,S
freq,,,,1,577,,,,7,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


### Cleaning data and feature engineering

In [5]:
np.random.seed(26)

survival_ratios = train[["Survived","Sex"]]
survival_ratios['Count'] = 1
survival_ratios = survival_ratios.groupby(["Sex"])["Survived","Count"].sum()
survival_ratios = pd.DataFrame(survival_ratios).reset_index()
survival_ratios["Percentage"] = round(survival_ratios["Survived"]/survival_ratios["Count"],2)
survival_ratios

Unnamed: 0,Sex,Survived,Count,Percentage
0,female,233,314,0.74
1,male,109,577,0.19


In [6]:
X = train.drop("Survived",axis=1)
Y = train['Survived']
X["Sur_Probability"] = np.where(X["Sex"]=='male',0.19,0.74)
X['Sex'] = np.where(X['Sex']=='male',1,0 ) 
X = X.drop(["Name","Age","Cabin"],axis=1)

#Normalizing a numerical variable so that the range is between 0 and 1
min_max_scaler = preprocessing.MinMaxScaler()
X['Normalized_Fare'] = min_max_scaler.fit_transform(X[["Fare"]])

X = X.drop(["Ticket","Fare"],axis=1)
X = pd.get_dummies(X[["PassengerId","Pclass","Sex","SibSp","Parch","Embarked","Normalized_Fare","Sur_Probability"]])

X

Unnamed: 0,PassengerId,Pclass,Sex,SibSp,Parch,Normalized_Fare,Sur_Probability,Embarked_C,Embarked_Q,Embarked_S
0,1,3,1,1,0,0.014151,0.19,0,0,1
1,2,1,0,1,0,0.139136,0.74,1,0,0
2,3,3,0,0,0,0.015469,0.74,0,0,1
3,4,1,0,1,0,0.103644,0.74,0,0,1
4,5,3,1,0,0,0.015713,0.19,0,0,1
...,...,...,...,...,...,...,...,...,...,...
886,887,2,1,0,0,0.025374,0.19,0,0,1
887,888,1,0,0,0,0.058556,0.74,0,0,1
888,889,3,0,1,2,0.045771,0.74,0,0,1
889,890,1,1,0,0,0.058556,0.19,1,0,0


### Splitting our training data into a train and test set to validate our models

In [7]:
# Train/test split so that we can test the model on a sample size
X_train,X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2)

In [8]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 100, max_features=0.8,oob_score =True,random_state=50)

clf.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 0.8,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': True,
 'random_state': 50,
 'verbose': 0,
 'warm_start': False}

In [9]:
clf.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=0.8, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=50, verbose=0,
                       warm_start=False)

In [10]:
# Now we can make a prediction
Y_preds = clf.predict(X_test)
Y_preds

array([0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0])

In [11]:
clf.score(X_train, Y_train) 

1.0

In [12]:
clf.score(X_test,Y_test)

0.7877094972067039

In [13]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(Y_test,Y_preds)) 

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       116
           1       0.72      0.65      0.68        63

    accuracy                           0.79       179
   macro avg       0.77      0.76      0.76       179
weighted avg       0.78      0.79      0.79       179



In [14]:
confusion_matrix(Y_test,Y_preds)

array([[100,  16],
       [ 22,  41]])

In [15]:
accuracy_score(Y_test,Y_preds)

0.7877094972067039

### Tunning model inputs to improve quality of predictions

In [20]:
for i in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]:
    print(f"Trying model with {i} max features:")
    clf = RandomForestClassifier(max_features = i).fit(X_train,Y_train)
    print(f"Model Accuracy on test set: {clf.score(X_test,Y_test)*100:.2f}")

Trying model with 0.1 max features:
Model Accuracy on test set: 74.86
Trying model with 0.2 max features:
Model Accuracy on test set: 75.98
Trying model with 0.3 max features:
Model Accuracy on test set: 77.65
Trying model with 0.4 max features:
Model Accuracy on test set: 77.65
Trying model with 0.5 max features:
Model Accuracy on test set: 77.09
Trying model with 0.6 max features:
Model Accuracy on test set: 80.45
Trying model with 0.7 max features:
Model Accuracy on test set: 74.30
Trying model with 0.8 max features:
Model Accuracy on test set: 82.12
Trying model with 0.9 max features:
Model Accuracy on test set: 75.98
Trying model with 1 max features:
Model Accuracy on test set: 77.65




In [21]:
for i in range(10,100,10):
    print(f"Trying model with {i} estimatores:")
    clf = RandomForestClassifier(n_estimators = i).fit(X_train,Y_train)
    print(f"Model Accuracy on test set: {clf.score(X_test,Y_test)*100:.2f}")

Trying model with 10 estimatores:
Model Accuracy on test set: 74.86
Trying model with 20 estimatores:
Model Accuracy on test set: 76.54
Trying model with 30 estimatores:
Model Accuracy on test set: 78.21
Trying model with 40 estimatores:
Model Accuracy on test set: 77.65
Trying model with 50 estimatores:
Model Accuracy on test set: 75.42
Trying model with 60 estimatores:
Model Accuracy on test set: 77.65
Trying model with 70 estimatores:
Model Accuracy on test set: 76.54
Trying model with 80 estimatores:
Model Accuracy on test set: 78.21
Trying model with 90 estimatores:
Model Accuracy on test set: 76.54


### Predicting on the test data from Kaggle

In [22]:
test["Sur_Probability"]= np.where(test["Sex"]=='male',0.19,0.74)
test['Sex'] = np.where(test['Sex']=='male',1,0 ) 
test = test.drop(["Name","Age","Cabin"],axis=1)

min_max_scaler = preprocessing.MinMaxScaler()
test['Normalized_Fare'] = min_max_scaler.fit_transform(test[["Fare"]])

test = test.drop(["Ticket","Fare"],axis=1)
test = pd.get_dummies(test[["PassengerId","Pclass","Sex","SibSp","Parch","Embarked","Normalized_Fare","Sur_Probability"]])
test

Unnamed: 0,PassengerId,Pclass,Sex,SibSp,Parch,Normalized_Fare,Sur_Probability,Embarked_C,Embarked_Q,Embarked_S
0,892,3,1,0,0,0.015282,0.19,0,1,0
1,893,3,0,1,0,0.013663,0.74,0,0,1
2,894,2,1,0,0,0.018909,0.19,0,1,0
3,895,3,1,0,0,0.016908,0.19,0,0,1
4,896,3,0,1,1,0.023984,0.74,0,0,1
...,...,...,...,...,...,...,...,...,...,...
413,1305,3,1,0,0,0.015713,0.19,0,0,1
414,1306,1,0,0,0,0.212559,0.74,1,0,0
415,1307,3,1,0,0,0.014151,0.19,0,0,1
416,1308,3,1,0,0,0.015713,0.19,0,0,1


In [23]:
test["Normalized_Fare"] = np.where(test['Normalized_Fare'].isna(),0,test['Normalized_Fare'] )

predictions = clf.predict(test)

predictions = pd.DataFrame(predictions)
predictions

ids = test[['PassengerId']]
ids

titanic_1 = pd.DataFrame(pd.concat([ids,predictions], axis=1))

titanic_1 = titanic_1.rename(columns={titanic_1.columns[1]:"Survived"})
titanic_1


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [None]:
# titanic_1.to_csv("/Users/kurtis.campbell/Desktop/Kurtis/Kaggle/Titanic/predictions.csv")