# Predicting Titanic Survivors Using Machine Learning

<img src="https://media.nationalgeographic.org/assets/photos/000/273/27302.jpg" height="300" width="450" align="left">

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np 
import pandas as pd 

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
df = pd.read_csv('../input/titanic/train.csv')

df

In [None]:
import matplotlib.pyplot as plt

pd.value_counts(df['Pclass']).plot.bar()
plt.xlabel('Proxy for Socio-Economic Status (SES)')
plt.ylabel('Counts')

In [None]:
pd.value_counts(df['Sex']).plot.barh()

In [None]:
plt.hist(df['Age'] , edgecolor='black', linewidth=1.2)
plt.xlabel('Age')
plt.ylabel('Count')

plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder() 

df['Sex_category'] = label_encoder.fit_transform(df['Sex'])

df['Embarked'].fillna('None', inplace=True)

df['Embarked_category'] = label_encoder.fit_transform(df['Embarked'])
df

In [None]:
df['Age'].fillna(df['Age'].mean(), inplace=True)
df.describe()

In [None]:
#Using Pearson Correlation
import seaborn as sns

plt.figure(figsize=(12,10))
corr = df.corr()
sns.heatmap(corr, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
print(df.corr())

### PassengerId looks like it has very little impact on any of the other variables within the model. Let's drop it from the dataset.

In [None]:
df.drop(columns='PassengerId', inplace=True)
df

In [None]:
df.info()
df['Cabin'].describe()

### Not sure how relevant the cabin on the ship will be to survival. Aside from that though it only has a 22.9% non-null population.  
### We'll also drop this from the dataset.

In [None]:
df.drop(columns=['Cabin', 'Ticket', 'Fare', 'Name'], inplace=True)
df

In [None]:
#let's run this again now that we've narrowed down to our selected features
plt.figure(figsize=(12,10))
corr = df.corr()
sns.heatmap(corr, annot=True, cmap=plt.cm.Reds)
plt.show() 

In [None]:
X = df[['Pclass', 'Age', 'SibSp', 'Parch', 'Sex_category', 'Embarked_category']]
y = df['Survived']

In [None]:
#This cell will split the training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#I think for this comp, the train/test csv's are already the splitting but we might want to look into using the entirity of one to accomplish their respective namesakes

In [None]:
from sklearn.ensemble import RandomForestClassifier

#instantiate
RFC = RandomForestClassifier(n_estimators=100).fit(X_train, y_train) #start w/all default values
RFC2 = RandomForestClassifier(n_estimators=10).fit(X_train, y_train) #start w/all default values
#fit

trainscore = RFC.score(X_train, y_train)
testscore = RFC.score(X_test, y_test)

trainscore2 = RFC2.score(X_train, y_train)
testscore2 = RFC2.score(X_test, y_test)

print(f'Accuracy of Random Forest Classifier algorithm on training set (100 estimators): {trainscore}')
#predict
print(f'Accuracy of Random Forest Classifier algorithm on test set (100 estimators): {testscore}')

print(f'Accuracy of Random Forest Classifier algorithm on training set (10 estimators): {trainscore2}')
#predict
print(f'Accuracy of Random Forest Classifier algorithm on test set (10 estimators): {testscore2}')

In [None]:
testdf = pd.read_csv('../input/titanic/test.csv')

testdf.describe()


In [None]:
testdf['Sex_category'] = label_encoder.fit_transform(testdf['Sex'])

testdf['Embarked'].fillna('None', inplace=True)

testdf['Embarked_category'] = label_encoder.fit_transform(testdf['Embarked'])

testdf['Age'].fillna(df['Age'].mean(), inplace=True)

In [None]:
print(testdf.columns)

In [None]:
testdf.describe()

In [None]:
testX = testdf[['Pclass', 'Age', 'SibSp', 'Parch', 'Sex_category', 'Embarked_category']]

testdf['Survived'] = RFC.predict(testX)

testdf[['PassengerId', 'Survived']]

In [None]:
grading = pd.read_csv("../input/titanic/gender_submission.csv")

In [None]:
correct = 0
incorrect = 0

In [None]:
for i in testdf['Survived']:
    if testdf['Survived'][i] == grading['Survived'][i]:
        correct+=1
    else:
        incorrect+=1

In [None]:
print("You have correctly predicted " + str(correct/(correct+incorrect))+ " survivors in the dataset.")

In [None]:
correct

In [None]:
incorrect

# Results
### So. We have a big overfitting problem. Our training set has a 94% accuracy which reduces to 81% on the test set. When we introduce the prediction data set without the survival column, we have even lower accuracy at 61%. 

In [None]:
#evaluate accuracy and maybe perform cross validation - gridsearchCV
from sklearn.model_selection import GridSearchCV

In [None]:
params = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3], 
    'min_samples_leaf': [3, 4, 5], 
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

griddy = GridSearchCV(estimator=RFC,
             param_grid=params,
            cv = 3, 
            verbose = 2)

griddy.fit(X_train, y_train)

griddy.best_params_ 

In [None]:
RFCbest = griddy.best_estimator_

#griddy_acc = evaluate(best_grid, X_test, Y_test)

griddy_acc = RFCbest.score(X_test, y_test)

In [None]:
print(f'Base Accuracy (no parameter tuning) on training set: {trainscore}')
#predict
print(f'Base Accuracy (no parameter tuning) on test set: {testscore}')

print(f'Accuracy after GridSearchCV on test set: {griddy_acc}')

print(f'Gain of accuracy: {(griddy_acc-testscore)*100}%')

# Let's try this hyper-tuned Random Forest Algorithm on our prediction set.

In [None]:
testdf['SurvivedBest'] = RFCbest.predict(testX)

In [None]:
testdf[['SurvivedBest', 'Survived']]

testdf.to_csv(r'C:\Users\LIDFS61\test.csv')

In [None]:
grading[['Survived']]

grading.to_csv(r'C:\Users\LIDFS61\grading.csv')

In [None]:
correctpreds=0
incorrectpreds=0

In [None]:
for i in testdf['SurvivedBest']:
    if testdf['SurvivedBest'][i] == grading['Survived'][i]:
        correctpreds+=1
    else:
        incorrectpreds+=1

In [None]:
testdf['SurvivedBest'][33]

In [None]:
grading['Survived'][33]

In [None]:
testdf['SurvivedBest'][33] == grading['Survived'][33]

In [None]:
correctpreds

In [None]:
incorrectpreds

In [None]:
print("You have correctly predicted " + str(correctpreds/(correctpreds+incorrectpreds)*100) + "% of the survivors in the dataset.")
print("This is a gain of " + str((correctpreds/(correctpreds+incorrectpreds)) - (correct/(correct+incorrect))) + " accuracy." )