# Import Packages

In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Load Data

In [50]:
Titanic = sns.load_dataset('titanic')

# Data Wrangling

## Drop the missing values

In [51]:
# Drop the missing values

Titanic.dropna(inplace=True)

## Recoding the string data

In [52]:
# Recoding embark_town to R_embark_town


def town_recode(town):
    if town == "Southampton":
        return 0
    if town == "Cherbourg":
        return 1
    if town == "Queenstown":
        return 2
    
Titanic['R_embark_town'] = Titanic['embark_town'].apply(town_recode)

In [53]:
# Recoding sex to SexM0F1

def sex_recode(MorF):
    if MorF == "male":
        return 0
    if MorF == "female":
        return 1
    
Titanic['SexM0F1'] = Titanic['sex'].apply(sex_recode)


In [54]:
# Recoding deck to R_deck

Titanic.deck.value_counts()

def deck_recode(deckval):
    if deckval == "C":
        return 0
    if deckval == "B":
        return 1
    if deckval == "D":
        return 2
    if deckval == "E":
        return 3
    if deckval == "A":
        return 4
    if deckval == "F":
        return 5
    if deckval == "G":
        return 6
    
Titanic['R_deck'] = Titanic['deck'].apply(deck_recode)



## Trimming the data to keep useful variables/ Drop any variables that are redundant.


In [55]:
TitanicTrimmed = Titanic.drop(['sex','embarked','class','who','adult_male','deck','embark_town','survived','alone'], axis=1)


## Changing float to int

In [45]:
TitanicTrimmed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 182 entries, 1 to 889
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   survived       182 non-null    int64   
 1   pclass         182 non-null    int64   
 2   age            182 non-null    float64 
 3   sibsp          182 non-null    int64   
 4   parch          182 non-null    int64   
 5   fare           182 non-null    float64 
 6   R_embark_town  182 non-null    int64   
 7   SexM0F1        182 non-null    int64   
 8   R_deck         182 non-null    category
dtypes: category(1), float64(2), int64(6)
memory usage: 13.3 KB


In [56]:
TitanicTrimmed.R_deck = TitanicTrimmed.R_deck.astype(int)
TitanicTrimmed.age = TitanicTrimmed.age.astype(int)
TitanicTrimmed.fare = TitanicTrimmed.fare.astype(int)
TitanicTrimmed.R_embark_town = TitanicTrimmed.R_embark_town.astype(int)


In [57]:
TitanicTrimmed.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,alive,R_embark_town,SexM0F1,R_deck
1,1,38,1,0,71,yes,1,1,0
3,1,35,1,0,53,yes,0,1,0
6,1,54,0,0,51,no,0,0,3
10,3,4,1,1,16,yes,0,1,6
11,1,58,0,0,26,yes,0,1,0


# PART 1
## Create a decision tree model of the Titanic dataset that predicts survival from seaborn.

In [58]:
# Question Set Up 

# The next step is to specify your x and y variables using subsetting.
# y is the column you are predicting, and x is everything you are using to predict it.

x = TitanicTrimmed[['pclass', 'age', 'sibsp', 'parch', 'fare', 'R_embark_town', 'SexM0F1', 'R_deck']]
y = TitanicTrimmed['alive']

## Train Test Split

In [59]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=76)

## Create Initial Decision Tree

In [62]:
# To do this, utilize the DecisionTreeClassifier() function and then fit() the model. 

decisionTree = DecisionTreeClassifier(random_state=76)
decisionTree.fit(x_train, y_train)

DecisionTreeClassifier(random_state=76)

## Assess the model

In [63]:
# Now that the data is fit, the next step is to create a set of predictions and interpret the results.

treePredictions = decisionTree.predict(x_test)
print(treePredictions)


['yes' 'yes' 'yes' 'no' 'yes' 'no' 'no' 'no' 'no' 'yes' 'yes' 'yes' 'yes'
 'no' 'yes' 'no' 'yes' 'no' 'yes' 'no' 'yes' 'yes' 'yes' 'yes' 'yes' 'no'
 'no' 'no' 'yes' 'no' 'yes' 'yes' 'yes' 'no' 'yes' 'yes' 'no' 'yes' 'no'
 'yes' 'yes' 'yes' 'no' 'yes' 'yes' 'yes' 'yes' 'no' 'yes' 'yes' 'no'
 'yes' 'yes' 'yes' 'yes']


In [64]:
# the above output is hard to understand.
# We'll call on the functions confusion_matrix() and classification_report(). 

print(confusion_matrix(y_test, treePredictions))

[[12  7]
 [ 7 29]]


In [None]:
# Alive   No (actual)	 Yes (actual)
# No (predicted)	12	7
# Yes (predicted)	7	29

# So what this means is that 12 were correctly classified as Dead. 
# There were 7 misclassifications. 29 were correctly classified as Alive, with 7 accidentally
# being misclassified as Dead. 

## How Well Does your Model Fit?

In [65]:
print(classification_report(y_test, treePredictions))

              precision    recall  f1-score   support

          no       0.63      0.63      0.63        19
         yes       0.81      0.81      0.81        36

    accuracy                           0.75        55
   macro avg       0.72      0.72      0.72        55
weighted avg       0.75      0.75      0.75        55



In [None]:
# The precision column here tells 63% accurate in predicting Dead people, 
# and was 81% accurate in predicting the Alive population. Also looking at the weighted avg row
# for precision, which gives an overall value of 75%.


# Part 2

## Now create a random forest model of the Titanic dataset that predicts survival.

## Initial Random Forest Model

In [66]:
# We'll use the function RandomForestClassifer(), with the arguments n_estimators= to specify how many 
# decision trees we want the random forest to stem from, and of course random_state= just to follow along 
# with this content:

forest = RandomForestClassifier(n_estimators=500, random_state=76)
forest.fit(x_train, y_train)

RandomForestClassifier(n_estimators=500, random_state=76)

## Evaluate Model Fit

In [67]:
# The final step is to create our prediction set and print a report! 

forestPredictions = forest.predict(x_test)
print(confusion_matrix(y_test, forestPredictions))
print(classification_report(y_test, forestPredictions))

[[12  7]
 [ 6 30]]
              precision    recall  f1-score   support

          no       0.67      0.63      0.65        19
         yes       0.81      0.83      0.82        36

    accuracy                           0.76        55
   macro avg       0.74      0.73      0.74        55
weighted avg       0.76      0.76      0.76        55



In [None]:
# The output above shows that both the methods give almost the same results for the Titanic dataset; 
# Random Forest being a little better.
# there's an improvement in predicting the survived. 

# The confusion matrix tells us
# So what this means is that 12 were correctly classified as Dead. 
# There were 7 misclassifications. 30 were correctly classified as Alive, with 6 accidentally
# being misclassified as Dead. 

# The precision column here tells 67% accurate in predicting Dead people, 
# and was 81% accurate in predicting the Alive population. Also looking at the weighted avg row
# for precision, which gives an overall value of 76%.
