In [1]:
import pandas as pd
import numpy as np
from scipy import stats

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# import splitting and imputing functions
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# confusion matrix for model evaluation
from sklearn.metrics import confusion_matrix

# **************************************

# import local files
import env
import acquire
import prepare

import os

# turn off pink boxes
import warnings
warnings.filterwarnings('ignore')

np.random.seed(123)

In [2]:
# acquiring titanic data

titanic_df = acquire.get_titanic_data()

In [3]:
# preparing titanic data
#     cleaning

titanic_df = prepare.clean_data(titanic_df)

In [12]:
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,male,1,0,7.25,Southampton,0,1,0,1
1,1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,2,1,3,female,0,0,7.925,Southampton,1,0,0,1
3,3,1,1,female,1,0,53.1,Southampton,0,0,0,1
4,4,0,3,male,0,0,8.05,Southampton,1,1,0,1


In [13]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   passenger_id             891 non-null    int64  
 1   survived                 891 non-null    int64  
 2   pclass                   891 non-null    int64  
 3   sex                      891 non-null    object 
 4   sibsp                    891 non-null    int64  
 5   parch                    891 non-null    int64  
 6   fare                     891 non-null    float64
 7   embark_town              891 non-null    object 
 8   alone                    891 non-null    int64  
 9   sex_male                 891 non-null    uint8  
 10  embark_town_Queenstown   891 non-null    uint8  
 11  embark_town_Southampton  891 non-null    uint8  
dtypes: float64(1), int64(6), object(2), uint8(3)
memory usage: 72.2+ KB


In [4]:
# preparing titanic data
#     splitting

train, validate, test = prepare.split_data(titanic_df)

In [5]:
print(f'Train: {train.shape}')
print(f'Validate: {validate.shape}')
print(f'Test: {test.shape}')

Train: (498, 12)
Validate: (214, 12)
Test: (179, 12)


# 1. Baseline Prediction

In [6]:
# calculate baseline using .value_counts for target variable column

print(f'Baseline Prediction = 0')
print(f'Did not survive')
print()
print(train.survived.value_counts())

Baseline Prediction = 0
Did not survive

0    307
1    191
Name: survived, dtype: int64


In [7]:
# created baseline column

train['baseline'] = 0
train.head(2)

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton,baseline
583,583,0,1,male,0,0,40.125,Cherbourg,1,1,0,0,0
165,165,1,3,male,0,2,20.525,Southampton,0,1,0,1,0


## 1a. Baseline Accuracy

In [8]:
baseline_accuracy = (train.survived == train.baseline).mean()

print(f'Baseline Accuracy: {baseline_accuracy.round(4)}')

Baseline Accuracy: 0.6165


# 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [9]:
# creating x and y versions of train
#     x: all the feature variables, without target
#     y: a series with just the target variable

X_train = train.drop(columns = 'survived')
y_train = train.survived

In [10]:
# creating the decision tree object with desired hyper-parameters

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

clf = DecisionTreeClassifier(max_depth = 3, random_state = 123)

In [11]:
# fitting the random forest algorithm to the training data

# clf = clf.fit(X_train, y_train)