In [102]:
import acquire as a
import prepare as p

import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier, \
export_text, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [103]:
def preprocess_titanic(train_df, val_df, test_df):
    '''
    preprocess_titanic will take in three pandas dataframes
    of our titanic data, expected as cleaned versions of this 
    titanic data set (see documentation on acquire.py and prepare.py)
    
    output:
    encoded, ML-ready versions of our clean data, with 
    columns sex and embark_town encoded in the one-hot fashion
    return: (pd.DataFrame, pd.DataFrame, pd.DataFrame)
    '''
    # with a looping structure:
    # for df in [train_df, val_df, test_df]:
    #     df.drop(blah blah blah)
    #     df['pclass'] = df['pclass'].astype(int)
    train_df = train_df.drop(columns='passenger_id')
    train_df['pclass'] = train_df['pclass'].astype(int)
    val_df = val_df.drop(columns='passenger_id')
    val_df['pclass'] = val_df['pclass'].astype(int)
    test_df = test_df.drop(columns='passenger_id')
    test_df['pclass'] = test_df['pclass'].astype(int)
    encoding_var = ['sex', 'embark_town']
    encoded_dfs = []
    for df in [train_df, val_df, test_df]:
        df_encoded_cats = pd.get_dummies(
            df[['embark_town', 'sex']],
              drop_first=True).astype(int)
        encoded_dfs.append(pd.concat(
            [df,
            df_encoded_cats],
            axis=1).drop(columns=['sex', 'embark_town']))
    return encoded_dfs

In [104]:
#grab, clean, and process data

In [105]:
train, val, test = p.split_data(p.prep_titanic(),'survived')

this file exists, reading from csv


In [106]:
train_encoded, val_encoded, test_encoded = preprocess_titanic(train, val, test)

In [107]:
train_encoded.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
776,0,3,0,0,7.75,1,1,0,1
829,1,1,0,0,80.0,1,0,1,0
215,1,1,1,0,113.275,0,0,0,0
258,1,1,0,0,512.3292,1,0,0,0
129,0,3,0,0,6.975,1,0,1,1


**1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.**

In [108]:
X_train, y_train = train_encoded.drop(columns='survived'), train_encoded.survived

In [109]:
X_validate, y_validate = val_encoded.drop(columns='survived'), val_encoded.survived

In [110]:
X_test, y_test = test_encoded.drop(columns='survived'), test_encoded.survived

In [111]:
X_train.columns

Index(['pclass', 'sibsp', 'parch', 'fare', 'alone', 'embark_town_Queenstown',
       'embark_town_Southampton', 'sex_male'],
      dtype='object')

In [112]:
y_train.head()

776    0
829    1
215    1
258    1
129    0
Name: survived, dtype: int64

In [113]:
clf = DecisionTreeClassifier(max_depth=3, min_samples_leaf=10)
clf
#fit the model with the values
clf.fit(X_train, y_train)
#predict the next 10 values
clf.predict(X_train)[:10]

array([0, 1, 1, 1, 0, 0, 1, 0, 1, 0])

In [114]:
#get the mode (the highest value)
train_encoded.survived.value_counts()

survived
0    329
1    205
Name: count, dtype: int64

In [49]:
#make the baseline accuracy
#'y_true' is the values of our target
#'baseline' is the all zero survival prediction
#'dt_one' is a new model to compare with

y_pred = pd.DataFrame({
    'y_true': y_train.values,
    'baseline':0,
    'dt_one': clf.predict(X_train)
}, index=train_encoded.index)
y_pred.head(2)

Unnamed: 0,y_true,baseline,dt_one
776,0,0,0
829,1,0,1


In [50]:
#accuracy of our baseline to our y_true
baseline_acc = accuracy_score(y_pred.y_true, y_pred.baseline)
baseline_acc

0.6161048689138576

**2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)**

In [51]:
clf.fit(X_train, y_train)
clf.predict(X_train)[:10]

array([0, 1, 1, 1, 0, 0, 1, 0, 1, 0])

In [93]:
clf.score(X_train, y_train)

0.8146067415730337

**3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.**

In [120]:
#model score
clf.score(X_train, y_train)

0.8146067415730337

In [117]:
survived_counts = pd.crosstab(y_train, y_pred.dt_one)
survived_counts

dt_one,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,293,36
1,63,142


In [53]:
print(classification_report(y_train, y_pred.dt_one))

              precision    recall  f1-score   support

           0       0.82      0.89      0.86       329
           1       0.80      0.69      0.74       205

    accuracy                           0.81       534
   macro avg       0.81      0.79      0.80       534
weighted avg       0.81      0.81      0.81       534



**4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.**

In [121]:
pd.crosstab(y_train, y_pred.dt_one, normalize='index')

dt_one,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.890578,0.109422
1,0.307317,0.692683


In [54]:
tp = survived_counts.iloc[0,0]
tn = survived_counts.iloc[0,1]
fp = survived_counts.iloc[1,0]
fn = survived_counts.iloc[1,1]
accuracy = (tp + tn) / (tp + tn + fp + fn)
print(accuracy)
print(classification_report(y_train, y_pred.dt_one))

0.6161048689138576
              precision    recall  f1-score   support

           0       0.82      0.89      0.86       329
           1       0.80      0.69      0.74       205

    accuracy                           0.81       534
   macro avg       0.81      0.79      0.80       534
weighted avg       0.81      0.81      0.81       534



**5. Run through steps 2-4 using a different max_depth value.**

In [55]:
clf2 = DecisionTreeClassifier(max_depth=8, min_samples_leaf=10)
clf2
#fit the model with the values
clf2.fit(X_train, y_train)
#predict the next 10 values
clf2.predict(X_train)[:10]

array([0, 1, 1, 1, 0, 0, 1, 0, 1, 0])

In [61]:
clf2.score(X_train, y_train)

0.8445692883895131

In [56]:
survived_counts2 = pd.crosstab(y_train, clf2.predict(X_train))
survived_counts2

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,302,27
1,56,149


In [57]:
print(classification_report(y_train, clf2.predict(X_train)))

              precision    recall  f1-score   support

           0       0.84      0.92      0.88       329
           1       0.85      0.73      0.78       205

    accuracy                           0.84       534
   macro avg       0.85      0.82      0.83       534
weighted avg       0.84      0.84      0.84       534



In [58]:
tp2 = survived_counts2.iloc[0,0]
tn2 = survived_counts2.iloc[0,1]
fp2 = survived_counts2.iloc[1,0]
fn2 = survived_counts2.iloc[1,1]
accuracy2 = (tp2 + tn2) / (tp2 + tn2 + fp2 + fn2)
print(accuracy2)

0.6161048689138576


**6. Which model performs better on your in-sample data?**

In [None]:
#The second model had better accuracy from the classification report

**7. Which model performs best on your out-of-sample data, the validate set?**

In [67]:
clf3 = DecisionTreeClassifier(max_depth=3, min_samples_leaf=10)
#fit the model with the values
clf3.fit(X_validate, y_validate)
#predict the next 10 values
clf3.predict(X_validate)[:10]

array([0, 0, 0, 1, 1, 0, 0, 1, 0, 0])

In [68]:
clf3.score(X_validate, y_validate)

0.797752808988764

In [69]:
survived_counts3 = pd.crosstab(y_validate, clf3.predict(X_validate))
survived_counts3

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,93,17
1,19,49


In [71]:
print(classification_report(y_validate, clf3.predict(X_validate)))

              precision    recall  f1-score   support

           0       0.83      0.85      0.84       110
           1       0.74      0.72      0.73        68

    accuracy                           0.80       178
   macro avg       0.79      0.78      0.78       178
weighted avg       0.80      0.80      0.80       178



In [90]:
clf4 = DecisionTreeClassifier(max_depth=8, min_samples_leaf=10)
#fit the model with the values
clf4.fit(X_validate, y_validate)
#predict the next 10 values
clf4.predict(X_validate)[:10]

array([0, 0, 0, 1, 1, 0, 0, 1, 0, 0])

In [91]:
clf4.score(X_validate, y_validate)

0.797752808988764

In [92]:
survived_counts4 = pd.crosstab(y_validate, clf4.predict(X_validate))
survived_counts4

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,93,17
1,19,49


In [87]:
print(classification_report(y_validate, clf4.predict(X_validate)))

              precision    recall  f1-score   support

           0       0.83      0.85      0.84       110
           1       0.74      0.72      0.73        68

    accuracy                           0.80       178
   macro avg       0.79      0.78      0.78       178
weighted avg       0.80      0.80      0.80       178

