In [None]:
# Self Notes:
# 1) Sec III - Answer number 5!
# 2) Sec III - Tighten up and seperate out the confusion matrix formula - perhaps generalize it
# 3) Sec III - Get a better grip and practice creating dataframes with results

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

from acquire import get_titanic_data
from prepare import prep_titanic
from splitter import splitter

# I. Decision Tree - Titanic Data

In [2]:
df = get_titanic_data()
df.head(2)

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0


In [3]:
df = prep_titanic(df)
df.drop(columns='age', inplace=True)
df.head(2)

Unnamed: 0,survived,pclass,Siblings/Spouses,Parents/Children,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,1,0,7.25,0,1,0,1
1,1,1,1,0,71.2833,0,0,0,0


## 1. What is your baseline prediction?  Baseline accuracy?

In [4]:
df_train, df_validate, df_test = splitter(df, 'survived')
df_train.survived.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [5]:
307/(307+191)

0.6164658634538153

#### Baseline prediction is died, which is accurate 61.65% of the time.

## 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [6]:
X_train = df_train.drop(columns=['survived'])
y_train = df_train.survived

X_validate = df_validate.drop(columns=['survived'])
y_validate = df_validate.survived

X_test = df_test.drop(columns=['survived'])
y_test = df_test.survived

In [None]:
clf = DecisionTreeClassifier(max_depth=3, random_state=123)
clf = clf.fit(X_train, y_train)
# this is tpyiall where y_pred = clf.predict(X_train) would live as part of the "3 lines of code"

In [None]:
# plotting figure unecessary but good for funsies or stakeholder visuals at times
plt.figure(figsize=(16, 8))
plot_tree(clf, feature_names=X_train.columns, class_names=clf.classes_.astype(str), rounded=True)

## 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [None]:
y_pred = clf.predict(X_train)
y_pred[10:15]

In [None]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[10:15]

In [None]:
# unecessary/redundant - found in dataframe you create below already
confusion_matrix(y_train, y_pred)

In [None]:
pd.DataFrame(confusion_matrix(y_train, y_pred), index=['died','survived'], columns=['died','survived'])

## 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

##### In sample:

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

In [None]:
report = classification_report(y_train, y_pred)
print(report)

##### Validate:

In [None]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

In [None]:
y_pred = clf.predict(X_validate)

print(classification_report(y_validate, y_pred))

In [None]:
# Make function that creates a fun and pretty table when you have time.

## 5. Run through steps 2-4 using a different max_depth value.

### Using max_depth = 7:

##### Train:

In [None]:
clf = DecisionTreeClassifier(max_depth=7, random_state=123)
clf = clf.fit(X_train, y_train)

In [None]:
plt.figure(figsize=(16, 8))
plot_tree(clf, feature_names=X_train.columns, class_names=clf.classes_.astype(str), rounded=True)

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

In [None]:
y_pred = clf.predict(X_train)
print(classification_report(y_train, y_pred))

##### Validate:

In [None]:
print('Accuracy of Decision Tree classifier on validation set: {:.2f}'
      .format(clf.score(X_validate, y_validate)))

In [None]:
y_pred = clf.predict(X_validate)

print(classification_report(y_validate, y_pred))

## 6. Which model performs better on your in-sample data?

The max_steps = 7 is better on in-sample data (as expected, more granular).

## 7. Which model performs best on your out-of-sample data, the validate set?

The max_steps = 3 is better on the validate data.

# II. Decision Tree - Telco Data

In [None]:
from acquire import get_telco_data
from prepare import prep_telco

In [None]:
df = get_telco_data()
df.head(2)

In [None]:
df = prep_telco(df)
df.info()

In [None]:
dropcols = [col for col in df.columns if df[col].dtype == 'object']
dropcols.remove('churn')
dropcols.append('churn_encoded')
dropcols

In [None]:
df = df.drop(columns=dropcols)
df.head(2)

In [None]:
df_train, df_validate, df_test = splitter(df, 'churn')
df_train.churn.value_counts()

In [None]:
#Note: Work more on establishing the baseline with code.  Esp since often a categorical!
2891/(2891+1046)

#### Baseline case is 'not churned' and the baseline accuracy is 73.43%.

In [None]:
#Note - you can also drop all the columns in X-data, including churn, in the drop(columns)
# The way you did it here was redundant!
X_train = df_train.drop(columns=['churn'])
y_train = df_train.churn

X_validate = df_validate.drop(columns=['churn'])
y_validate = df_validate.churn

X_test = df_test.drop(columns=['churn'])
y_test = df_test.churn

In [None]:
clf = DecisionTreeClassifier(max_depth = 3, random_state=123)
clf = clf.fit(X_train, y_train)

In [None]:
plt.figure(figsize=(16, 8))
plot_tree(clf, feature_names=X_train.columns, class_names=clf.classes_.astype(str), rounded=True)

In [None]:
y_pred = clf.predict(X_train)
y_pred[0:5]

In [None]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

In [None]:
labels = sorted(y_train.unique())
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

##### In training sample:

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

In [None]:
y_pred = clf.predict(X_train)
print(classification_report(y_train, y_pred))

##### In validation sample:

In [None]:
print('Accuracy of Decision Tree classifier on validation set: {:.2f}'
      .format(clf.score(X_validate, y_validate)))

In [None]:
y_pred = clf.predict(X_validate)
print(classification_report(y_validate, y_pred))

### Using max_depth = 7

In [None]:
clf = DecisionTreeClassifier(max_depth = 7, random_state=123)
clf = clf.fit(X_train, y_train)

In [None]:
plt.figure(figsize=(16, 8))
plot_tree(clf, feature_names=X_train.columns, class_names=clf.classes_.astype(str), rounded=True)

In [None]:
y_pred = clf.predict(X_train)
y_pred[0:5]

In [None]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

In [None]:
labels = sorted(y_train.unique())
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

##### In training sample:

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

In [None]:
y_pred = clf.predict(X_train)
print(classification_report(y_train, y_pred))

##### In validation sample:

In [None]:
print('Accuracy of Decision Tree classifier on validation set: {:.2f}'
      .format(clf.score(X_validate, y_validate)))

In [None]:
y_pred = clf.predict(X_validate)
print(classification_report(y_validate, y_pred))

### Comparison at different max levels:

In [None]:
for i in range(1,9):
    clf = DecisionTreeClassifier(max_depth = i, random_state=123)
    clf = clf.fit(X_train, y_train)
    print(f'Depth = {i}')
    print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))
    print('Accuracy of Decision Tree classifier on validation set: {:.2f}'
      .format(clf.score(X_validate, y_validate)))

#### Model performs best on validation set at a Depth of 3.

In [None]:
# Note: Remember you are not necessarily optimizing for accuracy, could be looking at recall or precision (or even F1)
# A good way to assess train v. validation is by minimizing the distance between the outcome of interest

# III. Random Forest - Titanic Data

##### Note: Used acquire, prep and splitting functions found in first exercise above.

## 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [25]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=10, 
                            min_samples_leaf = 1,
                            random_state=123)

In [26]:
rf = rf.fit(X_train, y_train)

In [27]:
print(X_train.columns)
print(rf.feature_importances_)

Index(['pclass', 'Siblings/Spouses', 'Parents/Children', 'fare', 'alone',
       'sex_male', 'embark_town_Queenstown', 'embark_town_Southampton'],
      dtype='object')
[0.10004896 0.06919781 0.05286698 0.38308923 0.02330269 0.32808955
 0.01527994 0.02812485]


## 2. Evaluate your results using the model score, confusion matrix, and classification report.

### Training data:

In [28]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.94


In [None]:
labels = ['died','survived']
y_pred_train = rf.predict(X_train)
dfrftr = pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)
dfrftr

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))

### Validation Data:

In [None]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

In [None]:
y_pred_validate = rf.predict(X_validate)
dfrfv = pd.DataFrame(confusion_matrix(y_validate, y_pred_validate), index=labels, columns=labels)
dfrfv

In [None]:
print(classification_report(y_validate, y_pred_validate))

## 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [91]:
def matrix_results(df, TP='NW', actual='rows'):
    if TP=='NW':
        if actual=='rows':
            true_positive = df.iloc[0,0]
            false_positive = df.iloc[1,0]
            true_negative = df.iloc[1,1]
            false_negative = df.iloc[0,1]
        elif actual=='columns':
            true_positive = df.iloc[0,0]
            false_positive = df.iloc[0,1]
            true_negative = df.iloc[1,1]
            false_negative = df.iloc[1,0]
        else:
            print('''If actual results (from training data) is in columns, please include "actual = 'columns'" in the function parameters''')
            return
    elif TP=='SE':
        if actual=='rows':
            true_positive = df.iloc[1,1]
            false_positive = df.iloc[0,1]
            true_negative = df.iloc[0,0]
            false_negative = df.iloc[1,0]
        elif actual=='columns':
            true_positive = df.iloc[1,1]
            false_positive = df.iloc[1,0]
            true_negative = df.iloc[0,0]
            false_negative = df.iloc[0,1]
        else:
            print('''If actual results (from training data) is in columns, please include " actual = 'columns' " in the function parameters''')
            return
    else:
        print('''If True Positive is in the bottom right quadrant, please include " TP = 'SE' " in the function parameters''')
        return
    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
    accuracy = (true_positive + true_negative) / (true_positive + true_negative + false_positive + false_negative)
    f1_score = (2*precision*recall)/(precision + recall)
    support_pos = true_positive + false_negative
    support_neg = true_negative + false_positive
#     print(f'True Positive = {true_positive}')
#     print(f'False Positive = {false_positive}')
#     print(f'True Negative = {true_negative}')
#     print(f'False Negative = {false_negative}')
#     print(f'Precision = {precision:.2f}')
#     print(f'Recall = {recall:.2f}')
#     print(f'Accuracy = {accuracy:.2f}')
#     print(f'F1 Score = {f1_score:.2f}')
#     print(f'Support, Positive = {support_pos}')
#     print(f'Support, Negative = {support_neg}')
    return {'true_positive':true_positive, 'false_positive':false_positive, 'true_negative':true_negative, 'false_negative':false_negative, 'precision':precision, 'recall':recall, 'accuracy':accuracy, 'f1_score':f1_score, 'support_pos':support_pos, 'support_neg':support_neg}

In [None]:
print('Training Data Results:')
matrix_results(dfrftr)
print('\n----------\n')
print('Validation Data Results:')
matrix_results(dfrfv)

In [None]:
pd.DataFrame(matrix_results(dfrftr))

## 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [None]:
for i in range(10,0,-1):
    for j in range(1,10):
        rf = RandomForestClassifier(max_depth=i, min_samples_leaf = j,random_state=123)
        rf = rf.fit(X_train, y_train)
        y_pred_train = rf.predict(X_train)
        y_pred_validate = rf.predict(X_validate)
        print(f'Depth = {i}, Leaves = {j}')
        print('Accuracy of random forest classifier on training set: {:.2f}'.format(rf.score(X_train, y_train)))
        print('Accuracy of random forest classifier on validation set: {:.2f}'.format(rf.score(X_validate, y_validate)))
#         print('Training Report:')
#         print(classification_report(y_train, y_pred_train))
#         print('Validation Report:')
#         print(classification_report(y_validate, y_pred_validate))
#         print('\n----------\n')

## 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [None]:
# ANSWER

# KNN - Titanic Data

#### Acquired, prepped and split above in section I.
-Baseline is 'died' with an accuracy of 61.65%

## 1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [72]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

In [73]:
knn = knn.fit(X_train, y_train)

In [74]:
y_pred = knn.predict(X_train)
y_pred_proba = knn.predict_proba(X_train)

## 2. Evaluate your results using the model score, confusion matrix, and classification report.

Model Score:

In [75]:
knn.score(X_train, y_train)

0.8072289156626506

Confusion Matrix:

In [76]:
labels = ['died','survived']
results = pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)
results

Unnamed: 0,died,survived
died,265,42
survived,54,137


In [77]:
print(classification_report(y_train, y_pred))
pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))

              precision    recall  f1-score   support

           0       0.83      0.86      0.85       307
           1       0.77      0.72      0.74       191

    accuracy                           0.81       498
   macro avg       0.80      0.79      0.79       498
weighted avg       0.81      0.81      0.81       498



Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.830721,0.765363,0.807229,0.798042,0.805654
recall,0.863192,0.717277,0.807229,0.790235,0.807229
f1-score,0.846645,0.740541,0.807229,0.793593,0.805951
support,307.0,191.0,0.807229,498.0,498.0


## 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [79]:
#pd.DataFrame(matrix_results(results).values(), columns=matrix_results(results).keys())
matrix_results(results)

True Positive = 265
False Positive = 54
True Negative = 137
False Negative = 42
Precision = 0.83
Recall = 0.86
Accuracy = 0.81
F1 Score = 0.85
Support, Positive = 307
Support, Negative = 191


{'true_positive': 265,
 'false_positive': 54,
 'true_negative': 137,
 'false_negative': 42,
 'precision': 0.8307210031347962,
 'recall': 0.8631921824104235,
 'accuracy': 0.8072289156626506,
 'f1_score': 0.8466453674121405,
 'support_pos': 307,
 'support_neg': 191}

## 4. Run through steps 2-3 setting k to 10

In [80]:
knn = KNeighborsClassifier(n_neighbors=10)
knn = knn.fit(X_train, y_train)
y_pred = knn.predict(X_train)
knn.score(X_train, y_train)

0.7831325301204819

In [81]:
labels = ['died','survived']
results = pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)
results

Unnamed: 0,died,survived
died,267,40
survived,68,123


In [83]:
print(classification_report(y_train, y_pred))
pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))

              precision    recall  f1-score   support

           0       0.80      0.87      0.83       307
           1       0.75      0.64      0.69       191

    accuracy                           0.78       498
   macro avg       0.78      0.76      0.76       498
weighted avg       0.78      0.78      0.78       498



Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.797015,0.754601,0.783133,0.775808,0.780748
recall,0.869707,0.643979,0.783133,0.756843,0.783133
f1-score,0.831776,0.694915,0.783133,0.763345,0.779285
support,307.0,191.0,0.783133,498.0,498.0


In [84]:
matrix_results(results)

True Positive = 267
False Positive = 68
True Negative = 123
False Negative = 40
Precision = 0.80
Recall = 0.87
Accuracy = 0.78
F1 Score = 0.83
Support, Positive = 307
Support, Negative = 191


{'true_positive': 267,
 'false_positive': 68,
 'true_negative': 123,
 'false_negative': 40,
 'precision': 0.7970149253731343,
 'recall': 0.8697068403908795,
 'accuracy': 0.7831325301204819,
 'f1_score': 0.8317757009345794,
 'support_pos': 307,
 'support_neg': 191}

## 5. Run through steps 2-3 setting k to 20

In [85]:
knn = KNeighborsClassifier(n_neighbors=20)
knn = knn.fit(X_train, y_train)
y_pred = knn.predict(X_train)
knn.score(X_train, y_train)

0.7369477911646586

In [86]:
labels = ['died','survived']
results = pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)
results

Unnamed: 0,died,survived
died,263,44
survived,87,104


In [89]:
print(classification_report(y_train, y_pred))
pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).round(2)

              precision    recall  f1-score   support

           0       0.75      0.86      0.80       307
           1       0.70      0.54      0.61       191

    accuracy                           0.74       498
   macro avg       0.73      0.70      0.71       498
weighted avg       0.73      0.74      0.73       498



Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.75,0.7,0.74,0.73,0.73
recall,0.86,0.54,0.74,0.7,0.74
f1-score,0.8,0.61,0.74,0.71,0.73
support,307.0,191.0,0.74,498.0,498.0


In [88]:
matrix_results(results)

True Positive = 263
False Positive = 87
True Negative = 104
False Negative = 44
Precision = 0.75
Recall = 0.86
Accuracy = 0.74
F1 Score = 0.80
Support, Positive = 307
Support, Negative = 191


{'true_positive': 263,
 'false_positive': 87,
 'true_negative': 104,
 'false_negative': 44,
 'precision': 0.7514285714285714,
 'recall': 0.8566775244299675,
 'accuracy': 0.7369477911646586,
 'f1_score': 0.8006088280060883,
 'support_pos': 307,
 'support_neg': 191}

##### Trying to loop through all k for train data:

In [103]:
metrics = []
labels = ['died','survived']

for i in range(1,25):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn = knn.fit(X_train, y_train)
    y_pred = knn.predict(X_train)
    
    result = pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)
    results = matrix_results(result)
    k = {'k':i}
    k.update(results)
    metrics.append(k)

df = pd.DataFrame(metrics).round(2).set_index('k')
df

Unnamed: 0_level_0,true_positive,false_positive,true_negative,false_negative,precision,recall,accuracy,f1_score,support_pos,support_neg
k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,297,21,170,10,0.93,0.97,0.94,0.95,307,191
2,305,67,124,2,0.82,0.99,0.86,0.9,307,191
3,279,40,151,28,0.87,0.91,0.86,0.89,307,191
4,286,70,121,21,0.8,0.93,0.82,0.86,307,191
5,265,54,137,42,0.83,0.86,0.81,0.85,307,191
6,276,71,120,31,0.8,0.9,0.8,0.84,307,191
7,255,55,136,52,0.82,0.83,0.79,0.83,307,191
8,267,68,123,40,0.8,0.87,0.78,0.83,307,191
9,262,58,133,45,0.82,0.85,0.79,0.84,307,191
10,267,68,123,40,0.8,0.87,0.78,0.83,307,191


## 6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

#### It seems the lower the k, the better the model performs for overall accuracy.  Interestingly, if we are optimizing for recall (minimizing false negatives) k=2 is far superior.

## 7. Which model performs best on our out-of-sample data from validate?

##### Focusing on accuracy:

In [107]:
differences=[]

for i in range(1,25):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn = knn.fit(X_train, y_train)
    
    in_sample_acc = knn.score(X_train, y_train)
    out_sample_acc = knn.score(X_validate, y_validate)
    
    output = {
        'k' : i,
        'train_accuracy' : in_sample_acc,
        'validation_accuracy' : out_sample_acc
    }
    
    differences.append(output)

ddf = pd.DataFrame(differences).set_index('k')
ddf['difference'] = ddf['train_accuracy']-ddf['validation_accuracy']
ddf.sort_values(by='difference')

Unnamed: 0_level_0,train_accuracy,validation_accuracy,difference
k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11,0.785141,0.728972,0.056169
13,0.763052,0.705607,0.057445
12,0.769076,0.705607,0.063469
8,0.783133,0.719626,0.063506
20,0.736948,0.672897,0.064051
9,0.793173,0.728972,0.064201
5,0.807229,0.742991,0.064238
18,0.746988,0.682243,0.064745
7,0.785141,0.719626,0.065514
19,0.738956,0.672897,0.066059


#### While the best k for train was very low, when appying it to the validation data, we see a higher k is better.  In this case k=11.