In [1]:
import prepare
import acquire
import exploration
import scipy.stats as stats
import pandas as pd
import numpy as np
import seaborn as sns
import os
import split
import matplotlib.pyplot as plt
import itertools

from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import modeling as md

alpha = 0.05

# I wrote several functions in exploration.py and modeling.py to help 
# with formatting of certain results, and grouping important things
# like classification reports, confusion matrices,

# Decision Tree Exercises

In [2]:
titanic_df, categories, quant_cols = prepare.acquire_prep_titanic()
target_var = 'survived'
train, validate, test = split.train_validate_test_split(titanic_df, target_var)

md.impute_value(train, validate, test, col_names=['age'])


train = exploration.dataset_reduction(train, target_var, categories, quant_cols)
validate = validate[train.columns]
train

['age', 'fare']
Categories related to survived:
pclass
sibsp
parch
alone
embarked_S
sex_male


Unnamed: 0,age,fare,pclass,sibsp,parch,alone,embarked_S,sex_male,survived
583,36.0,40.1250,1,0,0,1,0,1,0
165,9.0,20.5250,3,0,2,0,1,1,1
50,7.0,39.6875,3,4,1,0,1,1,0
259,50.0,26.0000,2,0,1,0,1,0,1
306,28.0,110.8833,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...
313,28.0,7.8958,3,0,0,1,1,1,0
636,32.0,7.9250,3,0,0,1,1,1,0
222,51.0,8.0500,3,0,0,1,1,1,0
485,28.0,25.4667,3,3,1,0,1,0,0


## Q1: What is your baseline prediction? What is your baseline accuracy?

In [3]:
baseline = 1 - train.survived.mean()
baseline

0.6164658634538153

## Q2: Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [4]:
results, reports = md.decision_tree(train, validate, test, target_var, depth=3)
dec_tree_df = md.Results(results, reports, target_var)
dec_tree_df.summary.drop(columns='validate_accuracy')

Unnamed: 0,model_type,depth,train_accuracy,difference,percent_diff
0,decision_tree,3,0.825301,0.026236,3.18


## Q3: Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [5]:
print(f'Accuracy: {dec_tree_df.summary.train_accuracy}')
print(dec_tree_df.train_report(0))
print(dec_tree_df.train_confusion_matrix())

Accuracy: 0    0.825301
Name: train_accuracy, dtype: float64
Train report for index 0:
               precision    recall  f1-score     support
0              0.829341  0.902280  0.864275  307.000000
1              0.817073  0.701571  0.754930  191.000000
accuracy       0.825301  0.825301  0.825301    0.825301
macro avg      0.823207  0.801925  0.809602  498.000000
weighted avg   0.824636  0.825301  0.822337  498.000000 

None
Train report for index 0:
      0    1
0  277   30
1   57  134 

None


## Q4 Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.



In [6]:
dec_tree_df.train_report(0)

Train report for index 0:
               precision    recall  f1-score     support
0              0.829341  0.902280  0.864275  307.000000
1              0.817073  0.701571  0.754930  191.000000
accuracy       0.825301  0.825301  0.825301    0.825301
macro avg      0.823207  0.801925  0.809602  498.000000
weighted avg   0.824636  0.825301  0.822337  498.000000 



## Run through steps 2-4 using a different max_depth value.



In [7]:
results, reports = md.decision_tree(train, validate, test, target_var, depth = 5, loop=True)
dec_tree_df = md.Results(results, reports, target_var)
dec_tree_df.summary

Unnamed: 0,model_type,depth,train_accuracy,validate_accuracy,difference,percent_diff
0,decision_tree,1,0.799197,0.761682,0.037515,4.69
1,decision_tree,2,0.799197,0.761682,0.037515,4.69
2,decision_tree,3,0.825301,0.799065,0.026236,3.18
3,decision_tree,4,0.835341,0.794393,0.040949,4.9
4,decision_tree,5,0.853414,0.803738,0.049675,5.82


In [8]:
print(f'Accuracy: {dec_tree_df.summary[["depth", "train_accuracy"]]}')
print(dec_tree_df.train_report(0))
print(dec_tree_df.train_confusion_matrix())

Accuracy:    depth  train_accuracy
0      1        0.799197
1      2        0.799197
2      3        0.825301
3      4        0.835341
4      5        0.853414
Train report for index 0:
               precision    recall  f1-score     support
0              0.820433  0.863192  0.841270  307.000000
1              0.760000  0.696335  0.726776  191.000000
accuracy       0.799197  0.799197  0.799197    0.799197
macro avg      0.790217  0.779764  0.784023  498.000000
weighted avg   0.797255  0.799197  0.797358  498.000000 

Train report for index 1:
               precision    recall  f1-score     support
0              0.820433  0.863192  0.841270  307.000000
1              0.760000  0.696335  0.726776  191.000000
accuracy       0.799197  0.799197  0.799197    0.799197
macro avg      0.790217  0.779764  0.784023  498.000000
weighted avg   0.797255  0.799197  0.797358  498.000000 

Train report for index 2:
               precision    recall  f1-score     support
0              0.829341  0.

 ## Which model performs better on your in-sample data?



A decision tree with more levels does better on in-sample data. This is probably due to overfitting.

## Which model performs best on your out-of-sample data, the validate set?

Models with depth 3 and 4 seem to do well without over-fitting to the training data.

# Random Forests Exercises

In [9]:
titanic_df, categories, quant_cols = prepare.acquire_prep_titanic()
target_var = 'survived'
train, validate, test = split.train_validate_test_split(titanic_df, target_var)

md.impute_value(train, validate, test, col_names=['age'])


train = exploration.dataset_reduction(train, target_var, categories, quant_cols)
validate = validate[train.columns]

['age', 'fare']
Categories related to survived:
pclass
sibsp
parch
alone
embarked_S
sex_male


In [10]:
train

Unnamed: 0,age,fare,pclass,sibsp,parch,alone,embarked_S,sex_male,survived
583,36.0,40.1250,1,0,0,1,0,1,0
165,9.0,20.5250,3,0,2,0,1,1,1
50,7.0,39.6875,3,4,1,0,1,1,0
259,50.0,26.0000,2,0,1,0,1,0,1
306,28.0,110.8833,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...
313,28.0,7.8958,3,0,0,1,1,1,0
636,32.0,7.9250,3,0,0,1,1,1,0
222,51.0,8.0500,3,0,0,1,1,1,0
485,28.0,25.4667,3,3,1,0,1,0,0


In [11]:
validate

Unnamed: 0,age,fare,pclass,sibsp,parch,alone,embarked_S,sex_male,survived
610,39.0,31.2750,3,1,5,0,1,0,0
424,18.0,20.2125,3,1,1,0,1,1,0
568,28.0,7.2292,3,0,0,1,0,1,0
334,28.0,133.6500,1,1,0,0,1,0,1
101,28.0,7.8958,3,0,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...
176,28.0,25.4667,3,3,1,0,1,1,0
372,19.0,8.0500,3,0,0,1,1,1,0
737,35.0,512.3292,1,0,0,1,0,1,1
862,48.0,25.9292,1,0,0,1,1,0,1


## Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.



In [12]:
results, reports = md.random_forests(train, validate, test, target_var, min_sample_leaf=1,depth= 10, inverse=False)
rf_df = md.Results(results, reports, target_var)
depth_ten = rf_df.summary.depth==10
summary = rf_df.summary[depth_ten]
summary


Unnamed: 0,model_type,depth,min_samples_leaf,train_accuracy,validate_accuracy,difference,percent_diff
9,random_forests,10,1,0.96988,0.813084,0.156795,16.17


## Evaluate your results using the model score, confusion matrix, and classification report.




In [13]:
print(f'Train Accuracy: {summary.train_accuracy}')
print(f'Validate Accuracy: {summary.validate_accuracy}')
print(rf_df.report(9))
rf_df.train_confusion_matrix(9)

Train Accuracy: 9    0.96988
Name: train_accuracy, dtype: float64
Validate Accuracy: 9    0.813084
Name: validate_accuracy, dtype: float64
Training report for index 9:
               precision    recall  f1-score    support
0              0.953416  1.000000  0.976153  307.00000
1              1.000000  0.921466  0.959128  191.00000
accuracy       0.969880  0.969880  0.969880    0.96988
macro avg      0.976708  0.960733  0.967640  498.00000
weighted avg   0.971283  0.969880  0.969623  498.00000 

Validate report for index 9:
               precision    recall  f1-score     support
0              0.828571  0.878788  0.852941  132.000000
1              0.783784  0.707317  0.743590   82.000000
accuracy       0.813084  0.813084  0.813084    0.813084
macro avg      0.806178  0.793052  0.798265  214.000000
weighted avg   0.811410  0.813084  0.811040  214.000000 

(              precision    recall  f1-score    support
0              0.953416  1.000000  0.976153  307.00000
1              1.000

Unnamed: 0,0,1
0,307,0
1,15,176


## Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.



In [14]:
rf_df.report(9)

Training report for index 9:
               precision    recall  f1-score    support
0              0.953416  1.000000  0.976153  307.00000
1              1.000000  0.921466  0.959128  191.00000
accuracy       0.969880  0.969880  0.969880    0.96988
macro avg      0.976708  0.960733  0.967640  498.00000
weighted avg   0.971283  0.969880  0.969623  498.00000 

Validate report for index 9:
               precision    recall  f1-score     support
0              0.828571  0.878788  0.852941  132.000000
1              0.783784  0.707317  0.743590   82.000000
accuracy       0.813084  0.813084  0.813084    0.813084
macro avg      0.806178  0.793052  0.798265  214.000000
weighted avg   0.811410  0.813084  0.811040  214.000000 



(              precision    recall  f1-score    support
 0              0.953416  1.000000  0.976153  307.00000
 1              1.000000  0.921466  0.959128  191.00000
 accuracy       0.969880  0.969880  0.969880    0.96988
 macro avg      0.976708  0.960733  0.967640  498.00000
 weighted avg   0.971283  0.969880  0.969623  498.00000,
               precision    recall  f1-score     support
 0              0.828571  0.878788  0.852941  132.000000
 1              0.783784  0.707317  0.743590   82.000000
 accuracy       0.813084  0.813084  0.813084    0.813084
 macro avg      0.806178  0.793052  0.798265  214.000000
 weighted avg   0.811410  0.813084  0.811040  214.000000)

## Run through steps increasing your min_samples_leaf and decreasing your max_depth.



In [15]:
results, reports = md.random_forests(train, validate, test, target_var, depth=10, inverse=True)
rf_df = md.Results(results, reports, target_var)
rf_df.df

Unnamed: 0,depth,min_samples_leaf,train_accuracy,validate_accuracy,difference,percent_diff,classification_report_validate,classification_report_train,model_type
0,1,10,0.76506,0.747664,0.017397,2.27,precision recall f1-score ...,precision recall f1-score ...,random_forests
1,2,9,0.795181,0.771028,0.024153,3.04,precision recall f1-score ...,precision recall f1-score ...,random_forests
2,3,8,0.823293,0.785047,0.038246,4.65,precision recall f1-score ...,precision recall f1-score ...,random_forests
3,4,7,0.845382,0.803738,0.041643,4.93,precision recall f1-score ...,precision recall f1-score ...,random_forests
4,5,6,0.855422,0.813084,0.042338,4.95,precision recall f1-score ...,precision recall f1-score ...,random_forests
5,6,5,0.859438,0.803738,0.055699,6.48,precision recall f1-score ...,precision recall f1-score ...,random_forests
6,7,4,0.879518,0.808411,0.071107,8.08,precision recall f1-score ...,precision recall f1-score ...,random_forests
7,8,3,0.901606,0.813084,0.088522,9.82,precision recall f1-score ...,precision recall f1-score ...,random_forests
8,9,2,0.915663,0.836449,0.079214,8.65,precision recall f1-score ...,precision recall f1-score ...,random_forests
9,10,1,0.96988,0.813084,0.156795,16.17,precision recall f1-score ...,precision recall f1-score ...,random_forests


## What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

Depth of 10 and min_sample_leaf of 1 performs better on in-sample data
due to overfitting.

## After making a few models, which one has the best performance (or closest metrics) on both train and validate?

Best performance: I'd pick depth 2, min_sample_leaf 9. Even though the metrics aren't as close together as depth 1 min_sample_leaf 10, it is 
several percent higher in training and validate accuracy, while still
maintaining a relative closeness in metrics.

# K Nearest Neighbors Exercises

In [16]:
titanic_df, categories, quant_cols = prepare.acquire_prep_titanic()
target_var = 'survived'
train, validate, test = split.train_validate_test_split(titanic_df, target_var)

md.impute_value(train, validate, test, col_names=['age'])


train = exploration.dataset_reduction(train, target_var, categories, quant_cols)
validate = validate[train.columns]
train

['age', 'fare']
Categories related to survived:
pclass
sibsp
parch
alone
embarked_S
sex_male


Unnamed: 0,age,fare,pclass,sibsp,parch,alone,embarked_S,sex_male,survived
583,36.0,40.1250,1,0,0,1,0,1,0
165,9.0,20.5250,3,0,2,0,1,1,1
50,7.0,39.6875,3,4,1,0,1,1,0
259,50.0,26.0000,2,0,1,0,1,0,1
306,28.0,110.8833,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...
313,28.0,7.8958,3,0,0,1,1,1,0
636,32.0,7.9250,3,0,0,1,1,1,0
222,51.0,8.0500,3,0,0,1,1,1,0
485,28.0,25.4667,3,3,1,0,1,0,0


## Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [17]:
n_neighbor = [5]
results, reports = md.knearest_neightbors(train, validate, test, target_var, 
                                  n_neighbors=n_neighbor)

knn_df = md.Results(results, reports, target_var)
knn_df.set_baseline((train.survived==0).mean())
knn_df.summary

Unnamed: 0,model_type,n_nearest_neighbor,train_accuracy,validate_accuracy,difference,percent_diff
0,knn_uniform,5,0.793173,0.700935,0.092238,11.63
1,knn_distance,5,0.993976,0.742991,0.250985,25.25


## Evaluate your results using the model score, confusion matrix, and classification report.

In [18]:
print(f'Training accuracy:\n {knn_df.summary.train_accuracy}')
print(f'Validate accuracy:\n {knn_df.summary.validate_accuracy}')

print(knn_df.report())
print(knn_df.train_confusion_matrix())
print(knn_df.validate_confusion_matrix())
knn_df.summary

Training accuracy:
 0    0.793173
1    0.993976
Name: train_accuracy, dtype: float64
Validate accuracy:
 0    0.700935
1    0.742991
Name: validate_accuracy, dtype: float64
Train report for index 0:
               precision    recall  f1-score     support
0              0.833333  0.830619  0.831974  307.000000
1              0.729167  0.732984  0.731070  191.000000
accuracy       0.793173  0.793173  0.793173    0.793173
macro avg      0.781250  0.781802  0.781522  498.000000
weighted avg   0.793382  0.793173  0.793274  498.000000 

Validate report for index 0:
               precision    recall  f1-score     support
0              0.750000  0.772727  0.761194  132.000000
1              0.615385  0.585366  0.600000   82.000000
accuracy       0.700935  0.700935  0.700935    0.700935
macro avg      0.682692  0.679047  0.680597  214.000000
weighted avg   0.698418  0.700935  0.699428  214.000000 

Train report for index 1:
               precision    recall  f1-score     support
0          

Unnamed: 0,model_type,n_nearest_neighbor,train_accuracy,validate_accuracy,difference,percent_diff
0,knn_uniform,5,0.793173,0.700935,0.092238,11.63
1,knn_distance,5,0.993976,0.742991,0.250985,25.25


## Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [19]:
knn_df.report()

Train report for index 0:
               precision    recall  f1-score     support
0              0.833333  0.830619  0.831974  307.000000
1              0.729167  0.732984  0.731070  191.000000
accuracy       0.793173  0.793173  0.793173    0.793173
macro avg      0.781250  0.781802  0.781522  498.000000
weighted avg   0.793382  0.793173  0.793274  498.000000 

Validate report for index 0:
               precision    recall  f1-score     support
0              0.750000  0.772727  0.761194  132.000000
1              0.615385  0.585366  0.600000   82.000000
accuracy       0.700935  0.700935  0.700935    0.700935
macro avg      0.682692  0.679047  0.680597  214.000000
weighted avg   0.698418  0.700935  0.699428  214.000000 

Train report for index 1:
               precision    recall  f1-score     support
0              0.990323  1.000000  0.995138  307.000000
1              1.000000  0.984293  0.992084  191.000000
accuracy       0.993976  0.993976  0.993976    0.993976
macro avg      0

## Run through steps 2-4 setting k to 10



In [20]:
n_neighbor = [5, 10, 20]
results, reports = md.knearest_neightbors(train, validate, test,
                                                target_var,
                                               n_neighbors=n_neighbor)
knn_df = md.Results(results, reports, target_var)
knn_df.summary

Unnamed: 0,model_type,n_nearest_neighbor,train_accuracy,validate_accuracy,difference,percent_diff
0,knn_uniform,5,0.793173,0.700935,0.092238,11.63
1,knn_uniform,10,0.748996,0.71028,0.038716,5.17
2,knn_uniform,20,0.728916,0.728972,-5.6e-05,-0.01
3,knn_distance,5,0.993976,0.742991,0.250985,25.25
4,knn_distance,10,0.993976,0.742991,0.250985,25.25
5,knn_distance,20,0.993976,0.738318,0.255658,25.72


#### Dropping knn_distance models due to substantial overfitting.

In [21]:
summary = knn_df.summary[knn_df.summary.model_type == 'knn_uniform']
summary

Unnamed: 0,model_type,n_nearest_neighbor,train_accuracy,validate_accuracy,difference,percent_diff
0,knn_uniform,5,0.793173,0.700935,0.092238,11.63
1,knn_uniform,10,0.748996,0.71028,0.038716,5.17
2,knn_uniform,20,0.728916,0.728972,-5.6e-05,-0.01


In [22]:
# need to fix function to accept 0 without printing all
# also would like to pass list for specific indexes
knn_df.report(0)

Train report for index 0:
               precision    recall  f1-score     support
0              0.833333  0.830619  0.831974  307.000000
1              0.729167  0.732984  0.731070  191.000000
accuracy       0.793173  0.793173  0.793173    0.793173
macro avg      0.781250  0.781802  0.781522  498.000000
weighted avg   0.793382  0.793173  0.793274  498.000000 

Validate report for index 0:
               precision    recall  f1-score     support
0              0.750000  0.772727  0.761194  132.000000
1              0.615385  0.585366  0.600000   82.000000
accuracy       0.700935  0.700935  0.700935    0.700935
macro avg      0.682692  0.679047  0.680597  214.000000
weighted avg   0.698418  0.700935  0.699428  214.000000 

Train report for index 1:
               precision    recall  f1-score     support
0              0.745946  0.899023  0.815362  307.000000
1              0.757812  0.507853  0.608150  191.000000
accuracy       0.748996  0.748996  0.748996    0.748996
macro avg      0

## What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?



#### Knn with weight as distance has nearly 100% accuracy due to massive overfitting.

## Which model performs best on our out-of-sample data from validate?



knn_uniform with n == 20 has the best validate performance, its validate acuually performed slightly better than training.

# Logistic Regression

In [23]:
train

Unnamed: 0,age,fare,pclass,sibsp,parch,alone,embarked_S,sex_male,survived
583,36.0,40.1250,1,0,0,1,0,1,0
165,9.0,20.5250,3,0,2,0,1,1,1
50,7.0,39.6875,3,4,1,0,1,1,0
259,50.0,26.0000,2,0,1,0,1,0,1
306,28.0,110.8833,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...
313,28.0,7.8958,3,0,0,1,1,1,0
636,32.0,7.9250,3,0,0,1,1,1,0
222,51.0,8.0500,3,0,0,1,1,1,0
485,28.0,25.4667,3,3,1,0,1,0,0


In [24]:
validate

Unnamed: 0,age,fare,pclass,sibsp,parch,alone,embarked_S,sex_male,survived
610,39.0,31.2750,3,1,5,0,1,0,0
424,18.0,20.2125,3,1,1,0,1,1,0
568,28.0,7.2292,3,0,0,1,0,1,0
334,28.0,133.6500,1,1,0,0,1,0,1
101,28.0,7.8958,3,0,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...
176,28.0,25.4667,3,3,1,0,1,1,0
372,19.0,8.0500,3,0,0,1,1,1,0
737,35.0,512.3292,1,0,0,1,0,1,1
862,48.0,25.9292,1,0,0,1,1,0,1


## Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

In [25]:
# age_fare_plcass -> afp
afp = ['age', 'fare', 'pclass', target_var]
afp_train = train[afp]
afp_validate = validate[afp]
afp_test = test[afp]

afp_train

Unnamed: 0,age,fare,pclass,survived
583,36.0,40.1250,1,0
165,9.0,20.5250,3,1
50,7.0,39.6875,3,0
259,50.0,26.0000,2,1
306,28.0,110.8833,1,1
...,...,...,...,...
313,28.0,7.8958,3,0
636,32.0,7.9250,3,0
222,51.0,8.0500,3,0
485,28.0,25.4667,3,0


In [26]:
afp_validate

Unnamed: 0,age,fare,pclass,survived
610,39.0,31.2750,3,0
424,18.0,20.2125,3,0
568,28.0,7.2292,3,0
334,28.0,133.6500,1,1
101,28.0,7.8958,3,0
...,...,...,...,...
176,28.0,25.4667,3,0
372,19.0,8.0500,3,0
737,35.0,512.3292,1,1
862,48.0,25.9292,1,1


In [27]:
results, reports = md.logistic_regression(afp_train, afp_validate,
                                         afp_test, target_var, c=2,
                                         solver='lbfgs')
afp_logit = md.Results(results, reports, target_var)
afp_logit.summary

Unnamed: 0,model_type,C,train_accuracy,validate_accuracy,difference,percent_diff
0,logistic_regression,0.1,0.674699,0.696262,-0.021563,-3.2
1,logistic_regression,0.2,0.674699,0.696262,-0.021563,-3.2
2,logistic_regression,0.3,0.674699,0.696262,-0.021563,-3.2
3,logistic_regression,0.4,0.674699,0.696262,-0.021563,-3.2
4,logistic_regression,0.5,0.674699,0.696262,-0.021563,-3.2
5,logistic_regression,0.6,0.674699,0.696262,-0.021563,-3.2
6,logistic_regression,0.7,0.674699,0.696262,-0.021563,-3.2
7,logistic_regression,0.8,0.674699,0.696262,-0.021563,-3.2
8,logistic_regression,0.9,0.674699,0.696262,-0.021563,-3.2
9,logistic_regression,1.0,0.674699,0.696262,-0.021563,-3.2


### Answer

Yes, a model only including age, fare and plcass performs better than baseline.

## Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.



In [28]:
#age, fare, plcass, sex -> afps
afps = ['age', 'fare', 'pclass', 'sex_male', target_var]

afps_train = train[afps]
afps_validate = validate[afps]
afps_test = test[afps]

afps_train

Unnamed: 0,age,fare,pclass,sex_male,survived
583,36.0,40.1250,1,1,0
165,9.0,20.5250,3,1,1
50,7.0,39.6875,3,1,0
259,50.0,26.0000,2,0,1
306,28.0,110.8833,1,0,1
...,...,...,...,...,...
313,28.0,7.8958,3,1,0
636,32.0,7.9250,3,1,0
222,51.0,8.0500,3,1,0
485,28.0,25.4667,3,0,0


In [29]:
results, reports = md.logistic_regression(afps_train, afps_validate,
                                               afps_test, target_var, 
                                               c=2, solver='lbfgs')
afps_logit = md.Results(results, reports, target_var)
afps_logit.summary

Unnamed: 0,model_type,C,train_accuracy,validate_accuracy,difference,percent_diff
0,logistic_regression,0.1,0.791165,0.761682,0.029482,3.73
1,logistic_regression,0.2,0.791165,0.761682,0.029482,3.73
2,logistic_regression,0.3,0.791165,0.761682,0.029482,3.73
3,logistic_regression,0.4,0.791165,0.761682,0.029482,3.73
4,logistic_regression,0.5,0.791165,0.761682,0.029482,3.73
5,logistic_regression,0.6,0.791165,0.761682,0.029482,3.73
6,logistic_regression,0.7,0.791165,0.761682,0.029482,3.73
7,logistic_regression,0.8,0.791165,0.761682,0.029482,3.73
8,logistic_regression,0.9,0.791165,0.761682,0.029482,3.73
9,logistic_regression,1.0,0.791165,0.761682,0.029482,3.73


## Try out other combinations of features and models.

In [30]:
# age, fare, pclass, sex, alone -> afpsa
afpsa = ['age', 'fare', 'pclass', 'sex_male', 'alone', target_var]
afpsa_train = train[afpsa]
afpsa_validate = validate[afpsa]
afpsa_test = test[afpsa]

afpsa_train

Unnamed: 0,age,fare,pclass,sex_male,alone,survived
583,36.0,40.1250,1,1,1,0
165,9.0,20.5250,3,1,0,1
50,7.0,39.6875,3,1,0,0
259,50.0,26.0000,2,0,0,1
306,28.0,110.8833,1,0,1,1
...,...,...,...,...,...,...
313,28.0,7.8958,3,1,1,0
636,32.0,7.9250,3,1,1,0
222,51.0,8.0500,3,1,1,0
485,28.0,25.4667,3,0,0,0


In [31]:
results, reports = md.logistic_regression(afpsa_train, afpsa_validate,
                                               afpsa_test, target_var, c=2)

afpsa_logit = md.Results(results, reports, target_var)
afpsa_logit.summary

Unnamed: 0,model_type,C,train_accuracy,validate_accuracy,difference,percent_diff
0,logistic_regression,0.1,0.789157,0.761682,0.027474,3.48
1,logistic_regression,0.2,0.789157,0.761682,0.027474,3.48
2,logistic_regression,0.3,0.789157,0.761682,0.027474,3.48
3,logistic_regression,0.4,0.789157,0.761682,0.027474,3.48
4,logistic_regression,0.5,0.789157,0.761682,0.027474,3.48
5,logistic_regression,0.6,0.789157,0.761682,0.027474,3.48
6,logistic_regression,0.7,0.789157,0.761682,0.027474,3.48
7,logistic_regression,0.8,0.789157,0.761682,0.027474,3.48
8,logistic_regression,0.9,0.789157,0.761682,0.027474,3.48
9,logistic_regression,1.0,0.789157,0.761682,0.027474,3.48


## Takeaway

 ### Adding 'alone' increased training accuracy but didn't seem to affect validate accuracy.

In [32]:
# taking out 'alone', adding 'embarked_S'
sub = ['age', 'fare', 'pclass', 'sex_male', 'embarked_S', target_var]

results, reports = md.logistic_regression(train[sub], validate[sub],
                                         test[sub], target_var, c=2)
afpse_logit = md.Results(results, reports, target_var)
afpse_logit.summary

Unnamed: 0,model_type,C,train_accuracy,validate_accuracy,difference,percent_diff
0,logistic_regression,0.1,0.789157,0.761682,0.027474,3.48
1,logistic_regression,0.2,0.789157,0.761682,0.027474,3.48
2,logistic_regression,0.3,0.789157,0.761682,0.027474,3.48
3,logistic_regression,0.4,0.789157,0.761682,0.027474,3.48
4,logistic_regression,0.5,0.789157,0.761682,0.027474,3.48
5,logistic_regression,0.6,0.789157,0.761682,0.027474,3.48
6,logistic_regression,0.7,0.789157,0.761682,0.027474,3.48
7,logistic_regression,0.8,0.789157,0.761682,0.027474,3.48
8,logistic_regression,0.9,0.789157,0.761682,0.027474,3.48
9,logistic_regression,1.0,0.789157,0.761682,0.027474,3.48


In [33]:
# adding 'parch'
afpsep = ['age', 'fare', 'pclass', 'sex', 'embarked_S', 'parch']


# Modeling Telco

In [120]:
telco_df, categories, quant_cols = prepare.acquire_prep_telco()
target_var = 'churn'
telco_df['monthly_tenure_ratio'] = telco_df['monthly_charges'] / telco_df['tenure']
#telco_df['total_monthly_product'] = telco_df['total_charges'] * telco_df['monthly_charges']
#telco_df['total_tenure_squared'] = (telco_df['total_charges'] * telco_df['tenure']) **2
telco_df['senior_partner_dependent'] = telco_df[['senior_citizen', 'partner', 'dependents']].apply(lambda x: x.sum(), axis=1)



# removing outliers from training to see if it improves model accuracy
train, validate, test = split.train_validate_test_split(telco_df, target_var)
#train = prepare.remove_outliers(2.5, quant_cols, train)
baseline = (train.churn==0).mean()

train = exploration.dataset_reduction(train, target_var, categories, quant_cols)
validate = validate[train.columns]
telco_df

multiple_lines
online_security
online_backup
device_protection
tech_support
streaming_tv
streaming_movies
Categories related to churn:
senior_citizen
partner
dependents
multiple_lines
online_security
online_backup
device_protection
tech_support
streaming_tv
streaming_movies
paperless_billing
internet_service_type_Fiber optic
internet_service_type_None
payment_type_Credit card (automatic)
payment_type_Electronic check
payment_type_Mailed check
contract_type_One year
contract_type_Two year


Unnamed: 0,is_female,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,...,churn,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check,contract_type_One year,contract_type_Two year,monthly_tenure_ratio,senior_partner_dependent
0,1,0,1,1,9.0,1,0.0,0.0,1.0,0.0,...,0,0,0,0,0,1,1,0,7.288889,2
1,0,0,0,0,9.0,1,1.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,6.655556,0
2,0,0,0,0,4.0,1,0.0,0.0,0.0,1.0,...,1,1,0,0,1,0,0,0,18.475000,0
3,0,1,1,0,13.0,1,0.0,0.0,1.0,1.0,...,1,1,0,0,1,0,0,0,7.538462,2
4,1,1,1,0,3.0,1,0.0,0.0,0.0,0.0,...,1,1,0,0,0,1,0,0,27.966667,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,0,0,13.0,1,0.0,1.0,0.0,0.0,...,0,0,0,0,0,1,1,0,4.242308,0
7039,0,0,1,0,22.0,1,1.0,0.0,0.0,0.0,...,1,1,0,0,1,0,0,0,3.868182,1
7040,0,0,0,0,2.0,1,0.0,0.0,1.0,0.0,...,0,0,0,0,0,1,0,0,25.150000,0
7041,0,0,1,1,67.0,1,0.0,1.0,0.0,1.0,...,0,0,0,0,0,1,0,1,1.012687,2


In [103]:
md.Results.baseline = baseline
baseline

0.7343154686309372

# Telco Decision Tree Modeling

In [104]:
md.Results.all_instances = []
results, reports = md.decision_tree(train, validate, test, target_var, loop=True, depth=10)
dec_tree_df = md.Results(results, reports, target_var)
dec_tree_df.summary


Unnamed: 0,model_type,depth,train_accuracy,validate_accuracy,difference,percent_diff
0,decision_tree,1,0.734315,0.734005,0.000311,0.04
1,decision_tree,2,0.794006,0.776066,0.017939,2.26
2,decision_tree,3,0.794006,0.776066,0.017939,2.26
3,decision_tree,4,0.80061,0.777844,0.022766,2.84
4,decision_tree,5,0.80696,0.776066,0.030893,3.83
5,decision_tree,6,0.816612,0.777251,0.03936,4.82
6,decision_tree,7,0.832106,0.763626,0.06848,8.23
7,decision_tree,8,0.841504,0.763033,0.078471,9.33
8,decision_tree,9,0.857506,0.758294,0.099212,11.57
9,decision_tree,10,0.876302,0.751185,0.125117,14.28


# Telco Random Forests Modeling

In [127]:
results, reports = md.random_forests(train, validate, test, target_var, min_sample_leaf=1, depth=20, inverse=True )
rf_df = md.Results(results, reports, target_var)
rf_df.summary

Unnamed: 0,model_type,depth,min_samples_leaf,train_accuracy,validate_accuracy,difference,percent_diff
0,random_forests,1,20,0.734315,0.734005,0.000311,0.04
1,random_forests,2,19,0.78537,0.773104,0.012265,1.56
2,random_forests,3,18,0.796546,0.777251,0.019294,2.42
3,random_forests,4,17,0.80442,0.787322,0.017097,2.13
4,random_forests,5,16,0.809246,0.785545,0.023701,2.93
5,random_forests,6,15,0.814326,0.792654,0.021672,2.66
6,random_forests,7,14,0.817628,0.789692,0.027936,3.42
7,random_forests,8,13,0.824486,0.793246,0.031239,3.79
8,random_forests,9,12,0.82982,0.792062,0.037758,4.55
9,random_forests,10,11,0.836678,0.7891,0.047578,5.69


# Telco K-Nearest Neighbors Modeling

In [106]:
neighbors = list(range(1,11))


results, reports = md.knearest_neightbors(train, validate, test, target_var, n_neighbors =neighbors)
knn_sub5_df = md.Results(results, reports, target_var)
knn_sub5_df.summary



Unnamed: 0,model_type,n_nearest_neighbor,train_accuracy,validate_accuracy,difference,percent_diff
0,knn_uniform,1,0.997206,0.704384,0.292822,29.36
1,knn_uniform,2,0.860808,0.74763,0.113177,13.15
2,knn_uniform,3,0.858522,0.742299,0.116223,13.54
3,knn_uniform,4,0.833122,0.766588,0.066534,7.99
4,knn_uniform,5,0.831852,0.757109,0.074743,8.99
5,knn_uniform,6,0.81839,0.773104,0.045285,5.53
6,knn_uniform,7,0.818898,0.776659,0.042239,5.16
7,knn_uniform,8,0.812802,0.78436,0.028441,3.5
8,knn_uniform,9,0.815088,0.777251,0.037836,4.64
9,knn_uniform,10,0.808738,0.770735,0.038003,4.7


# Telco Logistic Regression Modeling

### No Scaling of Data

In [107]:
results, reports = md.logistic_regression(train, validate, test, 
                                          target_var, c=3,
                                          solver='liblinear')
logi_1 = md.Results(results, reports, target_var)
logi_1.summary

Unnamed: 0,model_type,C,train_accuracy,validate_accuracy,difference,percent_diff
0,logistic_regression,0.1,0.800356,0.793246,0.007109,0.89
1,logistic_regression,0.2,0.799594,0.795024,0.00457,0.57
2,logistic_regression,0.3,0.79807,0.795616,0.002453,0.31
3,logistic_regression,0.4,0.798578,0.795616,0.002961,0.37
4,logistic_regression,0.5,0.802134,0.795024,0.00711,0.89
5,logistic_regression,0.6,0.802642,0.795024,0.007618,0.95
6,logistic_regression,0.7,0.804166,0.793839,0.010327,1.28
7,logistic_regression,0.8,0.80442,0.794431,0.009988,1.24
8,logistic_regression,0.9,0.803658,0.794431,0.009226,1.15
9,logistic_regression,1.0,0.803404,0.794431,0.008972,1.12


In [108]:
# monthly charges, internet service type fiber, payment type elec. check
sub1 = ['monthly_charges', 'internet_service_type_Fiber optic',
       'payment_type_Electronic check',
        target_var]

results, reports = md.logistic_regression(train[sub1], validate[sub1],
                                          test[sub1], target_var, c=3)
main_drivers = md.Results(results, reports, target_var)

main_drivers.summary

Unnamed: 0,model_type,C,train_accuracy,validate_accuracy,difference,percent_diff
0,logistic_regression,0.1,0.755906,0.739929,0.015977,2.11
1,logistic_regression,0.2,0.755906,0.739929,0.015977,2.11
2,logistic_regression,0.3,0.755906,0.739929,0.015977,2.11
3,logistic_regression,0.4,0.755906,0.739929,0.015977,2.11
4,logistic_regression,0.5,0.755906,0.739929,0.015977,2.11
5,logistic_regression,0.6,0.755906,0.739929,0.015977,2.11
6,logistic_regression,0.7,0.755906,0.739929,0.015977,2.11
7,logistic_regression,0.8,0.755906,0.739929,0.015977,2.11
8,logistic_regression,0.9,0.755906,0.739929,0.015977,2.11
9,logistic_regression,1.0,0.755906,0.739929,0.015977,2.11


In [109]:
sub2 = ['monthly_charges', 'internet_service_type_Fiber optic',
       'payment_type_Electronic check',
       'payment_type_Mailed check',
       'internet_service_type_None',
       target_var]
results, reports = md.logistic_regression(train[sub2], validate[sub2],
                                          test[sub2], target_var, c=3)
main_drivers = md.Results(results, reports, target_var)
main_drivers.summary


Unnamed: 0,model_type,C,train_accuracy,validate_accuracy,difference,percent_diff
0,logistic_regression,0.1,0.766828,0.747038,0.01979,2.58
1,logistic_regression,0.2,0.766828,0.747038,0.01979,2.58
2,logistic_regression,0.3,0.766828,0.747038,0.01979,2.58
3,logistic_regression,0.4,0.766828,0.747038,0.01979,2.58
4,logistic_regression,0.5,0.766828,0.747038,0.01979,2.58
5,logistic_regression,0.6,0.766828,0.747038,0.01979,2.58
6,logistic_regression,0.7,0.766828,0.747038,0.01979,2.58
7,logistic_regression,0.8,0.766828,0.747038,0.01979,2.58
8,logistic_regression,0.9,0.766828,0.747038,0.01979,2.58
9,logistic_regression,1.0,0.766828,0.747038,0.01979,2.58


In [110]:
sub3 = ['monthly_charges', 'internet_service_type_Fiber optic',
       'payment_type_Electronic check',
       'payment_type_Mailed check',
       'internet_service_type_None',
       'payment_type_Credit card (automatic)',
       target_var]
results, reports = md.logistic_regression(train[sub3], validate[sub3],
                                          test[sub3], target_var, c=3,
                                         solver='liblinear')
main_drivers = md.Results(results, reports, target_var)
main_drivers.summary

Unnamed: 0,model_type,C,train_accuracy,validate_accuracy,difference,percent_diff
0,logistic_regression,0.1,0.766066,0.748815,0.01725,2.25
1,logistic_regression,0.2,0.766066,0.748815,0.01725,2.25
2,logistic_regression,0.3,0.766066,0.74763,0.018435,2.41
3,logistic_regression,0.4,0.765812,0.74763,0.018181,2.37
4,logistic_regression,0.5,0.765812,0.74763,0.018181,2.37
5,logistic_regression,0.6,0.765812,0.74763,0.018181,2.37
6,logistic_regression,0.7,0.765812,0.74763,0.018181,2.37
7,logistic_regression,0.8,0.765558,0.74763,0.017927,2.34
8,logistic_regression,0.9,0.765812,0.74763,0.018181,2.37
9,logistic_regression,1.0,0.765558,0.74763,0.017927,2.34


In [111]:
sub4 = ['monthly_charges', 'internet_service_type_Fiber optic',
       'payment_type_Electronic check',
       'payment_type_Mailed check',
       'internet_service_type_None',
       'payment_type_Credit card (automatic)',
       'senior_citizen',
       target_var]
results, reports = md.logistic_regression(train[sub4], validate[sub4],
                                          test[sub4], target_var, c=3)
main_drivers = md.Results(results, reports, target_var)
main_drivers.summary

Unnamed: 0,model_type,C,train_accuracy,validate_accuracy,difference,percent_diff
0,logistic_regression,0.1,0.767336,0.763033,0.004302,0.56
1,logistic_regression,0.2,0.767336,0.763033,0.004302,0.56
2,logistic_regression,0.3,0.767336,0.763033,0.004302,0.56
3,logistic_regression,0.4,0.767336,0.763033,0.004302,0.56
4,logistic_regression,0.5,0.767336,0.763033,0.004302,0.56
5,logistic_regression,0.6,0.767336,0.763033,0.004302,0.56
6,logistic_regression,0.7,0.767336,0.763033,0.004302,0.56
7,logistic_regression,0.8,0.767336,0.763033,0.004302,0.56
8,logistic_regression,0.9,0.767336,0.763033,0.004302,0.56
9,logistic_regression,1.0,0.767336,0.763033,0.004302,0.56


In [112]:
sub5 = ['monthly_charges', 'internet_service_type_Fiber optic',
       'payment_type_Electronic check',
       'internet_service_type_None',
       'payment_type_Credit card (automatic)',
       'senior_citizen',
       'partner',
       target_var]
results, reports = md.logistic_regression(train[sub5], validate[sub5],
                                          test[sub5], target_var, c=3,
                                         solver='liblinear')
main_drivers = md.Results(results, reports, target_var)
main_drivers.summary

Unnamed: 0,model_type,C,train_accuracy,validate_accuracy,difference,percent_diff
0,logistic_regression,0.1,0.767844,0.765403,0.002441,0.32
1,logistic_regression,0.2,0.767844,0.765403,0.002441,0.32
2,logistic_regression,0.3,0.76759,0.765403,0.002187,0.28
3,logistic_regression,0.4,0.767844,0.76481,0.003033,0.4
4,logistic_regression,0.5,0.767844,0.765403,0.002441,0.32
5,logistic_regression,0.6,0.76759,0.76481,0.002779,0.36
6,logistic_regression,0.7,0.767844,0.76481,0.003033,0.4
7,logistic_regression,0.8,0.76759,0.765403,0.002187,0.28
8,logistic_regression,0.9,0.767336,0.76481,0.002525,0.33
9,logistic_regression,1.0,0.767336,0.76481,0.002525,0.33


In [113]:
md.Results.baseline

0.7343154686309372

### With Scaling of Data

In [121]:
col_to_scale = ['tenure', 'monthly_charges', 'total_charges', 'monthly_tenure_ratio', 'senior_partner_dependent'] #'total_monthly_product', 'total_tenure_squared']
scaled_df = telco_df[col_to_scale].apply(lambda x: ((x-x.min())/(x.max()-x.min())))

telco_encoded = telco_df.drop(columns=col_to_scale)
telco_scaled_df = pd.concat([scaled_df, telco_encoded], axis=1)
    
telco_scaled_df
train, validate, test = split.train_validate_test_split(telco_scaled_df, target_var)

train.head()

Unnamed: 0,tenure,monthly_charges,total_charges,monthly_tenure_ratio,senior_partner_dependent,is_female,senior_citizen,partner,dependents,phone_service,...,streaming_movies,paperless_billing,churn,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check,contract_type_One year,contract_type_Two year
5919,0.802817,0.525871,0.49393,0.009374,0.0,1,0,0,0,1,...,0.0,1,0,0,0,1,0,0,1,0
1915,0.985915,0.668657,0.693532,0.009155,0.333333,0,0,0,1,1,...,0.0,1,0,1,0,1,0,0,1,0
5054,0.478873,0.074627,0.099671,0.004577,0.666667,1,0,1,1,1,...,0.0,1,0,0,1,0,1,0,0,0
2355,0.0,0.009453,4.6e-05,0.185277,0.666667,0,0,1,1,1,...,0.0,0,0,0,1,0,0,1,0,0
6279,0.267606,0.355721,0.119675,0.0238,0.333333,0,1,0,0,1,...,0.0,0,0,0,0,0,0,1,1,0


In [125]:
results, reports = md.logistic_regression(train, validate, test, 
                                          target_var, c=100, solver='liblinear')
logi_1 = md.Results(results, reports, target_var)
logi_1.summary

Unnamed: 0,model_type,C,train_accuracy,validate_accuracy,difference,percent_diff
0,logistic_regression,0.1,0.807976,0.805687,0.002288,0.28
1,logistic_regression,0.2,0.807976,0.805095,0.002881,0.36
2,logistic_regression,0.3,0.807976,0.805687,0.002288,0.28
3,logistic_regression,0.4,0.807976,0.805095,0.002881,0.36
4,logistic_regression,0.5,0.807976,0.806280,0.001696,0.21
...,...,...,...,...,...,...
995,logistic_regression,99.6,0.809500,0.805687,0.003812,0.47
996,logistic_regression,99.7,0.809500,0.805687,0.003812,0.47
997,logistic_regression,99.8,0.809500,0.805687,0.003812,0.47
998,logistic_regression,99.9,0.809500,0.805687,0.003812,0.47


In [123]:
summary = md.Results.total_summary
summary

Unnamed: 0,model_type,depth,train_accuracy,validate_accuracy,difference,percent_diff,min_samples_leaf,n_nearest_neighbor,C
0,decision_tree,1.0,0.734315,0.734005,0.000311,0.04,,,
1,decision_tree,2.0,0.794006,0.776066,0.017939,2.26,,,
2,decision_tree,3.0,0.794006,0.776066,0.017939,2.26,,,
3,decision_tree,4.0,0.800610,0.777844,0.022766,2.84,,,
4,decision_tree,5.0,0.806960,0.776066,0.030893,3.83,,,
...,...,...,...,...,...,...,...,...,...
45,logistic_regression,,0.808230,0.806872,0.001358,0.17,,,4.6
46,logistic_regression,,0.808230,0.806872,0.001358,0.17,,,4.7
47,logistic_regression,,0.808230,0.806872,0.001358,0.17,,,4.8
48,logistic_regression,,0.808230,0.806872,0.001358,0.17,,,4.9


In [126]:
summary.validate_accuracy.max()

0.806872037914692