In [197]:
from scipy import stats

import matplotlib.pyplot as plt

import numpy as np

import pandas as pd

import seaborn as sns

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.dummy import DummyClassifier

import os

import graphviz
from graphviz import Graph

import env
import acquire
import prepare

import warnings
warnings.filterwarnings("ignore")

## Decision Tree Notes

In [3]:
df = data("iris")

In [4]:
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [5]:
df.columns = [col.lower().replace(".","_") for col in df]

In [6]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [7]:
from sklearn.model_selection import train_test_split

def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [8]:
# split into train, validate, test
train, validate, test = train_validate_test_split(df, target='species', seed=123)

# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['species'])
y_train = train.species

X_validate = validate.drop(columns=['species'])
y_validate = validate.species

X_test = test.drop(columns=['species'])
y_test = test.species

In [21]:
# for classification you can change the algorithm to gini or entropy (information gain).  
# Default is gini.

clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [22]:
# model.fit(X, y)

clf = clf.fit(X_train, y_train)

In [29]:
dot_data = export_graphviz(clf, feature_names= X_train.columns, class_names=clf.classes_, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('iris_decision_tree', view=True)

'iris_decision_tree.pdf'

In [24]:
y_pred = clf.predict(X_train)
y_pred[0:5]

array(['versicolor', 'setosa', 'virginica', 'versicolor', 'setosa'],
      dtype=object)

In [25]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.        , 1.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.03703704, 0.96296296],
       [0.        , 1.        , 0.        ],
       [1.        , 0.        , 0.        ]])

In [26]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.96


In [31]:
confusion_matrix(y_train,y_pred)

array([[28,  0,  0],
       [ 0, 27,  1],
       [ 0,  2, 26]])

In [32]:
y_train.value_counts()

versicolor    28
setosa        28
virginica     28
Name: species, dtype: int64

In [33]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,setosa,versicolor,virginica
setosa,28,0,0
versicolor,0,27,1
virginica,0,2,26


In [34]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        28
  versicolor       0.93      0.96      0.95        28
   virginica       0.96      0.93      0.95        28

    accuracy                           0.96        84
   macro avg       0.96      0.96      0.96        84
weighted avg       0.96      0.96      0.96        84



In [35]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.94


In [36]:
# And since accuracy isn't everything

# Produce y_predictions that come from the X_validate
y_pred = clf.predict(X_validate)

# Compare actual y values (from validate) to predicted y_values from the model run on X_validate
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        12
  versicolor       0.86      1.00      0.92        12
   virginica       1.00      0.83      0.91        12

    accuracy                           0.94        36
   macro avg       0.95      0.94      0.94        36
weighted avg       0.95      0.94      0.94        36



## DECISION TREE EXERCISES

### Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:

In [72]:
df = acquire.get_titanic_data()

In [73]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


### 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [74]:
df = prepare.clean_data(df)

In [75]:
df.drop(columns = ["sex","embark_town"], inplace = True)

In [77]:
train,validate,test= train_validate_test_split(df, target = "survived", seed = 174)

In [78]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [79]:
model = DummyClassifier(strategy='most_frequent')

In [80]:
model.fit(X_train, y_train)

DummyClassifier(strategy='most_frequent')

In [81]:
accuracy = round(model.score(X_train, y_train), 2)
print(f'Training accuracy: {accuracy}')

Training accuracy: 0.62


##### THIS IS OUR BASELINE ACCURACY LETS SEE IF WE CAN BEAT IT (0.62)

### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [108]:
clf = DecisionTreeClassifier(max_depth=8, random_state=123)

In [109]:
clf = clf.fit(X_train, y_train)

In [110]:
y_pred = clf.predict(X_train)
y_pred[0:5]

array([0, 0, 0, 0, 0])

In [111]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

array([[1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.88888889, 0.11111111],
       [1.        , 0.        ]])

In [112]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.92


### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [117]:
y_train.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [122]:
pd.DataFrame(confusion_matrix(y_train, y_pred), index=["died","lived"], columns=["died","lived"])

Unnamed: 0,died,lived
died,297,10
lived,32,159


In [123]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.97      0.93       307
           1       0.94      0.83      0.88       191

    accuracy                           0.92       498
   macro avg       0.92      0.90      0.91       498
weighted avg       0.92      0.92      0.91       498



### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [124]:
accuracy = round(clf.score(X_train, y_train), 2)
print(f'Training accuracy: {accuracy}')

Training accuracy: 0.92


In [125]:
train['prediction'] = model.predict(X_train)

In [126]:
print(classification_report(train.survived, train.prediction, zero_division=True))

              precision    recall  f1-score   support

           0       0.62      1.00      0.76       307
           1       1.00      0.00      0.00       191

    accuracy                           0.62       498
   macro avg       0.81      0.50      0.38       498
weighted avg       0.76      0.62      0.47       498



### 5. Run through steps 2-4 using a different max_depth value.

In [175]:
clf_2 = DecisionTreeClassifier(max_depth=5, random_state=123)

In [176]:
clf_2 = clf_2.fit(X_train, y_train)

In [177]:
y_pred = clf_2.predict(X_train)
y_pred[0:5]

array([0, 0, 0, 0, 0])

In [178]:
y_pred_proba = clf_2.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.66666667, 0.33333333],
       [0.63333333, 0.36666667],
       [1.        , 0.        ],
       [0.88888889, 0.11111111],
       [1.        , 0.        ]])

In [179]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf_2.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.84


In [180]:
pd.DataFrame(confusion_matrix(y_train, y_pred), index=["died","lived"], columns=["died","lived"])

Unnamed: 0,died,lived
died,291,16
lived,62,129


In [181]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.95      0.88       307
           1       0.89      0.68      0.77       191

    accuracy                           0.84       498
   macro avg       0.86      0.81      0.82       498
weighted avg       0.85      0.84      0.84       498



In [182]:
accuracy = round(clf_2.score(X_train, y_train), 2)
print(f'Training accuracy: {accuracy}')

Training accuracy: 0.84


In [183]:
train['prediction'] = model.predict(X_train)

In [187]:
pd.DataFrame(classification_report(train.survived, train.prediction, output_dict=True)).transpose()

Unnamed: 0,precision,recall,f1-score,support
0,0.616466,1.0,0.762733,307.0
1,0.0,0.0,0.0,191.0
accuracy,0.616466,0.616466,0.616466,0.616466
macro avg,0.308233,0.5,0.381366,498.0
weighted avg,0.38003,0.616466,0.470199,498.0


### 6. Which model performs better on your in-sample data?

In [697]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.79


In [189]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf_2.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.83


In [698]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(model.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.73


In [192]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.76


In [191]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf_2.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.79


## PART 2

### 1. Work through these same exercises using the Telco dataset.

In [555]:
def get_telco_data():
    filename = "telco_churn.csv"

    if os.path.isfile("telco_churn.csv"):
        return pd.read_csv("telco_churn.csv")
    else:
        # read the SQL query into a dataframe
        df = pd.read_sql('''
SELECT *
FROM customers
JOIN contract_types ON contract_types.contract_type_id = customers.contract_type_id
JOIN internet_service_types ON internet_service_types.internet_service_type_id = customers.internet_service_type_id
JOIN payment_types ON payment_types.payment_type_id = customers.payment_type_id;
''', acquire.get_connection("telco_churn"))

        # Write that dataframe to disk for later. Called "caching" the data for later.
        df.to_csv("telco_churn.csv")

        # Return the dataframe to the calling code
        return df

In [607]:
df = get_telco_data()

In [608]:
df["partner"] = df.partner.replace(to_replace = ["Yes","No"],value = [1,0])
df["dependents"] = df.dependents.replace(to_replace = ["Yes","No"],value = [1,0])
df["churn"] = df.churn.replace(to_replace = ["Yes","No"],value = [1,0])
df["paperless_billing"] = df.paperless_billing.replace(to_replace = ["Yes","No","No internet service"],value = [1,0,0])
df["tech_support"] = df.tech_support.replace(to_replace = ["Yes","No","No internet service"],value = [1,0,0])
df["device_protection"] = df.device_protection.replace(to_replace = ["Yes","No","No internet service"],value = [1,0,0])
df["online_backup"] = df.online_backup.replace(to_replace = ["Yes","No","No internet service"],value = [1,0,0])
df["online_security"] = df.online_security.replace(to_replace = ["Yes","No","No internet service"],value = [1,0,0])
df["streaming_tv"] = df.streaming_tv.replace(to_replace = ["Yes","No","No internet service"],value = [1,0,0])
df["streaming_movies"] = df.streaming_movies.replace(to_replace = ["Yes","No","No internet service"],value = [1,0,0])
df["phone_service"] = df.phone_service.replace(to_replace = ["Yes","No","No internet service"],value = [1,0,0])
df["multiple_lines"] = df.multiple_lines.replace(to_replace = ["Yes","No","No phone service"],value = [1,0,0])

In [609]:
df.drop(columns = ["contract_type_id.1","internet_service_type_id.1","payment_type_id.1","Unnamed: 0"],inplace = True)

In [610]:
df.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,online_security,...,streaming_movies,contract_type_id,paperless_billing,payment_type_id,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,0016-QLJIS,Female,0,1,1,65,1,1,1,1,...,1,3,1,2,90.45,5957.9,0,Two year,DSL,Mailed check
1,0017-DINOC,Male,0,0,0,54,0,0,1,1,...,0,3,0,4,45.2,2460.55,0,Two year,DSL,Credit card (automatic)
2,0019-GFNTW,Female,0,0,0,56,0,0,1,1,...,0,3,0,3,45.05,2560.1,0,Two year,DSL,Bank transfer (automatic)
3,0056-EPFBG,Male,0,1,1,20,0,0,1,1,...,0,3,1,4,39.4,825.4,0,Two year,DSL,Credit card (automatic)
4,0078-XZMHT,Male,0,1,0,72,1,1,1,0,...,1,3,1,3,85.15,6316.2,0,Two year,DSL,Bank transfer (automatic)


In [611]:
df_dummy = pd.get_dummies(df[["gender","multiple_lines","contract_type","internet_service_type","payment_type"]])

In [612]:
df = pd.concat([df, df_dummy], axis=1)

In [613]:
df.drop(columns = ["gender","customer_id","multiple_lines","contract_type","internet_service_type","payment_type","total_charges",'internet_service_type_id',"contract_type_id","payment_type_id"],inplace = True)

In [614]:
df["total_charges_clean"] = df.monthly_charges * df.tenure

In [615]:
len(y_train)

3943

In [616]:
train,validate,test= train_validate_test_split(df, target = "churn", seed = 174)

In [617]:
train.columns

Index(['senior_citizen', 'partner', 'dependents', 'tenure', 'phone_service',
       'online_security', 'online_backup', 'device_protection', 'tech_support',
       'streaming_tv', 'streaming_movies', 'paperless_billing',
       'monthly_charges', 'churn', 'gender_Female', 'gender_Male',
       'contract_type_Month-to-month', 'contract_type_One year',
       'contract_type_Two year', 'internet_service_type_DSL',
       'internet_service_type_Fiber optic', 'internet_service_type_None',
       'payment_type_Bank transfer (automatic)',
       'payment_type_Credit card (automatic)', 'payment_type_Electronic check',
       'payment_type_Mailed check', 'total_charges_clean'],
      dtype='object')

In [618]:
X_train = train.drop(columns=['churn'])
y_train = train.churn

X_validate = validate.drop(columns=['churn'])
y_validate = validate.churn

X_test = test.drop(columns=['churn'])
y_test = test.churn

In [619]:
model = DummyClassifier(strategy='most_frequent')

In [620]:
model.fit(X_train, y_train)

DummyClassifier(strategy='most_frequent')

In [621]:
accuracy = round(model.score(X_train, y_train), 2)
print(f'Training accuracy: {accuracy}')

Training accuracy: 0.73


##### OUR MODEL TO BEAT IS (.73)

In [684]:
clf = DecisionTreeClassifier(max_depth=6, random_state=123)

In [685]:
clf = clf.fit(X_train, y_train)

In [686]:
y_pred = clf.predict(X_train)
y_pred[0:5]

array([0, 0, 0, 1, 0])

In [687]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.83333333, 0.16666667],
       [0.82142857, 0.17857143],
       [0.72727273, 0.27272727],
       [0.48430493, 0.51569507],
       [0.57692308, 0.42307692]])

In [688]:
clf

DecisionTreeClassifier(max_depth=6, random_state=123)

In [689]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on training set: 0.79


In [696]:
dot_data = export_graphviz(decision_tree = clf,feature_names= X_train.columns)
graph = graphviz.Source(dot_data) 

graph.render('telco_decision_tree', view=True)

'telco_decision_tree.pdf'

In [691]:
pd.DataFrame(confusion_matrix(y_train, y_pred), index=["didntchurn","churned"], columns=["didntchurn","churned"])

Unnamed: 0,didntchurn,churned
didntchurn,2492,405
churned,354,692


In [694]:
train["predicted"] = y_pred

In [695]:
pd.DataFrame(classification_report(train.churn, train.predicted, output_dict=True)).transpose()

Unnamed: 0,precision,recall,f1-score,support
0,0.875615,0.8602,0.867839,2897.0
1,0.630811,0.661568,0.645824,1046.0
accuracy,0.807507,0.807507,0.807507,0.807507
macro avg,0.753213,0.760884,0.756831,3943.0
weighted avg,0.810673,0.807507,0.808943,3943.0


### 2. Experiment with this model on other datasets with a higher number of output classes.