# Imports:

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import matplotlib.pyplot as plt
import seaborn as sns
from env import get_connection

import acquire, prepare

pd.options.display.max_columns = None

In [2]:
# 20% test, 80% train_validate
# then of the 80% train_validate: 30% validate, 70% train. 
def train_validate(df, stratify_col = None):
    """
    This function takes in a DataFrame and column name for the stratify argument (defualt is None).
    It will split the data into three parts for training, testing and validating.
    """
    #This is logic to set the stratify argument:
    stratify_arg = ''
    if stratify_col != None:
        stratify_arg = df[stratify_col]
    else:
        stratify_arg = None
    
    #This splits the DataFrame into 'train' and 'test':
    train, test = train_test_split(df, train_size=.7, stratify=stratify_arg, random_state = 321)
    
    #The length of the stratify column changed and needs to be adjusted:
    if stratify_col != None:
        stratify_arg = train[stratify_col]
        
    #This splits the larger 'train' DataFrame into a smaller 'train' and 'validate' DataFrames:
    train, validate = train_test_split(train, test_size=.5, stratify=stratify_arg, random_state = 321)
    return train, validate, test

# Part 1: Titanic

In [3]:
titanic = acquire.get_titanic_data()

In [4]:
titanic = prepare.prep_titanic(titanic)

In [5]:
titanic.drop(columns=['sex', 'embark_town'], inplace=True)

In [6]:
ti_train, ti_val, ti_test = train_validate(titanic, 'survived')

In [7]:
ti_train.survived.value_counts()

0    192
1    119
Name: survived, dtype: int64

### 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [8]:
ti_base = pd.DataFrame()
ti_base['baseline_prediction'] = ti_train['survived']
ti_base['baseline_prediction'] = ti_base['baseline_prediction'] * 0

In the telco example, the accuracy_score does not like int inputs. For some reason it works here and all input as similar in type...

In [9]:
#print(ti_base.info())
#print(ti_train.survived.info())
#print(ti_base.value_counts())
#print(ti_train.survived.value_counts())

In [10]:
ti_baseline = accuracy_score(ti_train.survived, ti_base.baseline_prediction)
print(f'The baseline is: {ti_baseline:.3f}')

The baseline is: 0.617


### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [11]:
X_train = ti_train.drop(columns=['survived'])
y_train = ti_train['survived']

X_val = ti_val.drop(columns=['survived'])
y_val = ti_val['survived']

X_test = ti_test.drop(columns=['survived'])
y_test = ti_test['survived']

In [12]:
seed = 42

clf = DecisionTreeClassifier(max_depth=3, random_state=42)

In [13]:
# This is for some error where clf is trying to concat a string to an int.
# It did not occure in my code and I'm unsure why.

#class_names = np.array(clf.classes_).astype('str').tolist()

In [14]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3, random_state=42)

### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [15]:
#Model score:
clf.score(X_train, y_train)

0.8167202572347267

In [16]:
#Confusion matrix:
#Do the reading
# make prediction on train obeservations

y_pred = clf.predict(X_train)

pd.DataFrame(confusion_matrix(y_train, y_pred), index=['Pred_live', 'Pred_death'], columns=['Act_live','Act_death'])

Unnamed: 0,Act_live,Act_death
Pred_live,173,19
Pred_death,38,81


In [17]:
#Classification report:
print(
    classification_report(y_train, y_pred),)

              precision    recall  f1-score   support

           0       0.82      0.90      0.86       192
           1       0.81      0.68      0.74       119

    accuracy                           0.82       311
   macro avg       0.81      0.79      0.80       311
weighted avg       0.82      0.82      0.81       311



### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [18]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86       192
           1       0.81      0.68      0.74       119

    accuracy                           0.82       311
   macro avg       0.81      0.79      0.80       311
weighted avg       0.82      0.82      0.81       311



In [19]:
y_val_pred = clf.predict(X_val)

pd.DataFrame(confusion_matrix(y_val, y_val_pred), index=['Pred_live', 'Pred_death'], columns=['Act_live','Act_death'])

Unnamed: 0,Act_live,Act_death
Pred_live,174,18
Pred_death,44,76


In [20]:
accuracy_score(y_train, y_pred)

0.8167202572347267

In [21]:
accuracy_score(y_val, y_val_pred)

0.8012820512820513

### 5. Run through steps 2-4 using a different max_depth value.

In [22]:
seed = 42
clf = DecisionTreeClassifier(max_depth=5, random_state=42)

In [23]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=5, random_state=42)

In [24]:
clf.score(X_train, y_train)

0.8585209003215434

In [25]:
y_pred = clf.predict(X_train)

In [26]:
pd.DataFrame(confusion_matrix(y_train, y_pred), index=['Pred_live', 'Pred_death'], columns=['Act_live','Act_death'])

Unnamed: 0,Act_live,Act_death
Pred_live,177,15
Pred_death,29,90


In [27]:
print(
    classification_report(y_train, y_pred),)

              precision    recall  f1-score   support

           0       0.86      0.92      0.89       192
           1       0.86      0.76      0.80       119

    accuracy                           0.86       311
   macro avg       0.86      0.84      0.85       311
weighted avg       0.86      0.86      0.86       311



In [28]:
y_val_pred = clf.predict(X_val)

pd.DataFrame(confusion_matrix(y_val, y_val_pred), index=['Pred_live', 'Pred_death'], columns=['Act_live','Act_death'])

Unnamed: 0,Act_live,Act_death
Pred_live,178,14
Pred_death,51,69


In [29]:
accuracy_score(y_val, y_val_pred)

0.7916666666666666

### 6. Which model performs better on your in-sample data?

The depth 5 model performs better on the training data. Step 5.

### 7. Which model performs best on your out-of-sample data, the validate set?

The depth 3 model performs better on the training data. Steps <=4.

# Part 2: Telco churn

## 1. Work through these same exercises using the Telco dataset.

In [30]:
telco = acquire.get_telco_data()
telco = prepare.prep_telco(telco)

In [31]:
telco = telco.T.drop_duplicates(keep='first').T

You have to drop sting columns:

In [32]:
"""
for col in telco:
    if telco[col]dtypes() == 'object':
        pass
    else:
        new_df
"""

"\nfor col in telco:\n    if telco[col]dtypes() == 'object':\n        pass\n    else:\n        new_df\n"

In [33]:
tel_train, tel_val, tel_test = train_validate(telco, 'churn')

### 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [34]:
tel_train.churn.value_counts()

0    1811
1     654
Name: churn, dtype: int64

In [35]:
tel_base = pd.DataFrame()
tel_base['baseline_prediction'] = tel_train['churn']
tel_base['baseline_prediction'] = tel_base['baseline_prediction'] * 0

The accuracy_score does not like int input.

In [36]:
tel_base = tel_base.astype('string')
tel_train.churn = tel_train.churn.astype('string')

In [37]:
tel_baseline = accuracy_score(tel_train.churn, tel_base.baseline_prediction)
print(f'The baseline is: {tel_baseline:.3f}')

The baseline is: 0.735


### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [38]:
#tel_train.churn = tel_train.churn.astype('int')

In [39]:
X_train = tel_train.drop(columns=['churn'])
y_train = tel_train['churn']

X_val = tel_val.drop(columns=['churn'])
y_val = tel_val['churn']

X_test = tel_test.drop(columns=['churn'])
y_test = tel_test['churn']

In [40]:
seed = 42

clf = DecisionTreeClassifier(max_depth=3, random_state=42)

In [41]:
clf.fit(X_train, y_train)

ValueError: could not convert string to float: 'Male'

### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [None]:
y_pred = clf.predict(X_train)

In [None]:
pd.DataFrame(confusion_matrix(y_train, y_pred), index=['Pred Churn', 'Pred Stay'], columns=['Act Churn','Act Stay'])

### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

### 5. Run through steps 2-4 using a different max_depth value.

### 6. Which model performs better on your in-sample data?

### 7. Which model performs best on your out-of-sample data, the validate set?

## 2. Experiment with this model on other datasets with a higher number of output classes.