In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV 
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, precision_score, recall_score


# custom imports
import wrangle as w

# Now that i have accomplished exploration i'm going to begin creating functions to put into my prepare.py file for preprocessing before i proceed to model evaluation

In [2]:
train, val, test = w.wrangle_telco()

In [3]:
train.head()

Unnamed: 0,customer_id,senior_citizen,tenure,monthly_charges,total_charges,gender,partner,dependents,phone,multiple_lines,...,churn,dsl,fiber_optic,month_to_month,one_year,two_year,bank_transfer_payment,credit_card_payment,electronic_payment,mailed_payment
4604,6490-FGZAT,0,6,20.65,109.3,1,0,0,1,0,...,0,0,1,1,0,0,0,0,0,1
4051,5707-ORNDZ,1,9,54.55,494.05,1,0,0,1,1,...,1,1,0,1,0,0,0,0,1,0
5159,7252-NTGSS,0,1,45.15,45.15,0,0,0,1,0,...,0,1,0,1,0,0,0,0,0,1
2974,4250-WAROZ,1,60,93.25,5774.55,1,1,1,1,1,...,0,0,1,1,0,0,0,0,1,0
2979,4257-GAESD,0,6,45.0,298.7,0,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3943 entries, 4604 to 6377
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   customer_id            3943 non-null   object 
 1   senior_citizen         3943 non-null   int64  
 2   tenure                 3943 non-null   int64  
 3   monthly_charges        3943 non-null   float64
 4   total_charges          3943 non-null   float64
 5   gender                 3943 non-null   uint8  
 6   partner                3943 non-null   uint8  
 7   dependents             3943 non-null   uint8  
 8   phone                  3943 non-null   uint8  
 9   multiple_lines         3943 non-null   uint8  
 10  online_security        3943 non-null   uint8  
 11  online_backup          3943 non-null   uint8  
 12  device_protection      3943 non-null   uint8  
 13  tech_support           3943 non-null   uint8  
 14  streaming_tv           3943 non-null   uint8  
 15  s

In [5]:
train.drop(columns='customer_id')
val.drop(columns='customer_id')
test.drop(columns='customer_id')

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,gender,partner,dependents,phone,multiple_lines,online_security,...,churn,dsl,fiber_optic,month_to_month,one_year,two_year,bank_transfer_payment,credit_card_payment,electronic_payment,mailed_payment
455,0,21,28.50,629.35,0,1,0,0,0,1,...,0,1,0,1,0,0,0,0,1,0
2301,1,11,111.40,1183.05,1,0,0,1,1,0,...,0,0,1,1,0,0,0,0,1,0
2255,1,4,69.35,261.65,1,0,0,1,0,0,...,0,0,1,1,0,0,0,0,1,0
4527,0,5,19.95,107.05,1,0,0,1,0,0,...,0,0,1,1,0,0,0,0,0,1
3812,0,72,65.65,4664.50,1,1,1,0,0,1,...,0,1,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5191,0,1,19.15,19.15,0,0,0,1,0,0,...,0,0,1,1,0,0,0,0,0,1
4510,0,3,80.50,232.35,0,0,0,1,0,0,...,0,0,1,1,0,0,0,0,1,0
3910,0,1,55.25,55.25,1,0,0,1,0,0,...,0,1,0,1,0,0,0,0,1,0
4820,0,24,74.80,1821.20,1,1,1,1,1,1,...,0,1,0,0,1,0,0,0,0,1


In [6]:
train.churn.value_counts(normalize = True)

0    0.73472
1    0.26528
Name: churn, dtype: float64

In [7]:
#################################################MODEL EVALUATION######################################################## 


In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

In [9]:
train, val, test = w.wrangle_telco()

train = train.drop(columns='customer_id')
val = val.drop(columns='customer_id')
test = test.drop(columns='customer_id')

train.columns

train.shape, val.shape, test.shape

X_train = train.drop(columns='churn')
y_train = train.churn

X_val = val.drop(columns='churn')
y_val = val.churn

X_test = test.drop(columns='churn')
y_test = test.churn

len(y_train)

y_train.mode()

baseline = (y_train == 0).mean()
baseline

seed = 42

clf = DecisionTreeClassifier(max_depth=3, random_state=seed)

clf = clf.fit(X_train, y_train)

clf

clf.feature_importances_

X_train.columns

# make prediction on train obeservations
y_pred = clf.predict(X_train)
y_pred[:5]

y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[:5]

y_pred_proba = clf.predict_proba(X_val)
y_pred_proba[:5]

seed = 42
train_acc = []
val_acc = []
depth = []

for i in range(2, 3):

    clf = DecisionTreeClassifier(max_depth=i, random_state = seed)

    clf.fit(X_train, y_train)

    depth.append(i)

    train_acc.append(clf.score(X_train, y_train))

    val_acc.append(clf.score(X_val, y_val))

tree = pd.DataFrame({'max_depth' : depth,
                      'train_acc' : train_acc,
                      'val_acc' : val_acc})
tree

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train) * 100))

print('Accuracy of Decision Tree classifier on validation set: {:.2f}'
      .format(clf.score(X_val, y_val) * 100))

# confusion matrix
confusion_matrix(y_train, y_pred)

y_train.value_counts()

print(classification_report(y_train, y_pred))

clf.score(X_train, y_train)

clf.score(X_val, y_val)

y_train = train.churn

labels = sorted(y_train.unique())

model = pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

model.rename(
    columns={0: 'No Churn', 1: 'Churn'}, index={0: 'No Churn', 1: 'Churn'}, inplace=True,)

model

#.ravel returns a contigious array
TN, FP, FN, TP = confusion_matrix(y_train, y_pred).ravel()
TN, FP, FN, TP

def metrics(TP, TN, FP, FN):
    accuracy = round((TP + TN) / (TP + TN + FP + FN) * 100, 2)
    recall = round(TP / (TP + FN) * 100, 2)
    true_positive_rate = round(TP / (TP + FN) * 100, 2)
    false_positive_rate = round(FP / (FP + TN) *100, 2)
    true_negative_rate = round(TN / (TN + FP) * 100, 2)
    false_negative_rate = round(FN / (FN + TP) * 100, 2)
    precision = round(TP / (TP + FP) * 100, 2)
    f1_score = round((2 * (precision * recall) / (precision + recall)), 2)
    support_neg = TP + FN
    support_pos = FP + TN
    data = {
        'Metric': ['Accuracy', 'Recall', 'True Positive Rate', 'False Positive Rate', 'True Negative Rate', 'False Negative Rate', 'Precision', 'F1-Score', 'Support (0)', 'Support (1)'],
        'Value': [accuracy, recall, true_positive_rate, false_positive_rate, true_negative_rate, false_negative_rate, precision, f1_score, support_neg, support_pos]
    }
    metrics = pd.DataFrame(data, index=None)
    return metrics

conf_matrix = confusion_matrix(y_train, y_pred)
TN, FP, FN, TP = conf_matrix.ravel()

metrics = (TP, TN, FP, FN)
print(metrics)

Accuracy of Decision Tree classifier on training set: 75.25
Accuracy of Decision Tree classifier on validation set: 74.45
              precision    recall  f1-score   support

           0       0.80      0.96      0.87      2897
           1       0.73      0.32      0.45      1046

    accuracy                           0.79      3943
   macro avg       0.76      0.64      0.66      3943
weighted avg       0.78      0.79      0.76      3943

(336, 2773, 124, 710)


In [10]:
train.head()

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,gender,partner,dependents,phone,multiple_lines,online_security,...,churn,dsl,fiber_optic,month_to_month,one_year,two_year,bank_transfer_payment,credit_card_payment,electronic_payment,mailed_payment
4604,0,6,20.65,109.3,1,0,0,1,0,0,...,0,0,1,1,0,0,0,0,0,1
4051,1,9,54.55,494.05,1,0,0,1,1,1,...,1,1,0,1,0,0,0,0,1,0
5159,0,1,45.15,45.15,0,0,0,1,0,0,...,0,1,0,1,0,0,0,0,0,1
2974,1,60,93.25,5774.55,1,1,1,1,1,1,...,0,0,1,1,0,0,0,0,1,0
2979,0,6,45.0,298.7,0,0,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0


In [11]:
train.columns.tolist()

['senior_citizen',
 'tenure',
 'monthly_charges',
 'total_charges',
 'gender',
 'partner',
 'dependents',
 'phone',
 'multiple_lines',
 'online_security',
 'online_backup',
 'device_protection',
 'tech_support',
 'streaming_tv',
 'streaming_movies',
 'paperless_billing',
 'churn',
 'dsl',
 'fiber_optic',
 'month_to_month',
 'one_year',
 'two_year',
 'bank_transfer_payment',
 'credit_card_payment',
 'electronic_payment',
 'mailed_payment']

In [12]:
train = train.drop(columns=['bank_transfer_payment', 'credit_card_payment'])


val = val.drop(columns=['bank_transfer_payment', 'credit_card_payment'])


test = test.drop(columns=['bank_transfer_payment', 'credit_card_payment'])


In [13]:
train.shape, val.shape, test.shape

((3943, 24), (1691, 24), (1409, 24))

In [14]:
y_train.mode()

0    0
Name: churn, dtype: uint8

In [15]:
y_train == 0

4604     True
4051    False
5159     True
2974     True
2979     True
        ...  
4993     True
3991     True
5229    False
6473     True
6377     True
Name: churn, Length: 3943, dtype: bool

In [16]:
baseline = (y_train == 0).mean()
baseline

0.7347197565305605

In [17]:
X_train.nunique()

senior_citizen              2
tenure                     73
monthly_charges          1336
total_charges            3735
gender                      2
partner                     2
dependents                  2
phone                       2
multiple_lines              2
online_security             2
online_backup               2
device_protection           2
tech_support                2
streaming_tv                2
streaming_movies            2
paperless_billing           2
dsl                         2
fiber_optic                 2
month_to_month              2
one_year                    2
two_year                    2
bank_transfer_payment       2
credit_card_payment         2
electronic_payment          2
mailed_payment              2
dtype: int64

In [18]:
seed = 42

clf = DecisionTreeClassifier(max_depth=5, random_state=seed)

clf = clf.fit(X_train, y_train)

clf

In [19]:
# make prediction on train obeservations
y_pred = clf.predict(X_train)
y_pred[:5]

array([0, 0, 0, 1, 0], dtype=uint8)

In [20]:
clf.score(X_train, y_train)

0.8092822723814355

In [21]:
clf.score(X_val, y_val)

0.7634535777646363

In [22]:
clf.feature_importances_

array([2.49187593e-04, 2.02647672e-01, 1.60743302e-01, 2.19329246e-02,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.90717822e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.63911866e-02, 0.00000000e+00, 0.00000000e+00, 1.57864228e-02,
       0.00000000e+00, 1.63239060e-02, 5.07175109e-01, 9.27549056e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.45676210e-02,
       0.00000000e+00])

In [23]:
fi = pd.DataFrame({'feature': X_train.columns,
              'importance': clf.feature_importances_})

fi.sort_values(by = 'importance', ascending = False)

Unnamed: 0,feature,importance
18,month_to_month,0.507175
1,tenure,0.202648
2,monthly_charges,0.160743
23,electronic_payment,0.044568
3,total_charges,0.021933
12,tech_support,0.016391
17,fiber_optic,0.016324
15,paperless_billing,0.015786
19,one_year,0.009275
7,phone,0.004907


In [24]:
seed = 42
train_acc = []
val_acc = []
depth = []

for i in range(2, 4):

    clf = DecisionTreeClassifier(max_depth=i, random_state = seed)

    clf.fit(X_train, y_train)

    depth.append(i)

    train_acc.append(clf.score(X_train, y_train))

    val_acc.append(clf.score(X_val, y_val))

In [25]:
tree = pd.DataFrame({'max_depth' : depth,
                      'train_acc' : train_acc,
                      'val_acc' : val_acc})
tree

Unnamed: 0,max_depth,train_acc,val_acc
0,2,0.752473,0.74453
1,3,0.788486,0.769367


In [26]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train) * 100))

print('Accuracy of Decision Tree classifier on validation set: {:.2f}'
      .format(clf.score(X_val, y_val) * 100))

Accuracy of Decision Tree classifier on training set: 78.85
Accuracy of Decision Tree classifier on validation set: 76.94


In [27]:
# confusion matrix
confusion_matrix(y_train, y_pred)

array([[2553,  344],
       [ 408,  638]])

In [28]:
#.ravel returns a contigious array
TN, FP, FN, TP = confusion_matrix(y_train, y_pred).ravel()
TN, FP, FN, TP

def metrics(TP, TN, FP, FN):
    accuracy = round((TP + TN) / (TP + TN + FP + FN) * 100, 2)
    recall = round(TP / (TP + FN) * 100, 2)
    true_positive_rate = round(TP / (TP + FN) * 100, 2)
    false_positive_rate = round(FP / (FP + TN) *100, 2)
    true_negative_rate = round(TN / (TN + FP) * 100, 2)
    false_negative_rate = round(FN / (FN + TP) * 100, 2)
    precision = round(TP / (TP + FP) * 100, 2)
    f1_score = round((2 * (precision * recall) / (precision + recall)), 2)
    support_neg = TP + FN
    support_pos = FP + TN
    data = {
        'Metric': ['Accuracy', 'Recall', 'True Positive Rate', 'False Positive Rate', 'True Negative Rate', 'False Negative Rate', 'Precision', 'F1-Score', 'Support (0)', 'Support (1)'],
        'Value': [accuracy, recall, true_positive_rate, false_positive_rate, true_negative_rate, false_negative_rate, precision, f1_score, support_neg, support_pos]
    }
    metrics = pd.DataFrame(data, index=None)
    return metrics

conf_matrix = confusion_matrix(y_train, y_pred)
TN, FP, FN, TP = conf_matrix.ravel()

metrics = metrics(TP, TN, FP, FN)
print(metrics)

                Metric    Value
0             Accuracy    80.93
1               Recall    60.99
2   True Positive Rate    60.99
3  False Positive Rate    11.87
4   True Negative Rate    88.13
5  False Negative Rate    39.01
6            Precision    64.97
7             F1-Score    62.92
8          Support (0)  1046.00
9          Support (1)  2897.00


In [29]:
y_train = train.churn

import pandas as pd

labels = sorted(y_train.unique())

model = pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

model.rename(
    columns={0: 'Not Churned', 1: 'Churned'}, index={0: 'Not Churned', 1: 'Churned'}, inplace=True,)

model

Unnamed: 0,Not Churned,Churned
Not Churned,2553,344
Churned,408,638


In [30]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.88      0.87      2897
           1       0.65      0.61      0.63      1046

    accuracy                           0.81      3943
   macro avg       0.76      0.75      0.75      3943
weighted avg       0.81      0.81      0.81      3943



In [None]:
train, val, test = w.wrangle_telco()
train

train = train.drop(columns='customer_id')
val = val.drop(columns='customer_id')
test = test.drop(columns='customer_id')

def xy_split(data):
    X = data.drop(columns=['churn'])
    y = data['churn']
    return X, y

X_train, y_train = xy_split(train)
X_val, y_val = xy_split(val)
X_test, y_test = xy_split(test)

y_train.mode()

y_train == 0

def baseline(target):
    """
    The function calculates and prints the accuracy of a baseline model that always predicts the most
    frequent class in the target variable.
    
   :param target: The "target" parameter is likely a Pandas Series or DataFrame column that contains
    the true labels or values that we are trying to predict or classify. The "baseline" function appears
    to calculate the accuracy of a simple baseline model that always predicts the most common value in
    the "target" column
    """
    print(f'Baseline: {round(((target==target.value_counts().idxmax()).mean())*100,2)}% Accuracy')

baseline(train['churn'])

rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=seed)
rf.fit(X_train, y_train)

rf

# make prediction on train obeservations
y_pred = rf.predict(X_train)
y_pred[:5]

rf.score(X_train, y_train)

rf.score(X_val, y_val)

fi = pd.DataFrame({'feature': X_train.columns,
              'importance': rf.feature_importances_})

fi.sort_values(by = 'importance', ascending = False)

print('Accuracy of Random Forest classifier on training set: {:.2f}%'
      .format(rf.score(X_train, y_train) * 100))

print('Accuracy of Random Forest classifier on validation set: {:.2f}%'
      .format(rf.score(X_val, y_val) * 100))

y_train = train.churn

labels = sorted(y_train.unique())

model = pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

model.rename(
    columns={0: 'No Churn', 1: 'Churn'}, index={0: 'No Churn', 1: 'Churn'}, inplace=True,)

model

print(classification_report(y_train, y_pred))

#.ravel returns a contigious array
TN, FP, FN, TP = confusion_matrix(y_train, y_pred).ravel()
TN, FP, FN, TP

def metrics(TP, TN, FP, FN):
    accuracy = round((TP + TN) / (TP + TN + FP + FN) * 100, 2)
    recall = round(TP / (TP + FN) * 100, 2)
    true_positive_rate = round(TP / (TP + FN) * 100, 2)
    false_positive_rate = round(FP / (FP + TN) *100, 2)
    true_negative_rate = round(TN / (TN + FP) * 100, 2)
    false_negative_rate = round(FN / (FN + TP) * 100, 2)
    precision = round(TP / (TP + FP) * 100, 2)
    f1_score = round((2 * (precision * recall) / (precision + recall)), 2)
    support_neg = TP + FN
    support_pos = FP + TN
    data = {
        'Metric': ['Accuracy', 'Recall', 'True Positive Rate', 'False Positive Rate', 'True Negative Rate', 'False Negative Rate', 'Precision', 'F1-Score', 'Support (0)', 'Support (1)'],
        'Value': [accuracy, recall, true_positive_rate, false_positive_rate, true_negative_rate, false_negative_rate, precision, f1_score, support_neg, support_pos]
    }
    metrics = pd.DataFrame(data, index=None)
    return metrics

conf_matrix = confusion_matrix(y_train, y_pred)
TN, FP, FN, TP = conf_matrix.ravel()

metrics = metrics(TP, TN, FP, FN)
print(metrics)

for i in reversed(range(2, 6)):
    
    for n in (range(1, 6)):
        
        forest = RandomForestClassifier(min_samples_leaf=n, max_depth=i, random_state=42)
        forest = forest.fit(X_train, y_train)
        y_predictions = forest.predict(X_train)
        report = classification_report(y_train, y_predictions, output_dict=True)

        print(f'Tree with max depth of {i}', '\n\n\n')
        print(f'Minimum samples leaf of {n}', '\n')
        print(pd.DataFrame(report).T)
        print('\n\n')

seed = 42
train_accuracy = []
val_accuracy = []
depth = []
samples = []

for i in reversed(range(2, 23)):

    for n in range(1, 23):

        rf = RandomForestClassifier(min_samples_leaf=n, max_depth=i, random_state = seed)
    
        rf.fit(X_train, y_train)
    
        depth.append(i)
    
        samples.append(n)
    
        train_accuracy.append(rf.score(X_train, y_train))
    
        val_accuracy.append(rf.score(X_val, y_val))

r_forest = pd.DataFrame({'min_samples_leaf' : samples,
                        'train_accuracy' : train_accuracy,
                        'val_accuracy' : val_accuracy,
                        'max_depth' : depth,})
r_forest

print('Accuracy of Random Forest classifier on training set: {:.2f}%'
      .format(rf.score(X_train, y_train) * 100))

print('Accuracy of Random Forest classifier on validation set: {:.2f}%'
      .format(rf.score(X_val, y_val) * 100))

param_grid = {
    'max_depth': range(2, 22),
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=seed), param_grid, cv=5)
grid_search.fit(X_train, y_train)

rf = grid_search.best_estimator_

print(param_grid, '\n\n')
print(rf)

print('Accuracy of Random Forest classifier on training set: {:.2f}'
      .format(rf.score(X_train, y_train) * 100))

print('Accuracy of Random Forest classifier on validation set: {:.2f}'
      .format(rf.score(X_val, y_val) * 100))

def perform_grid_search(X_train, y_train):
    param_grid = {
        'max_depth': range(2, 22),
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    grid_search = GridSearchCV(RandomForestClassifier(random_state=seed), param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    best_rf = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print("Grid Search Results:")
    print("Best Parameters:", best_params)
    print("Best Score: {:.2f}%".format(best_score * 100))

    return best_rf

best_rf_model = perform_grid_search(X_train, y_train)
print(best_rf_model)

Baseline: 73.47% Accuracy
Accuracy of Random Forest classifier on training set: 82.48%
Accuracy of Random Forest classifier on validation set: 78.36%
              precision    recall  f1-score   support

           0       0.84      0.94      0.89      2897
           1       0.75      0.51      0.61      1046

    accuracy                           0.82      3943
   macro avg       0.80      0.72      0.75      3943
weighted avg       0.82      0.82      0.81      3943

                Metric    Value
0             Accuracy    82.48
1               Recall    50.76
2   True Positive Rate    50.76
3  False Positive Rate     6.08
4   True Negative Rate    93.92
5  False Negative Rate    49.24
6            Precision    75.11
7             F1-Score    60.58
8          Support (0)  1046.00
9          Support (1)  2897.00
Tree with max depth of 5 



Minimum samples leaf of 1 

              precision    recall  f1-score      support
0              0.840853  0.939247  0.887331  2897.000000


In [None]:
X_train, y_train = xy_split(train)
X_val, y_val = xy_split(val)
X_test, y_test = xy_split(test)

baseline(train['churn'])

mms = MinMaxScaler()

X_train[['monthly_charges', 'total_charges']] = mms.fit_transform(X_train[['monthly_charges', 'total_charges']])
X_val[['monthly_charges', 'total_charges']] = mms.transform(X_val[['monthly_charges', 'total_charges']])
X_test[['monthly_charges', 'total_charges']] = mms.transform(X_test[['monthly_charges', 'total_charges']])


X_train_short = X_train.iloc[:, 1:]
X_val_short = X_val.iloc[:, 1:]


knn = KNeighborsClassifier(n_neighbors=30)

knn.fit(X_train_short, y_train)

knn.score(X_train_short, y_train), knn.score(X_val_short, y_val)

y_train = train.churn

labels = sorted(y_train.unique())

model = pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

model.rename(
    columns={0: 'No Churn', 1: 'Churn'}, index={0: 'Not Churn', 1: 'Churn'}, inplace=True,)

model

report = classification_report(y_train, y_pred)
print(report)

#.ravel returns a contigious array
TN, FP, FN, TP = confusion_matrix(y_train, y_pred).ravel()
TN, FP, FN, TP

def metrics(TP, TN, FP, FN):
    accuracy = round((TP + TN) / (TP + TN + FP + FN) * 100, 2)
    recall = round(TP / (TP + FN) * 100, 2)
    true_positive_rate = round(TP / (TP + FN) * 100, 2)
    false_positive_rate = round(FP / (FP + TN) *100, 2)
    true_negative_rate = round(TN / (TN + FP) * 100, 2)
    false_negative_rate = round(FN / (FN + TP) * 100, 2)
    precision = round(TP / (TP + FP) * 100, 2)
    f1_score = round((2 * (precision * recall) / (precision + recall)), 2)
    support_neg = TP + FN
    support_pos = FP + TN
    data = {
        'Metric': ['Accuracy', 'Recall', 'True Positive Rate', 'False Positive Rate', 'True Negative Rate', 'False Negative Rate', 'Precision', 'F1-Score', 'Support (0)', 'Support (1)'],
        'Value': [accuracy, recall, true_positive_rate, false_positive_rate, true_negative_rate, false_negative_rate, precision, f1_score, support_neg, support_pos]
    }
    metrics = pd.DataFrame(data, index=None)
    return metrics

conf_matrix = confusion_matrix(y_train, y_pred)
TN, FP, FN, TP = conf_matrix.ravel()

metrics = metrics(TP, TN, FP, FN)
print(metrics)

## Model 1 logistic reg

In [None]:
train, val, test = w.wrangle_telco()
train.head()

In [None]:
train = train.drop(columns='customer_id')
val = val.drop(columns='customer_id')
test = test.drop(columns='customer_id')

In [None]:
train.columns

In [None]:
X_train, y_train = xy_split(train)
X_val, y_val = xy_split(val)
X_test, y_test = xy_split(test)

In [None]:
def keep_cols(df):
    
    cols_to_keep = ['senior_citizen',
                    'tenure',
                    'month_to_month',
                    'dsl',
                    'bank_transfer_payment',
                    ]
    
    return df[cols_to_keep], df.churn

In [None]:
X_train, y_train = keep_cols(train)
X_val, y_val = keep_cols(val)

X_train.head()

In [None]:
baseline = (y_train == 0).mean()

In [None]:
train.churn.value_counts(normalize=True)

In [None]:
seed = 42

logit = LogisticRegression(random_state=seed, max_iter=400,
                          solver = 'liblinear', penalty ='l2')

logit.fit(X_train, y_train)

y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)


print("Baseline is", baseline * 100)

print("\nLogistic Regression using senior_citizen, tenure, month_to_month, dsl, bank_transfer_payment features.")

print('\n Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

In [None]:
seed = 42

logit = LogisticRegression(random_state=seed, max_iter=400,
                          solver = 'liblinear', penalty ='l2')

logit.fit(X_val, y_val)

y_pred = logit.predict(X_val)
y_pred_proba = logit.predict_proba(X_val)


print("Baseline is", baseline * 100)

print("\nLogistic Regression using senior_citizen, tenure, month_to_month, dsl, bank_transfer_payment, features.")

print('\nAccuracy of Logistic Regression classifier on validation set: {:.2f}'
     .format(logit.score(X_val, y_val)))

## Model 2 logistic reg

In [None]:
train, val, test = w.wrangle_telco()
train.head()

train = train.drop(columns='customer_id')
val = val.drop(columns='customer_id')
test = test.drop(columns='customer_id')

train.columns

X_train, y_tråain = xy_split(train)
X_val, y_val = xy_split(val)
X_test, y_test = xy_split(test)

baseline = (y_train == 0).mean()

train.churn.value_counts(normalize=True)

seed = 42

logit = LogisticRegression(random_state=seed, max_iter=400,
                          solver = 'liblinear', penalty ='l2')

logit.fit(X_train, y_train)

y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)


print("Baseline is", baseline * 100)
print("Logistic Regression using all features.")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

seed = 42

logit = LogisticRegression(random_state=seed, max_iter=400,
                          solver = 'liblinear', penalty ='l2')

logit.fit(X_val, y_val)

y_pred = logit.predict(X_val)
y_pred_proba = logit.predict_proba(X_val)


print("Baseline is", baseline * 100)
print("Logistic Regression using all features.")
print('Accuracy of Logistic Regression classifier on validation set: {:.2f}'
     .format(logit.score(X_val, y_val)))

## Model 3 log reg

In [None]:
train, val, test = w.wrangle_telco()
train.head()

In [None]:
train = train.drop(columns='customer_id')
val = val.drop(columns='customer_id')
test = test.drop(columns='customer_id')

In [None]:
train.columns

In [None]:
X_train, y_train = xy_split(train)
X_val, y_val = xy_split(val)
X_test, y_test = xy_split(test)

def keep_cols(df):
    
    cols_to_keep = ['senior_citizen',
                    'partner',
                    'dependents',
                    'paperless_billing',
                    'monthly_charges',
                    'total_charges',
                    'tenure',
                    'online_security',
                    'online_backup',
                    'device_protection',
                    'tech_support',
                    'streaming_tv',
                    'streaming_movies',
                    'one_year',
                    'two_year',
                    'fiber_optic',
                    'bank_transfer_payment',
                    'credit_card_payment',
                    'electronic_payment',
                    'mailed_payment']
    
    return df[cols_to_keep], df.churn

In [None]:
X_train, y_train = keep_cols(train)
X_val, y_val = keep_cols(val)

X_train.head()

In [None]:
baseline = (y_train == 0).mean()

In [None]:
train.churn.value_counts(normalize=True)

In [None]:
seed = 42

logit = LogisticRegression(random_state=seed, max_iter=400,
                          solver = 'liblinear', penalty ='l2')

logit.fit(X_train, y_train)

y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)


print("Baseline is", baseline * 100)

print("\nLogistic Regression using senior_citizen, partner, dependents, paperless_billing,")
print("monthly_charges,total_charges, tenure, online_security, online_backup, device_protection,")
print("tech_support,streaming_tv, streaming_movies, one_year, two_year, fiber_optic,")
print("bank_transfer_payment, credit_card_payment, electronic_payment, mailed_payment features.")

print('\nAccuracy of Logistic Regression classifier on training set: {:.2f}'

     .format(logit.score(X_train, y_train)))

In [None]:
seed = 42

logit = LogisticRegression(random_state=seed, max_iter=400,
                          solver = 'liblinear', penalty ='l2')

logit.fit(X_val, y_val)

y_pred = logit.predict(X_val)
y_pred_proba = logit.predict_proba(X_val)


print("Baseline is", baseline * 100)

print("\nLogistic Regression using senior_citizen, partner, dependents, paperless_billing,")
print("monthly_charges,total_charges, tenure, online_security, online_backup, device_protection,")
print("tech_support,streaming_tv, streaming_movies, one_year, two_year, fiber_optic,")
print("bank_transfer_payment, credit_card_payment, electronic_payment, mailed_payment features.")

print('\nAccuracy of Logistic Regression classifier on validation set: {:.2f}'
     .format(logit.score(X_val, y_val)))

### Model 4 log reg

In [None]:
train, val, test = w.wrangle_telco()
train.head()

In [None]:
train = train.drop(columns='customer_id')
val = val.drop(columns='customer_id')
test = test.drop(columns='customer_id')

In [None]:
train.columns

In [None]:
X_train, y_train = xy_split(train)
X_val, y_val = xy_split(val)
X_test, y_test = xy_split(test)

In [None]:
def keep_cols(df):
    
    cols_to_keep = ['senior_citizen',
                    'paperless_billing',
                    'monthly_charges',
                    'total_charges',
                    'tenure',
                    'one_year',
                    'two_year',
                    'fiber_optic',
                    'credit_card_payment']
    
    return df[cols_to_keep], df.churn

In [None]:
X_train, y_train = keep_cols(train)
X_val, y_val = keep_cols(val)

X_train.head()

In [None]:
baseline = (y_train == 0).mean()

In [None]:
train.churn.value_counts(normalize=True)

In [None]:
seed = 42

logit = LogisticRegression(random_state=seed, max_iter=400,
                          solver = 'liblinear', penalty ='l2')

logit.fit(X_train, y_train)

y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)


print("Baseline is", baseline * 100)

print("\nLogistic Regression using senior_citizen, paperless_billing, monthly_charges, total_charges,")
print("tenure,one_year, two_year, fiber_optic,credit_card_payment features.")
      
print('\nAccuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

In [None]:
seed = 42

logit = LogisticRegression(random_state=seed, max_iter=400,
                          solver = 'liblinear', penalty ='l2')

logit.fit(X_val, y_val)

y_pred = logit.predict(X_val)
y_pred_proba = logit.predict_proba(X_val)


print("Baseline is", baseline * 100)

print("\nLogistic Regression using senior_citizen, tenure, month_to_month, dsl, bank_transfer_payment, churn.")

print('\nAccuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_val, y_val)))