Try to identify folks before churn so we can reach out. Err on side of identifying too many. Aim for most potential churners with limit of 3:1 ratio of false positives to true positives.

The total net value of a customer is ~$200. Estimated lost revenue due to churn is ~$80/customer. Cost of outreach is ~$20/customer. A 3:1 ratio of false positives to true positives will be approximately net neutral, with lower ratios leading to more profits. The maximum value lies in maximizing the churn prevented and minimizing the cost of outreach to misidentified churners. The formula for profit/loss on this project would be: 80*TP - 20*(TP+FP)

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('bigml_59c28831336c6604c800002a.csv')
df.info()

In [None]:
df.head()

In [None]:
print('Churn rate: ',round(df['churn'].value_counts()[1]/3333,3))

In [None]:
# One hot encoding for categorical columns
categorical = ['state','area code', 'international plan', 'voice mail plan','churn']
df_encoded = pd.get_dummies(df, columns=categorical, drop_first=True)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_curve, auc
from imblearn.over_sampling import SMOTE
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.metrics import recall_score

In [None]:
X = df_encoded.drop(['churn_True','phone number'], axis=1)
y = df_encoded.churn_True
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1, train_size=0.8)

In [None]:
def model_eval(model_name,X_test,y_test):
    y_hat_test = model_name.predict(X_test)
    conf_mat = confusion_matrix(y_test, y_hat_test)
#     ConfusionMatrixDisplay(conf_mat).plot();
    rec_score = recall_score(y_test,y_hat_test)
    profit_loss = round(80*conf_mat[1,1]-20*(conf_mat[0,1]+conf_mat[1,1]),2)
#     print(f'Profit(loss): ${profit_loss}')
#     print(f'Recall: {round(rec_score,3)}')
    
    return(profit_loss)

In [None]:
logreg_base = LogisticRegression(random_state=42,fit_intercept=False, C=1e10, solver='liblinear')
logreg_base.fit(X_train.values,y_train) # added .values to handle labeling error

model_eval(logreg_base,X_test.values,y_test);

In [None]:
# Scaling everything to a 0-1 range

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
logreg = LogisticRegression(random_state=42,fit_intercept=False, C=1e10, solver='liblinear')
logreg.fit(X_train_scaled,y_train)
model_eval(logreg,X_test_scaled,y_test);

In [None]:
# Now let's compare a few different ratios of minority class to majority class
ratios = [0.25, 0.33, 0.5, 0.7, 1]
names = ['0.25', '0.33','0.5','0.7','even']
colors = sns.color_palette('Set2')

for n, ratio in enumerate(ratios):
    # Fit a model
    smote = SMOTE(sampling_strategy=ratio, random_state=7)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train) 
    X_test_resampled, y_test_resampled = smote.fit_resample(X_test_scaled, y_test)
    logreg = LogisticRegression(fit_intercept=False, C=1e10, solver ='liblinear')
    model_log = logreg.fit(X_train_resampled, y_train_resampled)
    print(f'Ratio: {names[n]}')
    model_eval(logreg,X_test_scaled,y_test);
    print('\n')

In [None]:
ratios = np.arange(.4,.6,.01)

for n, ratio in enumerate(ratios):
    # Fit a model
    smote = SMOTE(sampling_strategy=ratio, random_state=7)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train) 
    X_test_resampled, y_test_resampled = smote.fit_resample(X_test_scaled, y_test)
    logreg = LogisticRegression(fit_intercept=False, C=1e10, solver ='liblinear')
    model_log = logreg.fit(X_train_resampled, y_train_resampled)
    print(f'Ratio: {round(ratios[n],3)}')
    model_eval(logreg,X_test_scaled,y_test);
    print('\n')

In [None]:
smote = SMOTE(sampling_strategy=.52, random_state=7)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train) 
logreg_resample = LogisticRegression(fit_intercept=False, C=1e10, solver ='liblinear')
logreg_resample.fit(X_train_resampled, y_train_resampled)

model_eval(logreg_resample,X_test_scaled,y_test);

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score, roc_auc_score

In [None]:
clf = DecisionTreeClassifier(criterion='entropy', random_state=42)

clf.fit(X_train, y_train)

In [None]:

fig, axes = plt.subplots(nrows = 1,ncols = 1, figsize = (6,6), dpi=900)
tree.plot_tree(clf,
               class_names=np.unique(y).astype('str'),
               filled = True)
plt.show()

In [None]:
model_eval(clf,X_test,y_test);

In [None]:
y_hat_test = clf.predict(X_test)
conf_mat = confusion_matrix(y_test, y_hat_test)
ConfusionMatrixDisplay(conf_mat).plot();

In [None]:
clf_g = DecisionTreeClassifier(criterion='gini',random_state=42)
clf_g.fit(X_train, y_train)
model_eval(clf_g,X_test,y_test);

clf_ll = DecisionTreeClassifier(criterion='log_loss',random_state=42)
clf_ll.fit(X_train, y_train)
model_eval(clf_ll,X_test,y_test);
# No help, stick with 'entropy'

In [None]:
ratios = np.arange(.2,1.1,.1)

for n, ratio in enumerate(ratios):
    # Fit a model
    smote = SMOTE(sampling_strategy=round(ratio,3), random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train) 
    X_test_resampled, y_test_resampled = smote.fit_resample(X_test_scaled, y_test)
    clf_resamp_temp = DecisionTreeClassifier(criterion='entropy',random_state=7)
    clf_resamp_temp.fit(X_train_resampled, y_train_resampled)
    print(f'Ratio: {round(ratios[n],3)}')
    model_eval(clf_resamp_temp,X_test_scaled,y_test);
    print('\n')

    # Slight improvement in recall, but not better than baseline in terms of profit.

In [None]:
SEED=37

def hypertuning(parameter,X_train,y_train,X_test,y_test,start,stop,increment):
    parameter_range = np.arange(start,stop,increment)
    temp_range=[]
    test_aucs=[]
    train_aucs=[]
    profit=[]
    entropy='entropy'
    for i in parameter_range:
        param_name = parameter
        param_value = i
        
        # With assistance from ChatGPT - it gave me this method to set parameter names using variables so I can 
        # call this function with any parameter I want and have it run through a range of parameters.
        kwargs = {param_name: param_value, 'random_state':SEED,'criterion':entropy}
        dt_temp = DecisionTreeClassifier(**kwargs)
        # Thanks ChatGPT!!
        
        dt_temp.fit(X_train,y_train)
        y_pred_train = dt_temp.predict(X_train)
        y_pred_test = dt_temp.predict(X_test)
        auc_train_temp = round(roc_auc_score(y_train, y_pred_train),3)
        auc_test_temp = round(roc_auc_score(y_test, y_pred_test),3)
        temp_range.append(i)
        test_aucs.append(auc_test_temp)
        train_aucs.append(auc_train_temp)
        profit.append(model_eval(dt_temp,X_test,y_test))
        
    fig, ax1 = plt.subplots()
    ax1.plot(temp_range,test_aucs,label='Test')
    ax1.plot(temp_range,train_aucs,label='Train')
    ax2 = ax1.twinx()
    ax2.plot(temp_range,profit,label='Profit',color='g')
    ax1.set_ylabel('AUC')
    ax2.set_ylabel('Profit ($)')
    lines, labels = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax2.legend(lines + lines2, labels + labels2)
    plt.show();

In [None]:
hypertuning("max_depth",X_train,y_train,X_test,y_test,1,33,1)

In [None]:
hypertuning("min_samples_split",X_train,y_train,X_test,y_test,0.0001,.25,.01)

In [None]:
hypertuning("min_samples_leaf",X_train,y_train,X_test,y_test,1,25,1)

In [None]:
hypertuning("max_features",X_train,y_train,X_test,y_test,1,75,1)

In [None]:
dt_final = DecisionTreeClassifier(random_state=SEED,criterion='entropy',max_depth=7)
dt_final.fit(X_train,y_train)
model_eval(dt_final,X_test,y_test)

In [None]:
y_hat_test = dt_final.predict(X_test)
conf_mat = confusion_matrix(y_test, y_hat_test)
ConfusionMatrixDisplay(conf_mat).plot();

In [None]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

# Negative log loss doesn't exist as something we can import,
# but we can create it
neg_log_loss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

# Instantiate the model (same as previous example)
baseline_model = LogisticRegression(random_state=42)

# Create a list to hold the score from each fold
kfold_scores = np.ndarray(5)

# Instantiate a splitter object and loop over its result
kfold = StratifiedKFold()
for fold, (train_index, val_index) in enumerate(kfold.split(X_train, y_train)):
    # Extract train and validation subsets using the provided indices
    X_t, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_t, y_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Clone the provided model and fit it on the train subset
    temp_model = clone(baseline_model)
    temp_model.fit(X_t, y_t)
    
    # Evaluate the provided model on the validation subset
    neg_log_loss_score = neg_log_loss(temp_model, X_val, y_val)
    kfold_scores[fold] = neg_log_loss_score
    
-(kfold_scores.mean())

In [None]:
parameter_range = np.arange(1,25,1)
temp_range=[]
test_aucs=[]
train_aucs=[]
profit=[]
entropy='entropy'
for i in parameter_range:
#     to_run = "dt_temp = DecisionTreeClassifier(random_state=SEED,criterion=entropy,"
#     to_run += parameter
#     to_run += "="
#     to_run += str(i)
#     to_run+= ")"
#     print(to_run)
#     exec(to_run)
    dt_temp = DecisionTreeClassifier(random_state=SEED,criterion=entropy, min_samples_leaf=i)
    dt_temp.fit(X_train,y_train)
    y_pred_train = dt_temp.predict(X_train)
    y_pred_test = dt_temp.predict(X_test)
    auc_train_temp = round(roc_auc_score(y_train, y_pred_train),3)
    auc_test_temp = round(roc_auc_score(y_test, y_pred_test),3)
    temp_range.append(i)
    test_aucs.append(auc_test_temp)
    train_aucs.append(auc_train_temp)
    profit.append(model_eval(dt_temp,X_test,y_test));
    
    conf_mat = confusion_matrix(y_test, y_pred_test)
#     ConfusionMatrixDisplay(conf_mat).plot();
    
plt.plot(temp_range,profit);
# plt.plot(temp_range,test_aucs,label='Test')
# plt.plot(temp_range,train_aucs,label='Train')
# plt.legend();