In [2]:
# import the required packages
import pandas as pd 
import numpy as np
import random

import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE

from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, recall_score, precision_score, confusion_matrix

from datetime import datetime, timedelta

import warnings
warnings.filterwarnings('ignore')

In [6]:
# import data
basetable = pd.read_csv('basetable_churn.csv').drop(columns=['Unnamed: 0'])

In [26]:
basetable.head()

Unnamed: 0,EXTRELNO,Churn,Recency,Total_Monetary,Average_Monetary,Frequency,used_order,used_initiative,used_bank,used_unknown,medium_cat_electronic,medium_cat_face_to_face,medium_cat_paper,medium_cat_unknown,complaint,Incoming,Outgoing,comu_count,LANGUACODE,province
0,26414,0,15,2786.13,19.900929,140,138,0,0,2,2,0,19,32,0,1,52,53,NL,West Flanders
1,26419,0,29,3108.99,56.527091,55,0,2,48,5,5,0,19,32,0,1,55,56,NL,Antwerp
2,26430,1,15,2828.28,19.917465,142,137,0,3,2,2,0,21,33,1,1,55,56,NL,Antwerp
3,26430,0,15,2828.28,19.917465,142,137,0,3,2,2,0,21,33,1,1,55,56,NL,Antwerp
4,26431,0,190,3294.57,205.910625,16,0,0,13,3,1,0,18,33,0,0,52,52,NL,West Flanders


In [27]:
basetable.dtypes

EXTRELNO                     int64
Churn                        int64
Recency                      int64
Total_Monetary             float64
Average_Monetary           float64
Frequency                    int64
used_order                   int64
used_initiative              int64
used_bank                    int64
used_unknown                 int64
medium_cat_electronic        int64
medium_cat_face_to_face      int64
medium_cat_paper             int64
medium_cat_unknown           int64
complaint                    int64
Incoming                     int64
Outgoing                     int64
comu_count                   int64
LANGUACODE                  object
province                    object
dtype: object

### Data preprocessing
1. split train, val and test data set
2. Missing value
3. Catergorical variables encoder
4. Numerical variables standarization

1. Split train, val and test data

In [3]:
# split data set
train, val_test = train_test_split(basetable, test_size=0.3, random_state=42)
# split val and test 
val,test = train_test_split(val_test, test_size=0.5, random_state=42)

In [29]:
# check shape
print(train.shape)
print(val.shape)
print(test.shape)

(6474, 20)
(1387, 20)
(1388, 20)


2. Check missing values

In [15]:
# check missing value
train.isna().sum()

EXTRELNO                   0
Churn                      0
Recency                    0
Total_Monetary             0
Average_Monetary           0
Frequency                  0
used_order                 0
used_initiative            0
used_bank                  0
used_unknown               0
medium_cat_electronic      0
medium_cat_face_to_face    0
medium_cat_paper           0
medium_cat_unknown         0
complaint                  0
Incoming                   0
Outgoing                   0
comu_count                 0
LANGUACODE                 0
province                   0
dtype: int64

In [16]:
val.isna().sum()

EXTRELNO                   0
Churn                      0
Recency                    0
Total_Monetary             0
Average_Monetary           0
Frequency                  0
used_order                 0
used_initiative            0
used_bank                  0
used_unknown               0
medium_cat_electronic      0
medium_cat_face_to_face    0
medium_cat_paper           0
medium_cat_unknown         0
complaint                  0
Incoming                   0
Outgoing                   0
comu_count                 0
LANGUACODE                 0
province                   0
dtype: int64

In [18]:
test.isnull().sum()

EXTRELNO                   0
Churn                      0
Recency                    0
Total_Monetary             0
Average_Monetary           0
Frequency                  0
used_order                 0
used_initiative            0
used_bank                  0
used_unknown               0
medium_cat_electronic      0
medium_cat_face_to_face    0
medium_cat_paper           0
medium_cat_unknown         0
complaint                  0
Incoming                   0
Outgoing                   0
comu_count                 0
LANGUACODE                 0
province                   0
dtype: int64

3. Encode categorical variables

In [56]:
# encode lanuagecode and province
language_encoder = LabelEncoder()
province_encoder = LabelEncoder()

language_encoder = language_encoder.fit(train['LANGUACODE'])
province_encoder = province_encoder.fit(train['province'])

train['LANGUACODE'] = language_encoder.transform(train['LANGUACODE'])
val['LANGUACODE'] = language_encoder.transform(val['LANGUACODE'])
test['LANGUACODE'] = language_encoder.transform(test['LANGUACODE'])

train['province'] = province_encoder.transform(train['province'])
val['province'] = province_encoder.transform(val['province'])
test['province'] = province_encoder.transform(test['province'])

4. Standarized numerical variabels

In [None]:
train.columns

Index(['EXTRELNO', 'Churn', 'Recency', 'Total_Monetary', 'Average_Monetary',
       'Frequency', 'used_order', 'used_initiative', 'used_bank',
       'used_unknown', 'medium_cat_electronic', 'medium_cat_face_to_face',
       'medium_cat_paper', 'medium_cat_unknown', 'complaint', 'Incoming',
       'Outgoing', 'comu_count', 'LANGUACODE', 'province'],
      dtype='object')

In [57]:
# drop categorical variables
numerical_columns = [col for col in train.columns if col not in ['EXTRELNO', 'Churn', 'LANGUACODE', 'province']]

# standarized 
scaler = MinMaxScaler()
scaler= scaler.fit(train[numerical_columns])

train[numerical_columns] = scaler.transform(train[numerical_columns])
val[numerical_columns] = scaler.transform(val[numerical_columns])
test[numerical_columns] = scaler.transform(test[numerical_columns])

In [82]:
# split independent and dependent variables
train_ind = train.drop(columns=['EXTRELNO','Churn'])
train_dep = train['Churn']

val_ind = val.drop(columns=['EXTRELNO','Churn'])
val_dep = val['Churn']

test_ind = test.drop(columns=['EXTRELNO','Churn'])
test_dep = test['Churn']

### Modeling

Churn task is a binary classificaiton task. Thus we can use a range of classifcation methods like
1. Logistic regression
2. Decison tree
3. Random forest
4. XGboost
5. SVM
6. MLP

In [80]:
# Logistic regression
lr = LogisticRegression()
lr = lr.fit(train_ind,train_dep)
# predict 
lr_preds = lr.predict(val_ind)
lr_probs = lr.predict_proba(val_ind)   # default threshold = 0.5

# evaluate
lr_accuracy = accuracy_score(val_dep, lr_preds)
lr_accuracy

0.9516943042537851

In [None]:
# churn rate
churn_rate = lr_probs[:,1]
# predition based on threshold
lr_prediction = pd.DataFrame(churn_rate).rename(columns={0:'churn_rate'})
lr_prediction['prediction'] = np.where(lr_prediction['churn_rate']>0.6,1,0)
lr_accuracy_2 = accuracy_score(val_dep, lr_prediction['prediction'])
lr_accuracy_2

0.9516943042537851

In [91]:
# Decision tree
dt = DecisionTreeClassifier()
# fit model
dt = dt.fit(train_ind,train_dep)
# prediction
dt_preds = dt.predict(val_ind)
dt_probs = dt.predict_proba(val_ind)
# evaluate
dt_accuracy = accuracy_score(val_dep, dt_preds)
dt_accuracy

0.8918529199711608

In [93]:
# Randomforest 
rf = RandomForestClassifier()
# fit
rf = rf.fit(train_ind,train_dep)
# prediction
rf_preds = rf.predict(val_ind)
rf_probs = rf.predict_proba(val_ind)
# evaluate
rf_accuracy = accuracy_score(val_dep, rf_preds)
rf_accuracy

0.9401586157173756

In [None]:
# XGboost
# Use "hist" for constructing the trees, with early stopping enabled.
XGb = xgb.XGBClassifier(tree_method="hist", early_stopping_rounds=2)
# Fit the model, test sets are used for early stopping.
XGb = XGb.fit(train_ind, train_dep, eval_set=[(val_ind, val_dep)])
# prediction
XGb_preds = XGb.predict(val_ind)
XGb_probs = XGb.predict_proba(val_ind)
# evaluate
XGb_accuracy = accuracy_score(val_dep, XGb_preds)
XGb_accuracy

[0]	validation_0-logloss:0.21961
[1]	validation_0-logloss:0.20781
[2]	validation_0-logloss:0.19956
[3]	validation_0-logloss:0.19345
[4]	validation_0-logloss:0.19041
[5]	validation_0-logloss:0.18817
[6]	validation_0-logloss:0.18739
[7]	validation_0-logloss:0.18724
[8]	validation_0-logloss:0.18737


0.9516943042537851

In [101]:
# SVM
SVM = LinearSVC()
SVM = SVM.fit(train_ind, train_dep)
# prediction
SVM_preds = SVM.predict(val_ind)
# evaluate
SVM_accuracy = accuracy_score(val_dep, SVM_preds)
SVM_accuracy

0.9516943042537851

In [102]:
# MLP 
MLP = MLPClassifier()
MLP = MLP.fit(train_ind, train_dep)
# prediction
MLP_preds = MLP.predict(val_ind)
MLP_probs = MLP.predict_proba(val_ind)
# evaluate
MLP_accuracy = accuracy_score(val_dep, MLP_preds)
MLP_accuracy

0.9516943042537851

### Hyperparameters tuning

In [23]:
# split train, test
train, test = train_test_split(basetable,test_size=0.3, random_state=45)

# encode lanuagecode and province
language_encoder = LabelEncoder()
province_encoder = LabelEncoder()

language_encoder = language_encoder.fit(train['LANGUACODE'])
province_encoder = province_encoder.fit(train['province'])

train['LANGUACODE'] = language_encoder.transform(train['LANGUACODE'])
test['LANGUACODE'] = language_encoder.transform(test['LANGUACODE'])

train['province'] = province_encoder.transform(train['province'])
test['province'] = province_encoder.transform(test['province'])


In [24]:
# drop categorical variables
numerical_columns = [col for col in train.columns if col not in ['EXTRELNO', 'Churn', 'LANGUACODE', 'province']]

# standarized 
scaler = MinMaxScaler()
scaler= scaler.fit(train[numerical_columns])

train[numerical_columns] = scaler.transform(train[numerical_columns])
test[numerical_columns] = scaler.transform(test[numerical_columns])

In [25]:
# split independent and dependent variables
train_ind = train.drop(columns=['EXTRELNO','Churn'])
train_dep = train['Churn']

test_ind = test.drop(columns=['EXTRELNO','Churn'])
test_dep = test['Churn']

### Hyperparameters tuning

In [26]:
# Logistic regression model
lr = LogisticRegression()
# hyperparameters
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l1','l2']
c_values = [1.0, 0.1, 0.01]
# model
lr_grid = dict(model = lr, parameters = dict(solver=solvers,penalty=penalty,C=c_values))
lr_par = dict(solver=solvers,penalty=penalty,C=c_values)

# Decison tree
dt = DecisionTreeClassifier()
# hyperparameters
max_depth=[10,15,20]
min_samples_split=[2,3,5]
max_leaf_nodes=[15,20,50]
dt_grid = dict(model = dt, parameters = dict(max_depth=max_depth,min_samples_split=min_samples_split,max_leaf_nodes=max_leaf_nodes))
dt_par = dict(max_depth=max_depth,min_samples_split=min_samples_split,max_leaf_nodes=max_leaf_nodes)

# Random Forest
rf = RandomForestClassifier()
n_estimators=[100,150]
criterion=['gini']
max_depth=[10,20]
min_samples_split=[2,3,5]
rf_grid = dict(model = rf, parameters = dict(n_estimators=n_estimators,criterion=criterion,max_depth=max_depth,min_samples_split=min_samples_split))
rf_par = dict(n_estimators=n_estimators,criterion=criterion,max_depth=max_depth,min_samples_split=min_samples_split)

# SVM
SVM = LinearSVC()
penalty=['l2','l1']
loss=['squared_hinge']
C=[1.0, 0.1, 0.01]
svm_grid = dict(model = SVM, parameters = dict(penalty=penalty,loss=loss,C=C))
svm_par = dict(penalty=penalty,loss=loss,C=C)

# MLP
MLP = MLPClassifier()
hidden_layer_sizes=[50,100,150]
batch_size=[10, 20]
learning_rate=[0.01,0.001,0.05]
MLP_grid = dict(model = MLP, parameters = dict(hidden_layer_sizes=hidden_layer_sizes,batch_size=batch_size,learning_rate_init=learning_rate))
MLP_par = dict(hidden_layer_sizes=hidden_layer_sizes,batch_size=batch_size,learning_rate_init=learning_rate)


In [163]:

grid = dict(model = [lr,dt,rf,SVM,MLP],parameters = [lr_par,dt_par,rf_par,svm_par,MLP_par])
for i in range(0,len(grid['model'])):
    print(grid["model"][i],grid["parameters"][i])

LogisticRegression() {'solver': ['newton-cg', 'lbfgs', 'liblinear'], 'penalty': ['l1', 'l2'], 'C': [1.0, 0.1, 0.01]}
DecisionTreeClassifier() {'max_depth': [10, 15, 20], 'min_samples_split': [2, 3, 5], 'max_leaf_nodes': [15, 20, 50]}
RandomForestClassifier() {'n_estimators': [100, 150], 'criterion': ['gini'], 'max_depth': [10, 20], 'min_samples_split': [2, 3, 5]}
LinearSVC() {'penalty': ['l2', 'l1'], 'loss': ['squared_hinge'], 'C': [1.0, 0.1, 0.01]}
MLPClassifier() {'hidden_layer_sizes': [50, 100, 150], 'batch_size': [10, 20], 'learning_rate_init': [0.01, 0.001, 0.05]}


In [None]:
# set a loop to run and get the test accuracy
grid = dict(model = [lr,dt,rf,SVM,MLP],parameters = [lr_par,dt_par,rf_par,svm_par,MLP_par])
for i in range(0,len(grid['model'])):
    grid_search = GridSearchCV(estimator= grid["model"][i], param_grid=grid["parameters"][i], n_jobs=-1, cv=5, scoring='accuracy',error_score=0)
    grid_result = grid_search.fit(train_ind, train_dep)
    prediction = grid_result.predict(test_ind)
    accuracy = accuracy_score(test_dep, prediction)
    print(grid["model"][i],accuracy)

LogisticRegression() 0.9567567567567568
DecisionTreeClassifier() 0.9553153153153153
RandomForestClassifier() 0.9563963963963964
LinearSVC() 0.9567567567567568
MLPClassifier() 0.9567567567567568


In [None]:
# define a funciton to get the hperparameters
def cv(grid, train_ind,train_dep,test_ind,threshold):
    cv_model = GridSearchCV(estimator= grid['model'], param_grid=grid["parameters"], n_jobs=-1, cv=5, scoring='accuracy',error_score=0)
    cv_model = cv_model.fit(train_ind, train_dep)

    # Get predictions based on probabilities
    probabilities = cv_model.predict_proba(test_ind)[:, 1]  # Get probabilities for the positive class
    predictions = (probabilities >= threshold).astype(int) 

    accuracy = accuracy_score(test_dep, predictions)
    return cv_model.best_estimator_,cv_model.best_params_,accuracy

# run the model
parameters= [lr_grid,dt_grid,rf_grid,svm_grid,MLP_grid]
for i in parameters:
    model, hyperparameters,accuracy = cv(i,train_ind,train_dep,test_ind)
    print(model, hyperparameters,accuracy)

LogisticRegression(penalty='l1', solver='liblinear') {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'} 0.9567567567567568
DecisionTreeClassifier(max_depth=10, max_leaf_nodes=15, min_samples_split=3) {'max_depth': 10, 'max_leaf_nodes': 15, 'min_samples_split': 3} 0.9553153153153153
RandomForestClassifier(max_depth=10, min_samples_split=5, n_estimators=150) {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 150} 0.956036036036036
LinearSVC() {'C': 1.0, 'loss': 'squared_hinge', 'penalty': 'l2'} 0.9567567567567568
MLPClassifier(batch_size=10, hidden_layer_sizes=50) {'batch_size': 10, 'hidden_layer_sizes': 50, 'learning_rate_init': 0.001} 0.9567567567567568


In [None]:
# define a funciton to get the hperparameters and define threshold
def cv(grid, train_ind,train_dep,test_ind, threshold=0.5):
    cv_model = GridSearchCV(estimator= grid['model'], param_grid=grid["parameters"], n_jobs=-1, cv=5, scoring='accuracy',error_score=0)
    cv_model = cv_model.fit(train_ind, train_dep)

    # Check if the model supports predict_proba                    SVM has no predict_proba attribute
    if hasattr(cv_model.best_estimator_, "predict_proba"):
        probabilities = cv_model.predict_proba(test_ind)[:, 1]
        predictions = (probabilities >= threshold).astype(int)
    else:
        # If predict_proba is not available, use predict
        predictions = cv_model.predict(test_ind)

    accuracy = accuracy_score(test_dep, predictions)
    return cv_model.best_estimator_,cv_model.best_params_,accuracy

# run the model
parameters= [lr_grid,dt_grid,rf_grid,svm_grid,MLP_grid]
for i in parameters:
    model, hyperparameters,accuracy = cv(i,train_ind,train_dep,test_ind)
    print(model, hyperparameters,accuracy)

LogisticRegression(penalty='l1', solver='liblinear') {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'} 0.9567567567567568
DecisionTreeClassifier(max_depth=15, max_leaf_nodes=15, min_samples_split=3) {'max_depth': 15, 'max_leaf_nodes': 15, 'min_samples_split': 3} 0.9553153153153153
RandomForestClassifier(max_depth=10, min_samples_split=5, n_estimators=150) {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 150} 0.956036036036036
LinearSVC() {'C': 1.0, 'loss': 'squared_hinge', 'penalty': 'l2'} 0.9567567567567568
MLPClassifier(batch_size=10, hidden_layer_sizes=50) {'batch_size': 10, 'hidden_layer_sizes': 50, 'learning_rate_init': 0.001} 0.9567567567567568


### Final model

In [19]:
# define a function
# final model
final_model = LogisticRegression(penalty='l1', solver='liblinear',C=1.0)

# fit model
final_model = final_model.fit(train_ind,train_dep)

# prediction
prediction = final_model.predict(test_ind)
probs = final_model.predict_proba(test_ind)

# set the threshold by top n %
predictions = pd.DataFrame(probs[:,1]).rename(columns={0:'churn_rate'}).sort_values(by=['churn_rate'], ascending= False)

highest_prob = predictions.nlargest(round((0.2*len(predictions))), ['churn_rate'])['churn_rate']
predictions['prediction'] = np.where(predictions['churn_rate'].isin(highest_prob), 1, 0)
nlargest_accuracy = accuracy_score(test_dep,predictions['prediction'] )

# set the threshold by threshold 
predictions['threshold_pred'] = np.where(predictions['churn_rate']>0.1, 1, 0)
threshold_accuracy = accuracy_score(test_dep,predictions['threshold_pred'] )

print(nlargest_accuracy)
print(threshold_accuracy)

print(predictions[predictions['prediction']==1].count())
print(predictions[predictions['threshold_pred']==1].count())


0.7718918918918919
0.9567567567567568
churn_rate        555
prediction        555
threshold_pred    555
dtype: int64
churn_rate        0
prediction        0
threshold_pred    0
dtype: int64


In [21]:
test_dep = test_dep.reset_index()
test_dep[test_dep['Churn']==1].count()

index    120
Churn    120
dtype: int64

In [216]:
# Get the feature names (column names of train_ind)
feature_names = train_ind.columns

# Get the coefficients (only for the features used)
coefficients = final_model.coef_

# Construct the feature coefficients DataFrame
coef_df = pd.DataFrame({
    'Feature': list(train_ind.columns),  # Feature names from train_ind
    'Coefficient': final_model.coef_.flatten()  # Flatten coefficients if necessary
})

# Create a DataFrame for the intercept
intercept_df = pd.DataFrame({
    'Feature': ['Intercept'],  # Add intercept as a feature
    'Coefficient': [final_model.intercept_[0]]  # Intercept value
})

# Concatenate intercept and feature coefficients DataFrames
df = pd.concat([intercept_df, coef_df], ignore_index=True)
df

Unnamed: 0,Feature,Coefficient
0,Intercept,-2.684039
1,Recency,-2.996572
2,Total_Monetary,0.0
3,Average_Monetary,0.0
4,Frequency,0.0
5,used_order,0.0
6,used_initiative,0.0
7,used_bank,0.0
8,used_unknown,0.0
9,medium_cat_electronic,0.482348


## Oversampling

In [None]:
# check classes balance
print(train['Churn'].value_counts())
print(test['Churn'].value_counts())

Churn
0    6134
1     340
Name: count, dtype: int64
Churn
0    2655
1     120
Name: count, dtype: int64


Non churn class(0) is more than teh churn class(1) a lot. This is class imbalance.

In [27]:
# use oversampling method to resample the data
# SMOTE Oversampling
smote = SMOTE(random_state=42)
train_ind_sample, train_dep_sample = smote.fit_resample(train_ind, train_dep)

In [None]:
# check the balance
train_dep_sample.value_counts()

Churn
0    6134
1    6134
Name: count, dtype: int64

In [28]:
# define a funciton to get the hperparameters and define threshold
def cv(grid, train_ind,train_dep,test_ind, threshold=0.5):
    cv_model = GridSearchCV(estimator= grid['model'], param_grid=grid["parameters"], n_jobs=-1, cv=5, scoring='accuracy',error_score=0)
    cv_model = cv_model.fit(train_ind, train_dep)

    # Check if the model supports predict_proba                    SVM has no predict_proba attribute
    if hasattr(cv_model.best_estimator_, "predict_proba"):
        probabilities = cv_model.predict_proba(test_ind)[:, 1]
        predictions = (probabilities >= threshold).astype(int)
    else:
        # If predict_proba is not available, use predict
        predictions = cv_model.predict(test_ind)

    accuracy = accuracy_score(test_dep, predictions)
    return cv_model.best_estimator_,cv_model.best_params_,accuracy

# run the model
parameters= [lr_grid,dt_grid,rf_grid,svm_grid,MLP_grid]
for i in parameters:
    model, hyperparameters,accuracy = cv(i,train_ind_sample,train_dep_sample,test_ind,threshold=0.5)
    print(model, hyperparameters,accuracy)

LogisticRegression(penalty='l1', solver='liblinear') {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'} 0.44828828828828826
DecisionTreeClassifier(max_depth=10, max_leaf_nodes=50) {'max_depth': 10, 'max_leaf_nodes': 50, 'min_samples_split': 2} 0.8854054054054054
RandomForestClassifier(max_depth=20, min_samples_split=3) {'criterion': 'gini', 'max_depth': 20, 'min_samples_split': 3, 'n_estimators': 100} 0.9405405405405406
LinearSVC() {'C': 1.0, 'loss': 'squared_hinge', 'penalty': 'l2'} 0.461981981981982
MLPClassifier(batch_size=10, hidden_layer_sizes=150) {'batch_size': 10, 'hidden_layer_sizes': 150, 'learning_rate_init': 0.001} 0.8212612612612613


Random forest is the best model

In [None]:
final_model = RandomForestClassifier(criterion='gini', max_depth=20,min_samples_split=2,n_estimators=100)

# fit model
final_model = final_model.fit(train_ind_sample,train_dep_sample)

# prediction
prediction = final_model.predict(test_ind)
probs = final_model.predict_proba(test_ind)

# set the threshold by top n %
predictions = pd.DataFrame(probs[:,1]).rename(columns={0:'churn_rate'}).sort_values(by=['churn_rate'], ascending= False)

highest_prob = predictions.nlargest(round((0.2*len(predictions))), ['churn_rate'])['churn_rate']     #consider this method to set the threshold because the cost of retention non-churner is less expensive than the cost of the loss a churner's life time value
predictions['prediction'] = np.where(predictions['churn_rate'].isin(highest_prob), 1, 0)
nlargest_accuracy = accuracy_score(test_dep,predictions['prediction'] )

# set the threshold by threshold 
predictions['threshold_pred'] = np.where(predictions['churn_rate']>0.6, 1, 0)
threshold_accuracy = accuracy_score(test_dep,predictions['threshold_pred'] )

print(nlargest_accuracy)
print(threshold_accuracy)

0.7711711711711712
0.9398198198198198


In [36]:
print(predictions[predictions['prediction']==1].count())
print(predictions[predictions['threshold_pred']==1].count())

churn_rate        557
prediction        557
threshold_pred    557
dtype: int64
churn_rate        49
prediction        49
threshold_pred    49
dtype: int64
