In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import pickle

warnings.filterwarnings("ignore")

In [46]:
df = pd.read_csv("/home/muhd_bravo/Projects/churnmodel/data/Telco_Customer_Churn.csv")


In [47]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [48]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(0)

In [49]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [50]:
df.churn = (df.churn == 'yes').astype(int)

In [51]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [52]:
#Undersample the No Churn data
no_churn = len(df[df['churn'] == 1])
non_churn_indices = df[df.churn == 0].index
random_indices = np.random.choice(non_churn_indices,no_churn, replace=False)
churn_indices = df[df.churn == 1].index
under_sample_indices = np.concatenate([churn_indices,random_indices])
sample_df = df.loc[under_sample_indices]

In [53]:
sample_df.shape

(3738, 21)

In [54]:
sample_df['churn'].value_counts()

0    1869
1    1869
Name: churn, dtype: int64

In [55]:
categorical = ['internetservice', 'onlinebackup', 'onlinesecurity', 'contract',
       'paperlessbilling', 'paymentmethod']

numerical = ['tenure', 'monthlycharges', 'totalcharges', 'churn']

In [56]:
best_sample_df = sample_df[categorical+numerical]

In [57]:
best_sample_df.head()

Unnamed: 0,internetservice,onlinebackup,onlinesecurity,contract,paperlessbilling,paymentmethod,tenure,monthlycharges,totalcharges,churn
2,dsl,yes,yes,month-to-month,yes,mailed_check,2,53.85,108.15,1
4,fiber_optic,no,no,month-to-month,yes,electronic_check,2,70.7,151.65,1
5,fiber_optic,no,no,month-to-month,yes,electronic_check,8,99.65,820.5,1
8,fiber_optic,no,no,month-to-month,yes,electronic_check,28,104.8,3046.05,1
13,fiber_optic,yes,no,month-to-month,yes,bank_transfer_(automatic),49,103.7,5036.3,1


In [128]:
apo = list(best_sample_df['monthlycharges'].unique())
min(apo)

18.25

In [58]:
X = best_sample_df.drop(columns=['churn'])
y = best_sample_df['churn']

In [59]:
from sklearn.feature_extraction import DictVectorizer
X_dict = X.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(X_dict)
X_encoded = dv.transform(X_dict)

In [60]:
dv.get_feature_names()

['contract=month-to-month',
 'contract=one_year',
 'contract=two_year',
 'internetservice=dsl',
 'internetservice=fiber_optic',
 'internetservice=no',
 'monthlycharges',
 'onlinebackup=no',
 'onlinebackup=no_internet_service',
 'onlinebackup=yes',
 'onlinesecurity=no',
 'onlinesecurity=no_internet_service',
 'onlinesecurity=yes',
 'paperlessbilling=no',
 'paperlessbilling=yes',
 'paymentmethod=bank_transfer_(automatic)',
 'paymentmethod=credit_card_(automatic)',
 'paymentmethod=electronic_check',
 'paymentmethod=mailed_check',
 'tenure',
 'totalcharges']

In [61]:
X_dict[0]

{'internetservice': 'dsl',
 'onlinebackup': 'yes',
 'onlinesecurity': 'yes',
 'contract': 'month-to-month',
 'paperlessbilling': 'yes',
 'paymentmethod': 'mailed_check',
 'tenure': 2,
 'monthlycharges': 53.85,
 'totalcharges': 108.15}

In [62]:
## Train Test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=101)


In [63]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(n_estimators=1000, learning_rate=0.1)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)

In [64]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, classification_report, roc_curve, plot_roc_curve, auc, precision_recall_curve, plot_precision_recall_curve, average_precision_score

accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
f1score = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
class_rep = classification_report(y_test, y_pred)


print ('Accuracy score on Test is ', accuracy)
print ('AUC Score is', auc)
print ('F1 Score is ', f1score)
print ('Confusion Matrix\n\n', cm)
print('Classification Report is\n', class_rep ) 

Accuracy score on Test is  0.7762923351158645
AUC Score is 0.7759189354076108
F1 Score is  0.7917012448132781
Confusion Matrix

 [[394 164]
 [ 87 477]]
Classification Report is
               precision    recall  f1-score   support

           0       0.82      0.71      0.76       558
           1       0.74      0.85      0.79       564

    accuracy                           0.78      1122
   macro avg       0.78      0.78      0.78      1122
weighted avg       0.78      0.78      0.78      1122



In [89]:
with open('/home/muhd_bravo/Projects/churnmodel/model/churn-model.bin', 'wb') as f_out:
    pickle.dump((dv, model), f_out)

In [90]:
with open('/home/muhd_bravo/Projects/churnmodel/model/churn-model.bin', 'rb') as f_in: #A
    dv, model = pickle.load(f_in) #B

In [106]:
def predict_single(customer, dv, model):
    X = dv.transform(customer)
    y_pred = model.predict(X)[0]
    y_pred_prob = round(model.predict_proba(X)[:, 1][0],3)
    if y_pred_prob >= 0.5:
        churn = 'Churn'
        return churn, y_pred_prob
    else:
        not_churn = 'Not churn'
        return not_churn, y_pred_prob
    # return y_pred, y_pred_prob[0]

In [92]:
model_cols = ['internetservice', 'onlinebackup', 'onlinesecurity', 'contract',
       'paperlessbilling', 'paymentmethod', 'tenure', 'monthlycharges', 'totalcharges']
with open('/home/muhd_bravo/Projects/churnmodel/model/model_cols.bin', 'wb') as f_out:
    pickle.dump(model_cols, f_out)

In [107]:
customer = ["dsl", "yes", "yes", "month-to-month","yes"	"mailed_check",	2, 53.85,108.15]

In [94]:
def input_to_df(input_list):
    with open('/home/muhd_bravo/Projects/churnmodel/model/model_cols.bin', 'rb') as f_in: #A
        cols = pickle.load(f_in) #B
    map_df = dict(zip(cols,input_list))
    input_df = pd.DataFrame([map_df])
    return input_df

In [108]:
cust = input_to_df(customer)
gh = cust.to_dict(orient='records')
predict_single(gh, dv, model)

('Churn', 0.5)