In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
col_scheme = ["#4374B3", "#FF0B04"]

In [None]:
data = pd.read_csv('https://github.com/dsrscientist/DSData/blob/master/Telecom_customer_churn.csv')
data.head()

In [None]:
print('We have %d rows (observations) of data with %d columns (features).\n' % (data.shape[0], data.shape[1]))
dups = data[data.duplicated()]
print("Number of duplicate rows: %d\n" % len(dups))

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data['Churn'].value_counts()

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2)
fig.set_figheight(6)
fig.set_figwidth(12)

ax1.pie(data['Churn'].value_counts(), explode=(0,0.1), autopct='%1.1f%%',
        shadow=True, startangle=90, labels=data['Churn'].unique(), colors=col_scheme)
fig.set_facecolor('white')

ax2.bar(data['Churn'].unique(), data['Churn'].value_counts(), color=col_scheme)
ax1.text(0.5, 0.0, "Number of customers who stayed = %d" % len(data[data['Churn'] == 'No']), size=12, ha="center", 
         transform=ax1.transAxes)
ax1.text(0.5,-0.1, "Number of customers who left = %d" % len(data[data['Churn'] == 'Yes']), size=12, ha="center", 
         transform=ax1.transAxes)

plt.suptitle('Churn')
plt.show()

In [None]:
plt.figure(figsize = (9,6))
ten_mean = np.mean(data['tenure'])
ten_med = np.median(data['tenure'])
plt.hist(data["tenure"], bins = 25, alpha = 0.65, color = 'green')
plt.title("tenure", fontsize = 15)
plt.ylabel("Number of customers", fontsize = 15)
plt.xlabel("Months", fontsize = 15)
plt.xticks(np.arange(0, 75, step=6))
plt.axvline(ten_mean, color='red', linestyle='solid', linewidth=2, label = "Mean %.2f" % ten_mean)
plt.axvline(ten_med, color='blue', linestyle='solid', linewidth=2, label = "Median %.2f" % ten_med)
plt.legend(fontsize = 12)
plt.show()


In [None]:
print("Percentage of customers who left before 6 months? : %.2f%%" % ((len(data[data['tenure'] < 6])) / len(data['tenure']) * 100))

In [None]:
under_6 = data[data['tenure'] < 6]

In [None]:
sns.set_palette(sns.color_palette(col_scheme))
plt.figure(figsize=(9,6))
sns.countplot(x=under_6['Contract'], hue=under_6['Churn']).set_title('Churn vs. PaymentMethod')
plt.show()

In [None]:
plt.figure(figsize = (10,6))
mc_mean = np.mean(data['MonthlyCharges'])
mc_med = np.median(data['MonthlyCharges'])
plt.hist(data["MonthlyCharges"], bins = 25, alpha = 0.65, color = 'green')
plt.xticks(np.arange(20, 125, step=5))
plt.title("MonthlyCharges", fontsize = 15)
plt.ylabel("Number of customers", fontsize = 15)
plt.xlabel("Monthly Bill Amount", fontsize = 15)
plt.axvline(mc_mean, color='red', linestyle='solid', linewidth=2, label = "Mean %.2f" % mc_mean)
plt.axvline(mc_med, color='blue', linestyle='solid', linewidth=2, label = "Median %.2f" % mc_med)
plt.legend(fontsize = 12)
plt.show()


In [None]:
print("Percentage of customers with a Monthly Bill of $25 or less? %.2f%%" % ((len(data[data['MonthlyCharges'] < 25.01])) / len(data['MonthlyCharges']) * 100

In [None]:
# Convert data in 'TotalCharges' from string to float.
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'],errors='coerce')

In [None]:
print("How many missing values in 'TotalCharges' after coercion? : %d" % data['TotalCharges'].isnull().sum())
print("What percentage of the data are these missing values? : %0.2f%%" % ((data['TotalCharges'].isnull().sum()) / (len(data['TotalCharges'])) * 100))

In [None]:
data2 = data[data["TotalCharges"].notnull()] 
data2 = data2.reset_index(drop=True)

In [None]:
plt.figure(figsize = (18,6))
tc_mean = np.mean(data2['TotalCharges'])
tc_med = np.median(data2['TotalCharges'])
plt.hist(data2['TotalCharges'], bins = 25, alpha = 0.65, color = 'green')
plt.title('TotalCharges', fontsize = 15)
plt.ylabel("Number of customers", fontsize = 15)
plt.xlabel("Total Amount Charged", fontsize = 15)
plt.xticks(np.arange(0, 8800, 300))
plt.axvline(tc_mean, color='red', linestyle='solid', linewidth=2, label = "Mean %.2f" % tc_mean)
plt.axvline(tc_med, color='blue', linestyle='solid', linewidth=2, label = "Median %.2f" % tc_med)
plt.legend(fontsize = 12)
plt.show()


In [None]:
print("Percentage of customers with TotalCharges less than $300? %.2f%%" % ((len(data2[data2['TotalCharges'] < 300])) / len(data2['TotalCharges']) * 10

In [None]:
no_www = data2[data2.InternetService == 'No']

In [None]:
print(no_www['OnlineSecurity'].unique())
print(no_www['OnlineBackup'].unique())
print(no_www['DeviceProtection'].unique())
print(no_www['TechSupport'].unique())
print(no_www['StreamingTV'].unique())
print(no_www['StreamingMovies'].unique())

In [None]:
binaries = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

In [None]:
for i in binaries:
    data2[i] = data2[i].replace({'No internet service' : 'No'})

In [None]:
data2["SeniorCitizen"] = data2["SeniorCitizen"].replace({1:"Yes",0:"No"})

In [None]:
sns.set_palette(sns.color_palette(col_scheme))
fig, ax=plt.subplots(3,4,figsize=(25,18))

sns.countplot(x=data2['gender'], hue=data2['Churn'], ax=ax[0,0])
ax[0,0].set_title('Churn vs. gender')
sns.countplot(x=data2['SeniorCitizen'], hue=data2['Churn'], ax=ax[0,1])
ax[0,1].set_title('Churn vs. SeniorCitizen')
sns.countplot(x=data2['Partner'], hue=data2['Churn'], ax=ax[0,2])
ax[0,2].set_title('Churn vs. Partner')
sns.countplot(x=data2['Dependents'], hue=data2['Churn'], ax=ax[0,3])
ax[0,3].set_title('Churn vs. Dependents')

sns.countplot(x=data2['PhoneService'], hue=data2['Churn'], ax=ax[1,0])
ax[1,0].set_title('Churn vs. PhoneService')
sns.countplot(x=data2['MultipleLines'], hue=data2['Churn'], ax=ax[1,1])
ax[1,1].set_title('Churn vs. MultipleLines')
sns.countplot(x=data2['InternetService'], hue=data2['Churn'], ax=ax[1,2])
ax[1,2].set_title('Churn vs. InternetService')
sns.countplot(x=data2['OnlineSecurity'], hue=data2['Churn'], ax=ax[1,3])
ax[1,3].set_title('Churn vs. OnlineSecurity')

sns.countplot(x=data2['OnlineBackup'], hue=data2['Churn'], ax=ax[2,0])
ax[2,0].set_title('Churn vs. OnlineBackup')
sns.countplot(x=data2['DeviceProtection'], hue=data2['Churn'], ax=ax[2,1])
ax[2,1].set_title('Churn vs. DeviceProtection')
sns.countplot(x=data2['TechSupport'], hue=data2['Churn'], ax=ax[2,2])
ax[2,2].set_title('Churn vs. TechSupport')
sns.countplot(x=data2['StreamingTV'], hue=data2['Churn'], ax=ax[2,3])
ax[2,3].set_title('Churn vs. StreamingTV')
plt.show()

In [None]:
fig, ax = plt.subplots(1,3,figsize=(18,6))

sns.countplot(x=data2['StreamingMovies'], hue=data2['Churn'], ax=ax[0])
ax[0].set_title('Churn vs. StreamingMovies')
sns.countplot(x=data2['Contract'], hue=data2['Churn'], ax=ax[1])
ax[1].set_title('Churn vs. Contract')
sns.countplot(x=data2['PaperlessBilling'], hue=data2['Churn'], ax=ax[2])
ax[2].set_title('Churn vs. PaperlessBilling')
plt.show()

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(x=data2['PaymentMethod'], hue=data2['Churn']).set_title('Churn vs. PaymentMethod')
plt.show()


In [None]:
pay_meth = pd.DataFrame(data2.groupby(['PaymentMethod', 'Churn'])['Churn'].count())
pay_meth

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

In [None]:
data2.drop('customerID', axis=1, inplace=True)

In [None]:
cont_feats = ['tenure', 'MonthlyCharges', 'TotalCharges']
binary_feats = [i for i in data2.columns[:-1] if len(data2[i].unique()) == 2]
binary_feats.remove('gender')
cat_feats = [i for i in data2.columns[:-1] if i not in cont_feats + binary_feats]

In [None]:
data2[binary_feats] = data2[binary_feats].replace(to_replace=['Yes', 'No'], value=[1, 0])

In [None]:
final_dat = pd.get_dummies(data2, columns=cat_feats)

In [None]:
train_set, test_set = train_test_split(final_dat, test_size=0.15, random_state=2021)

In [None]:
feats = [i for i in final_dat if i != 'Churn']

In [None]:
data_tr = train_set[feats].copy()
train_labels = train_set['Churn'].copy()
data_te = test_set[feats].copy()
test_labels = test_set['Churn'].copy()

In [None]:
col_trans = ColumnTransformer([
    ('num', StandardScaler(),  cont_feats)], remainder='passthrough')

train_prep = col_trans.fit_transform(data_tr)

In [None]:
test_prep = col_trans.transform(data_te)

In [None]:
new_order = [i for i in data_tr.columns if i not in cont_feats]
new_order = cont_feats+new_order

In [None]:
train_prep.shape

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from numpy import random

In [None]:
param_grid = {
    'n_estimators': random.randint(100,600,3),
    'max_features' : random.randint(10,29,3),
    'max_depth' : [3,4,5],
    'bootstrap' : [True, False]
}

In [None]:
rf_clf = RandomForestClassifier(random_state=2021)
rf_grid = RandomizedSearchCV(rf_clf, param_grid, cv=3, scoring='accuracy',
                           return_train_score=True)
rf_grid.fit(train_prep, train_labels)

In [None]:
cvres = rf_grid.cv_results_
grid_rf_df = pd.DataFrame({
    "Train_Error": cvres["mean_train_score"],
    "Test_Error": cvres["mean_test_score"],
    "Parameter_Set": cvres["params"],
}).sort_index()
pd.set_option('display.max_colwidth', None)
grid_rf_df.tail(10)

In [None]:
final_model = rf_grid.best_estimator_
rf_preds = final_model.predict(test_prep)
rfmod_pred_score = accuracy_score(test_labels, rf_preds)
rfmod_recall = recall_score(test_labels, rf_preds, pos_label='Yes')

In [None]:
mat = confusion_matrix(test_labels, rf_preds)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, cmap='RdBu')
plt.xlabel('Actual Churn', size=15)
plt.ylabel('Predicted Churn', size=15)
plt.show()
print('Total predictions = %d' % len(test_labels))

In [None]:
print('Train error rate = %.2f%%' % ((1 - rf_grid.best_score_) * 100))
print('Test error rate = %.2f%%' % ((1 - rfmod_pred_score) * 100))
print('Test Recall = %.2f%%' % (rfmod_recall * 100))

In [None]:
print(classification_report(test_labels, rf_preds))

In [None]:
df = pd.DataFrame(train_prep, columns = new_order)

In [None]:
rf_importances = pd.DataFrame(rf_grid.best_estimator_.feature_importances_,
                                   index = df.columns,
                                    columns=['Importance']).sort_values('Importance', ascending=False)

rf_importances[:10]

In [None]:
top_5 = rf_importances[rf_importances['Importance'] > .05]

In [None]:
plt.figure(figsize=(13,8))
ax = sns.barplot(top_5.index, top_5['Importance'])
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=12)
plt.show()

In [None]:
param_grid = [
    {'max_iter': random.randint(100,600,3), 'C': [.1,.5,1], 'penalty': ['l2'], 'solver': ['lbfgs'], 'tol': [.01, .001, .0001]},
    {'max_iter': random.randint(100,600,3), 'penalty': ['none'], 'tol': [.01, .001, .0001]},  
    {'max_iter': random.randint(100,600,3), 'C': [.1,.5,1], 'penalty': ['l1'], 'solver': ['liblinear'], 'tol': [.01, .001, .0001]},
    {'max_iter': random.randint(100,600,3), 'C': [.1,.5,1], 'penalty': ['elasticnet'], 'solver': ['saga'], 'l1_ratio':[.25, .5, .75], 'tol': [.01, .001, .0001]}
]

In [None]:
log_clf = LogisticRegression()
log_grid = RandomizedSearchCV(log_clf, param_grid, cv=3, scoring='accuracy',
                           return_train_score=True)
log_grid.fit(train_prep, train_labels)

In [None]:
cvres = log_grid.cv_results_
grid_log_df = pd.DataFrame({
    "Train_Error": cvres["mean_train_score"],
    "Test_Error": cvres["mean_test_score"],
    "Parameter_Set": cvres["params"],
}).sort_index()
grid_log_df.tail(10)

In [None]:
final_model = log_grid.best_estimator_
log_preds = final_model.predict(test_prep)
logmod_pred_score = accuracy_score(test_labels, log_preds)
logmod_recall = recall_score(test_labels, log_preds, pos_label='Yes')

In [None]:
mat = confusion_matrix(test_labels, log_preds)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, cmap='RdBu')
plt.xlabel('Actual Churn', size=15)
plt.ylabel('Predicted Churn', size=15)
plt.show()
print('Total predictions = %d' % len(test_labels))

In [None]:
print('Train error rate = %.2f%%' % ((1 - log_grid.best_score_) * 100))
print('Test error rate = %.2f%%' % ((1 - logmod_pred_score) * 100))
print('Test Recall = %.2f%%' % (logmod_recall * 100))

In [None]:
print(classification_report(test_labels, log_preds))

In [None]:
df = pd.DataFrame(log_grid.best_estimator_.coef_, columns = new_order).transpose()

In [None]:
df['square'] = df[0] ** 2
df['sqrt'] = [np.sqrt(i) for i in df['square']]
df = df.sort_values('sqrt', ascending=False)
df[:10]

In [None]:
top5 = df[df['sqrt'] > 1.1]

In [None]:
plt.figure(figsize=(13,8))
ax = sns.barplot(top5.index, top5['sqrt'])
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=12)
plt.show()

In [None]:
param_grid = [    
    {'C': [.1,1,10], 'max_iter': random.randint(1000,3000,3), 'tol': [1e-4, 1e-10], 'gamma' : [3,5,7]},
    {'C': [.1,1,10], 'max_iter': [-1], 'tol': [1e-4, 1e-10]},
    {'C': [.1,1,10], 'max_iter': random.randint(1000,3000,3), 'kernel': ['linear']}
  ]

In [None]:
svm_clf = SVC(random_state=2021)
svm_grid = RandomizedSearchCV(svm_clf, param_grid, cv=3,
                           scoring='accuracy',
                           return_train_score=True, n_iter=15)
svm_grid.fit(train_prep, train_labels)

In [None]:
cvres = svm_grid.cv_results_
grid_svc_df = pd.DataFrame({
    "Train_Error": cvres["mean_train_score"],
    "Test_Error": cvres["mean_test_score"],
    "Parameter_Set": cvres["params"],
}).sort_index()
grid_svc_df

In [None]:
final_model = svm_grid.best_estimator_
svm_preds = final_model.predict(test_prep)
svmmod_pred_score = accuracy_score(test_labels, svm_preds)
svmmod_recall = recall_score(test_labels, svm_preds, pos_label='Yes')

In [None]:
mat = confusion_matrix(test_labels, svm_preds)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, cmap='RdBu')
plt.xlabel('Actual Churn', size=15)
plt.ylabel('Predicted Churn', size=15)
plt.show()
print('Total predictions = %d' % len(test_labels))

In [None]:
print('Train error rate = %.2f%%' % ((1 - svm_grid.best_score_) * 100))
print('Test error rate = %.2f%%' % ((1 - svmmod_pred_score) * 100))
print('Test Recall = %.2f%%' % (svmmod_recall * 100))

In [None]:
print(classification_report(test_labels, svm_preds))

In [None]:
df = pd.DataFrame(svm_grid.best_estimator_.coef_, columns = new_order).transpose()

In [None]:
df['square'] = df[0] ** 2
df['sqrt'] = [np.sqrt(i) for i in df['square']]
df = df.sort_values('sqrt', ascending=False)
df[:10]

In [None]:
top5 = df[df['sqrt'] > 0.25]

In [None]:
plt.figure(figsize=(13,8))
ax = sns.barplot(top5.index, top5['sqrt'])
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=12)
plt.show()

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smt = SMOTE(random_state = 2021) 
x_smt, y_smt = smt.fit_sample(train_prep, train_labels) 

In [None]:
param_grid = {
    'n_estimators': random.randint(100,600,3),
    'max_features' : random.randint(10,29,3),
    'max_depth' : [3,4,5],
    'bootstrap' : [True, False]
}

In [None]:
rf_clf = RandomForestClassifier(random_state=2021)
rf_grid = RandomizedSearchCV(rf_clf, param_grid, cv=3, scoring='accuracy')
rf_grid.fit(x_smt, y_smt)

In [None]:
final_model = rf_grid.best_estimator_
rf_preds = final_model.predict(test_prep)
rfmod_pred_score = accuracy_score(test_labels, rf_preds)
rfmod_recall = recall_score(test_labels, rf_preds, pos_label='Yes')

In [None]:
print('Train error rate = %.2f%%' % ((1 - rf_grid.best_score_) * 100))
print('Test error rate = %.2f%%' % ((1 - rfmod_pred_score) * 100))
print('Test Recall = %.2f%%' % (rfmod_recall * 100))

In [None]:
print(classification_report(test_labels, rf_preds))

In [None]:
param_grid = [
    {'max_iter': random.randint(100,600,3), 'C': [.1,.5,1], 'penalty': ['l2'], 'solver': ['lbfgs'], 'tol': [.01, .001, .0001]},
    {'max_iter': random.randint(100,600,3), 'penalty': ['none'], 'tol': [.01, .001, .0001]},  
    {'max_iter': random.randint(100,600,3), 'C': [.1,.5,1], 'penalty': ['l1'], 'solver': ['liblinear'], 'tol': [.01, .001, .0001]},
    {'max_iter': random.randint(100,600,3), 'C': [.1,.5,1], 'penalty': ['elasticnet'], 'solver': ['saga'], 'l1_ratio':[.25, .5, .75], 'tol': [.01, .001, .0001]}
]

In [None]:
log_clf = LogisticRegression()
log_grid = RandomizedSearchCV(log_clf, param_grid, cv=3, scoring='accuracy')
log_grid.fit(x_smt, y_smt)

In [None]:
final_model = log_grid.best_estimator_
log_preds = final_model.predict(test_prep)
logmod_pred_score = accuracy_score(test_labels, log_preds)
logmod_recall = recall_score(test_labels, log_preds, pos_label='Yes')

In [None]:
print('Train error rate = %.2f%%' % ((1 - log_grid.best_score_) * 100))
print('Test error rate = %.2f%%' % ((1 - logmod_pred_score) * 100))
print('Test Recall = %.2f%%' % (logmod_recall * 100))

In [None]:
print(classification_report(test_labels, log_preds))

In [None]:
param_grid = [    
    {'C': [.1,1,10], 'max_iter': random.randint(1000,3000,3), 'tol': [1e-4, 1e-10], 'gamma' : [3,5,7]},
    {'C': [.1,1,10], 'max_iter': [-1], 'tol': [1e-4, 1e-10]},
    {'C': [.1,1,10], 'max_iter': random.randint(1000,3000,3), 'kernel': ['linear']}
  ]

In [None]:
svm_clf = SVC(random_state=2021)
svm_grid = RandomizedSearchCV(svm_clf, param_grid, cv=3,
                           scoring='accuracy',
                           n_iter=15)
svm_grid.fit(x_smt, y_smt)

In [None]:
final_model = svm_grid.best_estimator_
svm_preds = final_model.predict(test_prep)
svmmod_pred_score = accuracy_score(test_labels, svm_preds)
svmmod_recall = recall_score(test_labels, svm_preds, pos_label='Yes')

In [None]:
print('Train error rate = %.2f%%' % ((1 - svm_grid.best_score_) * 100))
print('Test error rate = %.2f%%' % ((1 - svmmod_pred_score) * 100))
print('Test Recall = %.2f%%' % (svmmod_recall * 100))

In [None]:
print(classification_report(test_labels, svm_preds))

In [None]:
all_recall = [rfmod_recall, logmod_recall, svmmod_recall]

In [None]:
all_acc = [rfmod_pred_score, logmod_pred_score, svmmod_pred_score]

In [None]:
df = pd.DataFrame(list(zip(all_recall, all_acc)), columns =['Recall', 'Accuracy'])

In [None]:
df['Model'] = ['Random Forest', 'Logistic Regresssion', 'Support Vector Machine']

In [None]:
df = df[['Model', 'Recall', 'Accuracy']].reset_index(drop=True)

In [None]:
plt.style.use('seaborn')
ax = df.plot(x='Model', y=['Accuracy', 'Recall'], kind="bar", 
             width=.4, ylim=(0,.99), figsize=(9,7), title='Scores')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=12)
plt.yticks(np.arange(0, 1, step=0.1))
ax.set_xlabel("")
plt.legend(prop={'size': 12}, loc=1)
plt.show()

df