In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
!pip install scikit-learn==1.2.2 --user
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE





In [5]:
churn = pd.read_csv('files_for_lab/customer_churn.csv')
churn

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [6]:
# first drop two columns I consider irrelevant
churn = churn.drop(columns=['customerID','gender'])

In [7]:
for col in churn.columns: # seeing from the result I will replace all yes no values and 0/1 values of Senior citizens to appropriate True/False
    print(churn[col].value_counts())

SeniorCitizen
0    5901
1    1142
Name: count, dtype: int64
Partner
No     3641
Yes    3402
Name: count, dtype: int64
Dependents
No     4933
Yes    2110
Name: count, dtype: int64
tenure
1     613
72    362
2     238
3     200
4     176
     ... 
28     57
39     56
44     51
36     50
0      11
Name: count, Length: 73, dtype: int64
PhoneService
Yes    6361
No      682
Name: count, dtype: int64
MultipleLines
No                  3390
Yes                 2971
No phone service     682
Name: count, dtype: int64
InternetService
Fiber optic    3096
DSL            2421
No             1526
Name: count, dtype: int64
OnlineSecurity
No                     3498
Yes                    2019
No internet service    1526
Name: count, dtype: int64
OnlineBackup
No                     3088
Yes                    2429
No internet service    1526
Name: count, dtype: int64
DeviceProtection
No                     3095
Yes                    2422
No internet service    1526
Name: count, dtype: int64
TechSupport

In [8]:
# replace all approriate values to True/False
churn = churn.replace({'Yes':True, 'No':False, 'No phone service':False, 'No internet service':False})
churn['SeniorCitizen'] = churn['SeniorCitizen'].replace({1:True, 0:False})

In [9]:
churn['PaymentMethod'] = churn['PaymentMethod'].replace({'Electronic check':False, 'Mailed check':False, 'Bank transfer (automatic)':True, "Credit card (automatic)":True})

In [10]:
y = churn['Churn']
X = churn[['tenure', 'SeniorCitizen', 'MonthlyCharges']]


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [12]:
# transformer = StandardScaler().fit(X_train[['tenure','MonthlyCharges']])
# X_train[['tenure','MonthlyCharges']] = pd.DataFrame(transformer.transform(X_train[['tenure','MonthlyCharges']]),columns=['tenure','MonthlyCharges'])
# X_test[['tenure','MonthlyCharges']] = pd.DataFrame(transformer.transform(X_test[['tenure','MonthlyCharges']]),columns=['tenure','MonthlyCharges'])

X_train_num = X_train[['tenure','MonthlyCharges']]
X_train_cat = X_train['SeniorCitizen']
X_test_num = X_test[['tenure','MonthlyCharges']]
X_test_cat = X_test['SeniorCitizen']
transformer = StandardScaler().fit(X_train_num)

X_train_scaled = pd.DataFrame(transformer.transform(X_train_num),columns=X_train_num.columns)
X_test_scaled = pd.DataFrame(transformer.transform(X_test_num),columns=X_test_num.columns)

X_train_scaled.reset_index(drop=True, inplace=True)
X_train_cat.reset_index(drop=True, inplace=True)
X_train = pd.concat([X_train_scaled, X_train_cat],axis=1)

X_test_scaled.reset_index(drop=True, inplace=True)
X_test_cat.reset_index(drop=True, inplace=True)
X_test = pd.concat([X_test_scaled, X_test_cat],axis=1)


In [13]:
LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train, y_train)
print("accuracy: ",LR.score(X_test, y_test))

pred = LR.predict(X_test)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

accuracy:  0.7808063600227144
precision:  0.6115942028985507
recall:  0.4557235421166307
f1:  0.5222772277227723


In [None]:
# as we are a telecom company, we should be interested in a model that can catch more churned customers (so aim for high recall)
# will try to achieve this by oversampling churned customers to the same weight as loyal customers:

In [25]:
sm = SMOTE(random_state=50, k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train,y_train)

LR_SMOTE = LogisticRegression(random_state=30, solver='lbfgs')
LR_SMOTE.fit(X_train_SMOTE, y_train_SMOTE)
print("accuracy: ",LR_SMOTE.score(X_test, y_test))

pred_SMOTE = LR_SMOTE.predict(X_test)

print("precision: ",precision_score(y_test,pred_SMOTE))
print("recall: ",recall_score(y_test,pred_SMOTE))
print("f1: ",f1_score(y_test,pred_SMOTE))

accuracy:  0.717206132879046
precision:  0.474964234620887
recall:  0.7170626349892009
f1:  0.5714285714285715


In [None]:
# recall raised, but precision dropped. Meaning that now we are catching significantly more churned customers (about 72% of all churned customers), but in the process also catching more customers who was not going to churn (only about 47% of all catches are actual churned customers)