In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [2]:
churn=pd.read_csv('customer_churn.csv')
churn.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


#### We will try to predict variable Churn using a logistic regression on variables: tenure, SeniorCitizen,MonthlyCharges.

In [3]:
#create dataframe I gonna continue working with
data=churn[['tenure', 'SeniorCitizen','MonthlyCharges','Churn']].copy()
data.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,Churn
0,1,0,29.85,No
1,34,0,56.95,No
2,2,0,53.85,Yes
3,45,0,42.3,No
4,2,0,70.7,Yes


#### check for imbalance in target variable

In [4]:
data.Churn.value_counts(dropna=False)

No     5174
Yes    1869
Name: Churn, dtype: int64

### we see a large imbalance to No, which menas most people dont cancel their subscription

### Scaling of indepedent variables:

In [5]:
# X/y split
X = data.drop('Churn',axis = 1)
y = data['Churn']
# but we also need to transform out target into a boolean 0 or 1
y=y.map(dict(Yes=1, No=0))
print(y)

#need to do train-test-split before
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7043, dtype: int64


In [6]:
data.dtypes #all are numerical except the target of course

tenure              int64
SeniorCitizen       int64
MonthlyCharges    float64
Churn              object
dtype: object

In [7]:
data.isna().sum()# no NaNs

tenure            0
SeniorCitizen     0
MonthlyCharges    0
Churn             0
dtype: int64

In [8]:
data['SeniorCitizen'].value_counts() #its a category being sensior or not, therefore doesnt need to be scaled with minmax

0    5901
1    1142
Name: SeniorCitizen, dtype: int64

In [9]:
X_num_train=X_train.drop(['SeniorCitizen'],axis=1)
X_cat_train=X_train[['SeniorCitizen']]


X_num_test=X_test.drop(['SeniorCitizen'],axis=1)
X_cat_test=X_test[['SeniorCitizen']]

In [10]:
#standarize X_num_train and X_num_test


transformer = StandardScaler().fit(X_num_train)
X_num_train_scaled = pd.DataFrame(transformer.transform(X_num_train),columns=X_num_train.columns)
# because this is the only tranformation we apply to the numerics, 
# we can immediately transform the X_test as well
X_num_test_scaled = pd.DataFrame(transformer.transform(X_num_test),columns=X_num_test.columns)
X_num_train_scaled.head()

Unnamed: 0,tenure,MonthlyCharges
0,-0.340191,-0.514314
1,0.88321,0.01784
2,-1.196572,-0.819594
3,1.16867,-1.483535
4,-0.829552,0.658427


#### re-scale index that has changed during transformation

In [11]:
y_train = y_train.reset_index(drop=True) 
# needed because the transformation also reset the index for X_train_scaled
# so we need this for y_test as well...
y_test = y_test.reset_index(drop=True) 

#same goes for our senior category that has now a different indexing
X_cat_train = X_cat_train.reset_index(drop=True) 
X_cat_test = X_cat_test.reset_index(drop=True) 

In [12]:
#concat numerical and categorical again
X_train_scaled=pd.concat([X_num_train_scaled , X_cat_train], axis=1)
X_test_scaled=pd.concat([X_num_test_scaled , X_cat_test], axis=1)

### perform oversampling

In [13]:
#reverse split
train = pd.concat([X_train_scaled, y_train],axis=1) #for resampling I need target and data again together
train.head()

Unnamed: 0,tenure,MonthlyCharges,SeniorCitizen,Churn
0,-0.340191,-0.514314,0,0
1,0.88321,0.01784,1,0
2,-1.196572,-0.819594,0,1
3,1.16867,-1.483535,0,0
4,-0.829552,0.658427,0,1


In [14]:
#check for majority and minority
no_churn = train[train['Churn']==0]
yes_churn = train[train['Churn']==1]

display(no_churn.shape)
display(yes_churn.shape) #we oversample the yes reponses so that no and yes will be equal

(3876, 4)

(1406, 4)

In [15]:
# oversample minority
yes_churn_oversampled = resample(yes_churn, 
                                    replace=True, 
                                    n_samples = len(no_churn),
                                    random_state=0)

In [16]:
#size is the same now
display(no_churn.shape)
display(yes_churn_oversampled.shape)

(3876, 4)

(3876, 4)

In [17]:
#concat both
train_oversampled = pd.concat([no_churn,yes_churn_oversampled],axis=0)
train_oversampled.head()

Unnamed: 0,tenure,MonthlyCharges,SeniorCitizen,Churn
0,-0.340191,-0.514314,0,0
1,0.88321,0.01784,1,0
3,1.16867,-1.483535,0,0
5,1.04633,0.503285,0,0
6,-0.258631,0.7018,0,0


In [18]:
#again x-y split for this oversampled data
y_train_over = train_oversampled['Churn'].copy()
X_train_over = train_oversampled.drop('Churn',axis = 1).copy()

### Logistic Regression

In [23]:
LR = LogisticRegression(max_iter=1000)
LR.fit(X_train_over, y_train_over)
pred = LR.predict(X_test_scaled)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred)) #we detect a lot more people how have diabetes, of course precision decreased, but we wanted a higher recall
print("f1: ",f1_score(y_test,pred))



precision:  0.4776978417266187
recall:  0.7170626349892009
f1:  0.5734024179620035


In [26]:
confusion_matrix(y_test,pred)

array([[935, 363],
       [131, 332]])

##### the model is not so precise: of all customers labled as staying subscribed how many actualy stay subscribed, it does a lot of mistakes in that, so has a lot of false positive: 362

#### the model has a good recall:  less false negative: 131

#### F1 is the balance between accuracy and recall

#### the model should have less of the following error: where clients are sorted false negative (so prediction as NO CHURN) but in real CHURN is YES, which means they would cancel the subscrption, so less false negative (prediction NO churn if its actually yes churn), this means a high recall is good

## undersampling

In [22]:
# undersample minority
no_churn_undersampled = resample(no_churn, 
                                    replace=False, 
                                    n_samples = len(yes_churn),
                                    random_state=0)

In [23]:
display(yes_churn.shape)
display(no_churn_undersampled.shape)

(1406, 4)

(1406, 4)

In [25]:
train_undersampled = pd.concat([yes_churn,no_churn_undersampled])
train_undersampled.head()

Unnamed: 0,tenure,MonthlyCharges,SeniorCitizen,Churn
2,-1.196572,-0.819594,0,1
4,-0.829552,0.658427,0,1
12,-1.115012,0.630068,0,1
13,-1.237352,0.199673,0,1
15,-1.155792,0.698464,0,1


In [26]:
y_train_under = train_undersampled['Churn'].copy()
X_train_under = train_undersampled.drop('Churn',axis = 1).copy()

In [27]:
LR = LogisticRegression(max_iter=1000)
LR.fit(X_train_under, y_train_under)
pred = LR.predict(X_test_scaled)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.47701149425287354
recall:  0.7170626349892009
f1:  0.5729076790336497


### both over and under sampled have a higher recall, but loose precision,compared to not scaling the target and feature data

In [28]:
LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train_scaled, y_train)
LR.score(X_test_scaled, y_test)

0.7808063600227144

In [29]:
pred = LR.predict(X_test_scaled)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred)) #aff all people who had diabtes we found only 58%
print("f1: ",f1_score(y_test,pred))

precision:  0.6115942028985507
recall:  0.4557235421166307
f1:  0.5222772277227723


# Now use SMOTE

In [38]:
sm = SMOTE(random_state=100,k_neighbors=5)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train_scaled,y_train)

In [39]:
LR = LogisticRegression(max_iter=1000)
LR.fit(X_train_SMOTE, y_train_SMOTE)
pred = LR.predict(X_test_scaled)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.4776978417266187
recall:  0.7170626349892009
f1:  0.5734024179620035


In [40]:
confusion_matrix(y_test,pred)

array([[935, 363],
       [131, 332]])