In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('customer_churn.csv', index_col=0)
data.head()

Unnamed: 0_level_0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
#Load the dataset and explore the variables.
#We will try to predict variable Churn using a logistic regression on variables tenure, SeniorCitizen,MonthlyCharges.
#Extract the target variable.

In [5]:
data.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [8]:
target_variable = data[['Churn']]
target_variable

Unnamed: 0_level_0,Churn
customerID,Unnamed: 1_level_1
7590-VHVEG,No
5575-GNVDE,No
3668-QPYBK,Yes
7795-CFOCW,No
9237-HQITU,Yes
...,...
6840-RESVB,No
2234-XADUH,No
4801-JZAZL,No
8361-LTMKD,Yes


In [None]:
#Extract the independent variables and scale them.

In [9]:
independent_variables = data[['tenure', 'SeniorCitizen','MonthlyCharges']]
independent_variables

Unnamed: 0_level_0,tenure,SeniorCitizen,MonthlyCharges
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7590-VHVEG,1,0,29.85
5575-GNVDE,34,0,56.95
3668-QPYBK,2,0,53.85
7795-CFOCW,45,0,42.30
9237-HQITU,2,0,70.70
...,...,...,...
6840-RESVB,24,0,84.80
2234-XADUH,72,0,103.20
4801-JZAZL,11,0,29.60
8361-LTMKD,4,1,74.40


In [10]:
independent_variables.isna().sum()

tenure            0
SeniorCitizen     0
MonthlyCharges    0
dtype: int64

In [11]:
independent_variables['tenure'].value_counts()

1     613
72    362
2     238
3     200
4     176
     ... 
28     57
39     56
44     51
36     50
0      11
Name: tenure, Length: 73, dtype: int64

In [13]:
independent_variables['SeniorCitizen'].value_counts()

0    5901
1    1142
Name: SeniorCitizen, dtype: int64

In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_scaled = scaler.fit_transform(independent_variables)

In [None]:
#Build the logistic regression model.

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

y = data['Churn']

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


In [21]:
prediction = LogisticRegression()

In [22]:
prediction.fit(X_train, y_train)

In [26]:
predictions = prediction.predict(X_test)
predictions

array(['No', 'No', 'No', ..., 'No', 'No', 'No'], dtype=object)

In [None]:
#Evaluate the model.

In [27]:
prediction.score(X_test,y_test)

0.7936583057264552

In [28]:
pd.Series(predictions).value_counts()

No     1737
Yes     376
dtype: int64

In [None]:
#Even a simple model will give us more than 70% accuracy. Why?
# Because the data was clean and we used the right variables.

In [None]:
#Synthetic Minority Oversampling TEchnique (SMOTE) is an over sampling technique based on nearest neighbors that adds new 
#points between existing points. Apply imblearn.over_sampling.SMOTE to the dataset. Build and evaluate the logistic 
#regression model. Is it there any improvement?

In [31]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
prediction = LogisticRegression()
prediction.fit(X_train, y_train)
y_pred = prediction.predict(X_test)

In [33]:
prediction.score(X_test,y_test)

0.7468599033816425

In [None]:
# There was no improvement.