In [28]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, f1_score, recall_score

from imblearn.over_sampling import SMOTE

from sklearn.metrics import confusion_matrix, accuracy_score

# Load the dataset and explore the variables.

In [2]:
data = pd.read_csv('customer_churn.csv')

In [3]:
data

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [4]:
data.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [5]:
data.info


<bound method DataFrame.info of       customerID  gender  SeniorCitizen Partner Dependents  tenure  \
0     7590-VHVEG  Female              0     Yes         No       1   
1     5575-GNVDE    Male              0      No         No      34   
2     3668-QPYBK    Male              0      No         No       2   
3     7795-CFOCW    Male              0      No         No      45   
4     9237-HQITU  Female              0      No         No       2   
...          ...     ...            ...     ...        ...     ...   
7038  6840-RESVB    Male              0     Yes        Yes      24   
7039  2234-XADUH  Female              0     Yes        Yes      72   
7040  4801-JZAZL  Female              0     Yes        Yes      11   
7041  8361-LTMKD    Male              1     Yes         No       4   
7042  3186-AJIEK    Male              0      No         No      66   

     PhoneService     MultipleLines InternetService OnlineSecurity  ...  \
0              No  No phone service             DSL 

In [6]:
data["Churn"].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

# We will try to predict variable Churn using a logistic regression on variables tenure, SeniorCitizen, MonthlyCharges.

In [7]:



# Check data types
data[['tenure', 'SeniorCitizen', 'MonthlyCharges']].dtypes


tenure              int64
SeniorCitizen       int64
MonthlyCharges    float64
dtype: object

In [8]:

# Summary statistics
data[['tenure', 'SeniorCitizen', 'MonthlyCharges']].describe()



Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,32.371149,0.162147,64.761692
std,24.559481,0.368612,30.090047
min,0.0,0.0,18.25
25%,9.0,0.0,35.5
50%,29.0,0.0,70.35
75%,55.0,0.0,89.85
max,72.0,1.0,118.75


# Split the Dataset into X ('tenure', 'SeniorCitizen', 'MonthlyCharges') and y ('Churn')

In [9]:
# Split the dataset into X (features) and y (target)
X = data[['tenure', 'SeniorCitizen', 'MonthlyCharges']]
y = data['Churn']



# Build the logistic regression model.

In [10]:
# Treat numerical features
num_transformer = MinMaxScaler().fit(X)
X = pd.DataFrame(num_transformer.transform(X), columns=X.columns)


In [11]:
X

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,0.013889,0.0,0.115423
1,0.472222,0.0,0.385075
2,0.027778,0.0,0.354229
3,0.625000,0.0,0.239303
4,0.027778,0.0,0.521891
...,...,...,...
7038,0.333333,0.0,0.662189
7039,1.000000,0.0,0.845274
7040,0.152778,0.0,0.112935
7041,0.055556,1.0,0.558706


In [12]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)

In [13]:
# Build the logistic regression model
model = LogisticRegression(random_state=0, solver='saga', multi_class='auto')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate the model.

In [14]:
# Calculate the classification accuracy
accuracy = model.score(X_test, y_test)
accuracy

0.7863733144073811

The model is able to correctly predict the outcome for around 78.6% of the total instances in the test dataset. 

In [20]:
# Convert 'Churn' labels to numerical (0 and 1), forgot to do it earlier
y_test_numeric = y_test.map({'No': 0, 'Yes': 1})

# Convert predicted labels to numerical (0 and 1)
y_pred_numeric = [0 if label == 'No' else 1 for label in y_pred]

# Calculate the confusion matrix
confusion_matrix(y_test_numeric, y_pred_numeric)


array([[937,  96],
       [205, 171]], dtype=int64)

In this confusion matrix:

937 are True Negatives

171 are True Positives 

96 are False Positives 

205 are False Negatives 

#  Even a simple model will give us more than 70% accuracy. Why?

This happens due to the Class distribution.  
If the classes in the target variable (in this case, 'Churn') are imbalanced, where one class dominates the other, 
a simple model might tend to predict the majority class most of the time and still achieve a relatively high accuracy. 

#  Apply imblearn.over_sampling.SMOTE to the dataset.

In [23]:
# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [25]:
# Build and evaluate the logistic regression model
model = LogisticRegression(random_state=0, solver='saga', multi_class='auto')
model.fit(X_train_resampled, y_train_resampled)
y_pred = model.predict(X_test)

In [26]:
# Calculate the confusion matrix and accuracy
confusion_matrix(y_test, y_pred)


array([[760, 273],
       [104, 272]], dtype=int64)

In [29]:
accuracy_score(y_test, y_pred)

0.7324343506032647

By apllying smote, we managed to balance the class distribution in the data, wich lead to an increase in the number of correct prediction for the menority
class (churn: yes).
However we can check that the overall accuracy has slightly decreased, wich is expected because using SMOTE,
the model makes more predicitions for the minority class.

