In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import statsmodels.api as sm
import seaborn as sns

from sklearn.metrics import classification_report
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.preprocessing import StandardScaler


pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

In [2]:
data= pd.read_csv('C:/Users/Ouroboros/lab-cross-validation/files_for_lab/Customer-Churn.csv')
data

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [3]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

In [4]:
data['TotalCharges'].dtypes

dtype('float64')

In [5]:
data.isnull().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [6]:
data['TotalCharges'] = data['TotalCharges'].fillna(0)

In [8]:
y = data['Churn']

num = ['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']
num = data[num]

### Apply SMOTE for upsampling the data

- Use logistic regression to fit the model and compute the accuracy of the model.
- Use decision tree classifier to fit the model and compute the accuracy of the model.
- Compare the accuracies of the two models.

In [17]:
transformer = StandardScaler().fit(num)
X_stan = transformer.transform(num)
X_stan.shape

(7043, 4)

In [18]:
smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()


No     5174
Yes    5174
Name: Churn, dtype: int64

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.3, random_state=100)
classification = LogisticRegression(random_state=0, multi_class='ovr').fit(X_train, y_train)
predictions = classification.predict(X_test)

classification.score(X_test, y_test)

0.7420289855072464

Logistic Regression:

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.3, random_state=42)
model = LogisticRegression()
model.fit(X_train,y_train)
preds = model.predict(X_test)

In [22]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

          No       0.76      0.73      0.74      1574
         Yes       0.73      0.76      0.74      1531

    accuracy                           0.74      3105
   macro avg       0.74      0.74      0.74      3105
weighted avg       0.74      0.74      0.74      3105



Decision Tree Classifier:

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.3, random_state=100)
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))
score_sm_dec = round(model.score(X_test, y_test),2)

              precision    recall  f1-score   support

          No       0.75      0.79      0.77      1557
         Yes       0.78      0.74      0.76      1548

    accuracy                           0.76      3105
   macro avg       0.76      0.76      0.76      3105
weighted avg       0.76      0.76      0.76      3105



### Apply TomekLinks for downsampling

- It is important to remember that it does not make the two classes equal but only removes the points from the majority class that are close to other points in minority class.
- Use logistic regression to fit the model and compute the accuracy of the model.
- Use decision tree classifier to fit the model and compute the accuracy of the model.
- Compare the accuracies of the two models.

In [24]:
tomek = TomekLinks()
X_tl, y_tl = tomek.fit_resample(X_stan, y)
y_tl.value_counts()

No     4666
Yes    1869
Name: Churn, dtype: int64

Logistic Regression:

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.3, random_state=100)
model = LogisticRegression()
model.fit(X_train,y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

          No       0.84      0.90      0.87      1432
         Yes       0.66      0.53      0.59       529

    accuracy                           0.80      1961
   macro avg       0.75      0.71      0.73      1961
weighted avg       0.79      0.80      0.79      1961



Decision Tree:

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.3, random_state=100)
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))
score_sm_dec = round(model.score(X_test, y_test),2)

              precision    recall  f1-score   support

          No       0.75      0.79      0.77      1557
         Yes       0.78      0.73      0.76      1548

    accuracy                           0.76      3105
   macro avg       0.76      0.76      0.76      3105
weighted avg       0.76      0.76      0.76      3105

