## Lab | Cross Validation

- Importing the required libraries

In [1]:
import pandas as pd
import numpy as np
import datetime
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

- Importing the dataset

In [2]:
df = pd.read_csv('Customer-Churn.csv')
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [3]:
#checking for nulls just in case
df.isnull().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

### Apply SMOTE for upsampling the data
- Use logistic regression to fit the model and compute the accuracy of the model.
- Use decision tree classifier to fit the model and compute the accuracy of the model.
- Compare the accuracies of the two models.

In [4]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [5]:
#splitting the dataset
X = df.drop("Churn", axis=1)
y = df["Churn"]

In [6]:
#Encoding 
X_encoded = pd.get_dummies(X)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [8]:
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [9]:
#logistic model part
logreg = LogisticRegression()

In [10]:
logreg.fit(X_train_resampled, y_train_resampled)

In [11]:
y_pred_logreg = logreg.predict(X_test)

In [12]:
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)

In [13]:
print("Logistic Regression Accuracy:", accuracy_logreg)

Logistic Regression Accuracy: 0.794180269694819


In [14]:
#decision tree part
dt_classifier = DecisionTreeClassifier()

In [15]:
dt_classifier.fit(X_train_resampled, y_train_resampled)

In [16]:
y_pred_dt = dt_classifier.predict(X_test)

In [17]:
accuracy_dt = accuracy_score(y_test, y_pred_dt)

In [18]:
print("Decision Tree Classifier Accuracy:", accuracy_dt)

Decision Tree Classifier Accuracy: 0.7643718949609652


In [19]:
#comparing the accuracies
print("Logistic Regression Accuracy:", accuracy_logreg)
print("Decision Tree Classifier Accuracy:", accuracy_dt)

Logistic Regression Accuracy: 0.794180269694819
Decision Tree Classifier Accuracy: 0.7643718949609652


### Apply TomekLinks for downsampling

- Use logistic regression to fit the model and compute the accuracy of the model.
- Use decision tree classifier to fit the model and compute the accuracy of the model.
- Compare the accuracies of the two models.

In [20]:
from imblearn.under_sampling import TomekLinks

In [21]:
tomek = TomekLinks()
X_train_resampled, y_train_resampled = tomek.fit_resample(X_train, y_train)

In [22]:
#logistic part
logreg = LogisticRegression()

In [23]:
logreg.fit(X_train_resampled, y_train_resampled)

In [24]:
y_pred_logreg = logreg.predict(X_test)

In [25]:
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)

In [26]:
print("Logistic Regression Accuracy:", accuracy_logreg)

Logistic Regression Accuracy: 0.8041163946061036


In [27]:
#decision tree part
dt_classifier = DecisionTreeClassifier()

In [28]:
dt_classifier.fit(X_train_resampled, y_train_resampled)

In [29]:
y_pred_dt = dt_classifier.predict(X_test)

In [30]:
accuracy_dt = accuracy_score(y_test, y_pred_dt)

In [31]:
print("Decision Tree Classifier Accuracy:", accuracy_dt)

Decision Tree Classifier Accuracy: 0.7650816181689141


In [32]:
#comparing
print("Logistic Regression Accuracy:", accuracy_logreg)
print("Decision Tree Classifier Accuracy:", accuracy_dt)

Logistic Regression Accuracy: 0.8041163946061036
Decision Tree Classifier Accuracy: 0.7650816181689141


- You can also apply this algorithm one more time and check the how the imbalance in the two classes changed from the last time.

In [33]:
X_train_resampled2, y_train_resampled2 = tomek.fit_resample(X_train_resampled, y_train_resampled)

In [34]:
original_imbalance_ratio = y_train.value_counts()[1] / y_train.value_counts()[0]
after_tomek_imbalance_ratio = y_train_resampled2.value_counts()[1] / y_train_resampled2.value_counts()[0]

print("Original Class Imbalance Ratio:", original_imbalance_ratio)
print("After TomekLinks Class Imbalance Ratio:", after_tomek_imbalance_ratio)

Original Class Imbalance Ratio: 0.36152730787820203
After TomekLinks Class Imbalance Ratio: 0.40829694323144106


## Lab | Random Forests

 - Apply the Random Forests algorithm but this time only by upscaling the data using SMOTE.

In [37]:
from sklearn.ensemble import RandomForestClassifier

In [38]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [40]:
model = RandomForestClassifier()

In [41]:
model.fit(X_train_smote, y_train_smote)

In [42]:
accuracy = accuracy_score(X_test, y_test)

ValueError: Classification metrics can't handle a mix of continuous-multioutput and binary targets

In [43]:
#Igothelpforthisone
accuracy = model.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.7615330021291696
