# 1./2. Load the dataset and explore the variables; try to predict variable Churn using a logistic regression on variables tenure, SeniorCitizen, MonthlyCharges.

In [63]:
!pip install imblearn
import imblearn
import pymysql
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import getpass
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder
password = getpass.getpass()

········


In [65]:
# make the connection with the database
connection_string = 'mysql+pymysql://root:' + password + '@localhost/customer_churn'
engine = create_engine(connection_string)

# 3. Extract the target variable.

In [66]:
query = '''
        SELECT Churn
        FROM customer_churn
        '''
churn_data = pd.read_sql_query(query, engine) # create query

targ_variable = churn_data['Churn'] # extract and print targ variable
print(targ_variable)

0        No
1        No
2       Yes
3        No
4       Yes
       ... 
7027     No
7028     No
7029     No
7030    Yes
7031     No
Name: Churn, Length: 7032, dtype: object


# 4. Extract the independent variables and scale

>Extract other 3 variables along with Churn

In [67]:
data = pd.read_sql_query('SELECT * FROM customer_churn', engine)
query = '''
        SELECT tenure, SeniorCitizen, MonthlyCharges, Churn
        FROM customer_churn
        '''
data = pd.read_sql_query(query, engine)
data.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,Churn
0,1,0,29.85,No
1,34,0,56.95,No
2,2,0,53.85,Yes
3,45,0,42.3,No
4,2,0,70.7,Yes


In [68]:
data.dtypes # unsure whether to leave Churn as categorical or not... will leave for now.

tenure              int64
SeniorCitizen       int64
MonthlyCharges    float64
Churn              object
dtype: object

>Use scaler

In [69]:
scaler = StandardScaler()
X = pd.get_dummies(data.drop('Churn', axis=1))
y = data['Churn']
X_scaled = scaler.fit_transform(X)


# 5. Build the logistic regression model.

In [70]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

predictions = classification.predict(X_test)
print(classification_report(y_test, predictions))

Accuracy: 0.783226723525231
              precision    recall  f1-score   support

          No       0.82      0.91      0.86      1033
         Yes       0.63      0.44      0.52       374

    accuracy                           0.78      1407
   macro avg       0.72      0.67      0.69      1407
weighted avg       0.77      0.78      0.77      1407



# 6. Evaluate the model.

In [71]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='Yes')
recall = recall_score(y_test, y_pred, pos_label='Yes')
f1 = f1_score(y_test, y_pred, pos_label='Yes')

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.783226723525231
Precision: 0.6292134831460674
Recall: 0.44919786096256686
F1-score: 0.5241809672386896


In [72]:
# Accuracy: model has accuracy of 78.32%; this amount of predictions made by model were correct.
# precision: when the model predicts a customer will churn, it is right only 62.92% of the time.
# recall: proportion of positive cases (churn) that were correctly predicted by model; not so good
# F1: mean of precision/recall; not sure if this is decent or not

# 7. Even a simple model will give us more than 70% accuracy. Why?

In [73]:
# Perhaps data imbalance: when the categories are unbalanced and the number of them arent equal/similar, then a model can still have high accuracy by predicting the majority class most of the time, since it's better represented in the data set.

# 8. Synthetic Minority Oversampling TEchnique (SMOTE) is an over sampling technique based on nearest neighbors that adds new points between existing points. Apply imblearn.over_sampling.SMOTE to the dataset. Build and evaluate the logistic regression model. Is it there any improvement?

>Apply Smote to the data set, see values before and after

In [74]:
from imblearn.over_sampling import SMOTE

smote = SMOTE() 
X_resampled, y_resampled = smote.fit_resample(data.drop('Churn', axis=1), data['Churn'])

# Value counts before
print("BeforeSMOTE:")
print(y.value_counts())

# Value counts after
print("AfterSMOTE:")
print(pd.Series(y_resampled).value_counts())

BeforeSMOTE:
No     5163
Yes    1869
Name: Churn, dtype: int64
AfterSMOTE:
No     5163
Yes    5163
Name: Churn, dtype: int64


>Repeat the logisic regression model and comment

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Build the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='Yes')
recall = recall_score(y_test, y_pred, pos_label='Yes')
f1 = f1_score(y_test, y_pred, pos_label='Yes')
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.7270087124878993
Precision: 0.7255092143549952
Recall: 0.7269193391642371
F1-score: 0.7262135922330096


In [76]:
# Accuracy has slightly declined, but all other scores have improved. The recall score has significantly imoproved.

# 9. Tomek links are pairs of very close instances, but of opposite classes. Removing the instances of the majority class of each pair increases the space between the two classes, facilitating the classification process. Apply imblearn.under_sampling.TomekLinks to the dataset. Build and evaluate the logistic regression model. Is it there any improvement?

>Apply Tomeklinks to the dataset

In [77]:
from imblearn.under_sampling import TomekLinks

X = data.drop('Churn', axis=1)
y = data['Churn']

tomek_links = TomekLinks()
X_resampled, y_resampled = tomek_links.fit_resample(X, y)

# Check the class distribution after undersampling
print(y_resampled.value_counts())

No     4701
Yes    1869
Name: Churn, dtype: int64


> Run again the logistic regression model

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='Yes')
recall = recall_score(y_test, y_pred, pos_label='Yes')
f1 = f1_score(y_test, y_pred, pos_label='Yes')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.7732115677321156
Precision: 0.6291390728476821
Recall: 0.5053191489361702
F1-score: 0.56047197640118
