In [1]:
'''
# Lab | Imbalanced data

We will be using the `files_for_lab/customer_churn.csv` dataset to build a churn predictor.

### Instructions

1. Load the dataset and explore the variables.
2. We will try to predict variable `Churn` using a logistic regression on variables `tenure`, `SeniorCitizen`,`MonthlyCharges`.
3. Extract the target variable.
4. Extract the independent variables and scale them.
5. Build the logistic regression model.
6. Evaluate the model.
7. Even a simple model will give us more than 70% accuracy. Why?
8. **Synthetic Minority Oversampling TEchnique (SMOTE)
** is an over sampling technique based on nearest neighbors that adds new points between existing points. 
Apply `imblearn.over_sampling.SMOTE` to the dataset. Build and evaluate the logistic regression model. 
Is it there any improvement?'''

'\n# Lab | Imbalanced data\n\nWe will be using the `files_for_lab/customer_churn.csv` dataset to build a churn predictor.\n\n### Instructions\n\n1. Load the dataset and explore the variables.\n2. We will try to predict variable `Churn` using a logistic regression on variables `tenure`, `SeniorCitizen`,`MonthlyCharges`.\n3. Extract the target variable.\n4. Extract the independent variables and scale them.\n5. Build the logistic regression model.\n6. Evaluate the model.\n7. Even a simple model will give us more than 70% accuracy. Why?\n8. **Synthetic Minority Oversampling TEchnique (SMOTE)\n** is an over sampling technique based on nearest neighbors that adds new points between existing points. \nApply `imblearn.over_sampling.SMOTE` to the dataset. Build and evaluate the logistic regression model. \nIs it there any improvement?'

In [6]:
import imblearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
# To ignore all warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [16]:
df = pd.read_csv(r"C:\Users\filip\OneDrive\Desktop\IRONHACK\Labs\Week5\lab-imbalanced-data\files_for_lab\customer_churn.csv")

# Normalizing column names

cols = []

for i in range(len(df.columns)):
    cols.append(df.columns[i].lower().replace(' ','_'))
    
df.columns = cols

# Checking data types
df.dtypes


# Checking for null values and droping
df.isna().sum() #will drop, not significant


# Converting 'churn'

df.loc[df['churn'] == 'Yes', 'churn'] = 1
df.loc[df['churn'] == 'No', 'churn'] = 0

# Dropping columns
df = df[['churn', 'tenure', 'seniorcitizen', 'monthlycharges']]

#Converting target to numerical

df['churn'] = pd.to_numeric(df['churn'])

# Checking target count
df.groupby('churn').count() #we have imbalanced data



Unnamed: 0_level_0,tenure,seniorcitizen,monthlycharges
churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,5174,5174,5174
1,1869,1869,1869


In [17]:
# Building a logistic regression model

X = df.drop('churn',axis = 1)
y = df['churn']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.2)

LR = LogisticRegression()
LR.fit(X_train, y_train)

LR.score(X_test, y_test)

0.8055358410220014

In [None]:
# Accuracy = (Number of Correct Predictions) / (Total Number of Predictions)
# Our accuracy is high but it only means we predicted 80% of the results but it doesn't provide provide a comprehensive assessment of the model's performance. 
#The evaluation metric depends on the specific goals, we should look at recall

In [19]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report


pred = LR.predict(X_test)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))


print(classification_report(y_test, pred))

precision:  0.6911196911196911
recall:  0.47989276139410186
f1:  0.5664556962025317
              precision    recall  f1-score   support

           0       0.83      0.92      0.87      1036
           1       0.69      0.48      0.57       373

    accuracy                           0.81      1409
   macro avg       0.76      0.70      0.72      1409
weighted avg       0.79      0.81      0.79      1409



In [20]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,pred)

array([[956,  80],
       [194, 179]], dtype=int64)

In [None]:
# Our recall is very low we(48%), missed 194 cases
# Will use SMOTE to try to improve recall

In [22]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state =0,sampling_strategy=1)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train,y_train)

train_smote = pd.concat([X_train_SMOTE,y_train_SMOTE], axis = 1)
train_smote
train_smote.sum()

tenure            231063.000000
seniorcitizen       1179.000000
monthlycharges    562821.518704
churn               4138.000000
dtype: float64

In [23]:
LR = LogisticRegression()
LR.fit(X_train_SMOTE, y_train_SMOTE)
pred = LR.predict(X_test)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

print(classification_report(y_test, pred))

precision:  0.5126811594202898
recall:  0.7587131367292225
f1:  0.6118918918918917
              precision    recall  f1-score   support

           0       0.89      0.74      0.81      1036
           1       0.51      0.76      0.61       373

    accuracy                           0.75      1409
   macro avg       0.70      0.75      0.71      1409
weighted avg       0.79      0.75      0.76      1409



In [24]:
confusion_matrix(y_test,pred)

array([[767, 269],
       [ 90, 283]], dtype=int64)

In [None]:
# The recall improved to 75% and we missed 90 case instead of 194