In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
import warnings

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

In [17]:
churnData = pd.read_csv('files_for_lab/Customer-Churn.csv')
# churnData.columns = [column.lower().replace(" ", "_") for column in churnData.columns]
churnData

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [18]:
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce')

In [19]:

churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce')

non_numeric = churnData[churnData['TotalCharges'].isna()]
print(non_numeric)

      gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
488   Female              0     Yes        Yes       0           No   
753     Male              0      No        Yes       0          Yes   
936   Female              0     Yes        Yes       0          Yes   
1082    Male              0     Yes        Yes       0          Yes   
1340  Female              0     Yes        Yes       0           No   
3331    Male              0     Yes        Yes       0          Yes   
3826    Male              0     Yes        Yes       0          Yes   
4380  Female              0     Yes        Yes       0          Yes   
5218    Male              0     Yes        Yes       0          Yes   
6670  Female              0     Yes        Yes       0          Yes   
6754    Male              0      No        Yes       0          Yes   

           OnlineSecurity         OnlineBackup     DeviceProtection  \
488                   Yes                   No                  Yes   
753  

In [21]:
churnData.isnull().sum()
churnData['TotalCharges'].fillna(churnData['TotalCharges'].mean(), inplace=True)

In [22]:
features = churnData[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
target = churnData['Churn']

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.3, random_state=0)

In [24]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.7884524372929484


In [25]:
target.value_counts()

Churn
No     5174
Yes    1869
Name: count, dtype: int64

In [26]:

churn_majority = churnData[churnData.Churn=='No']
churn_minority = churnData[churnData.Churn=='Yes']

churn_majority_downsampled = resample(churn_majority, 
                                      replace=False,   
                                      n_samples=len(churn_minority), 
                                      random_state=123) 


churn_downsampled = pd.concat([churn_majority_downsampled, churn_minority])


print(churn_downsampled.Churn.value_counts())


Churn
No     1869
Yes    1869
Name: count, dtype: int64


In [28]:

features_downsampled = churn_downsampled[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
target_downsampled = churn_downsampled['Churn']

features_downsampled_scaled = scaler.fit_transform(features_downsampled)


X_train_down, X_test_down, y_train_down, y_test_down = train_test_split(features_downsampled_scaled, target_downsampled, test_size=0.3, random_state=0)


model_downsampled = LogisticRegression()
model_downsampled.fit(X_train_down, y_train_down)
y_pred_down = model_downsampled.predict(X_test_down)


print("Accuracy on downsampled data:", accuracy_score(y_test_down, y_pred_down))

Accuracy on downsampled data: 0.7263814616755794


In [27]:

churn_minority_upsampled = resample(churn_minority, 
                                    replace=True,    
                                    n_samples=len(churn_majority),  
                                    random_state=123) 

churn_upsampled = pd.concat([churn_majority, churn_minority_upsampled])


print(churn_upsampled.Churn.value_counts())



Churn
No     5174
Yes    5174
Name: count, dtype: int64


In [29]:

features_upsampled = churn_upsampled[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
target_upsampled = churn_upsampled['Churn']


features_upsampled_scaled = scaler.fit_transform(features_upsampled)


X_train_up, X_test_up, y_train_up, y_test_up = train_test_split(features_upsampled_scaled, target_upsampled, test_size=0.3, random_state=0)


model_upsampled = LogisticRegression()
model_upsampled.fit(X_train_up, y_train_up)
y_pred_up = model_upsampled.predict(X_test_up)


print("Accuracy on upsampled data:", accuracy_score(y_test_up, y_pred_up))

Accuracy on upsampled data: 0.7342995169082126
