
# Developing a logistic regression model to predict customer churn based on features such as 'tenure', 'MonthlyCharges', and 'TotalCharges'. The model will aim at accurately classifying customers as churned or not churned, enabling preventive retention strategies to mitigate churn and improve customer retention rates.

# Importing necessary Libraries

In [563]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

# Loading  and Preprocessing Data

In [564]:
# Importing the dataset 
df=pd.read_csv(r"c:\Users\ESTHER\Desktop\exam_cohort3\exam_cohort3\data.csv")
print(df)

      customerID  gender  SeniorCitizen Partner Dependents  tenure  \
0     7590-VHVEG  Female              0     Yes         No       1   
1     5575-GNVDE    Male              0      No         No      34   
2     3668-QPYBK    Male              0      No         No       2   
3     7795-CFOCW    Male              0      No         No      45   
4     9237-HQITU  Female              0      No         No       2   
...          ...     ...            ...     ...        ...     ...   
7038  6840-RESVB    Male              0     Yes        Yes      24   
7039  2234-XADUH  Female              0     Yes        Yes      72   
7040  4801-JZAZL  Female              0     Yes        Yes      11   
7041  8361-LTMKD    Male              1     Yes         No       4   
7042  3186-AJIEK    Male              0      No         No      66   

     PhoneService     MultipleLines InternetService OnlineSecurity  ...  \
0              No  No phone service             DSL             No  ...   
1        

In [565]:
# Checking for non-numeric values in numerical columns
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']

for feature in numerical_features:
    non_numeric_count = df[feature].apply(lambda x: not x.isnumeric() if isinstance(x, str) else False).sum()
    print(f"Number of non-numeric values in {feature}: {non_numeric_count}")

Number of non-numeric values in tenure: 0
Number of non-numeric values in MonthlyCharges: 0
Number of non-numeric values in TotalCharges: 6719


In [566]:
# First converting 'TotalCharges' column to numeric, coerce errors to NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Filling missing values with mean
df['TotalCharges'].fillna(df['TotalCharges'].mean(), inplace=True)
print(df['TotalCharges'] )

0         29.85
1       1889.50
2        108.15
3       1840.75
4        151.65
         ...   
7038    1990.50
7039    7362.90
7040     346.45
7041     306.60
7042    6844.50
Name: TotalCharges, Length: 7043, dtype: float64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].mean(), inplace=True)


In [567]:
# Standardizing numerical features
scaler = StandardScaler()
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Print the DataFrame 
print(df)

      customerID  gender  SeniorCitizen Partner Dependents    tenure  \
0     7590-VHVEG  Female              0     Yes         No -1.277445   
1     5575-GNVDE    Male              0      No         No  0.066327   
2     3668-QPYBK    Male              0      No         No -1.236724   
3     7795-CFOCW    Male              0      No         No  0.514251   
4     9237-HQITU  Female              0      No         No -1.236724   
...          ...     ...            ...     ...        ...       ...   
7038  6840-RESVB    Male              0     Yes        Yes -0.340876   
7039  2234-XADUH  Female              0     Yes        Yes  1.613701   
7040  4801-JZAZL  Female              0     Yes        Yes -0.870241   
7041  8361-LTMKD    Male              1     Yes         No -1.155283   
7042  3186-AJIEK    Male              0      No         No  1.369379   

     PhoneService     MultipleLines InternetService OnlineSecurity  ...  \
0              No  No phone service             DSL         

In [568]:
# Encoding categorical features
# Performing one-hot encoding for categorical features
categorical_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
                        'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                        'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 
                        'PaperlessBilling', 'PaymentMethod']

# Use pandas get_dummies function for one-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_features)

# Display the encoded DataFrame
print(df_encoded)

      customerID  SeniorCitizen    tenure  MonthlyCharges  TotalCharges Churn  \
0     7590-VHVEG              0 -1.277445       -1.160323     -0.994971    No   
1     5575-GNVDE              0  0.066327       -0.259629     -0.173876    No   
2     3668-QPYBK              0 -1.236724       -0.362660     -0.960399   Yes   
3     7795-CFOCW              0  0.514251       -0.746535     -0.195400    No   
4     9237-HQITU              0 -1.236724        0.197365     -0.941193   Yes   
...          ...            ...       ...             ...           ...   ...   
7038  6840-RESVB              0 -0.340876        0.665992     -0.129281    No   
7039  2234-XADUH              0  1.613701        1.277533      2.242808    No   
7040  4801-JZAZL              0 -0.870241       -1.168632     -0.855182    No   
7041  8361-LTMKD              1 -1.155283        0.320338     -0.872777   Yes   
7042  3186-AJIEK              0  1.369379        1.358961      2.013917    No   

      gender_Female  gender

In [569]:
# Spliting the dataset into X and y
# Separating input features (X) and target variable (y)
X = df.drop(columns=['Churn'])  
y = df['Churn']  

In [570]:
# Printing first rows of  X and y
print("Input Features (X):")
print(X.head())  
print("\nTarget Variable (y):")
print(y.head())  

Input Features (X):
   customerID  gender  SeniorCitizen Partner Dependents    tenure  \
0  7590-VHVEG  Female              0     Yes         No -1.277445   
1  5575-GNVDE    Male              0      No         No  0.066327   
2  3668-QPYBK    Male              0      No         No -1.236724   
3  7795-CFOCW    Male              0      No         No  0.514251   
4  9237-HQITU  Female              0      No         No -1.236724   

  PhoneService     MultipleLines InternetService OnlineSecurity OnlineBackup  \
0           No  No phone service             DSL             No          Yes   
1          Yes                No             DSL            Yes           No   
2          Yes                No             DSL            Yes          Yes   
3           No  No phone service             DSL            Yes           No   
4          Yes                No     Fiber optic             No           No   

  DeviceProtection TechSupport StreamingTV StreamingMovies        Contract  \
0     

In [571]:
# Spliting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [572]:
# Printing the shapes of the training and testing sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (5634, 20)
Shape of X_test: (1409, 20)
Shape of y_train: (5634,)
Shape of y_test: (1409,)


# Logistic Regression

In [573]:
# Creating OneHotEncoder instance
encoder = OneHotEncoder(handle_unknown='ignore')

In [574]:
# Fitting and transforming the encoder on training data
X_train_encoded = encoder.fit_transform(X_train)

In [575]:
# Transforming test data using the same encoder
X_test_encoded = encoder.transform(X_test)

In [576]:
# Creating and fitting Logistic Regression model
logreg = LogisticRegression()
logreg.fit(X_train_encoded, y_train)

In [577]:
# Predicting using the trained model
y_pred_logreg = logreg.predict(X_test_encoded)

In [578]:
# Evaluating the model
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
conf_matrix_logreg = confusion_matrix(y_test, y_pred_logreg)
report_logreg = classification_report(y_test, y_pred_logreg)

In [579]:
# Print accuracy
print("Logistic Regression Accuracy:", accuracy_logreg)

Logistic Regression Accuracy: 0.8041163946061036


In [580]:
# Print confusion matrix
print("Confusion Matrix:\n", conf_matrix_logreg)

Confusion Matrix:
 [[933 103]
 [173 200]]


In [581]:
# Print classification report
print("Classification Report:\n", report_logreg)

Classification Report:
               precision    recall  f1-score   support

          No       0.84      0.90      0.87      1036
         Yes       0.66      0.54      0.59       373

    accuracy                           0.80      1409
   macro avg       0.75      0.72      0.73      1409
weighted avg       0.79      0.80      0.80      1409



# K-Nearest Neighbor Cluster algorithm

In [582]:
# Creating and fitting KNN model
knn = KNeighborsClassifier()
knn.fit(X_train_encoded, y_train)

In [583]:
# Predicting using the trained model
y_pred_knn = knn.predict(X_test_encoded)

In [584]:
# Evaluating the model
accuracy_knn = accuracy_score(y_test, y_pred_knn)
conf_matrix_knn = confusion_matrix(y_test, y_pred_knn)
report_knn = classification_report(y_test, y_pred_knn)

In [585]:
# Print accuracy
print("K-Nearest Neighbors Accuracy:", accuracy_knn)

K-Nearest Neighbors Accuracy: 0.7537260468417317


In [586]:
# Print confusion matrix
print("Confusion Matrix:\n", conf_matrix_knn)

Confusion Matrix:
 [[866 170]
 [177 196]]


In [587]:
# Print classification report
print("Classification Report:\n", report_knn)

Classification Report:
               precision    recall  f1-score   support

          No       0.83      0.84      0.83      1036
         Yes       0.54      0.53      0.53       373

    accuracy                           0.75      1409
   macro avg       0.68      0.68      0.68      1409
weighted avg       0.75      0.75      0.75      1409



# b) (a) Evaluate your models (logistic and KNN) using the following metrics [”Algorithm”, ”ROC AUC Mean”, ”ROC AUC STD”, ”Accuracy Mean”, ”Accuracy STD”]


In [588]:
# Defining logistic regression and KNN models
logistic_model = Pipeline([
    ('scaler', StandardScaler()),
    ('logistic', LogisticRegression())
])

knn_model = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])


In [589]:
# Defining the metrics to evaluate
metrics = {
    "Algorithm": ["Logistic Regression", "K-Nearest Neighbors"],
    "ROC AUC Mean": [],
    "ROC AUC STD": [],
    "Accuracy Mean": [],
    "Accuracy STD": []
}

In [590]:
# Evaluating models using cross-validation
models = [logistic_model, knn_model]
for model in models:
    # Computing ROC AUC scores
    roc_auc_scores = cross_val_score(model, X_processed, y, cv=5, scoring='roc_auc')
    roc_auc_mean = roc_auc_scores.mean()
    roc_auc_std = roc_auc_scores.std()
    
    # Computing accuracy scores
    accuracy_scores = cross_val_score(model, X_processed, y, cv=5, scoring='accuracy')
    accuracy_mean = accuracy_scores.mean()
    accuracy_std = accuracy_scores.std()
    
    # Appending results to metrics dictionary
    metrics["ROC AUC Mean"].append(roc_auc_mean)
    metrics["ROC AUC STD"].append(roc_auc_std)
    metrics["Accuracy Mean"].append(accuracy_mean)
    metrics["Accuracy STD"].append(accuracy_std)

In [591]:
# Printing the results
results_df = pd.DataFrame(metrics)
print(results_df)


             Algorithm  ROC AUC Mean  ROC AUC STD  Accuracy Mean  Accuracy STD
0  Logistic Regression      0.822538     0.010389       0.786454      0.009141
1  K-Nearest Neighbors      0.770730     0.010621       0.770836      0.002730
