In [267]:
import pandas as pd

In [268]:
df = pd.read_csv('data/preprocessed/preprocessed_telco_customer_churn.csv')

In [269]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 47 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   customerID                               7043 non-null   float64
 1   SeniorCitizen                            7043 non-null   float64
 2   tenure                                   7043 non-null   float64
 3   MonthlyCharges                           7043 non-null   float64
 4   TotalCharges                             7043 non-null   float64
 5   Churn                                    7043 non-null   int64  
 6   gender_Female                            7043 non-null   float64
 7   gender_Male                              7043 non-null   float64
 8   Partner_No                               7043 non-null   float64
 9   Partner_Yes                              7043 non-null   float64
 10  Dependents_No                            7043 no

# Import algorithms

In [270]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Import metrics

In [271]:
from sklearn.metrics import accuracy_score, classification_report, mean_absolute_error

# Train/Test split

In [272]:
from sklearn.model_selection import train_test_split

x = df.drop('Churn', axis=1)
y = df['Churn']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Logistic Regression

In [273]:
lr = LogisticRegression()

lr.fit(x_train, y_train)

y_pred = lr.predict(x_test)

In [274]:
lr_score = accuracy_score(y_test, y_pred)
lr_report = classification_report(y_test, y_pred)
lr_mae = mean_absolute_error(y_test, y_pred)

In [275]:
lr_score

0.8176011355571328

In [276]:
print(lr_report)

              precision    recall  f1-score   support

           0       0.86      0.91      0.88      1036
           1       0.69      0.57      0.62       373

    accuracy                           0.82      1409
   macro avg       0.77      0.74      0.75      1409
weighted avg       0.81      0.82      0.81      1409



In [277]:
lr_mae

0.1823988644428673

# Decision Tree

In [278]:
dt = DecisionTreeClassifier(random_state=42)

dt.fit(x_train, y_train)

y_pred = dt.predict(x_test)

In [279]:
dt_score = accuracy_score(y_test, y_pred)
dt_report = classification_report(y_test, y_pred)
dt_mae = mean_absolute_error(y_test, y_pred)

In [280]:
dt_score

0.723207948899929

In [281]:
print(dt_report)

              precision    recall  f1-score   support

           0       0.82      0.80      0.81      1036
           1       0.48      0.52      0.50       373

    accuracy                           0.72      1409
   macro avg       0.65      0.66      0.65      1409
weighted avg       0.73      0.72      0.73      1409



In [282]:
dt_mae

0.276792051100071

# Random Forest

In [283]:
rf = RandomForestClassifier(random_state=42)

rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

In [284]:
rf_score = accuracy_score(y_test, y_pred)
rf_report = classification_report(y_test, y_pred)
rf_mae = mean_absolute_error(y_test, y_pred)

In [285]:
rf_score

0.794180269694819

In [286]:
print(rf_report)

              precision    recall  f1-score   support

           0       0.83      0.90      0.87      1036
           1       0.65      0.49      0.56       373

    accuracy                           0.79      1409
   macro avg       0.74      0.70      0.71      1409
weighted avg       0.78      0.79      0.78      1409



In [287]:
rf_mae

0.20581973030518097

# SVM

In [288]:
svc = SVC()

svc.fit(x_train, y_train)

y_pred = svc.predict(x_test)

In [289]:
svc_score = accuracy_score(y_test, y_pred)
svc_report = classification_report(y_test, y_pred)
svc_mae = mean_absolute_error(y_test, y_pred)

In [290]:
svc_score

0.8026969481902059

In [291]:
print(svc_report)

              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1036
           1       0.67      0.50      0.57       373

    accuracy                           0.80      1409
   macro avg       0.75      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409



In [292]:
svc_mae

0.19730305180979418

# KNN

In [293]:
knn = KNeighborsClassifier()

knn.fit(x_train, y_train)

y_pred = knn.predict(x_test)

In [294]:
knn_score = accuracy_score(y_test, y_pred)
knn_report = classification_report(y_test, y_pred)
knn_mae = mean_absolute_error(y_test, y_pred)

In [295]:
knn_score

0.7778566359119943

In [296]:
print(knn_report)

              precision    recall  f1-score   support

           0       0.84      0.87      0.85      1036
           1       0.59      0.53      0.56       373

    accuracy                           0.78      1409
   macro avg       0.71      0.70      0.70      1409
weighted avg       0.77      0.78      0.77      1409



In [297]:
knn_mae

0.22214336408800567

# Tabulate

In [298]:
from tabulate import tabulate

In [299]:
result = [
    ['Logistic Regression', lr_score, lr_mae],
    ['Decision Tree', dt_score, dt_mae],
    ['Random Forest', rf_score, rf_mae],
    ['SVM', svc_score, svc_mae],
    ['KNN', knn_score, knn_mae],
]

headers = ['Algorithm', 'Accuracy_score', 'mean absolute error']

table = tabulate(result, headers=headers, tablefmt='grid', floatfmt='.2f')

In [300]:
print(table)

+---------------------+------------------+-----------------------+
| Algorithm           |   Accuracy_score |   mean absolute error |
| Logistic Regression |             0.82 |                  0.18 |
+---------------------+------------------+-----------------------+
| Decision Tree       |             0.72 |                  0.28 |
+---------------------+------------------+-----------------------+
| Random Forest       |             0.79 |                  0.21 |
+---------------------+------------------+-----------------------+
| SVM                 |             0.80 |                  0.20 |
+---------------------+------------------+-----------------------+
| KNN                 |             0.78 |                  0.22 |
+---------------------+------------------+-----------------------+


# Conclusion
    Among all tested algorithms (Logistic Regression, Decision Tree, Random Forest, SVM, KNN), **Logistic Regression** showed the best performance.  
    This model will be selected as the final model for predictions.


# ---------------------------------------------

# Joblib

In [304]:
from joblib import dump
import os

In [None]:
def JoblibSave(algo):
    os.makedirs("Model", exist_ok=True)
    algorithm = str(algo).split("(")[0]
    return dump(algo, f'Model/{algorithm}_prediction.joblib')

In [306]:
JoblibSave(lr)
JoblibSave(dt)
JoblibSave(rf)
JoblibSave(svc)
JoblibSave(knn)

['Model/KNeighborsClassifier_prediction.joblib']

In [309]:
os.makedirs("predictions", exist_ok=True)
def SaveComparison(results):
    table = tabulate(results, headers=["Model", "Accuracy", "MAE"], tablefmt="grid", floatfmt=".2f")
    with open("predictions/all_models_comparison.txt", "w") as f:
        f.write(table)

In [310]:
SaveComparison(result)