In [1]:
import pandas as pd

pd.set_option('display.max_columns', None)

In [2]:
from preprocessing import *

In [3]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [4]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df.loc[df["tenure"] == 0, "TotalCharges"] = 0

In [5]:
df['HasInternet'] = df['OnlineBackup'].isin(['No internet service'])
df['automatic_pay'] = df['PaymentMethod'].isin(['Bank transfer (automatic)','Credit card (automatic)'])

In [6]:
df = mapping(df, YES_NO, YES_NO_MAPPING)

In [7]:
df['gender'] = df['gender'].map({'Male' : 1, 'Female' : 0})
df['MultipleLines'] = df['MultipleLines'].map({'No phone service' : 0, 'No' : 1, 'Yes' : 2})
df['Contract'] = df['Contract'].map({'Month-to-month' : 0, 'One year' : 1, 'Two year' : 2})

In [8]:
df = mapping(df, INTERNET_VARS, INTERNET_VARS_MAPPING)

In [9]:
df = dummies(df, DUMMIES_VARS, TO_DROP)

In [10]:
df = mapping(df, TRUE_FALSE, TRUE_FALSE_MAPPING)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   customerID                 7043 non-null   object 
 1   gender                     7043 non-null   int64  
 2   SeniorCitizen              7043 non-null   int64  
 3   Partner                    7043 non-null   int64  
 4   Dependents                 7043 non-null   int64  
 5   tenure                     7043 non-null   int64  
 6   PhoneService               7043 non-null   int64  
 7   MultipleLines              7043 non-null   int64  
 8   OnlineSecurity             7043 non-null   int64  
 9   OnlineBackup               7043 non-null   int64  
 10  DeviceProtection           7043 non-null   int64  
 11  TechSupport                7043 non-null   int64  
 12  StreamingTV                7043 non-null   int64  
 13  StreamingMovies            7043 non-null   int64

In [12]:
df.drop('customerID', axis=1, inplace=True)

In [13]:
from sklearn.model_selection import train_test_split

X = df.drop('Churn',axis=1)
Y = df['Churn']

In [14]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score

model = LogisticRegression(max_iter=3500)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
y_pred_proba = model.predict_proba(x_test)[:,1]

print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))

0.8076650106458482
              precision    recall  f1-score   support

           0       0.85      0.90      0.87      1035
           1       0.66      0.56      0.61       374

    accuracy                           0.81      1409
   macro avg       0.76      0.73      0.74      1409
weighted avg       0.80      0.81      0.80      1409

[[928 107]
 [164 210]]
AUC: 0.8425766617582474


In [31]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
y_pred_proba = model.predict_proba(x_test)[:,1]

print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))

0.7892122072391767
              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1035
           1       0.64      0.48      0.55       374

    accuracy                           0.79      1409
   macro avg       0.73      0.69      0.71      1409
weighted avg       0.78      0.79      0.78      1409

[[931 104]
 [193 181]]
AUC: 0.8242682580278489


In [32]:
from xgboost import XGBClassifier

imbalance_ratio = (len(y_train) - sum(y_train)) / sum(y_train)

model = XGBClassifier(
    eval_metric="logloss",   # add this instead
    random_state=42)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)
y_pred_proba = model.predict_proba(x_test)[:,1]

print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))

0.772888573456352
              precision    recall  f1-score   support

           0       0.83      0.86      0.85      1035
           1       0.58      0.52      0.55       374

    accuracy                           0.77      1409
   macro avg       0.71      0.69      0.70      1409
weighted avg       0.77      0.77      0.77      1409

[[894 141]
 [179 195]]
AUC: 0.8202976052080913


In [34]:
from sklearn.svm import LinearSVC

model = LinearSVC(max_iter=5000, random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.7984386089425124
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1035
           1       0.65      0.52      0.58       374

    accuracy                           0.80      1409
   macro avg       0.74      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409

[[930 105]
 [179 195]]


In [35]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=300,
                                   learning_rate=0.5,
                                   max_depth=5,
                                   random_state=42)

model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_pred_proba = model.predict_proba(x_test)[:,1]

print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))

0.7714691270404542
              precision    recall  f1-score   support

           0       0.83      0.86      0.85      1035
           1       0.58      0.52      0.55       374

    accuracy                           0.77      1409
   macro avg       0.70      0.69      0.70      1409
weighted avg       0.76      0.77      0.77      1409

[[892 143]
 [179 195]]
AUC: 0.797432121728797


In [36]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Simple feedforward NN
model = MLPClassifier(
    hidden_layer_sizes=(32, 64, 128, 64, 32, 16),
    activation='relu',
    solver='adam',
    max_iter=1000,
    random_state=42
)

model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_pred_proba = model.predict_proba(x_test)[:,1]

print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))

0.794180269694819
              precision    recall  f1-score   support

           0       0.82      0.92      0.87      1035
           1       0.67      0.45      0.54       374

    accuracy                           0.79      1409
   macro avg       0.74      0.68      0.70      1409
weighted avg       0.78      0.79      0.78      1409

[[951  84]
 [206 168]]
AUC: 0.8343796016430288


In [40]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=5)

model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_pred_proba = model.predict_proba(x_test)[:,1]

print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))

0.7629524485450674
              precision    recall  f1-score   support

           0       0.81      0.88      0.85      1035
           1       0.57      0.43      0.49       374

    accuracy                           0.76      1409
   macro avg       0.69      0.66      0.67      1409
weighted avg       0.75      0.76      0.75      1409

[[914 121]
 [213 161]]
AUC: 0.7537123149655119


In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=42)

model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_proba = model.predict_proba(x_test)[:,1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_proba))

Accuracy: 0.7374024130589071
              precision    recall  f1-score   support

           0       0.82      0.82      0.82      1035
           1       0.51      0.51      0.51       374

    accuracy                           0.74      1409
   macro avg       0.66      0.67      0.67      1409
weighted avg       0.74      0.74      0.74      1409

[[847 188]
 [182 192]]
AUC: 0.6656281484925987


In [43]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=200, random_state=42)

model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_proba = model.predict_proba(x_test)[:,1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_proba))

Accuracy: 0.7849538679914834
              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1035
           1       0.62      0.49      0.55       374

    accuracy                           0.78      1409
   macro avg       0.72      0.69      0.70      1409
weighted avg       0.77      0.78      0.78      1409

[[921 114]
 [189 185]]
AUC: 0.8237515823193572


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

param_grid = [
    {
        "penalty": ["l1"],
        "solver": ["liblinear", "saga"],
        "C": [0.01, 0.1, 1, 10, 100]
    },
    {
        "penalty": ["l2"],
        "solver": ["liblinear", "saga"],
        "C": [0.01, 0.1, 1, 10, 100]
    },
    {
        "penalty": ["elasticnet"],
        "solver": ["saga"],
        "C": [0.01, 0.1, 1, 10, 100],
        "l1_ratio": [0, 0.5, 1]
    },
    {
        "penalty": [None],
        "solver": ["saga"],
        "C": [1]
    }
]

grid = GridSearchCV(
    LogisticRegression(max_iter=5000, class_weight="balanced"),
    param_grid,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1
)

grid.fit(x_train, y_train)

print("Best params:", grid.best_params_)
print("Best AUC:", grid.best_score_)

Best params: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
Best AUC: 0.8458461653337798
