In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
import joblib #buat save model

df = pd.read_csv("../dataset/cleanedData.csv")

In [11]:
#split
X = df.drop("Churn", axis=1)
y = df["Churn"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

#scale num values
scaler = StandardScaler()
num_cols = ["tenure", "MonthlyCharges", "TotalCharges"]
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [12]:
#coba pake beberapa baseline model, logreg, random forest, XGBoost
#tiap model param beda"
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000), #biar ga stuck di convergence warning
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42), #n_est = jumlah pohon
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42) #buat suppress warning + standar buat classification
}

for name, model in models.items():
    print(f"\n{name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
    print("ROC-AUC:", round(roc_auc_score(y_test, y_prob), 3))
    print(classification_report(y_test, y_pred))



Logistic Regression
Accuracy: 0.806
ROC-AUC: 0.84
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1291
           1       0.65      0.57      0.61       467

    accuracy                           0.81      1758
   macro avg       0.75      0.73      0.74      1758
weighted avg       0.80      0.81      0.80      1758


Random Forest
Accuracy: 0.789
ROC-AUC: 0.815
              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1291
           1       0.63      0.50      0.56       467

    accuracy                           0.79      1758
   macro avg       0.73      0.70      0.71      1758
weighted avg       0.78      0.79      0.78      1758


XGBoost
Accuracy: 0.77
ROC-AUC: 0.812
              precision    recall  f1-score   support

           0       0.83      0.87      0.85      1291
           1       0.58      0.50      0.54       467

    accuracy                           0.77   

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


logreg haslnya paling bagus with 80,6% accuracy

In [13]:
joblib.dump(models["Logistic Regression"], "../models/logreg.pkl")
joblib.dump(models["Random Forest"], "../models/randomforest.pkl")
joblib.dump(models["XGBoost"], "../models/xgb.pkl")

['../models/xgb.pkl']