In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

In [2]:
df = pd.read_excel('./data/Telco_customer_churn_adapted_v2.xlsx')

In [3]:
df.head()

Unnamed: 0,Customer ID,Tenure Months,Location,Device Class,Games Product,Music Product,Education Product,Call Center,Video Product,Use MyApp,Payment Method,Monthly Purchase (Thou. IDR),Churn Label,Longitude,Latitude,CLTV (Predicted Thou. IDR)
0,0,2,Jakarta,Mid End,Yes,Yes,No,No,No,No,Digital Wallet,70.005,Yes,106.816666,-6.2,4210.7
1,1,2,Jakarta,High End,No,No,No,No,No,No,Pulsa,91.91,Yes,106.816666,-6.2,3511.3
2,2,8,Jakarta,High End,No,No,Yes,No,Yes,Yes,Pulsa,129.545,Yes,106.816666,-6.2,6983.6
3,3,28,Jakarta,High End,No,No,Yes,Yes,Yes,Yes,Pulsa,136.24,Yes,106.816666,-6.2,6503.9
4,4,49,Jakarta,High End,No,Yes,Yes,No,Yes,Yes,Debit,134.81,Yes,106.816666,-6.2,6942.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Customer ID                   7043 non-null   int64  
 1   Tenure Months                 7043 non-null   int64  
 2   Location                      7043 non-null   object 
 3   Device Class                  7043 non-null   object 
 4   Games Product                 7043 non-null   object 
 5   Music Product                 7043 non-null   object 
 6   Education Product             7043 non-null   object 
 7   Call Center                   7043 non-null   object 
 8   Video Product                 7043 non-null   object 
 9   Use MyApp                     7043 non-null   object 
 10  Payment Method                7043 non-null   object 
 11  Monthly Purchase (Thou. IDR)  7043 non-null   float64
 12  Churn Label                   7043 non-null   object 
 13  Lon

In [5]:
df.isna().sum()

Customer ID                     0
Tenure Months                   0
Location                        0
Device Class                    0
Games Product                   0
Music Product                   0
Education Product               0
Call Center                     0
Video Product                   0
Use MyApp                       0
Payment Method                  0
Monthly Purchase (Thou. IDR)    0
Churn Label                     0
Longitude                       0
Latitude                        0
CLTV (Predicted Thou. IDR)      0
dtype: int64

In [6]:
df.duplicated().sum()

0

In [7]:
df.describe()

Unnamed: 0,Customer ID,Tenure Months,Monthly Purchase (Thou. IDR),Longitude,Latitude,CLTV (Predicted Thou. IDR)
count,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0
mean,3521.0,32.371149,84.1902,107.043246,-6.404184,5720.384481
std,2033.283305,24.559481,39.117061,0.358316,0.322898,1537.974298
min,0.0,0.0,23.725,106.816666,-6.914744,2603.9
25%,1760.5,9.0,46.15,106.816666,-6.914744,4509.7
50%,3521.0,29.0,91.455,106.816666,-6.2,5885.1
75%,5281.5,55.0,116.805,107.60981,-6.2,6994.65
max,7042.0,72.0,154.375,107.60981,-6.2,8450.0


In [8]:
products = ['Games Product', 'Music Product', 'Education Product', 'Video Product']

# Counting the number of 'Yes' responses for each product for each customer
df['Product_Count'] = df[products].apply(lambda row: sum(row == 'Yes'), axis=1)
df['Serivce_count'] = df[['Games Product', 'Music Product', 'Education Product', 'Call Center', 'Video Product', 'Use MyApp']].apply(lambda x: sum([1 if i == 'Yes' else 0 for i in x]), axis=1)
df['Company_Service'] = df[['Call Center', 'Use MyApp']].apply(lambda x: sum([1 if i == 'Yes' else 0 for i in x]), axis=1)
df['Is_Long_Tenure'] = df['Tenure Months'].apply(lambda x: 1 if x > 6 else 0)

def categorize_tenure(tenure):
    if tenure <= 6:
        return 0
    elif 7 <= tenure <= 24:
        return 1
    else:
        return 2

df['Tenure_Category'] = df['Tenure Months'].apply(categorize_tenure)

df['CLTV_to_MonthlyPurchase_Ratio'] = df['CLTV (Predicted Thou. IDR)'] / df['Monthly Purchase (Thou. IDR)']

# 4. Tenure to CLTV Ratio
df['Tenure to CLTV Ratio'] = df['CLTV (Predicted Thou. IDR)'] / df['Tenure Months']
df['Tenure to Purchase Ratio'] = df['Monthly Purchase (Thou. IDR)'] / df['Tenure Months']
df['MonthlyPurchase_CLTV'] = df['Monthly Purchase (Thou. IDR)'] * df['CLTV (Predicted Thou. IDR)']

average_purchase = df['Monthly Purchase (Thou. IDR)'].mean()
df['Is_High_Purchase_Customer'] = df['Monthly Purchase (Thou. IDR)'].apply(lambda x: 1 if x > average_purchase else 0)

average_cltv = df['CLTV (Predicted Thou. IDR)'].mean()
df['Is_High_Value_Customer'] = df['CLTV (Predicted Thou. IDR)'].apply(lambda x: 1 if x > average_cltv else 0)


In [9]:
# 4. Tenure to CLTV Ratio
df['Tenure to CLTV Ratio'] = df['Tenure to CLTV Ratio'].replace(np.inf, 0)
df['Tenure to Purchase Ratio'] = df['Tenure to CLTV Ratio'].replace(np.inf, 0)

In [10]:
df["Games Product"] = df["Games Product"].replace('No internet service','No')
df["Music Product"] = df["Music Product"].replace('No internet service','No')
df["Education Product"] = df["Education Product"].replace('No internet service','No')
df["Use MyApp"] = df["Use MyApp"].replace('No internet service','No')
df["Games Product"] = df["Games Product"].replace('No internet service','No')
df["Video Product"] = df["Video Product"].replace('No internet service','No')

In [11]:
# one-hot encoding (categorical variables with more than two levels)
one_hot_encoding_columns = ['Games Product', 'Device Class', 'Music Product', 'Payment Method', 'Education Product', 
                            'Use MyApp', 'Video Product',  'Call Center', 'Longitude', 'Latitude', 'Location']

# encode categorical variables with more than two levels using one-hot encoding
df = pd.get_dummies(df, columns = one_hot_encoding_columns)

In [12]:
df = df.drop('Customer ID', axis=1)

In [13]:
df['Churn Label'] = df['Churn Label'].replace({'No': 0, 'Yes': 1})

In [14]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler

# Membuat instance dari StandardScaler
scaler = MinMaxScaler()

# Menggunakan fit_transform untuk kolom Monthly Purchase dan CLTV
df[['Monthly Purchase (Thou. IDR)', 'CLTV (Predicted Thou. IDR)', 'Tenure to CLTV Ratio', 'Tenure to Purchase Ratio', 'MonthlyPurchase_CLTV', 'CLTV_to_MonthlyPurchase_Ratio']] = scaler.fit_transform(df[['Monthly Purchase (Thou. IDR)', 'CLTV (Predicted Thou. IDR)', 'Tenure to CLTV Ratio', 'Tenure to Purchase Ratio', 'MonthlyPurchase_CLTV', 'CLTV_to_MonthlyPurchase_Ratio']])

In [15]:
X = df.drop(columns='Churn Label')
y = df['Churn Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from imblearn.over_sampling import SMOTE

# 1. SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

In [16]:

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier

# Logistic Regression
lr = LogisticRegression(
    max_iter=1000,
    penalty='l2',  # 'l1' for Lasso, 'l2' for Ridge, 'elasticnet' for both
    C=1.0,  # Inverse of regularization strength
    solver='liblinear'  # 'newton-cg', 'lbfgs', 'sag', 'saga' are other solvers
)

# Decision Tree
dt = DecisionTreeClassifier(
    max_depth=None,  # Maximum depth of the tree
    min_samples_split=2,  # Minimum number of samples required to split an internal node
    min_samples_leaf=1  # Minimum number of samples required to be at a leaf node
)

# Random Forest
rf = RandomForestClassifier(
    n_estimators=100,  # Number of trees in the forest
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    bootstrap=True  # Whether bootstrap samples are used when building trees
)

# XGBoost
xgb = XGBClassifier(
    objective='binary:logistic', 
    eval_metric='auc',
    use_label_encoder=False,
    n_estimators=1000, 
    learning_rate=0.01,
    max_depth=6,
    min_child_weight=1,  # Minimum sum of instance weight (hessian) needed in a child
    gamma=0,  # Minimum loss reduction required to make a further partition
    subsample=0.9,
    colsample_bytree=0.9,
    scale_pos_weight=1  # Control the balance of positive and negative weights
)


# LightGBM
lgbm = LGBMClassifier(
    learning_rate=0.01,
    n_estimators=1000,
    max_depth=-1,  # Maximum tree depth, -1 means no limit
    num_leaves=31  # Maximum tree leaves for base learners
)

# CatBoost
catboost = CatBoostClassifier(verbose=0)  # verbose=0 untuk menghindari output banyak saat training

# k-NN
knn = KNeighborsClassifier(
    n_neighbors=5,  # Number of neighbors to use
    weights='uniform',  # 'distance' weights points by the inverse of their distance
    algorithm='auto'  # Algorithm used to compute the nearest neighbors ('ball_tree', 'kd_tree', 'brute')
)

# Support Vector Machine
svm = SVC(
    probability=True,
    kernel='rbf',  # Specifies the kernel type ('linear', 'poly', 'rbf', 'sigmoid', 'precomputed')
    C=1.0,  # Regularization parameter
    gamma='scale'  # Kernel coefficient
)

# Naive Bayes
nb = GaussianNB()

# Gradient Boosting Machine
gbm = GradientBoostingClassifier()

# AdaBoost
adaboost = AdaBoostClassifier(
    n_estimators=50,  # Maximum number of estimators
    learning_rate=1.0  # Weight applied to each classifier
)

from sklearn.ensemble import VotingClassifier

# Membuat ensemble model dengan hard voting
ensemble_model_hard = VotingClassifier(estimators=[
    ('catboost', catboost),
    ('gbm', gbm),
    ('lgbm', lgbm),
    ('xgb', xgb)
], voting='hard')

# Membuat ensemble model dengan soft voting
ensemble_model_soft = VotingClassifier(estimators=[
    ('catboost', catboost),
    ('gbm', gbm),
    ('lgbm', lgbm),
    ('xgb', xgb)
], voting='soft')


# Latih semua model dengan data training
models = {
    'Logistic Regression': lr,
    'Decision Tree': dt,
    'Random Forest': rf,
    'XGBoost': xgb,
    'LightGBM': lgbm,
    'CatBoost': catboost,
    'Gradient Boosting Machine': gbm,
    'AdaBoost': adaboost,
    'k-NN': knn,
    'Support Vector Machine': svm,
    'Naive Bayes': nb,
    'ensemble_model_hard' : ensemble_model_hard,
    'ensemble_model_soft': ensemble_model_soft,
}




In [17]:
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

def train_and_evaluate(models, X, y, X_test, y_test):
    # Membuat list kosong untuk menyimpan hasil
    results = []

    for name, model in models.items():
        model.fit(X, y)
        y_pred = model.predict(X_test)
        
        # Mengambil metrik yang diperlukan
        acc = round(accuracy_score(y_test, y_pred), 4)
        auc = round(roc_auc_score(y_test, y_pred), 4)
        
        # Menggunakan classification report untuk mendapatkan precision, recall, dan F1-score
        report = classification_report(y_test, y_pred, output_dict=True)
        macro_precision = round(report['macro avg']['precision'], 4)
        macro_recall = round(report['macro avg']['recall'], 4)
        macro_f1 = round(report['macro avg']['f1-score'], 4)
        
        # Menyimpan hasil dalam tuple dan menambahkannya ke list
        results.append((name, acc, auc, macro_precision, macro_recall, macro_f1))

    return results

# Menggunakan fungsi untuk data normal dan SMOTE
results_smote = train_and_evaluate(models, X_smote, y_smote, X_test, y_test)

# Membuat DataFrame dari list hasil
df_results = pd.DataFrame(results_smote, columns=['Model', 'Accuracy', 'AUC-ROC', 'Macro Precision', 'Macro Recall', 'Macro F1-score'])




In [18]:
df_results.head(13)

Unnamed: 0,Model,Accuracy,AUC-ROC,Macro Precision,Macro Recall,Macro F1-score
0,Logistic Regression,0.7864,0.7422,0.7376,0.7422,0.7398
1,Decision Tree,0.741,0.6969,0.6862,0.6969,0.6907
2,Random Forest,0.7587,0.707,0.704,0.707,0.7055
3,XGBoost,0.7921,0.7477,0.7444,0.7477,0.746
4,LightGBM,0.7942,0.7477,0.7469,0.7477,0.7473
5,CatBoost,0.7878,0.7356,0.7389,0.7356,0.7372
6,Gradient Boosting Machine,0.7864,0.7558,0.7395,0.7558,0.7463
7,AdaBoost,0.7764,0.7511,0.7299,0.7511,0.7379
8,k-NN,0.7033,0.6903,0.663,0.6903,0.6674
9,Support Vector Machine,0.7331,0.7405,0.7017,0.7405,0.7062


In [19]:
# df_results.to_csv('hasil_churn_prediction.csv', index = False, sep=';')