# 1. Import Library



In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score

from google.colab import drive
drive.mount('/content/drive')

import warnings
warnings.filterwarnings('ignore')

Mounted at /content/drive


# 2. Memuat Dataset dari Hasil Clustering

In [2]:
file_path = '/content/drive/MyDrive/Submission/clustering_results.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,age,gender,income,education,region,loyalty_status,purchase_frequency,purchase_amount,product_category,promotion_usage,satisfaction_score,cluster
0,27,0,40682,2,East,2,2,18249,Books,0,6,1
1,29,0,15317,3,West,0,0,4557,Clothing,1,6,0
2,37,0,38849,2,West,1,0,11822,Clothing,0,6,1
3,30,0,11568,0,South,0,2,4098,Food,0,7,0
4,31,1,46952,1,North,0,1,19685,Clothing,1,5,1


# 3. Data Splitting

In [3]:
X = df.drop(['region','product_category','cluster'], axis=1)
y = df['cluster']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(80000, 9)
(80000,)
(20000, 9)
(20000,)


# 4. Membangun Model Klasifikasi

## a. Membangun Model Klasifikasi

In [4]:
# model logistic regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)

In [5]:
# model Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

## b. Evaluasi Model Klasifikasi

In [6]:
# Logistic Regression
print("Logistic Regression:")
print("Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("F1 Score:", f1_score(y_test, y_pred_logreg, average='weighted'))
print(classification_report(y_test, y_pred_logreg))

# Random Forest
print("\nRandom Forest:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf, average='weighted'))
print(classification_report(y_test, y_pred_rf))

Logistic Regression:
Accuracy: 0.9963
F1 Score: 0.9963000085475069
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9920
           1       1.00      1.00      1.00     10080

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000


Random Forest:
Accuracy: 0.99915
F1 Score: 0.9991499982467558
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9920
           1       1.00      1.00      1.00     10080

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000



## c. Tuning Model Klasifikasi (Optional)

In [7]:
# Hasil Model sudah sangat tinggi, tidak perlu dilakukan Tuning

##  d. Evaluasi Model Klasifikasi setelah Tuning (Optional)

In [8]:
# Hasil Model sudah sangat tinggi, tidak perlu dilakukan Tuning

## e. Analisis Hasil Evaluasi Model Klasifikasi



* Kedua model mencapai akurasinya hampir sempurna (99% dan 99%), dengan F1 Score yang sangat tinggi juga.

* F1 Score mendekati 1 artinya model sangat baik dalam:

* Precision (prediksi positif benar)

* Recall (positif sebenarnya yang berhasil ditemukan)

* Performa hampir identik, meskipun Random Forest sedikit lebih baik secara metrik.

* Model berhasil mengenali kedua kelas dengan sempurna.

* Tidak ada tanda-tanda bias terhadap salah satu kelas (misalnya overfitting ke mayoritas kelas), karena nilai untuk kelas 0 dan 1 seimbang.