# **1. Import Library**

Pada tahap ini, Anda perlu mengimpor beberapa pustaka (library) Python yang dibutuhkan untuk analisis data dan pembangunan model machine learning.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split

# **2. Memuat Dataset dari Hasil Clustering**

Memuat dataset hasil clustering dari file CSV ke dalam variabel DataFrame.

In [28]:
data = pd.read_csv('Dataset_clustering.csv', sep=',')

print(data.head())

     ID  Year_Birth   Education Marital_Status     Income  Kidhome  Teenhome  \
0  5524        1957  Graduation         Single  23.979483        0         0   
1  2174        1954  Graduation         Single  23.703774        1         1   
2  4141        1965  Graduation       Together  24.232975        0         0   
3  6182        1984  Graduation       Together  23.030763        1         0   
4  5324        1981         PhD        Married  23.982721        1         0   

  Dt_Customer      Recency  MntWines  ...  Age     Tenure  TotalChildren  \
0  2012-09-04  1728.556636       635  ...   68  11.814468       0.950446   
1  2014-03-08  1149.436891        11  ...   71  10.123465       2.453716   
2  2013-08-21   801.965044       426  ...   60  11.092727       0.950446   
3  2014-02-10   801.965044        11  ...   41  10.320622       1.702081   
4  2014-01-19  2770.972177       173  ...   44  10.460688       1.702081   

   TotalSpent  TotalPurchases  Education_Encoded  Marital_Stat

# **3. Data Splitting**

Tahap Data Splitting bertujuan untuk memisahkan dataset menjadi dua bagian: data latih (training set) dan data uji (test set).

In [None]:
X = data.drop('Cluster', axis=1)
y = data['Cluster']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape}, Testing set: {X_test.shape}")

Training set: (1792, 38), Testing set: (448, 38)


# **4. Membangun Model Klasifikasi**


## **a. Membangun Model Klasifikasi**

In [None]:
X_train_processed = X_train.drop(['Education', 'Marital_Status', 'Dt_Customer'], axis=1)
X_test_processed = X_test.drop(['Education', 'Marital_Status', 'Dt_Customer'], axis=1)

X_train_processed = X_train_processed.fillna(X_train_processed.mean())
X_test_processed = X_test_processed.fillna(X_test_processed.mean())

rf_model = RandomForestClassifier(random_state=42)

rf_model.fit(X_train_processed, y_train)

y_train_pred_rf = rf_model.predict(X_train_processed)
y_test_pred_rf = rf_model.predict(X_test_processed)

In [None]:
lr_model = LogisticRegression(max_iter=1000, random_state=42)

lr_model.fit(X_train_processed, y_train)

y_train_pred_lr = lr_model.predict(X_train_processed)
y_test_pred_lr = lr_model.predict(X_test_processed)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Tulis narasi atau penjelasan algoritma yang Anda gunakan.

## **c. Evaluasi Model Klasifikasi**

In [32]:
print("Random Forest Performance:")
print(f"Training Accuracy: {accuracy_score(y_train, y_train_pred_rf):.2f}")
print(f"Training F1-Score: {f1_score(y_train, y_train_pred_rf, average='weighted'):.2f}")
print(f"Testing Accuracy: {accuracy_score(y_test, y_test_pred_rf):.2f}")
print(f"Testing F1-Score: {f1_score(y_test, y_test_pred_rf, average='weighted'):.2f}")
print("\nClassification Report (Testing):")
print(classification_report(y_test, y_test_pred_rf))

Random Forest Performance:
Training Accuracy: 1.00
Training F1-Score: 1.00
Testing Accuracy: 0.97
Testing F1-Score: 0.97

Classification Report (Testing):
              precision    recall  f1-score   support

           0       0.96      0.97      0.97       214
           1       0.97      0.97      0.97       234

    accuracy                           0.97       448
   macro avg       0.97      0.97      0.97       448
weighted avg       0.97      0.97      0.97       448



In [33]:
# Evaluasi
print("Logistic Regression Performance:")
print(f"Training Accuracy: {accuracy_score(y_train, y_train_pred_lr):.2f}")
print(f"Training F1-Score: {f1_score(y_train, y_train_pred_lr, average='weighted'):.2f}")
print(f"Testing Accuracy: {accuracy_score(y_test, y_test_pred_lr):.2f}")
print(f"Testing F1-Score: {f1_score(y_test, y_test_pred_lr, average='weighted'):.2f}")
print("\nClassification Report (Testing):")
print(classification_report(y_test, y_test_pred_lr))

Logistic Regression Performance:
Training Accuracy: 0.98
Training F1-Score: 0.98
Testing Accuracy: 0.97
Testing F1-Score: 0.97

Classification Report (Testing):
              precision    recall  f1-score   support

           0       0.97      0.96      0.96       214
           1       0.97      0.97      0.97       234

    accuracy                           0.97       448
   macro avg       0.97      0.97      0.97       448
weighted avg       0.97      0.97      0.97       448



Tulis hasil evaluasi algoritma yang digunakan, jika Anda menggunakan 2 algoritma, maka bandingkan hasilnya.

## **b. Tuning Model Klasifikasi (Optional)**

Gunakan GridSearchCV, RandomizedSearchCV, atau metode lainnya untuk mencari kombinasi hyperparameter terbaik

In [35]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train_processed, y_train)

best_rf_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

Best Parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}


## **d. Evaluasi Model Klasifikasi setelah Tuning (Optional)**

In [37]:
X_test_processed = X_test.drop(['Education', 'Marital_Status', 'Dt_Customer'], axis=1)

y_test_pred_best_rf = best_rf_model.predict(X_test_processed)
print(f"Testing Accuracy (Tuned): {accuracy_score(y_test, y_test_pred_best_rf):.2f}")
print(f"Testing F1-Score (Tuned): {f1_score(y_test, y_test_pred_best_rf, average='weighted'):.2f}")

Testing Accuracy (Tuned): 0.96
Testing F1-Score (Tuned): 0.96


## **e. Analisis Hasil Evaluasi Model Klasifikasi**

In [38]:
results = {
    'Model': ['Random Forest', 'Logistic Regression'],
    'Training Accuracy': [accuracy_score(y_train, y_train_pred_rf), accuracy_score(y_train, y_train_pred_lr)],
    'Testing Accuracy': [accuracy_score(y_test, y_test_pred_rf), accuracy_score(y_test, y_test_pred_lr)],
    'Training F1-Score': [f1_score(y_train, y_train_pred_rf, average='weighted'), f1_score(y_train, y_train_pred_lr, average='weighted')],
    'Testing F1-Score': [f1_score(y_test, y_test_pred_rf, average='weighted'), f1_score(y_test, y_test_pred_lr, average='weighted')]
}

results_df = pd.DataFrame(results)
print(results_df)

                 Model  Training Accuracy  Testing Accuracy  \
0        Random Forest           1.000000          0.968750   
1  Logistic Regression           0.977679          0.966518   

   Training F1-Score  Testing F1-Score  
0           1.000000          0.968756  
1           0.977676          0.966514  
