In [26]:
# UTS Praktikum Pembelajaran Mesin - Kaggle Bot Account Detection

# Import library yang dibutuhkan
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
import pickle
import warnings
warnings.filterwarnings('ignore')

In [27]:
# 1. MEMBACA DAN EKSPLORASI DATA
print("LANGKAH 1: MEMBACA DAN EKSPLORASI DATA")
print("-" * 50)

# Membaca dataset
dataset_path = "../dataset/kaggle_bot_accounts.csv"
data = pd.read_csv(dataset_path)

# Melihat struktur data
print("Informasi Dasar Dataset:")
print(f"Jumlah Baris: {data.shape[0]}")
print(f"Jumlah Kolom: {data.shape[1]}")
print("\nTipe Data dan Missing Values:")
print(data.info())

# Melihat sampel data
print("\nSampel Data (5 baris pertama):")
print(data.head())

LANGKAH 1: MEMBACA DAN EKSPLORASI DATA
--------------------------------------------------
Informasi Dasar Dataset:
Jumlah Baris: 1321188
Jumlah Kolom: 17

Tipe Data dan Missing Values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1321188 entries, 0 to 1321187
Data columns (total 17 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1321188 non-null  int64  
 1   NAME                   1243024 non-null  object 
 2   GENDER                 1243309 non-null  object 
 3   EMAIL_ID               1243374 non-null  object 
 4   IS_GLOGIN              1243272 non-null  object 
 5   FOLLOWER_COUNT         1243476 non-null  float64
 6   FOLLOWING_COUNT        1242743 non-null  float64
 7   DATASET_COUNT          1242621 non-null  float64
 8   CODE_COUNT             1243262 non-null  float64
 9   DISCUSSION_COUNT       1243466 non-null  float64
 10  AVG_NB_READ_TIME_MIN   1242872 non-null  float64


In [28]:
# 2. EXPLORATORY DATA ANALYSIS (EDA)
print("\nLANGKAH 2: EXPLORATORY DATA ANALYSIS (EDA)")
print("-" * 50)

# Menghitung jumlah missing value setiap kolom
missing_values = data.isnull().sum()
print("Jumlah Missing Values per Kolom:")
print(missing_values[missing_values > 0])

# Memeriksa duplikat
duplicates = data.duplicated().sum()
print(f"\nJumlah Duplikat dalam Dataset: {duplicates}")

# Statistik deskriptif untuk kolom numerik
print("\nStatistik Deskriptif untuk Fitur Numerik:")
print(data.describe())

# Menghitung distribusi kelas target (ISBOT)
print("\nDistribusi Target (ISBOT):")
print(data['ISBOT'].value_counts(dropna=False))
print(f"Persentase Missing Values Target: {data['ISBOT'].isnull().mean() * 100:.2f}%")


LANGKAH 2: EXPLORATORY DATA ANALYSIS (EDA)
--------------------------------------------------
Jumlah Missing Values per Kolom:
NAME                     78164
GENDER                   77879
EMAIL_ID                 77814
IS_GLOGIN                77916
FOLLOWER_COUNT           77712
FOLLOWING_COUNT          78445
DATASET_COUNT            78567
CODE_COUNT               77926
DISCUSSION_COUNT         77722
AVG_NB_READ_TIME_MIN     78316
REGISTRATION_IPV4        78329
REGISTRATION_LOCATION    78290
TOTAL_VOTES_GAVE_NB      77705
TOTAL_VOTES_GAVE_DS      77934
TOTAL_VOTES_GAVE_DC      78030
ISBOT                    78500
dtype: int64

Jumlah Duplikat dalam Dataset: 0

Statistik Deskriptif untuk Fitur Numerik:
         Unnamed: 0  FOLLOWER_COUNT  FOLLOWING_COUNT  DATASET_COUNT  \
count  1.321188e+06    1.243476e+06     1.242743e+06   1.242621e+06   
mean   6.605935e+05    2.698273e+01     4.505091e+01   2.562564e+00   
std    3.813943e+05    2.300504e+01     3.947716e+01   2.499882e+00   
mi

In [29]:
# # 3. DATA PREPROCESSING
# print("\nLANGKAH 3: DATA PREPROCESSING")
# print("-" * 50)

# # 3.1 Menangani missing values
# print("3.1 Menangani Missing Values")

# # Menghapus duplikat jika ada
# data = data.drop_duplicates().reset_index(drop=True)
# print(f"Jumlah baris setelah menghapus duplikat: {data.shape[0]}")

# # Menangani missing values pada kolom target (ISBOT)
# # Menghapus baris yang tidak memiliki label target
# if data['ISBOT'].isnull().sum() > 0:
#     data_labeled = data.dropna(subset=['ISBOT']).reset_index(drop=True)
#     print(f"Jumlah baris setelah menghapus baris tanpa label: {data_labeled.shape[0]}")
# else:
#     data_labeled = data.copy()

# # Fitur dan target
# X = data_labeled.drop('ISBOT', axis=1)
# y = data_labeled['ISBOT'].astype(int)  # Mengkonversi ke tipe int (0 untuk False, 1 untuk True)

# # Pembagian fitur kategorikal dan numerik
# categorical_cols = ['NAME', 'GENDER', 'EMAIL_ID', 'IS_GLOGIN', 'REGISTRATION_LOCATION']
# numerical_cols = ['FOLLOWER_COUNT', 'FOLLOWING_COUNT', 'DATASET_COUNT', 'CODE_COUNT', 
#                  'DISCUSSION_COUNT', 'AVG_NB_READ_TIME_MIN', 'TOTAL_VOTES_GAVE_NB', 
#                  'TOTAL_VOTES_GAVE_DS', 'TOTAL_VOTES_GAVE_DC']
# binary_cols = ['IS_GLOGIN']  # Kolom boolean

# # Imputasi nilai yang hilang
# print("\n3.2 Imputasi Missing Values")

# # Untuk fitur kategorikal, imputasi dengan mode (nilai yang paling sering muncul)
# # Untuk fitur numerik, imputasi dengan median
# print(f"Fitur kategorikal: {categorical_cols}")
# print(f"Fitur numerik: {numerical_cols}")

# # Memeriksa korelasi antar fitur numerik
# print("\n3.3 Analisis Korelasi Antar Fitur Numerik")
# numerical_data = X[numerical_cols].copy()

# # Imputasi missing values untuk perhitungan korelasi
# imputer = SimpleImputer(strategy='median')
# numerical_data_imputed = pd.DataFrame(
#     imputer.fit_transform(numerical_data),
#     columns=numerical_data.columns
# )

# # Hitung korelasi
# correlation_matrix = numerical_data_imputed.corr()

# # Plot heatmap korelasi
# plt.figure(figsize=(12, 10))
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
# plt.title('Matriks Korelasi Fitur Numerik')
# plt.tight_layout()
# plt.savefig('../dataset/correlation_heatmap.png')  # Simpan gambar korelasi
# plt.close()

# print("Heatmap korelasi telah disimpan sebagai 'correlation_heatmap.png'")

# # 3.4 Pemrosesan Data dan Encoding
# print("\n3.4 Pemrosesan Data dan Feature Engineering")

# # Preprocessing untuk fitur kategorikal dan numerik
# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])

# numerical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler())
# ])

# # Gabungkan preprocessing untuk semua fitur
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numerical_transformer, numerical_cols),
#         ('cat', categorical_transformer, [col for col in categorical_cols if col not in binary_cols])
#     ],
#     remainder='drop'  # Kolom lain diabaikan
# )


In [30]:
# 3. DATA PREPROCESSING
print("\nLANGKAH 3: DATA PREPROCESSING")
print("-" * 50)

# 3.1 Menangani missing values
print("3.1 Menangani Missing Values")

# Menghapus duplikat jika ada
data = data.drop_duplicates().reset_index(drop=True)
print(f"Jumlah baris setelah menghapus duplikat: {data.shape[0]}")

# Menangani missing values pada kolom target (ISBOT)
# Menghapus baris yang tidak memiliki label target
if data['ISBOT'].isnull().sum() > 0:
    data_labeled = data.dropna(subset=['ISBOT']).reset_index(drop=True)
    print(f"Jumlah baris setelah menghapus baris tanpa label: {data_labeled.shape[0]}")
else:
    data_labeled = data.copy()

# Fitur dan target
X = data_labeled.drop('ISBOT', axis=1)
y = data_labeled['ISBOT'].astype(int)  # Mengkonversi ke tipe int (0 untuk False, 1 untuk True)

# 3.2 Modifikasi fitur kategorikal untuk mengurangi kardinalitas
print("\n3.2 Modifikasi Fitur untuk Mengurangi Kardinalitas")

# Modifikasi fitur kategorikal - gunakan hanya yang kardinalitas rendah
categorical_cols = ['GENDER', 'IS_GLOGIN']  # Hapus NAME, EMAIL_ID, dan REGISTRATION_LOCATION
numerical_cols = ['FOLLOWER_COUNT', 'FOLLOWING_COUNT', 'DATASET_COUNT', 'CODE_COUNT', 
                 'DISCUSSION_COUNT', 'AVG_NB_READ_TIME_MIN', 'TOTAL_VOTES_GAVE_NB', 
                 'TOTAL_VOTES_GAVE_DS', 'TOTAL_VOTES_GAVE_DC']
binary_cols = ['IS_GLOGIN']  # Kolom boolean

# Tambahkan feature engineering sederhana untuk kolom lain
# Extract domain dari email (jika perlu dan jika kolom ada)
if 'EMAIL_ID' in X.columns and X['EMAIL_ID'].isnull().sum() < len(X['EMAIL_ID']):
    X['EMAIL_DOMAIN'] = X['EMAIL_ID'].fillna('').apply(lambda x: x.split('@')[1] if '@' in x else '')
    categorical_cols.append('EMAIL_DOMAIN')

# Tambahkan feature engineering untuk NAME (panjang nama)
if 'NAME' in X.columns:
    X['NAME_LENGTH'] = X['NAME'].fillna('').apply(len)
    numerical_cols.append('NAME_LENGTH')

# Imputasi nilai yang hilang
print("\n3.3 Imputasi Missing Values")
print(f"Fitur kategorikal: {categorical_cols}")
print(f"Fitur numerik: {numerical_cols}")

# Memeriksa korelasi antar fitur numerik
print("\n3.4 Analisis Korelasi Antar Fitur Numerik")
numerical_data = X[numerical_cols].copy()

# Imputasi missing values untuk perhitungan korelasi
imputer = SimpleImputer(strategy='median')
numerical_data_imputed = pd.DataFrame(
    imputer.fit_transform(numerical_data),
    columns=numerical_data.columns
)

# Hitung korelasi
correlation_matrix = numerical_data_imputed.corr()

# Plot heatmap korelasi
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Matriks Korelasi Fitur Numerik')
plt.tight_layout()
plt.savefig('../dataset/correlation_heatmap.png')  # Simpan gambar korelasi
plt.close()

print("Heatmap korelasi telah disimpan sebagai 'correlation_heatmap.png'")

# 3.5 Pemrosesan Data dan Feature Engineering
print("\n3.5 Pemrosesan Data dan Feature Engineering")

# Preprocessing untuk fitur kategorikal dan numerik
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Gabungkan preprocessing untuk semua fitur
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, [col for col in categorical_cols if col not in binary_cols])
    ],
    remainder='drop'  # Kolom lain diabaikan
)


LANGKAH 3: DATA PREPROCESSING
--------------------------------------------------
3.1 Menangani Missing Values
Jumlah baris setelah menghapus duplikat: 1321188
Jumlah baris setelah menghapus baris tanpa label: 1242688

3.2 Modifikasi Fitur untuk Mengurangi Kardinalitas

3.3 Imputasi Missing Values
Fitur kategorikal: ['GENDER', 'IS_GLOGIN', 'EMAIL_DOMAIN']
Fitur numerik: ['FOLLOWER_COUNT', 'FOLLOWING_COUNT', 'DATASET_COUNT', 'CODE_COUNT', 'DISCUSSION_COUNT', 'AVG_NB_READ_TIME_MIN', 'TOTAL_VOTES_GAVE_NB', 'TOTAL_VOTES_GAVE_DS', 'TOTAL_VOTES_GAVE_DC', 'NAME_LENGTH']

3.4 Analisis Korelasi Antar Fitur Numerik
Heatmap korelasi telah disimpan sebagai 'correlation_heatmap.png'

3.5 Pemrosesan Data dan Feature Engineering


In [31]:
# 4. PEMISAHAN DATA TRAINING DAN TESTING
print("\nLANGKAH 4: PEMISAHAN DATA TRAINING DAN TESTING")
print("-" * 50)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Ukuran data training: {X_train.shape[0]} sampel")
print(f"Ukuran data testing: {X_test.shape[0]} sampel")

# Cek distribusi kelas di data training
print("\nDistribusi kelas pada data training:")
print(pd.Series(y_train).value_counts(normalize=True).apply(lambda x: f"{x:.2%}"))


LANGKAH 4: PEMISAHAN DATA TRAINING DAN TESTING
--------------------------------------------------
Ukuran data training: 994150 sampel
Ukuran data testing: 248538 sampel

Distribusi kelas pada data training:
ISBOT
0    73.21%
1    26.79%
Name: proportion, dtype: object


In [None]:
# # 5. MENGATASI IMBALANCED DATASET
# print("\nLANGKAH 5: MENGATASI IMBALANCED DATASET")
# print("-" * 50)

# # Periksa keseimbangan kelas
# class_counts = pd.Series(y_train).value_counts()
# print("Jumlah sampel per kelas sebelum SMOTE:")
# print(class_counts)

# # Terapkan SMOTE untuk mengatasi ketidakseimbangan
# X_train_preprocessed = preprocessor.fit_transform(X_train)
# smote = SMOTE(random_state=42)
# X_train_resampled, y_train_resampled = smote.fit_resample(X_train_preprocessed, y_train)

# print(f"\nJumlah sampel sebelum SMOTE: {len(y_train)}")
# print(f"Jumlah sampel setelah SMOTE: {len(y_train_resampled)}")
# print("Distribusi kelas setelah SMOTE:")
# print(pd.Series(y_train_resampled).value_counts())

In [32]:
# 5. MENGATASI IMBALANCED DATASET
print("\nLANGKAH 5: MENGATASI IMBALANCED DATASET")
print("-" * 50)

# Periksa keseimbangan kelas
class_counts = pd.Series(y_train).value_counts()
print("Jumlah sampel per kelas sebelum SMOTE:")
print(class_counts)

# Cek apakah perlu SMOTE
imbalance_ratio = class_counts.min() / class_counts.max()
print(f"Rasio ketidakseimbangan: {imbalance_ratio:.4f}")

# Jika rasio imbalance di bawah 0.2, gunakan SMOTE dengan sampling rate yang lebih rendah
if imbalance_ratio < 0.2:
    # Terapkan preprocessing terlebih dahulu
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    
    # Gunakan SMOTE dengan sampling_strategy yang lebih rendah
    sampling_strategy = min(0.5, imbalance_ratio * 2)  # Jangan buat seimbang sempurna
    smote = SMOTE(random_state=42, sampling_strategy=sampling_strategy)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_preprocessed, y_train)
    
    print(f"\nJumlah sampel sebelum SMOTE: {len(y_train)}")
    print(f"Jumlah sampel setelah SMOTE: {len(y_train_resampled)}")
    print("Distribusi kelas setelah SMOTE:")
    print(pd.Series(y_train_resampled).value_counts())
else:
    # Jika rasio cukup seimbang, gunakan data asli
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_train_resampled, y_train_resampled = X_train_preprocessed, y_train
    print("\nData cukup seimbang, tidak perlu SMOTE")


LANGKAH 5: MENGATASI IMBALANCED DATASET
--------------------------------------------------
Jumlah sampel per kelas sebelum SMOTE:
ISBOT
0    727835
1    266315
Name: count, dtype: int64
Rasio ketidakseimbangan: 0.3659

Data cukup seimbang, tidak perlu SMOTE


In [33]:
# 6. MODEL BASELINE
print("\nLANGKAH 6: MODEL BASELINE")
print("-" * 50)

# Buat pipeline untuk Random Forest sebagai baseline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Latih model baseline
rf_pipeline.fit(X_train, y_train)

# Evaluasi model baseline
y_pred_baseline = rf_pipeline.predict(X_test)
print("Kinerja Model Baseline (Random Forest):")
print(f"Accuracy: {accuracy_score(y_test, y_pred_baseline):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_baseline):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_baseline):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_baseline):.4f}")


LANGKAH 6: MODEL BASELINE
--------------------------------------------------


KeyboardInterrupt: 

In [None]:
# 7. HYPERPARAMETER TUNING DAN CROSS-VALIDATION
print("\nLANGKAH 7: HYPERPARAMETER TUNING DAN CROSS-VALIDATION")
print("-" * 50)

# 7.1 Random Forest Tuning
print("7.1 Random Forest Hyperparameter Tuning")

param_grid_rf = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Gunakan GridSearchCV dengan cross-validation
grid_search_rf = GridSearchCV(
    rf_pipeline,
    param_grid_rf,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

grid_search_rf.fit(X_train, y_train)

print(f"\nParameter terbaik untuk Random Forest: {grid_search_rf.best_params_}")
print(f"F1 Score terbaik untuk Random Forest: {grid_search_rf.best_score_:.4f}")

# 7.2 Gradient Boosting Tuning
print("\n7.2 Gradient Boosting Hyperparameter Tuning")

# Buat pipeline untuk Gradient Boosting
gb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

param_grid_gb = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__max_depth': [3, 5, 7]
}

# Gunakan GridSearchCV dengan cross-validation
grid_search_gb = GridSearchCV(
    gb_pipeline,
    param_grid_gb,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

grid_search_gb.fit(X_train, y_train)

print(f"\nParameter terbaik untuk Gradient Boosting: {grid_search_gb.best_params_}")
print(f"F1 Score terbaik untuk Gradient Boosting: {grid_search_gb.best_score_:.4f}")

# 7.3 Logistic Regression Tuning
print("\n7.3 Logistic Regression Hyperparameter Tuning")

# Buat pipeline untuk Logistic Regression
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

param_grid_lr = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__solver': ['liblinear', 'lbfgs'],
    'classifier__penalty': ['l1', 'l2']
}

# Gunakan GridSearchCV dengan cross-validation
grid_search_lr = GridSearchCV(
    lr_pipeline,
    param_grid_lr,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

grid_search_lr.fit(X_train, y_train)

print(f"\nParameter terbaik untuk Logistic Regression: {grid_search_lr.best_params_}")
print(f"F1 Score terbaik untuk Logistic Regression: {grid_search_lr.best_score_:.4f}")

In [None]:
# 8. ENSEMBLE LEARNING
print("\nLANGKAH 8: ENSEMBLE LEARNING")
print("-" * 50)

# 8.1 Buat model dengan parameter terbaik
best_rf = grid_search_rf.best_estimator_
best_gb = grid_search_gb.best_estimator_
best_lr = grid_search_lr.best_estimator_

# 8.2 Buat model ensemble dengan Voting Classifier
ensemble_clf = VotingClassifier(
    estimators=[
        ('rf', best_rf),
        ('gb', best_gb),
        ('lr', best_lr)
    ],
    voting='soft'  # Gunakan probabilitas untuk voting
)

# Latih model ensemble
ensemble_clf.fit(X_train, y_train)

# Evaluasi model ensemble
y_pred_ensemble = ensemble_clf.predict(X_test)
print("Kinerja Model Ensemble:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_ensemble):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_ensemble):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_ensemble):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_ensemble):.4f}")

# Confusion Matrix untuk model ensemble
cm = confusion_matrix(y_test, y_pred_ensemble)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Bot', 'Bot'], yticklabels=['Non-Bot', 'Bot'])
plt.xlabel('Prediksi')
plt.ylabel('Aktual')
plt.title('Confusion Matrix - Model Ensemble')
plt.tight_layout()
plt.savefig('../notebook/confusion_matrix.png')
plt.close()

print("\nConfusion Matrix telah disimpan sebagai 'confusion_matrix.png'")

# Classification Report
print("\nClassification Report Model Ensemble:")
print(classification_report(y_test, y_pred_ensemble))

# Cross Validation untuk model ensemble
print("\nCross Validation (5-Fold) untuk Model Ensemble:")
cv_scores = cross_val_score(ensemble_clf, X, y, cv=5, scoring='f1')
print(f"F1 Scores: {cv_scores}")
print(f"Rata-rata F1 Score: {cv_scores.mean():.4f}")
print(f"Standar Deviasi F1 Score: {cv_scores.std():.4f}")

In [None]:
# 9. SIMPAN MODEL
print("\nLANGKAH 9: SIMPAN MODEL")
print("-" * 50)

# Simpan model ensemble
model_path = "model/kaggle_bot_detection_model.pkl"
with open(model_path, 'wb') as f:
    pickle.dump(ensemble_clf, f)

print(f"Model telah disimpan ke: {model_path}")

# Simpan preprocessor secara terpisah (untuk digunakan di aplikasi)
preprocessor_path = "../model/preprocessor.pkl"
with open(preprocessor_path, 'wb') as f:
    pickle.dump(preprocessor, f)

print(f"Preprocessor telah disimpan ke: {preprocessor_path}")

In [None]:
# 10. FEATURE IMPORTANCE ANALYSIS
print("\nLANGKAH 10: ANALISIS FEATURE IMPORTANCE")
print("-" * 50)

# Mendapatkan feature importance dari Random Forest (salah satu model dalam ensemble)
rf_model = best_rf.named_steps['classifier']
feature_names = []

# Dapatkan nama fitur setelah preprocessing
# Untuk fitur numerik
feature_names.extend(numerical_cols)

# Untuk fitur kategorikal yang di-one-hot-encode
ohe = best_rf.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot']
categorical_features = [col for col in categorical_cols if col not in binary_cols]
cat_feature_names = ohe.get_feature_names_out(categorical_features)
feature_names.extend(cat_feature_names)

# Potong ke jumlah fitur sebenarnya jika ada perbedaan
n_features = len(rf_model.feature_importances_)
feature_names = feature_names[:n_features]

# Plot feature importance
importance = rf_model.feature_importances_
indices = np.argsort(importance)[::-1]

plt.figure(figsize=(12, 8))
plt.title('Feature Importance dari Random Forest')
plt.bar(range(len(indices[:15])), importance[indices[:15]], align='center')
plt.xticks(range(len(indices[:15])), [feature_names[i] for i in indices[:15]], rotation=90)
plt.tight_layout()
plt.savefig('../notebook/feature_importance.png')
plt.close()

print("Feature importance telah disimpan sebagai 'feature_importance.png'")

print("\nPROSES PELATIHAN MODEL SELESAI!")
print("-" * 50)