# EduMate - Sistem Rekomendasi Pembelajaran

In [None]:
import pandas as pd             
import numpy as np             
from sklearn.preprocessing import LabelEncoder, StandardScaler 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pickle                  
import warnings                 
warnings.filterwarnings('ignore') 

import tensorflow as tf         
from tensorflow.keras.models import Model 
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.optimizers import Adam 
from tensorflow.keras.models import load_model
import os  

## 1. LOADING DATA

In [None]:
print("Memuat dataset...")
try:
    data_mahasiswa = pd.read_csv('dataset/data_mahasiswa.csv')
    data_konten = pd.read_csv('dataset/data_konten.csv')
    data_interaksi = pd.read_csv('dataset/interaksi_user_konten.csv')

    print(f"✅ Data mahasiswa dimuat: {data_mahasiswa.shape} baris")
    print(f"✅ Data konten dimuat: {data_konten.shape} baris")
    print(f"✅ Data interaksi dimuat: {data_interaksi.shape} baris")
except FileNotFoundError as e:
    print(f"❌ Error saat memuat data: {e}. Pastikan file CSV ada di folder 'dataset'.")
    exit()



Memuat dataset...
✅ Data mahasiswa dimuat: (300, 10) baris
✅ Data konten dimuat: (200, 8) baris
✅ Data interaksi dimuat: (3000, 8) baris


## 2. EXPLORATORY DATA ANALYSIS (EDA)

In [None]:
# Menampilkan informasi dasar untuk memahami struktur data
print("\n🔍 Gambaran Umum Data (Mahasiswa):")
print(data_mahasiswa.info())

print("\n🔍 Gambaran Umum Data (Konten):")
print(data_konten.info())
print(f"Nilai unik 'mata_kuliah' di data_konten: {data_konten['mata_kuliah'].nunique()}")
print(f"Contoh nilai 'mata_kuliah': {data_konten['mata_kuliah'].head().tolist()}")

print("\n🔍 Gambaran Umum Data (Interaksi):")
print(data_interaksi.info())


🔍 Gambaran Umum Data (Mahasiswa):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id_mahasiswa            300 non-null    object 
 1   nama                    300 non-null    object 
 2   jurusan                 300 non-null    object 
 3   angkatan                300 non-null    int64  
 4   ipk_terakhir            300 non-null    float64
 5   device_preference       300 non-null    object 
 6   learning_style          300 non-null    object 
 7   goal                    300 non-null    object 
 8   waktu_belajar_per_hari  300 non-null    int64  
 9   ketersediaan_belajar    300 non-null    object 
dtypes: float64(1), int64(2), object(7)
memory usage: 23.6+ KB
None

🔍 Gambaran Umum Data (Konten):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column         

## 3. DATA PREPROCESSING

In [None]:
print("\n⚙️ Memulai pra-pemrosesan data...")
if not data_interaksi.empty and not data_konten.empty and not data_mahasiswa.empty:
    
    feedback_mapping = {
        'Sangat Membantu': 5, 'Bermanfaat': 4, 'Cukup': 3,
        'Kurang': 2, 'Tidak Membantu': 1
    }
    data_interaksi['feedback_score'] = data_interaksi['feedback'].map(feedback_mapping)
    print("✅ Kolom 'feedback' dikonversi ke 'feedback_score'.")

    data_interaksi['completion_score'] = data_interaksi['status'].map({'Selesai': 1, 'Belum Selesai': 0})
    print("✅ Kolom 'status' dikonversi ke 'completion_score'.")

else:
    print("❌ Salah satu atau semua dataset kosong, pra-pemrosesan awal tidak dapat dilakukan.")
    exit()


⚙️ Memulai pra-pemrosesan data...
✅ Kolom 'feedback' dikonversi ke 'feedback_score'.
✅ Kolom 'status' dikonversi ke 'completion_score'.


In [None]:
# Feature engineering untuk durasi normalized (watch_ratio)
if not data_interaksi.empty and not data_konten.empty:
    
    if 'id_konten' in data_interaksi.columns and 'id_konten' in data_konten.columns and 'durasi' in data_konten.columns:
        data_interaksi = data_interaksi.merge(
            data_konten[['id_konten', 'durasi']],
            on='id_konten',
            how='left'
        )

        if 'durasi_tonton' in data_interaksi.columns and 'durasi' in data_interaksi.columns:
            data_interaksi['watch_ratio'] = data_interaksi.apply(
                lambda row: row['durasi_tonton'] / row['durasi'] if row['durasi'] > 0 else 0,
                axis=1
            )
            data_interaksi['watch_ratio'] = data_interaksi['watch_ratio'].clip(0, 1).fillna(0)
            print(f"✅ 'Watch ratio' dihitung dan dinormalisasi.")
        else:
            print(f"⚠️ Kolom 'durasi_tonton' atau 'durasi' tidak ditemukan untuk menghitung watch_ratio di data_interaksi setelah merge.")
            data_interaksi['watch_ratio'] = 0.0 
    else:
        print("⚠️ Kolom kunci untuk merge (id_konten) atau kolom durasi tidak ditemukan di data_interaksi/data_konten. Watch_ratio tidak dihitung.")
        data_interaksi['watch_ratio'] = 0.0
else:
    print("⚠️ Data interaksi atau konten kosong. Watch_ratio tidak dihitung.")
    if 'watch_ratio' not in data_interaksi.columns: 
        data_interaksi['watch_ratio'] = 0.0

✅ 'Watch ratio' dihitung dan dinormalisasi.


In [None]:
### Menggabungkan Semua Data
print("🔗 Menggabungkan dataset (merging)...")
full_data = data_interaksi.merge(data_mahasiswa, on='id_mahasiswa', how='left')
full_data = full_data.merge(data_konten, on='id_konten', how='left', suffixes=('_interaksi', '_konten'))

print(f"✅ Data gabungan ('full_data') siap dengan {full_data.shape[0]} baris dan {full_data.shape[1]} kolom.")
print("Contoh kolom setelah semua merge:", full_data.columns.tolist()[:20])

🔗 Menggabungkan dataset (merging)...
✅ Data gabungan ('full_data') siap dengan 3000 baris dan 28 kolom.
Contoh kolom setelah semua merge: ['id_interaksi', 'id_mahasiswa', 'id_konten', 'waktu_akses', 'durasi_tonton', 'feedback', 'device', 'status', 'feedback_score', 'completion_score', 'durasi_interaksi', 'watch_ratio', 'nama', 'jurusan', 'angkatan', 'ipk_terakhir', 'device_preference', 'learning_style', 'goal', 'waktu_belajar_per_hari']


In [None]:
### Menangani Nilai yang Hilang (Missing Values)
print("🛠️ Menangani nilai yang hilang...")
if not full_data.empty:
    numeric_cols = full_data.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        full_data[col] = full_data[col].fillna(full_data[col].median())

    categorical_cols = full_data.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        full_data[col] = full_data[col].fillna('Unknown') 
    print("✅ Nilai yang hilang telah ditangani.")
else:
    print("⚠️ 'full_data' kosong, tidak ada nilai yang hilang untuk ditangani.")

🛠️ Menangani nilai yang hilang...
✅ Nilai yang hilang telah ditangani.


In [None]:
### Label Encoding untuk fitur kategorikal
print("🏷️ Mengkodekan fitur kategorikal (Label Encoding)...")
label_encoders = {}

categorical_features_to_encode = [
    'jurusan', 'device_preference', 'learning_style', 'goal', 'ketersediaan_belajar', 
    'mata_kuliah', 'platform', 'format', 'kesulitan' 
]

if not full_data.empty:
    for feature in categorical_features_to_encode:
        if feature in full_data.columns:
            le = LabelEncoder()
           
            full_data[f'{feature}_encoded'] = le.fit_transform(full_data[feature].astype(str))
            label_encoders[feature] = le # Simpan encoder-nya
            print(f"   ✅ '{feature}' di-encode. Kelas yang ditemukan: {list(le.classes_)}")
        else:
            print(f"⚠️ Peringatan: Fitur '{feature}' tidak ditemukan di 'full_data' untuk di-encode. Akan dilewati.")

    print(f"✅ Label encoders dibuat untuk {len(label_encoders)} fitur: {list(label_encoders.keys())}")
    print(f"Contoh data setelah preprocessing:\n {full_data.head()}")
else:
    print("⚠️ 'full_data' kosong, tidak ada encoding yang dilakukan.")

🏷️ Mengkodekan fitur kategorikal (Label Encoding)...
   ✅ 'jurusan' di-encode. Kelas yang ditemukan: ['Akuntansi', 'Informatika', 'Manajemen', 'Sistem Informasi', 'Teknik Elektro']
   ✅ 'device_preference' di-encode. Kelas yang ditemukan: ['HP', 'Laptop', 'Tablet']
   ✅ 'learning_style' di-encode. Kelas yang ditemukan: ['Auditory', 'Kinesthetic', 'Visual']
   ✅ 'goal' di-encode. Kelas yang ditemukan: ['Lulus Cepat', 'Pahami Materi', 'Raih IPK Tinggi']
   ✅ 'ketersediaan_belajar' di-encode. Kelas yang ditemukan: ['Malam', 'Pagi', 'Siang']
   ✅ 'mata_kuliah' di-encode. Kelas yang ditemukan: ['Basis Data', 'Matematika Diskrit', 'Pemrograman Dasar', 'Sistem Operasi', 'Statistik']
   ✅ 'platform' di-encode. Kelas yang ditemukan: ['Coursera', 'Dicoding', 'Medium', 'Modul Kampus', 'YouTube']
   ✅ 'format' di-encode. Kelas yang ditemukan: ['Artikel', 'Course', 'PDF', 'Slide', 'Video']
   ✅ 'kesulitan' di-encode. Kelas yang ditemukan: ['Mudah', 'Sedang', 'Sulit']
✅ Label encoders dibuat untuk 9

## 4. FEATURE ENGINEERING & SCALING untuk TensorFlow

In [None]:
### Menyiapkan Fitur untuk Model TensorFlow (Feature Engineering & Scaling)
if 'full_data' in locals() and not full_data.empty:
    print("\n⚙️ Menyiapkan Fitur & Normalisasi untuk TensorFlow...")

    user_categorical_tf_features = [f'{col}_encoded' for col in ['jurusan', 'device_preference', 'learning_style', 'goal', 'ketersediaan_belajar'] if f'{col}_encoded' in full_data.columns]
    user_numerical_tf_features = ['ipk_terakhir', 'waktu_belajar_per_hari']

    content_categorical_tf_features = [f'{col}_encoded' for col in ['mata_kuliah', 'platform', 'format', 'kesulitan'] if f'{col}_encoded' in full_data.columns]
    content_numerical_tf_features = []
    if 'durasi' in full_data.columns:
        content_numerical_tf_features.append('durasi')
    if 'rating_pengguna' in full_data.columns:
        content_numerical_tf_features.append('rating_pengguna')

    interaction_numerical_tf_features = []
    if 'watch_ratio' in full_data.columns:
        interaction_numerical_tf_features.append('watch_ratio')

    print(f"Fitur Kategorikal Pengguna (model): {user_categorical_tf_features}")
    print(f"Fitur Numerik Pengguna (model): {user_numerical_tf_features}")
    print(f"Fitur Kategorikal Konten (model): {content_categorical_tf_features}")
    print(f"Fitur Numerik Konten (model): {content_numerical_tf_features}")
    print(f"Fitur Numerik Interaksi (model): {interaction_numerical_tf_features}")

    scalers = {}

    all_numerical_features_to_scale = user_numerical_tf_features + content_numerical_tf_features + interaction_numerical_tf_features
    unique_numerical_features_to_scale = sorted(list(set([f for f in all_numerical_features_to_scale if f in full_data.columns])))

    if unique_numerical_features_to_scale:
        scaler = StandardScaler()
        full_data[unique_numerical_features_to_scale] = scaler.fit_transform(full_data[unique_numerical_features_to_scale])
        scalers['all_numerical'] = scaler
        print(f"✅ StandardScaler dibuat dan diterapkan pada fitur: {unique_numerical_features_to_scale}")
        scalers['all_numerical_feature_names_in_'] = unique_numerical_features_to_scale
    else:
        print(f"⚠️ Tidak ada fitur numerik valid yang ditemukan untuk scaling. Scaler tidak dibuat.")

    target_variable = 'feedback_score'
    if target_variable not in full_data.columns:
        print(f"❌ Variabel target '{target_variable}' tidak ditemukan! Model tidak bisa dilatih.")
        target_variable = None
    else:
        print(f"🎯 Variabel target untuk model: {target_variable}")
else:
    print("❌ 'full_data' kosong. Tidak bisa melanjutkan ke persiapan fitur.")
    user_categorical_tf_features, user_numerical_tf_features = [], []
    content_categorical_tf_features, content_numerical_tf_features = [], []
    interaction_numerical_tf_features = []
    scalers = {}
    target_variable = None


⚙️ Menyiapkan Fitur & Normalisasi untuk TensorFlow...
Fitur Kategorikal Pengguna (model): ['jurusan_encoded', 'device_preference_encoded', 'learning_style_encoded', 'goal_encoded', 'ketersediaan_belajar_encoded']
Fitur Numerik Pengguna (model): ['ipk_terakhir', 'waktu_belajar_per_hari']
Fitur Kategorikal Konten (model): ['mata_kuliah_encoded', 'platform_encoded', 'format_encoded', 'kesulitan_encoded']
Fitur Numerik Konten (model): ['rating_pengguna']
Fitur Numerik Interaksi (model): ['watch_ratio']
✅ StandardScaler dibuat dan diterapkan pada fitur: ['ipk_terakhir', 'rating_pengguna', 'waktu_belajar_per_hari', 'watch_ratio']
🎯 Variabel target untuk model: feedback_score


## 5. MODEL DEVELOPMENT (TensorFlow Hybrid Model)

In [None]:
### Membangun dan Melatih Model TensorFlow
if 'full_data' in globals() and not full_data.empty and \
   ('target_variable' in globals() and target_variable is not None and target_variable in full_data.columns) and \
   all(f_list_name in globals() for f_list_name in [
           'user_categorical_tf_features', 'user_numerical_tf_features',
           'interaction_numerical_tf_features', 'content_categorical_tf_features',
           'content_numerical_tf_features'
       ]):
    print("\n🤖 Membangun Model Rekomendasi Hybrid dengan TensorFlow...")
    embedding_dim = 16

    combined_user_numerical_tf_features = sorted(list(set(user_numerical_tf_features + interaction_numerical_tf_features)))
    combined_user_numerical_tf_features = [f for f in combined_user_numerical_tf_features if f in full_data.columns]

    input_user_list = []
    user_embeddings_list = []

    for feature_name in user_categorical_tf_features:
        if feature_name in full_data.columns:
            vocab_size = full_data[feature_name].nunique() + 1
            user_input = Input(shape=(1,), name=f'user_{feature_name}')
            input_user_list.append(user_input)
            embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, name=f'user_embedding_{feature_name}')(user_input)
            embedding = Flatten()(embedding)
            user_embeddings_list.append(embedding)

    if combined_user_numerical_tf_features:
        user_numerical_input_layer = Input(shape=(len(combined_user_numerical_tf_features),), name='user_combined_numerical_features')
        input_user_list.append(user_numerical_input_layer)
        user_embeddings_list.append(user_numerical_input_layer)

    input_content_list = []
    content_embeddings_list = []

    for feature_name in content_categorical_tf_features:
        if feature_name in full_data.columns:
            vocab_size = full_data[feature_name].nunique() + 1
            content_input = Input(shape=(1,), name=f'content_{feature_name}')
            input_content_list.append(content_input)
            embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, name=f'content_embedding_{feature_name}')(content_input)
            embedding = Flatten()(embedding)
            content_embeddings_list.append(embedding)

    if content_numerical_tf_features:
        content_numerical_input_layer = Input(shape=(len(content_numerical_tf_features),), name='content_numerical_features')
        input_content_list.append(content_numerical_input_layer)
        content_embeddings_list.append(content_numerical_input_layer)

    user_vector, content_vector = None, None

    if user_embeddings_list:
        merged_user_features = Concatenate(name="concat_user_features")(user_embeddings_list) if len(user_embeddings_list) > 1 else user_embeddings_list[0]
        user_vector = Dense(64, activation='relu', name="user_dense_1")(merged_user_features)
        user_vector = Dropout(0.2, name="user_dropout")(user_vector)
        user_vector = Dense(32, activation='relu', name="user_dense_2")(user_vector)
    else:
        print("⚠️ Tidak ada fitur pengguna yang valid untuk membangun user_vector.")

    if content_embeddings_list:
        merged_content_features = Concatenate(name="concat_content_features")(content_embeddings_list) if len(content_embeddings_list) > 1 else content_embeddings_list[0]
        content_vector = Dense(64, activation='relu', name="content_dense_1")(merged_content_features)
        content_vector = Dropout(0.2, name="content_dropout")(content_vector)
        content_vector = Dense(32, activation='relu', name="content_dense_2")(content_vector)
    else:
        print("⚠️ Tidak ada fitur konten yang valid untuk membangun content_vector.")

    tf_model = None
    if user_vector is not None and content_vector is not None:
        combined_vector = Concatenate(name="concat_user_content")([user_vector, content_vector])
        dense_layer = Dense(64, activation='relu', name="combined_dense_1")(combined_vector)
        dense_layer = Dropout(0.3, name="combined_dropout")(dense_layer)
        output_layer = Dense(1, activation='linear', name='feedback_score_output')(dense_layer)

        all_inputs = input_user_list + input_content_list

        if not all_inputs:
            print("❌ Tidak ada input yang valid (dari user atau konten) untuk model TensorFlow. Model tidak dapat dibuat.")
        else:
            tf_model = Model(inputs=all_inputs, outputs=output_layer)
            tf_model.compile(optimizer=Adam(learning_rate=0.001),
                             loss='mean_squared_error',
                             metrics=[tf.keras.metrics.MeanAbsoluteError()])
            tf_model.summary()
    else:
        print("❌ Model tidak dapat dibangun karena user_vector atau content_vector tidak valid.")

    if tf_model:
        train_indices, test_indices = train_test_split(full_data.index, test_size=0.2, random_state=42)
        y_train = full_data.loc[train_indices, target_variable].values
        y_test = full_data.loc[test_indices, target_variable].values

        def prepare_model_inputs(df_indices, df_source, user_cat_feats, user_num_feats_combined, content_cat_feats, content_num_feats):
            input_list = []
            for feature_name in user_cat_feats:
                if feature_name in df_source.columns:
                    input_list.append(df_source.loc[df_indices, feature_name].values.reshape(-1, 1))
            if user_num_feats_combined and all(f in df_source.columns for f in user_num_feats_combined):
                input_list.append(df_source.loc[df_indices, user_num_feats_combined].values)
            for feature_name in content_cat_feats:
                if feature_name in df_source.columns:
                    input_list.append(df_source.loc[df_indices, feature_name].values.reshape(-1, 1))
            if content_num_feats and all(f in df_source.columns for f in content_num_feats):
                input_list.append(df_source.loc[df_indices, content_num_feats].values)
            return input_list

        X_input_list_train = prepare_model_inputs(
            train_indices,
            full_data,
            user_categorical_tf_features,
            combined_user_numerical_tf_features,
            content_categorical_tf_features,
            content_numerical_tf_features
        )
        X_input_list_test = prepare_model_inputs(
            test_indices,
            full_data,
            user_categorical_tf_features,
            combined_user_numerical_tf_features,
            content_categorical_tf_features,
            content_numerical_tf_features
        )

        if not X_input_list_train or len(X_input_list_train) != len(tf_model.inputs):
            print(f"❌ Gagal menyiapkan data training/testing. Jumlah input ({len(X_input_list_train)}) tidak sesuai dengan layer model ({len(tf_model.inputs)}).")
            print(f"   Nama input layer model: {[inp.name for inp in tf_model.inputs]}")
        else:
            print("💪 Melatih Model TensorFlow Hybrid...")
            history = tf_model.fit(X_input_list_train, y_train,
                                   epochs=25,
                                   batch_size=32,
                                   validation_data=(X_input_list_test, y_test),
                                   callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)],
                                   verbose=1)

            print("\n✅ Pelatihan Selesai.")
            loss, mae = tf_model.evaluate(X_input_list_test, y_test, verbose=0)
            print(f"✅ Evaluasi Model (Test MSE): {loss:.4f}")
            print(f"✅ Evaluasi Model (Test MAE): {mae:.4f}")
    else:
        print("❌ Model TensorFlow tidak berhasil dibangun, tidak bisa dilatih.")
else:
    print("❌ Kondisi awal tidak terpenuhi untuk pelatihan model (data kosong atau fitur/target tidak valid).")
    if 'tf_model' not in locals(): tf_model = None


🤖 Membangun Model Rekomendasi Hybrid dengan TensorFlow...


💪 Melatih Model TensorFlow Hybrid...
Epoch 1/25
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - loss: 9.7045 - mean_absolute_error: 2.7513 - val_loss: 1.6402 - val_mean_absolute_error: 1.1289
Epoch 2/25
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1.8702 - mean_absolute_error: 1.1688 - val_loss: 1.6326 - val_mean_absolute_error: 1.1276
Epoch 3/25
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1.9060 - mean_absolute_error: 1.2054 - val_loss: 1.6247 - val_mean_absolute_error: 1.1461
Epoch 4/25
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1.8622 - mean_absolute_error: 1.1836 - val_loss: 1.6453 - val_mean_absolute_error: 1.1259
Epoch 5/25
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 1.8868 - mean_absolute_error: 1.1910 - val_loss: 1.6115 - val_mean_absolute_error: 1.1729
Epoch 6/25
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[

## 6. SAVE MODELS & PREPROCESSORS

In [None]:
# --- SIMPAN SEMUA ARTEFAK MODEL UNTUK DEPLOYMENT ---
output_dir = 'model_artifacts'
os.makedirs(output_dir, exist_ok=True)

if tf_model:
    model_path = os.path.join(output_dir, 'recs_model.h5')
    tf_model.save(model_path)
    print(f"✅ Model TensorFlow disimpan ke: {model_path}")
else:
    print("❌ Model TensorFlow tidak tersedia untuk disimpan.")

le_path = os.path.join(output_dir, 'label_encoders.pkl')
with open(le_path, 'wb') as f:
    pickle.dump(label_encoders, f)
print(f"✅ LabelEncoders disimpan ke: {le_path}")
# PENTING: Periksa apakah 'mata_kuliah' ada di label_encoders
if 'mata_kuliah' in label_encoders:
    print(f"   LabelEncoder untuk 'mata_kuliah' memiliki kelas: {list(label_encoders['mata_kuliah'].classes_)}")
else:
    print("   ❌ Peringatan: LabelEncoder untuk 'mata_kuliah' TIDAK ditemukan. Ini adalah masalah serius untuk app.py!")


scaler_path = os.path.join(output_dir, 'scalers.pkl')
with open(scaler_path, 'wb') as f:
    pickle.dump(scalers, f)
print(f"✅ Scalers disimpan ke: {scaler_path}")
# PENTING: Jika scaler dibuat, cek feature_names_in_
if 'all_numerical' in scalers:
    print(f"   Scaler numerik di-fit pada fitur: {scalers['all_numerical_feature_names_in_']}")


features_info = {
    'user_categorical_tf_features': user_categorical_tf_features,
    'user_numerical_tf_features': user_numerical_tf_features,
    'interaction_numerical_tf_features': interaction_numerical_tf_features,
    'content_categorical_tf_features': content_categorical_tf_features,
    'content_numerical_tf_features': content_numerical_tf_features
}
features_path = os.path.join(output_dir, 'features_info.pkl')
with open(features_path, 'wb') as f:
    pickle.dump(features_info, f)
print(f"✅ Informasi fitur yang digunakan model disimpan ke: {features_path}")
print(f"   Fitur kategorikal konten yang disimpan: {features_info['content_categorical_tf_features']}")


unique_categories_path = os.path.join(output_dir, 'unique_categories.pkl')
unique_categories = {
    feature: list(le.classes_)
    for feature, le in label_encoders.items()
}
with open(unique_categories_path, 'wb') as f:
    pickle.dump(unique_categories, f)
print(f"✅ Kategori unik dari LabelEncoders disimpan ke: {unique_categories_path}")
# PENTING: Periksa apakah 'mata_kuliah' ada di unique_categories
if 'mata_kuliah' in unique_categories:
    print(f"   Kategori unik 'mata_kuliah': {unique_categories['mata_kuliah']}")
else:
    print("   ❌ Peringatan: 'mata_kuliah' TIDAK ditemukan di unique_categories. Pastikan datanya benar.")

# Simpan data_konten asli (atau kolom-kolom relevan) untuk ditampilkan di rekomendasi
content_cols_to_save = ['id_konten', 'judul', 'durasi', 'mata_kuliah', 'platform', 'format', 'kesulitan', 'rating_pengguna']
final_content_cols = [col for col in content_cols_to_save if col in data_konten.columns]

if not data_konten.empty and final_content_cols:
    data_konten_for_recs_path = os.path.join(output_dir, 'data_konten_for_recs.pkl')
    data_konten[final_content_cols].to_pickle(data_konten_for_recs_path)
    print(f"✅ Data konten yang relevan untuk rekomendasi disimpan ke: {data_konten_for_recs_path}")
    print(f"   Kolom yang disimpan di data_konten_for_recs: {final_content_cols}")
    if 'mata_kuliah' not in final_content_cols:
        print("   ❌ Peringatan: Kolom 'mata_kuliah' TIDAK termasuk dalam data_konten_for_recs.pkl. Ini akan menyebabkan error di app.py.")
else:
    print("❌ Data konten kosong atau kolom relevan tidak ditemukan, tidak dapat menyimpan untuk rekomendasi.")

print("\n🎉 Semua artefak model berhasil disimpan. Sekarang Anda siap untuk membangun aplikasi Streamlit!")



✅ Model TensorFlow disimpan ke: model_artifacts\recs_model.h5
✅ LabelEncoders disimpan ke: model_artifacts\label_encoders.pkl
   LabelEncoder untuk 'mata_kuliah' memiliki kelas: ['Basis Data', 'Matematika Diskrit', 'Pemrograman Dasar', 'Sistem Operasi', 'Statistik']
✅ Scalers disimpan ke: model_artifacts\scalers.pkl
   Scaler numerik di-fit pada fitur: ['ipk_terakhir', 'rating_pengguna', 'waktu_belajar_per_hari', 'watch_ratio']
✅ Informasi fitur yang digunakan model disimpan ke: model_artifacts\features_info.pkl
   Fitur kategorikal konten yang disimpan: ['mata_kuliah_encoded', 'platform_encoded', 'format_encoded', 'kesulitan_encoded']
✅ Kategori unik dari LabelEncoders disimpan ke: model_artifacts\unique_categories.pkl
   Kategori unik 'mata_kuliah': ['Basis Data', 'Matematika Diskrit', 'Pemrograman Dasar', 'Sistem Operasi', 'Statistik']
✅ Data konten yang relevan untuk rekomendasi disimpan ke: model_artifacts\data_konten_for_recs.pkl
   Kolom yang disimpan di data_konten_for_recs: ['