# **1. Import Library**

In [1]:
# !pip install tensorflow_recommenders

In [2]:
# !pip install tensorflow==2.15.0

In [3]:
# !pip install keras-tuner

In [4]:
import tensorflow as tf
print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

2.15.0
Num GPUs Available:  0


In [5]:
# Standard Libraries
import os
import time
import random
import pickle
from collections import Counter
import difflib

# Seed Setup
import numpy as np
np.random.seed(0)  # Set seed sebelum pengacakan

# Data Manipulation & Preprocessing
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score

# Deep Learning & Modeling
import tensorflow as tf
import tensorflow_recommenders as tfrs
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
import keras_tuner as kt

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Optional / Custom Libraries
import kagglehub

# Pandas Configuration
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
pd.set_option("display.float_format", lambda x: "%.4f" % x)

# Warnings Configuration
import warnings
warnings.filterwarnings("ignore")

In [6]:
import os
os.environ['TF_USE_LEGACY_KERAS'] = '1'

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **2. Load Dataset**

In [8]:
mainPath =  '/content/drive/MyDrive/Colab Notebooks/Capstone - Course Recommender Systems/' # Path of the main project
dataPath = os.path.join(mainPath, 'Dataset/') # Path of the dataset

In [9]:
# Load the CSV file from the URL
df_courses = pd.read_csv(dataPath + 'udemy_courses_new.csv')

In [10]:
df_courses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3672 entries, 0 to 3671
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   course_id            3672 non-null   int64  
 1   course_title         3672 non-null   object 
 2   url                  3672 non-null   object 
 3   is_paid              3672 non-null   bool   
 4   price                3672 non-null   int64  
 5   num_subscribers      3672 non-null   int64  
 6   num_reviews          3672 non-null   int64  
 7   num_lectures         3672 non-null   int64  
 8   level                3672 non-null   object 
 9   content_duration     3672 non-null   float64
 10  published_timestamp  3672 non-null   object 
 11  subject              3672 non-null   object 
 12  total_interactions   3672 non-null   int64  
 13  total_users          3672 non-null   int64  
dtypes: bool(1), float64(1), int64(7), object(5)
memory usage: 376.7+ KB


In [11]:
df_courses.head(2)

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,total_interactions,total_users
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance,10,10
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39.0,2017-03-09T16:34:20Z,Business Finance,15,14


# **3. Processing Data**

In [12]:
# 🔢 Normalisasi Fitur Numerik
numerical_features = ['price', 'num_subscribers', 'num_reviews', 'num_lectures', 'content_duration']
scaler = MinMaxScaler()
df_courses[numerical_features] = scaler.fit_transform(df_courses[numerical_features])

# 📊 Vocabulary unik untuk fitur kategorikal
subject_vocab = df_courses['subject'].unique().tolist()
level_vocab = df_courses['level'].unique().tolist()

## **3.1 Splitting Data (Train & Test)**

In [13]:
# Split data df_courses menjadi train & test (misal 90:10)
df_train_courses, df_test_courses = train_test_split(
    df_courses, test_size=0.1, random_state=42, shuffle=True
)

print(f"Jumlah course train: {len(df_train_courses)}")
print(f"Jumlah course test : {len(df_test_courses)}")

# Function untuk Membuat dataset TensorFlow train & test
def make_tf_dataset(df, shuffle=True, batch_size=64):
    ds = tf.data.Dataset.from_tensor_slices({
        "course_id": df["course_id"].astype(str).values,
        "course_title": df["course_title"].astype(str).values.reshape(-1),
        "subject": df["subject"].astype(str).values,
        "level": df["level"].astype(str).values,
        "price": df["price"].values,
        "num_subscribers": df["num_subscribers"].values,
        "num_reviews": df["num_reviews"].values,
        "num_lectures": df["num_lectures"].values,
        "content_duration": df["content_duration"].values,
    })
    if shuffle:
        ds = ds.shuffle(1000)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)
    return ds

# Contoh batch_size default untuk training nanti (bisa diubah oleh tuner)
default_batch_size = 64

tf_train_courses = make_tf_dataset(df_train_courses, batch_size=default_batch_size)
tf_test_courses = make_tf_dataset(df_test_courses, shuffle=False, batch_size=default_batch_size)

# Dataset kandidat untuk index embedding, dari seluruh courses
tf_all_courses = make_tf_dataset(df_courses, batch_size=64, shuffle=False)

Jumlah course train: 3304
Jumlah course test : 368


# **3. Recommenders Modeling - [Content Based Filtering]**

## **3.1 Model embedding fitur course**

In [14]:
class CourseModel(tf.keras.Model):
    def __init__(self, embedding_dim, subject_vocab, level_vocab, course_ids):
        super().__init__()

        # 🔍 Embedding untuk course_id (mengubah ID jadi vektor numerik)
        self.course_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=course_ids, mask_token=None),
            tf.keras.layers.Embedding(len(course_ids) + 1, embedding_dim)
        ])

        # 🎓 Embedding untuk subject course (kategori mata pelajaran)
        self.subject_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=subject_vocab, mask_token=None),
            tf.keras.layers.Embedding(len(subject_vocab) + 1, max(4, embedding_dim // 4))
        ])

        # 🏷️ Embedding untuk level course (pemula, menengah, dsb)
        self.level_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=level_vocab, mask_token=None),
            tf.keras.layers.Embedding(len(level_vocab) + 1, max(2, embedding_dim // 8))
        ])

        # ✍️ Vektorisasi teks judul course dengan TF-IDF dan dense layer
        self.title_vectorizer = tf.keras.layers.TextVectorization(max_tokens=1000, output_mode='tf-idf')
        self.title_vectorizer.adapt(df_courses["course_title"].astype(str).tolist())

        self.title_embedding = tf.keras.Sequential([
            self.title_vectorizer,
            tf.keras.layers.Dense(embedding_dim, activation="relu"),
        ])

        # 📊 Dense layer untuk fitur numerik seperti harga, jumlah subscriber, dll
        self.numerical_dense = tf.keras.Sequential([
            tf.keras.layers.Dense(embedding_dim // 4, activation="relu"),
            tf.keras.layers.Dense(embedding_dim // 8, activation="relu"),
        ])

        # 🔗 Gabungkan semua embedding lalu proses dengan dense layers
        self.final_dense1 = tf.keras.layers.Dense(embedding_dim * 2, activation="relu")
        self.final_dense2 = tf.keras.layers.Dense(embedding_dim)

    def call(self, inputs):
        title = inputs["course_title"]

        # Gabungkan fitur numerik dalam satu tensor
        numerical = tf.stack([
            inputs["price"],
            inputs["num_subscribers"],
            inputs["num_reviews"],
            inputs["num_lectures"],
            inputs["content_duration"],
        ], axis=1)

        # Dapatkan embedding masing-masing fitur
        course_emb = self.course_embedding(inputs["course_id"])
        subject_emb = self.subject_embedding(inputs["subject"])
        level_emb = self.level_embedding(inputs["level"])
        title_emb = self.title_embedding(title)
        numerical_emb = self.numerical_dense(numerical)

        # ⚡ Concatenate semua embedding jadi satu vector feature
        concat = tf.concat([course_emb, subject_emb, level_emb, title_emb, numerical_emb], axis=1)

        x = self.final_dense1(concat)
        return self.final_dense2(x)  # Output embedding final untuk course

## **3.2 Model Retrieval untuk Membangun Sistem Rekomendasi Berbasis Embedding**

In [15]:
class CourseRetrievalModel(tfrs.Model):
    def __init__(self, embedding_dim, subject_vocab, level_vocab, course_ids, candidate_dataset):
        super().__init__()
        # Model embedding course yang digunakan sebagai query dan kandidat
        self.course_model = CourseModel(embedding_dim, subject_vocab, level_vocab, course_ids)
        # Task retrieval dengan metric FactorizedTopK untuk efisiensi pencarian
        self.task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
            candidates=candidate_dataset.map(lambda x: self.course_model(x))  # Indeks kandidat embedding
        ))

    def compute_loss(self, features, training=False):
        # Hitung loss berdasarkan embedding query dan kandidat
        return self.task(
            query_embeddings=self.course_model(features),
            candidate_embeddings=self.course_model(features)
        )

## **3.3 Build Model untuk Hyperparameter Tuning**

In [16]:
# 🔧 Fungsi Build Model untuk Hyperparameter Tuning dengan Keras Tuner
def build_model(hp):
    # Pilihan dimensi embedding yang akan diuji (16, 32, 48, 64)
    embedding_dim = hp.Int('embedding_dim', 16, 64, step=16)

    # Pilihan optimizer: Adam atau Adagrad
    optimizer_choice = hp.Choice('optimizer', ['adam', 'adagrad'])

    # Pilihan learning rate dalam rentang logaritmik
    learning_rate = hp.Float('learning_rate', 1e-4, 1e-1, sampling='log')

    # Pilihan batch size yang akan diuji
    batch_size = hp.Choice('batch_size', [32, 64, 128])

    # Membuat model dengan hyperparameter yang dipilih tuner
    model = CourseRetrievalModel(
        embedding_dim=embedding_dim,
        subject_vocab=subject_vocab,
        level_vocab=level_vocab,
        course_ids=df_courses["course_id"].astype(str).tolist(),
        candidate_dataset=tf_all_courses
    )

    # Pilih optimizer dengan learning rate yang disesuaikan
    if optimizer_choice == 'adam':
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    else:
        optimizer = tf.keras.optimizers.Adagrad(learning_rate=learning_rate)

    # Compile model dengan optimizer tersebut
    model.compile(optimizer=optimizer)

    # Simpan batch_size di atribut model untuk penggunaan nanti
    model.batch_size = batch_size
    return model

## **3.4 Setup Callbacks untuk Optimasi & Monitoring Training**

In [17]:
checkpoint_dir = os.path.join(mainPath, 'Model/2. Content-Based Filtering/')
os.makedirs(checkpoint_dir, exist_ok=True)

# Callback standar untuk monitoring selama tuning
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='loss',  # 🚦 Pantau loss untuk hentikan training lebih awal jika stagnan
    patience=3,
    restore_best_weights=True  # 🔄 Kembalikan ke bobot terbaik saat selesai
)

lr_reducer = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='loss',  # 📉 Turunkan learning rate jika loss tidak berkurang
    factor=0.5,
    patience=2,
    verbose=1
)

checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=os.path.join(checkpoint_dir, 'best_model.keras'),  # 💾 Simpan model terbaik di path ini
    monitor='loss',
    save_best_only=True,
    save_weights_only=False,
    verbose=1
)

## **3.5 Custom and Inisialisasi Tuner untuk Hyperband**

In [18]:
# Custom Tuner Class untuk Hyperband dengan Batch Size Dinamis
class MyTuner(kt.Hyperband):
    def run_trial(self, trial, *args, **kwargs):
        batch_size = trial.hyperparameters.get('batch_size')  # 🎯 Ambil batch size dari hyperparameter
        train_ds = tf_train_courses.unbatch().batch(batch_size).shuffle(1000).cache()  # 🔄 Siapkan dataset dengan batch size dinamis
        kwargs['x'] = train_ds  # 🎛️ Override input dataset untuk fit()
        kwargs['epochs'] = 15   # ⏳ Set epoch tetap untuk tiap trial
        return super().run_trial(trial, *args, **kwargs)

# Inisialisasi Tuner dengan MyTuner Custom untuk Grid Search & Tuning Model
tuner = MyTuner(
    build_model,
    objective=kt.Objective('loss', direction='min'),  # 🔍 Minimalkan loss sebagai tujuan tuning
    max_epochs=15,
    factor=3,
    directory=checkpoint_dir + 'keras_tuner_dir_grid_search/',  # 📂 Direktori simpan hasil tuning
    project_name='content_based_filtering_tuning'  # 🏷️ Nama project tuning
)

Reloading Tuner from /content/drive/MyDrive/Colab Notebooks/Capstone - Course Recommender Systems/Model/2. Content-Based Filtering/keras_tuner_dir_grid_search/content_based_filtering_tuning/tuner0.json


## **3.6 Eksekusi Proses Tuning dengan Monitoring Waktu & Callbacks**

In [19]:
start_time = time.time()  # 🏁 Mulai hitung waktu training

# ⚠️ Jangan sertakan data training di sini, sudah di-handle di MyTuner.run_trial()
tuner.search(
    epochs=15,  # 🔄 Maksimal epoch tiap trial
    callbacks=[early_stop, lr_reducer, checkpoint]  # 🎛️ Callback penting selama tuning
)

end_time = time.time()  # 🏁 Akhiri hitung waktu
elapsed_time = end_time - start_time

# ⏳ Tampilkan durasi training dengan format friendly
minutes, seconds = divmod(elapsed_time, 60)
print("\n✅ Pencarian hyperparameter dengan Bayesian Optimization selesai.")
print(f"\n⏱️ Training selesai dalam {int(minutes)} menit {int(seconds)} detik.")


✅ Pencarian hyperparameter dengan Bayesian Optimization selesai.

⏱️ Training selesai dalam 0 menit 0 detik.


## **3.7 Model Terbaik dari Hasil Tuning**

In [20]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]  # 🔍 Ambil hyperparameter terbaik
print(f"\n✨ Hyperparameter terbaik ditemukan:")
print(f"  - Embedding Dimension: {best_hps.get('embedding_dim')}")
print(f"  - Learning Rate: {best_hps.get('learning_rate')}")
print(f"  - Optimizer: {best_hps.get('optimizer')}")

best_model = tuner.get_best_models(num_models=1)[0]  # 🏆 Muat model terbaik hasil tuning
print("\n🚀 Model terbaik berhasil dimuat.")


✨ Hyperparameter terbaik ditemukan:
  - Embedding Dimension: 48
  - Learning Rate: 0.0008901416500660483
  - Optimizer: adam

🚀 Model terbaik berhasil dimuat.


## **3.8 Model Evaluation**

In [21]:
def lookup_course(course_id_str):
    """🔍 Ambil fitur lengkap course berdasarkan course_id sebagai dictionary TensorFlow"""
    row = df_courses[df_courses["course_id"].astype(str) == course_id_str].iloc[0]  # 📄 Cari baris sesuai ID
    return {
        "course_id": tf.constant([str(row["course_id"])]),  # 🆔 ID course
        "course_title": tf.constant([str(row["course_title"])]),  # 📚 Judul course
        "subject": tf.constant([str(row["subject"])]),  # 🏷️ Kategori subject
        "level": tf.constant([str(row["level"])]),  # 🎯 Level course
        "price": tf.constant([row["price"]], dtype=tf.float32),  # 💰 Harga (normalisasi)
        "num_subscribers": tf.constant([row["num_subscribers"]], dtype=tf.float32),  # 👥 Jumlah subscriber
        "num_reviews": tf.constant([row["num_reviews"]], dtype=tf.float32),  # ⭐ Review count
        "num_lectures": tf.constant([row["num_lectures"]], dtype=tf.float32),  # 📖 Jumlah lecture
        "content_duration": tf.constant([row["content_duration"]], dtype=tf.float32),  # ⏳ Durasi konten
    }

### **3.8.1 Evaluate in Test Dataset**

In [22]:
# Evaluasi model di test set
results = best_model.evaluate(tf_test_courses, return_dict=True)

print("\n📈 Evaluation Metrics (Test Data):")
for metric_name, value in results.items():
    print(f"{metric_name}: {value:.4f}")


📈 Evaluation Metrics (Test Data):
factorized_top_k/top_1_categorical_accuracy: 0.3641
factorized_top_k/top_5_categorical_accuracy: 0.8587
factorized_top_k/top_10_categorical_accuracy: 0.9321
factorized_top_k/top_50_categorical_accuracy: 0.9946
factorized_top_k/top_100_categorical_accuracy: 1.0000
loss: 6.6648
regularization_loss: 0.0000
total_loss: 6.6648


### **3.8.2 Metric NDCG@10 Recall@10**

In [23]:
# === Ambil embeddings kandidat batch per batch ===
candidate_embeddings = []
for batch in tf_all_courses:
    emb = best_model.course_model(batch)
    candidate_embeddings.append(emb)
candidate_embeddings = tf.concat(candidate_embeddings, axis=0)
candidate_ids = np.array(df_courses["course_id"].astype(str).tolist())

# === Build index for retrieval ===
index = tfrs.layers.factorized_top_k.BruteForce(best_model.course_model)
index.index_from_dataset(
    tf_all_courses.map(lambda x: (x["course_id"], best_model.course_model(x)))
)

# === Define top-k ===
TOP_K = 20

# === Run inference on test queries ===
all_recall = []
all_ndcg = []

for batch in tf_test_courses:
    # Get ground truth course IDs in this batch
    true_ids = batch["course_id"].numpy().astype(str)

    # Query recommendations from index
    scores, recommended_ids = index(batch, k=TOP_K)

    for i in range(len(true_ids)):
        true_id = true_ids[i]
        recs = recommended_ids[i].numpy().astype(str)

        # Create relevance vector: 1 if recommended course == true course else 0
        relevance = [1 if rec == true_id else 0 for rec in recs]

        # Recall@k is 1 if true_id in top-k recs, else 0
        recall = int(true_id in recs)
        all_recall.append(recall)

        # NDCG@k requires 2D arrays
        ndcg = ndcg_score([relevance], [relevance])
        all_ndcg.append(ndcg)

# === Aggregate metrics ===
avg_recall = np.mean(all_recall)
avg_ndcg = np.mean(all_ndcg)

In [24]:
print(f"📊 Evaluation Results on Test Set:")
print(f" - Recall@{TOP_K}: {avg_recall:.4f}")
print(f" - NDCG@{TOP_K}: {avg_ndcg:.4f}")

📊 Evaluation Results on Test Set:
 - Recall@20: 0.9810
 - NDCG@20: 0.9810


### **3.8.3 Quick Manual Evaluation: Recommendation vs Ground Truth**

In [25]:
# Fungsi untuk dapat rekomendasi top-k course dari suatu course_id
def recommend_courses(course_id, index, k=10):
    query = lookup_course(course_id)
    scores, ids = index(query, k=k)
    recommended_ids = [id.decode('utf-8') for id in ids[0].numpy()]
    return recommended_ids, scores[0].numpy()

# Contoh course_id untuk query
query_course_id = df_courses["course_id"].astype(str).iloc[0]

In [26]:
def recommend_courses(course_id, index, k=10):
    # 🎯 Ambil rekomendasi top-k berdasarkan course_id
    query = lookup_course(course_id)
    scores, ids = index(query, k=k)
    recommended_ids = [id.decode('utf-8') for id in ids[0].numpy()]
    return recommended_ids, scores[0].numpy()

query_course_id = df_courses["course_id"].astype(str).iloc[0]  # 🔎 Course ID untuk uji coba
info = lookup_course(query_course_id)  # ℹ️ Ambil info course

print(f"ℹ️ Info Course ID {query_course_id}:")
for key, val in info.items():
    print(f" - {key}: {val}")

recommended_ids, scores = recommend_courses(query_course_id, index, k=10)  # 🚀 Dapatkan rekomendasi
print(f"\n🎯 Rekomendasi untuk Course ID {query_course_id}:")
for cid, score in zip(recommended_ids, scores):
    title = df_courses.loc[df_courses["course_id"].astype(str) == cid, 'course_title'].values[0]
    print(f" - {cid}: {title} | Skor: {score:.4f}")

query_subject = df_courses.loc[df_courses["course_id"].astype(str) == query_course_id, "subject"].values[0]  # 📚 Subject course query
ground_truth_ids = set(df_courses.loc[df_courses["subject"] == query_subject, "course_id"].astype(str).tolist())  # ✅ Ground truth (subject sama)

found_ids = set(recommended_ids).intersection(ground_truth_ids)  # 🔍 Rekomendasi yang ada di ground truth
percent_found = len(found_ids) / len(recommended_ids) * 100  # 📊 Persentase kecocokan

if percent_found == 100:
    print(f"\n✅ Semua rekomendasi ada di ground truth ({len(found_ids)}/{len(recommended_ids)}).")
else:
    missing = set(recommended_ids) - ground_truth_ids  # ❌ Rekomendasi yang tidak ada di ground truth
    print(f"\n⚠️ {len(found_ids)}/{len(recommended_ids)} rekomendasi ada di ground truth ({percent_found:.2f}%).")
    print("Course ID yang tidak ada di ground truth:")
    for cid in missing:
        print(f" - {cid}")

ℹ️ Info Course ID 1070968:
 - course_id: [b'1070968']
 - course_title: [b'Ultimate Investment Banking Course']
 - subject: [b'Business Finance']
 - level: [b'All Levels']
 - price: [1.]
 - num_subscribers: [0.0079837]
 - num_reviews: [0.00083804]
 - num_lectures: [0.06546855]
 - content_duration: [0.01910828]

🎯 Rekomendasi untuk Course ID 1070968:
 - 1185390: Como Produzir Ebook e Livros com Adobe Indesign CC e CS6 | Skor: 24.7674
 - 1070968: Ultimate Investment Banking Course | Skor: 23.2849
 - 965832: The Complete Investment Banking Course 2017 | Skor: 20.7592
 - 1202746: Curso Online de Adobe Illustrator CC e CS6 | Skor: 20.5425
 - 1209556: Hedge Fund Trading Systems Part Two - Stocks & ETFs | Skor: 19.1941
 - 975414: Contango VXX - ETF Options Trading - Double Your Investment | Skor: 18.2304
 - 944804: Coaching Bundle: Guitar for Music Educators: Part 1 | Skor: 17.5508
 - 1000010: Professional Ruby on Rails Developer with Rails 5 | Skor: 17.3140
 - 474212: High performance Stock T

## **3.9 Save Model, Index, dan Data Pendukung**

### **3.9.1 Simpan Model Terbaik ke Disk**

In [27]:
checkpoint_dir

'/content/drive/MyDrive/Colab Notebooks/Capstone - Course Recommender Systems/Model/2. Content-Based Filtering/'

In [28]:
#  Save model weights
best_model.save_weights(os.path.join(checkpoint_dir + "model weights/", "model_weights"))

In [29]:
os.path.join(checkpoint_dir, "title_vectorizer")

'/content/drive/MyDrive/Colab Notebooks/Capstone - Course Recommender Systems/Model/2. Content-Based Filtering/title_vectorizer'

### **3.9.2 Save Scaler for Numerical Feature**

In [38]:
with open(os.path.join(checkpoint_dir, "scaler.pkl"), "wb") as f:
    pickle.dump(scaler, f)

### **3.9.3 Simpan title_vectorizer dari Best Model**

In [31]:
# Ambil vectorizer dari model
vectorizer_layer = best_model.course_model.title_vectorizer

# Bungkus jadi model dummy
vectorizer_model = tf.keras.Sequential([vectorizer_layer])

# Simpan ke disk
vectorizer_path = os.path.join(checkpoint_dir, "title_vectorizer_model")
vectorizer_model.save(vectorizer_path)

print(f"✅ title_vectorizer berhasil disimpan ke {vectorizer_path}")



✅ title_vectorizer berhasil disimpan ke /content/drive/MyDrive/Colab Notebooks/Capstone - Course Recommender Systems/Model/2. Content-Based Filtering/title_vectorizer_model


### **3.9.4 Simpan Data Course IDs Vocabulary**

In [32]:
# Simpan list course_ids untuk referensi saat inferensi.
with open(os.path.join(checkpoint_dir, "course_ids.pkl"), "wb") as f:
    pickle.dump(df_courses["course_id"].astype(str).tolist(), f)

# Save subject
with open(os.path.join(checkpoint_dir, "subject_vocab.pkl"), "wb") as f:
    pickle.dump(subject_vocab, f)
# Save level vocab
with open(os.path.join(checkpoint_dir, "level_vocab.pkl"), "wb") as f:
    pickle.dump(level_vocab, f)

## **3.10 Load dan Inference**

In [33]:
mainPath =  '/content/drive/MyDrive/Colab Notebooks/Capstone - Course Recommender Systems/' # Path of the main project
checkpoint_dir = os.path.join(mainPath, 'Model/2. Content-Based Filtering/')

In [34]:
# Load necessary components to initialize the model
with open(os.path.join(checkpoint_dir, "course_ids.pkl"), "rb") as f:
    loaded_course_ids = pickle.load(f)

# Load subject and level vocab if needed (if they were saved)
with open(os.path.join(checkpoint_dir, "subject_vocab.pkl"), "rb") as f:
    loaded_subject_vocab = pickle.load(f)

with open(os.path.join(checkpoint_dir, "level_vocab.pkl"), "rb") as f:
    loaded_level_vocab = pickle.load(f)

In [35]:
# Now load the model by passing the correct arguments
loaded_model = CourseModel(
    embedding_dim=64,  # match the embedding_dim used during training
    subject_vocab=loaded_subject_vocab,  # load subject vocab from the saved file
    level_vocab=loaded_level_vocab,  # load level vocab from the saved file
    course_ids=loaded_course_ids  # load course IDs from the saved file
)

# Compile the loaded model as done during training
loaded_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0008901416500660483))

# Load the model weights
loaded_model.load_weights(os.path.join(checkpoint_dir + "model weights/", "model_weights"))

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7886b2f91f10>

In [36]:
loaded_index = tfrs.layers.factorized_top_k.BruteForce(loaded_model)
loaded_index.index_from_dataset(
    tf_all_courses.map(lambda x: (x["course_id"], loaded_model(x)))
)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7886b2fdb6d0>

### **3.10.1 Inference with Course ID**

In [37]:
# --------------------------------------------------
# SISTEM REKOMENDASI (BUILD INDEX)

def lookup_course(course_id_str):
    """Ambil dictionary fitur untuk 1 course berdasarkan ID string"""
    row = df_courses[df_courses["course_id"].astype(str) == course_id_str].iloc[0]
    return {
        "course_id": tf.constant([str(row["course_id"])]),
        "course_title": tf.constant([str(row["course_title"])]),
        "subject": tf.constant([str(row["subject"])]),
        "level": tf.constant([str(row["level"])]),
        "price": tf.constant([row["price"]], dtype=tf.float32),
        "num_subscribers": tf.constant([row["num_subscribers"]], dtype=tf.float32),
        "num_reviews": tf.constant([row["num_reviews"]], dtype=tf.float32),
        "num_lectures": tf.constant([row["num_lectures"]], dtype=tf.float32),
        "content_duration": tf.constant([row["content_duration"]], dtype=tf.float32),
    }

print("✅ Sistem rekomendasi telah siap digunakan!")

# --------------------------------------------------
# INTERAKSI DENGAN USER UNTUK MEMASUKKAN COURSE ID

while True:
    print("\n📥 Masukkan `course_id` untuk melihat rekomendasi (atau ketik 'exit' untuk keluar):")
    user_input = input(">> ")

    if user_input.lower() == "exit":
        print("\n👋 Terima kasih! Program selesai.")
        break

    # Validasi input course ID
    if user_input not in df_courses["course_id"].astype(str).values:
        print("⚠️  Course ID tidak ditemukan. Coba lagi dengan ID yang tersedia.\n")
        print("🆔 Course ID yang tersedia:", df_courses['course_id'].tolist())
        continue

    # Tampilkan info course yang dijadikan query
    course_row = df_courses[df_courses["course_id"].astype(str) == user_input].iloc[0]
    print("\n🔍 Course yang dijadikan dasar rekomendasi:")
    print(f"   📘 ID        : {course_row['course_id']}")
    print(f"   📚 Title     : {course_row['course_title']}")
    print(f"   🧩 Subject   : {course_row['subject']}")
    print(f"   🎯 Level     : {course_row['level']}")
    print(f"   💰 Price     : {course_row['price']}")
    print(f"   👥 Subscribers: {course_row['num_subscribers']}\n")

    # Cari rekomendasi
    query_features = lookup_course(user_input)
    scores, ids = loaded_index(query_features, k=len(df_courses))

    print("📎 Rekomendasi berdasarkan course tersebut:")
    for score, course_id in zip(scores[0].numpy(), ids[0].numpy()):
        course_id_str = course_id.decode('utf-8')
        recommended = df_courses[df_courses["course_id"].astype(str) == course_id_str].iloc[0]
        print(f"   🔗 Course ID: {course_id_str} | 📘 Title: {recommended['course_title']} | ⭐ Skor: {score:.4f}")

✅ Sistem rekomendasi telah siap digunakan!

📥 Masukkan `course_id` untuk melihat rekomendasi (atau ketik 'exit' untuk keluar):
>> 1004512

🔍 Course yang dijadikan dasar rekomendasi:
   📘 ID        : 1004512
   📚 Title     : Clipping Masks & Shaped images in Adobe Photoshop CC
   🧩 Subject   : Graphic Design
   🎯 Level     : All Levels
   💰 Price     : 0.45
   👥 Subscribers: 0.0070838120949119265

📎 Rekomendasi berdasarkan course tersebut:
   🔗 Course ID: 801702 | 📘 Title: Geld verdienen mit dem Ticken der Uhr, wie Warren Buffett! | ⭐ Skor: 13.4978
   🔗 Course ID: 1039930 | 📘 Title: Wie Du, Dir auch mit kleinen Beträgen ein Vermögen aufbaust | ⭐ Skor: 13.0085
   🔗 Course ID: 1066002 | 📘 Title: Leve a sua técnica de pedal duplo para um novo patamar | ⭐ Skor: 12.5109
   🔗 Course ID: 286898 | 📘 Title: Aprende a tocar el Acordeón 'de oído y con técnica",https://www.udemy.com/aprende-a-tocar-el-acordeon-de-oido-y-con-tecnica/,true,25,93,21,34,Beginner Level,4 hours,2014-09-16T19:51:45Z
26343

### **3.10.2 Inference with Course Tilte and Matching System**

In [None]:
def get_best_match(title_input, title_list, cutoff=0.5):
    """
    Cari judul course terdekat berdasarkan input, mirip autocomplete/fuzzy.
    """
    matches = difflib.get_close_matches(title_input.lower(), [t.lower() for t in title_list], n=1, cutoff=cutoff)
    if matches:
        matched_title = matches[0]
        for original in title_list:
            if original.lower() == matched_title:
                return original
    else:
        # Kalau nggak nemu fuzzy match, coba keyword search
        for title in title_list:
            if title_input.lower() in title.lower():
                return title
    return None

In [None]:
def lookup_course_by_title(course_title_str):
    """Ambil dictionary fitur untuk 1 course berdasarkan judul"""
    row = df_courses[df_courses["course_title"].str.lower().str.strip() == course_title_str.lower().strip()].iloc[0]
    return {
        "course_id": tf.constant([str(row["course_id"])]),
        "course_title": tf.constant([str(row["course_title"])]),
        "subject": tf.constant([str(row["subject"])]),
        "level": tf.constant([str(row["level"])]),
        "price": tf.constant([row["price"]], dtype=tf.float32),
        "num_subscribers": tf.constant([row["num_subscribers"]], dtype=tf.float32),
        "num_reviews": tf.constant([row["num_reviews"]], dtype=tf.float32),
        "num_lectures": tf.constant([row["num_lectures"]], dtype=tf.float32),
        "content_duration": tf.constant([row["content_duration"]], dtype=tf.float32),
    }

In [None]:
while True:
    print("\n📥 Masukkan *judul course* untuk melihat rekomendasi (atau ketik 'exit'):")
    user_input = input(">> ")

    if user_input.lower() == "exit":
        print("\n👋 Terima kasih! Program selesai.")
        break

    # Validasi: apakah judul ada di data
    matched_rows = df_courses[df_courses["course_title"].str.lower().str.strip() == user_input.lower().strip()]
    if matched_rows.empty:
        print("⚠️  Judul course tidak ditemukan. Coba lagi.\n")
        print("📝 Judul yang tersedia:")
        for title in df_courses["course_title"]:
            print(f" - {title}")
        continue

    # Ambil course row & fitur
    row = matched_rows.iloc[0]
    query_features = lookup_course_by_title(row["course_title"])

    # Tampilkan info course
    print("\n🔍 Course yang dijadikan dasar rekomendasi:")
    print(f"   📘 ID        : {row['course_id']}")
    print(f"   📚 Title     : {row['course_title']}")
    print(f"   🧩 Subject   : {row['subject']}")
    print(f"   🎯 Level     : {row['level']}")
    print(f"   💰 Price     : {row['price']}")
    print(f"   👥 Subscribers: {row['num_subscribers']}\n")

    # Dapatkan skor dan ID untuk rekomendasi
    scores, ids = loaded_index(query_features, k=len(df_courses))


    # Menghitung jumlah rekomendasi yang diinginkan oleh user
    print(f"\nKami memiliki sebanyak {len(ids[0])} rekomendasi untukmu!")
    print(f"Berapa banyak rekomendasi yang ingin kamu lihat? (Masukkan angka antara 1 sampai {len(ids[0])}):")
    num_recommendations = int(input(">> "))
    if num_recommendations > len(ids[0]):
        num_recommendations = len(ids[0])  # Jika input lebih banyak dari yang tersedia, sesuaikan

    print(f"\n🔎 Rekomendasi berdasarkan course tersebut (Menampilkan {num_recommendations} rekomendasi terbaik):")

    # Filter hanya yang memiliki skor positif
    recommended_ids = []
    recommended_scores = []
    for score, course_id in zip(scores[0].numpy(), ids[0].numpy()):
        if score > 0:  # Hanya menampilkan skor positif
            recommended_ids.append(course_id.decode('utf-8'))
            recommended_scores.append(score)

    # Jika rekomendasi lebih sedikit dari permintaan, sesuaikan
    for i in range(min(num_recommendations, len(recommended_ids))):
        course_id_str = recommended_ids[i]
        recommended = df_courses[df_courses["course_id"].astype(str) == course_id_str].iloc[0]
        print(f"   🔗 Course ID: {course_id_str} | 📘 Title: {recommended['course_title']} | ⭐ Skor: {recommended_scores[i]:.4f}")

    if len(recommended_ids) == 0:
        print("⚠️ Tidak ada rekomendasi dengan skor positif.")


📥 Masukkan *judul course* untuk melihat rekomendasi (atau ketik 'exit'):
>> Clipping Masks & Shaped images in Adobe Photoshop CC

🔍 Course yang dijadikan dasar rekomendasi:
   📘 ID        : 1004512
   📚 Title     : Clipping Masks & Shaped images in Adobe Photoshop CC
   🧩 Subject   : Graphic Design
   🎯 Level     : All Levels
   💰 Price     : 0.45
   👥 Subscribers: 0.0070838120949119265


Kami memiliki sebanyak 3672 rekomendasi untukmu!
Berapa banyak rekomendasi yang ingin kamu lihat? (Masukkan angka antara 1 sampai 3672):
>> 50

🔎 Rekomendasi berdasarkan course tersebut (Menampilkan 50 rekomendasi terbaik):
   🔗 Course ID: 149716 | 📘 Title: Learn to Play Fernando Sor's 'Study in B minor",https://www.udemy.com/study-in-b-minor/,true,115,1403,5,9,Intermediate Level,43 mins,2014-01-27T20:58:16Z
398746,Piano: The Chord Based System - Learn To Play As The Pros Do" | ⭐ Skor: 28.5033
   🔗 Course ID: 801702 | 📘 Title: Geld verdienen mit dem Ticken der Uhr, wie Warren Buffett! | ⭐ Skor: 24.46