File ini diluar keperluan submisi. Digunakan untuk eksperimen

# Modelling

In [2]:
import os
import pandas as pd

# Dapatkan direktori kerja saat ini (untuk Jupyter Notebook)
base_dir = os.getcwd()

# Gabungkan path relatif file CSV
file_path = os.path.join(base_dir, "nilai_mahasiswa-preprocessed.csv")
print(f"✅ File CSV: {file_path}")
# Load dataset
df = pd.read_csv(file_path)

✅ File CSV: c:\Users\ADVAN\Documents\Dicoding\msml\SMSML_LeoPrangsT\Membangun_model\nilai_mahasiswa-preprocessed.csv


## tensor CF

In [3]:
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scikeras.wrappers import KerasRegressor

# --- Split data
X = df[["user", "item"]].values
y = df["rating"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --- Model builder (terima satu array 2 kolom)
def build_ncf_model(n_users, n_items, embed_dim=16, hidden=[32, 16, 8]):
    inputs = keras.Input(shape=(2,), name="user_item_input")
    
    # Pisahkan kolom user & item
    user_input = layers.Lambda(lambda x: x[:, 0])(inputs)
    item_input = layers.Lambda(lambda x: x[:, 1])(inputs)
    
    # Ubah ke int agar bisa dipakai di Embedding
    user_input = layers.Reshape((1,))(user_input)
    item_input = layers.Reshape((1,))(item_input)

    user_emb = layers.Embedding(n_users, embed_dim)(user_input)
    item_emb = layers.Embedding(n_items, embed_dim)(item_input)

    # Flatten embedding
    user_vec = layers.Flatten()(user_emb)
    item_vec = layers.Flatten()(item_emb)

    # Gabungkan
    x = layers.Concatenate()([user_vec, item_vec])

    # Hidden layers
    for h in hidden:
        x = layers.Dense(h, activation="relu")(x)

    output = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs=inputs, outputs=output)
    model.compile(optimizer="adam", loss="mse")
    return model

# --- Setup model
n_users = df["user"].nunique()
n_items = df["item"].nunique()

model = KerasRegressor(
    model=lambda: build_ncf_model(n_users, n_items),
    epochs=10,
    batch_size=32,
    verbose=1
)

# --- Train
model.fit(X_train, y_train)

# --- Predict
y_pred = model.predict(X_test)

# --- Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"✅ RMSE: {rmse:.4f}")


Epoch 1/10
[1m628/628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.0589
Epoch 2/10
[1m628/628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.0317
Epoch 3/10
[1m628/628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.0284
Epoch 4/10
[1m628/628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0267
Epoch 5/10
[1m628/628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.0251
Epoch 6/10
[1m628/628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.0242
Epoch 7/10
[1m628/628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.0235
Epoch 8/10
[1m628/628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.0229
Epoch 9/10
[1m628/628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.0225
Epoch 10/10
[1m628/628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - l

# Modelling Tuning

## Tensor CF

In [3]:
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from scikeras.wrappers import KerasRegressor

# --- Split data
X = df[["user", "item"]].values
y = df["rating"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --- Model builder
def build_ncf_model(n_users, n_items, embed_dim=16, hidden=[32, 16, 8], lr=0.001):
    inputs = keras.Input(shape=(2,), name="user_item_input")
    
    user_input = layers.Lambda(lambda x: x[:, 0])(inputs)
    item_input = layers.Lambda(lambda x: x[:, 1])(inputs)
    user_input = layers.Reshape((1,))(user_input)
    item_input = layers.Reshape((1,))(item_input)

    user_emb = layers.Embedding(n_users, embed_dim)(user_input)
    item_emb = layers.Embedding(n_items, embed_dim)(item_input)

    user_vec = layers.Flatten()(user_emb)
    item_vec = layers.Flatten()(item_emb)
    x = layers.Concatenate()([user_vec, item_vec])

    for h in hidden:
        x = layers.Dense(h, activation="relu")(x)

    output = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs=inputs, outputs=output)

    optimizer = keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=optimizer, loss="mse")
    return model


# --- Hitung jumlah user & item
n_users = df["user"].nunique()
n_items = df["item"].nunique()

# --- Bungkus ke KerasRegressor
regressor = KerasRegressor(
    model=lambda embed_dim, hidden, lr: build_ncf_model(
        n_users=n_users, n_items=n_items, embed_dim=embed_dim, hidden=hidden, lr=lr
    ),
    epochs=10,
    batch_size=32,
    verbose=0
)

# --- Hyperparameter grid
# param_grid = { # 24 menit
#     "model__embed_dim": [8, 16, 32],
#     "model__hidden": [[64, 32, 16], [32, 16, 8]],
#     "model__lr": [0.001, 0.005],
#     "batch_size": [32, 64],
#     "epochs": [5, 10]
# }
# --- Hyperparameter grid (versi cepat, ±3 menit)
param_grid = {
    "model__embed_dim": [16],              # tetap 1 nilai tengah
    "model__hidden": [[64, 32, 16], [32, 16, 8]],  # 2 kombinasi arsitektur
    "model__lr": [0.001],                  # 1 nilai stabil
    "batch_size": [32],                    # 1 nilai default efisien
    "epochs": [5, 8]                       # 2 nilai ringan
}

# --- Grid Search
grid = GridSearchCV(
    estimator=regressor,
    param_grid=param_grid,
    scoring="neg_mean_squared_error",
    cv=3,
    verbose=2
)

grid_result = grid.fit(X_train, y_train)

# --- Hasil terbaik
print(f"Best Params: {grid_result.best_params_}")
print(f"Best Score (neg MSE): {grid_result.best_score_:.4f}")

# --- Evaluasi di test set
best_model = grid_result.best_estimator_
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"✅ Test RMSE: {rmse:.4f}")

Fitting 3 folds for each of 4 candidates, totalling 12 fits

[CV] END batch_size=32, epochs=5, model__embed_dim=16, model__hidden=[64, 32, 16], model__lr=0.001; total time=  13.8s
[CV] END batch_size=32, epochs=5, model__embed_dim=16, model__hidden=[64, 32, 16], model__lr=0.001; total time=  11.1s
[CV] END batch_size=32, epochs=5, model__embed_dim=16, model__hidden=[64, 32, 16], model__lr=0.001; total time=   8.6s
[CV] END batch_size=32, epochs=5, model__embed_dim=16, model__hidden=[32, 16, 8], model__lr=0.001; total time=   7.5s
[CV] END batch_size=32, epochs=5, model__embed_dim=16, model__hidden=[32, 16, 8], model__lr=0.001; total time=   7.5s
[CV] END batch_size=32, epochs=5, model__embed_dim=16, model__hidden=[32, 16, 8], model__lr=0.001; total time=   8.4s
[CV] END batch_size=32, epochs=8, model__embed_dim=16, model__hidden=[64, 32, 16], model__lr=0.001; total time=  12.8s
[CV] END batch_size=32, epochs=8, model__embed_dim=16, model__hidden=[64, 32, 16], model__lr=0.001; total tim