Library yang perlu diinstal

In [None]:
!pip install lightfm
!pip install streamlit
!pip install sentence-transformers
!pip install lightfm sentence-transformers scikit-learn


Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/316.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m307.2/316.4 kB[0m [31m10.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp311-cp311-linux_x86_64.whl size=829309 sha256=48fbcbe06cfc146f62544af6f8488c2d05b05e7a0281ed7e9b4b08b42feb316b
  Stored in directory: /root/.cache/pip/wheels/b9/0d/8a/0729d2e6e3ca2a898ba55201f905da7db3f838a33df5b3fcdd
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17
Collecting streamlit
 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Menyimpan Mapping User dan Item Dari Model LightFM.

In [None]:
import pandas as pd
import pickle
from lightfm.data import Dataset

# 1. Load Webtoon Data
file_path = '/content/drive/MyDrive/Colab Notebooks/AOL ML/webtoon_originals_en.csv'
df = pd.read_csv(file_path)
df = df[['title', 'genre', 'authors', 'subscribers', 'views', 'likes', 'rating', 'synopsis']].dropna()

# 2. Simulate 100 user_ids
num_users = 100
user_ids = [f"user_{i}" for i in range(num_users)]

# 3. Create Dataset and Fit
dataset = Dataset()
dataset.fit(users=user_ids, items=df['title'])

# 4. Extract and Save Mapping
user_mapping, _, item_mapping, _ = dataset.mapping()

with open("/content/drive/MyDrive/Colab Notebooks/AOL ML/user_mapping.pkl", "wb") as f:
    pickle.dump(user_mapping, f)

with open("/content/drive/MyDrive/Colab Notebooks/AOL ML/item_mapping.pkl", "wb") as f:
    pickle.dump(item_mapping, f)

print("✅ user_mapping.pkl dan item_mapping.pkl berhasil disimpan ke Google Drive.")


✅ user_mapping.pkl dan item_mapping.pkl berhasil disimpan ke Google Drive.


Generate Embedding & Feature Files

In [None]:
# generate_pickle_files.py
import pandas as pd
import numpy as np
import pickle
import re
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler

# Load CSV
file_path = '/content/drive/MyDrive/Colab Notebooks/AOL ML/webtoon_originals_en.csv'
df = pd.read_csv(file_path)
df = df[['title', 'genre', 'authors', 'subscribers', 'views', 'likes', 'rating', 'synopsis']].dropna()

# Clean text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return re.sub(r'\s+', ' ', text).strip()

df['genre'] = df['genre'].apply(clean_text)
df['authors'] = df['authors'].fillna('unknown').apply(clean_text)
df['synopsis'] = df['synopsis'].fillna('').apply(clean_text)

# Normalisasi view/like/subscriber → implicit score (optional tapi disarankan)
scaler = MinMaxScaler()
df[['views', 'likes', 'subscribers']] = scaler.fit_transform(df[['views', 'likes', 'subscribers']])
df['implicit_score'] = df['views'] * 0.5 + df['likes'] * 0.3 + df['subscribers'] * 0.2 + 0.1 * (df['rating'] / 10)

# Embed sinopsis
model_bert = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
synopsis_embeddings = model_bert.encode(df['synopsis'].tolist(), batch_size=128, show_progress_bar=True)

# Save top_items (judul)
top_items = df[['title']].reset_index(drop=True)
with open('/content/drive/MyDrive/Colab Notebooks/AOL ML/top_items.pkl', 'wb') as f:
    pickle.dump(top_items, f)

# Save item_features (genre + author + embedding dim names)
df['item_features'] = df['genre'] + ',' + df['authors']
embedding_features = [f'synopsis_dim_{i}' for i in range(synopsis_embeddings.shape[1])]
embedding_df = pd.DataFrame(synopsis_embeddings, columns=embedding_features)
df = pd.concat([df.reset_index(drop=True), embedding_df.reset_index(drop=True)], axis=1)
for feat in embedding_features:
    df['item_features'] += ',' + feat

item_features = df[['title', 'item_features']]
with open('/content/drive/MyDrive/Colab Notebooks/AOL ML/item_features.pkl', 'wb') as f:
    pickle.dump(item_features, f)

print("✅ Files generated and saved to Google Drive:")
print("- top_items.pkl")
print("- item_features.pkl")
print("- synopsis_embeddings.npy")


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

✅ Files generated and saved to Google Drive:
- top_items.pkl
- item_features.pkl
- synopsis_embeddings.npy


Training & Hyperparameter Tuning Untuk LightFM

In [None]:
# grid_search_optimized.py
import pandas as pd
import numpy as np
import random
import re
import pickle
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, auc_score
from lightfm.cross_validation import random_train_test_split
from sklearn.preprocessing import MinMaxScaler
from sentence_transformers import SentenceTransformer
from itertools import product
import os
import json

# 1. Setup reproducibility
random.seed(42)
np.random.seed(42)

# 2. Load dataset
file_path = '/content/drive/MyDrive/Colab Notebooks/AOL ML/webtoon_originals_en.csv'
df = pd.read_csv(file_path)
df = df[['title', 'genre', 'authors', 'subscribers', 'views', 'likes', 'rating', 'synopsis']].dropna()

# 3. Clean text fields
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return re.sub(r'\s+', ' ', text).strip()

df['genre'] = df['genre'].apply(clean_text)
df['authors'] = df['authors'].fillna('unknown').apply(clean_text)
df['synopsis'] = df['synopsis'].fillna('').apply(clean_text)

# 4. Implicit score
scaler = MinMaxScaler()
df[['views', 'likes', 'subscribers']] = scaler.fit_transform(df[['views', 'likes', 'subscribers']])
df['implicit_score'] = df['views'] * 0.5 + df['likes'] * 0.3 + df['subscribers'] * 0.2 + 0.1 * (df['rating'] / 10)

# 5. Simulate user interactions based on genre preference
num_users = 100
user_ids = [f"user_{i}" for i in range(num_users)]
webtoons = df['title'].tolist()
interaction_data = []

preferred_genres = ['romance', 'action', 'comedy', 'fantasy']
for i, user in enumerate(user_ids):
    genre = preferred_genres[i % len(preferred_genres)]
    preferred_titles = df[df['genre'].str.contains(genre)]['title'].tolist()
    if len(preferred_titles) < 10:
        preferred_titles = webtoons
    liked = random.sample(preferred_titles, min(80, len(preferred_titles)))
    for title in liked:
        score = df[df['title'] == title]['implicit_score'].values[0]
        interaction_data.append((user, title, score))

# 6. Load or create synopsis embeddings
embed_path = '/content/drive/MyDrive/Colab Notebooks/AOL ML/synopsis_embeddings.npy'
if os.path.exists(embed_path):
    synopsis_embeddings = np.load(embed_path)
else:
    model_bert = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
    synopsis_embeddings = model_bert.encode(df['synopsis'].tolist(), batch_size=128, show_progress_bar=True)
    np.save(embed_path, synopsis_embeddings)

# 7. Dataset setup
dataset = Dataset()
dataset.fit(users=user_ids, items=df['title'])

df['item_features'] = df['genre'] + ',' + df['authors']
embedding_features = [f'synopsis_dim_{i}' for i in range(synopsis_embeddings.shape[1])]
embedding_df = pd.DataFrame(synopsis_embeddings, columns=embedding_features)
df = pd.concat([df.reset_index(drop=True), embedding_df.reset_index(drop=True)], axis=1)

for feat in embedding_features:
    df['item_features'] += ',' + feat

all_item_features = set()
for feats in df['item_features']:
    all_item_features.update(feats.split(','))

# Build interactions & item features
dataset.fit_partial(items=df['title'], item_features=all_item_features)
interactions, _ = dataset.build_interactions(interaction_data)
item_features = dataset.build_item_features(((title, feats.split(',')) for title, feats in zip(df['title'], df['item_features'])))

# 8. Train/test split
train, test = random_train_test_split(interactions, test_percentage=0.2)

# 9. Grid Search
params = {
    'no_components': [64, 128],
    'learning_rate': [0.03, 0.05],
    'loss': ['bpr', 'warp']
}

results = []
best_auc = 0
best_precision = 0
best_model = None
best_config = None

print("\n[Running Grid Search]")
for nc, lr, loss_fn in product(params['no_components'], params['learning_rate'], params['loss']):
    model = LightFM(loss=loss_fn, learning_rate=lr, no_components=nc)
    model.fit(train, item_features=item_features, epochs=50, num_threads=2)

    precision = precision_at_k(model, test, train_interactions=train, item_features=item_features, k=10).mean()
    auc = auc_score(model, test, train_interactions=train, item_features=item_features).mean()

    config = {'components': nc, 'lr': lr, 'loss': loss_fn, 'precision': float(precision), 'auc': float(auc)}
    results.append(config)

    print(f"Config: {config}")

    if auc > best_auc and precision > best_precision:
        best_auc = auc
        best_precision = precision
        best_model = model
        best_config = config

# 10. Save best model & results
if best_model:
    with open("/content/drive/MyDrive/Colab Notebooks/AOL ML/best_webtoon_model.pkl", "wb") as f:
        pickle.dump(best_model, f)

    with open("/content/drive/MyDrive/Colab Notebooks/AOL ML/grid_results.json", "w") as f:
        json.dump(results, f, indent=2)

    print("\n✅ [Best Config]")
    print(f"Precision@10: {best_precision:.4f}, AUC: {best_auc:.4f}")
    print(f"Config: {best_config}")
else:
    print("\n❌ No suitable model found.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]


[Running Grid Search]
Config: {'components': 64, 'lr': 0.03, 'loss': 'bpr', 'precision': 0.1420000046491623, 'auc': 0.5368591547012329}
Config: {'components': 64, 'lr': 0.03, 'loss': 'warp', 'precision': 0.5019999742507935, 'auc': 0.9780001044273376}
Config: {'components': 64, 'lr': 0.05, 'loss': 'bpr', 'precision': 0.14499999582767487, 'auc': 0.5372779965400696}
Config: {'components': 64, 'lr': 0.05, 'loss': 'warp', 'precision': 0.5040000081062317, 'auc': 0.978089451789856}
Config: {'components': 128, 'lr': 0.03, 'loss': 'bpr', 'precision': 0.1459999829530716, 'auc': 0.5368958115577698}
Config: {'components': 128, 'lr': 0.03, 'loss': 'warp', 'precision': 0.4950000047683716, 'auc': 0.977794885635376}
Config: {'components': 128, 'lr': 0.05, 'loss': 'bpr', 'precision': 0.13899999856948853, 'auc': 0.5368326306343079}
Config: {'components': 128, 'lr': 0.05, 'loss': 'warp', 'precision': 0.49800002574920654, 'auc': 0.9777548313140869}

✅ [Best Config]
Precision@10: 0.5040, AUC: 0.9781
Confi

Pelatihan Model LightFM dengan Tuning Parameter dan Fitur Tambahan Seperti Boosted

In [None]:
# grid_search_boosted.py
import pandas as pd
import numpy as np
import random
import re
import pickle
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, auc_score
from lightfm.cross_validation import random_train_test_split
from sklearn.preprocessing import MinMaxScaler
from sentence_transformers import SentenceTransformer
from itertools import product
import os
import json

random.seed(42)
np.random.seed(42)

file_path = '/content/drive/MyDrive/Colab Notebooks/AOL ML/webtoon_originals_en.csv'
df = pd.read_csv(file_path)
df = df[['title', 'genre', 'authors', 'subscribers', 'views', 'likes', 'rating', 'synopsis']].dropna()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return re.sub(r'\s+', ' ', text).strip()

df['genre'] = df['genre'].apply(clean_text)
df['authors'] = df['authors'].fillna('unknown').apply(clean_text)
df['synopsis'] = df['synopsis'].fillna('').apply(clean_text)

scaler = MinMaxScaler()
df[['views', 'likes', 'subscribers']] = scaler.fit_transform(df[['views', 'likes', 'subscribers']])
df['implicit_score'] = df['views'] * 0.5 + df['likes'] * 0.3 + df['subscribers'] * 0.2 + 0.1 * (df['rating'] / 10)

# Tambahkan fitur rating_bucket
df['rating_bucket'] = pd.cut(df['rating'], bins=[0, 7, 8.5, 10], labels=['low', 'mid', 'high'])
df['rating_bucket'] = df['rating_bucket'].cat.add_categories('unknown').fillna('unknown')

num_users = 100
user_ids = [f"user_{i}" for i in range(num_users)]
webtoons = df['title'].tolist()
interaction_data = []
preferred_genres = ['romance', 'action', 'comedy', 'fantasy']

for i, user in enumerate(user_ids):
    genre = preferred_genres[i % len(preferred_genres)]
    preferred_titles = df[df['genre'].str.contains(genre)]['title'].tolist()
    if len(preferred_titles) < 10:
        preferred_titles = webtoons
    liked = random.sample(preferred_titles, min(80, len(preferred_titles)))
    for title in liked:
        score = df[df['title'] == title]['implicit_score'].values[0]
        interaction_data.append((user, title, score))

embed_path = '/content/drive/MyDrive/Colab Notebooks/AOL ML/synopsis_embeddings.npy'
if os.path.exists(embed_path):
    synopsis_embeddings = np.load(embed_path)
else:
    model_bert = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
    synopsis_embeddings = model_bert.encode(df['synopsis'].tolist(), batch_size=128, show_progress_bar=True)
    np.save(embed_path, synopsis_embeddings)

# Dataset setup
dataset = Dataset()
dataset.fit(users=user_ids, items=df['title'])

df['item_features'] = df['genre'] + ',' + df['authors'] + ',' + df['rating_bucket'].astype(str)
embedding_features = [f'synopsis_dim_{i}' for i in range(synopsis_embeddings.shape[1])]
embedding_df = pd.DataFrame(synopsis_embeddings, columns=embedding_features)
df = pd.concat([df.reset_index(drop=True), embedding_df.reset_index(drop=True)], axis=1)
for feat in embedding_features:
    df['item_features'] += ',' + feat

df['item_features'] = df['item_features'].fillna('unknown').astype(str)
all_item_features = set()
for feats in df['item_features']:
    all_item_features.update(feats.split(','))

dataset.fit_partial(items=df['title'], item_features=all_item_features)
interactions, _ = dataset.build_interactions(interaction_data)
item_features = dataset.build_item_features(((title, feats.split(',')) for title, feats in zip(df['title'], df['item_features'])))

train, test = random_train_test_split(interactions, test_percentage=0.2)

# Grid search tuning (terkontrol)
params = {
    'no_components': [64, 128],
    'learning_rate': [0.03, 0.05],
    'loss': ['warp', 'bpr'],
    'item_alpha': [0.0, 1e-6],
    'user_alpha': [0.0, 1e-6],
    'epochs': [30]
}

results = []
best_auc = 0
best_precision = 0
best_model = None
best_config = None

print("\n[Running Boosted Grid Search]")
for nc, lr, loss_fn, ia, ua, ep in product(params['no_components'], params['learning_rate'], params['loss'], params['item_alpha'], params['user_alpha'], params['epochs']):
    model = LightFM(loss=loss_fn, learning_rate=lr, no_components=nc, item_alpha=ia, user_alpha=ua)
    model.fit(train, item_features=item_features, epochs=ep, num_threads=2)

    precision = precision_at_k(model, test, train_interactions=train, item_features=item_features, k=10).mean()
    auc = auc_score(model, test, train_interactions=train, item_features=item_features).mean()

    config = {'components': nc, 'lr': lr, 'loss': loss_fn, 'item_alpha': ia, 'user_alpha': ua, 'epochs': ep, 'precision': float(precision), 'auc': float(auc)}
    results.append(config)

    print(f"Config: {config}")

    if auc > best_auc and precision > best_precision:
        best_auc = auc
        best_precision = precision
        best_model = model
        best_config = config

# Save best model and results
if best_model:
    with open("/content/drive/MyDrive/Colab Notebooks/AOL ML/best_webtoon_model_boosted.pkl", "wb") as f:
        pickle.dump(best_model, f)

    with open("/content/drive/MyDrive/Colab Notebooks/AOL ML/grid_results_boosted.json", "w") as f:
        json.dump(results, f, indent=2)

    print("\n✅ [Best Boosted Config]")
    print(f"Precision@10: {best_precision:.4f}, AUC: {best_auc:.4f}")
    print(f"Config: {best_config}")
else:
    print("\n❌ No suitable model found.")


Batches:   0%|          | 0/7 [00:00<?, ?it/s]


[Running Boosted Grid Search]
Config: {'components': 64, 'lr': 0.03, 'loss': 'warp', 'item_alpha': 0.0, 'user_alpha': 0.0, 'epochs': 30, 'precision': 0.3330000042915344, 'auc': 0.9690146446228027}
Config: {'components': 64, 'lr': 0.03, 'loss': 'warp', 'item_alpha': 0.0, 'user_alpha': 1e-06, 'epochs': 30, 'precision': 0.29500001668930054, 'auc': 0.9651132822036743}
Config: {'components': 64, 'lr': 0.03, 'loss': 'warp', 'item_alpha': 1e-06, 'user_alpha': 0.0, 'epochs': 30, 'precision': 0.48899996280670166, 'auc': 0.9764052033424377}
Config: {'components': 64, 'lr': 0.03, 'loss': 'warp', 'item_alpha': 1e-06, 'user_alpha': 1e-06, 'epochs': 30, 'precision': 0.48500004410743713, 'auc': 0.9763644337654114}
Config: {'components': 64, 'lr': 0.03, 'loss': 'bpr', 'item_alpha': 0.0, 'user_alpha': 0.0, 'epochs': 30, 'precision': 0.13500000536441803, 'auc': 0.5357367992401123}
Config: {'components': 64, 'lr': 0.03, 'loss': 'bpr', 'item_alpha': 0.0, 'user_alpha': 1e-06, 'epochs': 30, 'precision': 0.

Versi Final Code Sudah Jadi

In [None]:
# grid_search_optimized_with_recommendation.py
import pandas as pd
import numpy as np
import random
import re
import pickle
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, auc_score
from lightfm.cross_validation import random_train_test_split
from sklearn.preprocessing import MinMaxScaler
from sentence_transformers import SentenceTransformer
import os

random.seed(42)
np.random.seed(42)

file_path = 'webtoon_originals_en.csv'
df = pd.read_csv(file_path)
df = df[['title', 'genre', 'authors', 'subscribers', 'views', 'likes', 'rating', 'synopsis']].dropna()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return re.sub(r'\s+', ' ', text).strip()

df['genre'] = df['genre'].apply(clean_text)
df['authors'] = df['authors'].fillna('unknown').apply(clean_text)
df['synopsis'] = df['synopsis'].fillna('').apply(clean_text)

scaler = MinMaxScaler()
df[['views', 'likes', 'subscribers']] = scaler.fit_transform(df[['views', 'likes', 'subscribers']])
df['implicit_score'] = df['views'] * 0.5 + df['likes'] * 0.3 + df['subscribers'] * 0.2 + 0.1 * (df['rating'] / 10)

num_users = 100
user_ids = [f"user_{i}" for i in range(num_users)]
webtoons = df['title'].tolist()
interaction_data = []
preferred_genres = ['romance', 'action', 'comedy', 'fantasy']

for i, user in enumerate(user_ids):
    genre = preferred_genres[i % len(preferred_genres)]
    preferred_titles = df[df['genre'].str.contains(genre)]['title'].tolist()
    if len(preferred_titles) < 10:
        preferred_titles = webtoons
    liked = random.sample(preferred_titles, min(80, len(preferred_titles)))
    for title in liked:
        score = df[df['title'] == title]['implicit_score'].values[0]
        interaction_data.append((user, title, score))

# Langsung generate embedding
model_bert = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
synopsis_embeddings = model_bert.encode(df['synopsis'].tolist(), batch_size=128, show_progress_bar=True)

# Dataset setup
dataset = Dataset()
dataset.fit(users=user_ids, items=df['title'])

df['item_features'] = df['genre'] + ',' + df['authors']
embedding_features = [f'synopsis_dim_{i}' for i in range(synopsis_embeddings.shape[1])]
embedding_df = pd.DataFrame(synopsis_embeddings, columns=embedding_features)
df = pd.concat([df.reset_index(drop=True), embedding_df.reset_index(drop=True)], axis=1)
for feat in embedding_features:
    df['item_features'] += ',' + feat

df['item_features'] = df['item_features'].fillna('unknown').astype(str)
all_item_features = set()
for feats in df['item_features']:
    all_item_features.update(feats.split(','))

dataset.fit_partial(items=df['title'], item_features=all_item_features)
interactions, _ = dataset.build_interactions(interaction_data)
item_features = dataset.build_item_features(((title, feats.split(',')) for title, feats in zip(df['title'], df['item_features'])))

train, test = random_train_test_split(interactions, test_percentage=0.2)

# Train final model
model = LightFM(loss='warp', learning_rate=0.05, no_components=64)
model.fit(train, item_features=item_features, epochs=30, num_threads=2)

# Top items DataFrame untuk rekomendasi
top_items = df[['title']].reset_index(drop=True)

# Cold-start: Content-based recommendation
def content_based_recommendation(input_titles, top_n=10):
    idx_inputs = [top_items[top_items['title'] == t].index[0] for t in input_titles if t in top_items['title'].values]
    if not idx_inputs:
        return "Judul tidak ditemukan."
    input_vectors = synopsis_embeddings[idx_inputs]
    avg_vector = np.mean(input_vectors, axis=0).reshape(1, -1)
    similarities = np.dot(synopsis_embeddings, avg_vector.T).flatten()
    for idx in idx_inputs:
        similarities[idx] = -1
    top_indices = np.argsort(-similarities)[:top_n]
    return top_items['title'].iloc[top_indices].tolist()

# Hybrid recommendation
def hybrid_recommendation(user_id, input_titles, top_n=10):
    user_idx = 0  # hanya user_0
    item_labels = list(top_items['title'])
    idx_inputs = [item_labels.index(t) for t in input_titles if t in item_labels]
    if not idx_inputs:
        return "Judul tidak ditemukan."
    scores = model.predict(user_ids=user_idx, item_ids=np.arange(len(item_labels)), item_features=item_features)
    for idx in idx_inputs:
        scores[idx] = -np.inf
    top_items_idx = np.argsort(-scores)[:top_n]
    return [item_labels[i] for i in top_items_idx]

# Adaptif input menu
user_history_file = 'user_input_history.txt'
user_id = 'user_0'
input_titles = []

print("\n🎯 Sistem Rekomendasi Webtoon")
genre_input = input("Masukkan genre favoritmu (e.g. romance, action): ").lower()

while True:
    title = input("Masukkan judul Webtoon favoritmu (atau 'exit'): ")
    if title.lower() == 'exit':
        break
    if title not in top_items['title'].values:
        print(f"Judul '{title}' tidak ditemukan.")
        continue
    input_titles.append(title)
    with open(user_history_file, 'a') as f:
        f.write(f"{user_id},{title}\n")

    if len(input_titles) < 3:
        print("\n🔍 Rekomendasi awal (cold start):")
        rekomendasi = content_based_recommendation(input_titles)
    else:
        print("\n🤖 Rekomendasi campuran (hybrid):")
        rekomendasi = hybrid_recommendation(user_id, input_titles)

    for i, rec in enumerate(rekomendasi, 1):
        print(f"{i}. {rec}")
