# Imports

In [1]:
import os
import gc
import pickle
from datetime import datetime
from typing import Tuple, Dict, Any, List

import numpy as np
import pandas as pd
from scipy import sparse
from scipy.sparse import csr_matrix, lil_matrix, save_npz, load_npz
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import cosine_similarity
from math import sqrt
from IPython.display import display, Markdown, clear_output
from pathlib import Path
import ipywidgets as widgets

try:
    from tqdm import tqdm
except Exception:
    tqdm = lambda x, **k: x

print("Imports complete.")

Imports complete.


# Path Definitions (Portable)

In [2]:
# ‡πÑ‡∏î‡πâ Path ‡∏Ç‡∏≠‡∏á‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå‡πÇ‡∏õ‡∏£‡πÄ‡∏à‡∏Å‡∏ï‡πå (‡∏ó‡∏µ‡πà‡πÑ‡∏ü‡∏•‡πå .ipynb ‡∏ô‡∏µ‡πâ‡∏≠‡∏¢‡∏π‡πà)
PROJECT_ROOT = Path(os.getcwd())

# ‡∏™‡∏£‡πâ‡∏≤‡∏á Path ‡πÑ‡∏õ‡∏¢‡∏±‡∏á‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå‡∏¢‡πà‡∏≠‡∏¢‡∏ï‡πà‡∏≤‡∏á‡πÜ
PROCESSED_PATH = PROJECT_ROOT / "processed"
CLEANED_PATH = PROCESSED_PATH / "cleaned"
PREPROCESS_PATH = PROCESSED_PATH / "preprocess"
MODEL_PATH = PROCESSED_PATH / "models"

# Utility function ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Log
def log(msg: str, level: str = "INFO") -> None:
    ts = datetime.now().strftime("%Y-m-d %H:%M:%S")
    print(f"[{level}] {ts} | {msg}")

log(f"Project Root (This Notebook's location): {PROJECT_ROOT}")
log(f"Model Path set to: {MODEL_PATH}")
log(f"Cleaned Data Path set to: {CLEANED_PATH}")

[INFO] 2025-m-d 16:28:30 | Project Root (This Notebook's location): C:\Users\nonth\Documents\movie_predict_move
[INFO] 2025-m-d 16:28:30 | Model Path set to: C:\Users\nonth\Documents\movie_predict_move\processed\models
[INFO] 2025-m-d 16:28:30 | Cleaned Data Path set to: C:\Users\nonth\Documents\movie_predict_move\processed\cleaned


# Helper Functions

In [3]:
# --- ‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡πÇ‡∏´‡∏•‡∏î SVD Artifacts ---
def load_svd_artifacts(model_dir: Path) -> Dict[str, Any]:
    log("Loading SVD artifacts from disk...")
    U = np.load(model_dir / "svd_U.npy")
    Sigma = np.load(model_dir / "svd_Sigma.npy")
    Vt = np.load(model_dir / "svd_Vt.npy")
    user_mean = np.load(model_dir / "svd_user_mean.npy")
    
    with open(model_dir / "svd_user_index.pkl", "rb") as f:
        user_index = pickle.load(f)
    with open(model_dir / "svd_movie_index.pkl", "rb") as f:
        movie_index = pickle.load(f)
    with open(model_dir / "svd_reverse_user_index.pkl", "rb") as f:
        reverse_user_index = pickle.load(f)
    with open(model_dir / "svd_reverse_movie_index.pkl", "rb") as f:
        reverse_movie_index = pickle.load(f)
        
    log("Loaded SVD artifacts")
    return {
        "U": U, "Sigma": Sigma, "Vt": Vt, "user_mean": user_mean,
        "user_index": user_index, "movie_index": movie_index,
        "reverse_user_index": reverse_user_index, "reverse_movie_index": reverse_movie_index
    }

# --- ‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Test 1: Content-Based ---
def get_content_based_recs(movie_title: str, top_n: int = 10) -> pd.DataFrame:
    log(f"Finding Content-Based recommendations for: '{movie_title}'")
    
    # 1. ‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤ movieId ‡∏à‡∏≤‡∏Å title
    movie_row = movies_global[movies_global['title'].str.contains(movie_title, case=False, na=False)]
    if movie_row.empty:
        log(f"Movie not found: {movie_title}", "WARN")
        return pd.DataFrame()
    
    movie_id = movie_row.iloc[0]['movieId']
    log(f"Found movieId: {movie_id}")

    # 2. ‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤ index ‡πÉ‡∏ô sparse matrix
    idx_arr = np.where(movie_ids_global == movie_id)[0]
    if idx_arr.size == 0:
        log(f"MovieId {movie_id} not found in TF-IDF matrix (no content data).", "WARN")
        return pd.DataFrame()
    
    idx = int(idx_arr[0])

    # 3. ‡∏î‡∏∂‡∏á Top-N ‡∏ó‡∏µ‡πà‡∏Ñ‡∏•‡πâ‡∏≤‡∏¢‡∏Å‡∏±‡∏ô‡∏à‡∏≤‡∏Å sim_sparse
    row_indices = sim_sparse.rows[idx]
    row_data = sim_sparse.data[idx]
    
    if len(row_data) == 0:
        log(f"No similar movies found for index {idx}.", "WARN")
        return pd.DataFrame()

    # 4. Map ‡∏Å‡∏•‡∏±‡∏ö‡πÄ‡∏õ‡πá‡∏ô movieId ‡πÅ‡∏•‡∏∞ title
    similar_movie_ids = [movie_ids_global[i] for i in row_indices]
    
    result = movies_global[movies_global.movieId.isin(similar_movie_ids)][['movieId', 'title']].copy()
    score_map = dict(zip(similar_movie_ids, row_data))
    result['similarity_score'] = result['movieId'].map(score_map)
    
    # ‡πÑ‡∏°‡πà‡πÄ‡∏≠‡∏≤‡∏ï‡∏±‡∏ß‡πÄ‡∏≠‡∏á ‡πÅ‡∏•‡∏∞‡∏à‡∏±‡∏î‡∏≠‡∏±‡∏ô‡∏î‡∏±‡∏ö
    result = result[result.movieId != movie_id].sort_values('similarity_score', ascending=False)
    
    return result.head(top_n)

# --- ‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Test 2.1: CF (SVD) (Input: User) ---
def get_cf_recs_for_user(user_id: int, top_n: int = 10) -> pd.DataFrame:
    log(f"Finding CF (SVD) recommendations for User {user_id}")
    
    if user_id not in svd_user_index:
        log(f"User ID {user_id} not found in SVD training set.", "WARN")
        return pd.DataFrame()
        
    u_idx = svd_user_index[user_id]
    
    # ‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô (user_vector @ Vt) + user_mean
    user_vector = np.dot(U[u_idx, :], Sigma)  
    preds = np.dot(user_vector, Vt) + svd_user_mean[u_idx]  
    
    # ‡∏Å‡∏£‡∏≠‡∏á‡∏´‡∏ô‡∏±‡∏á‡∏ó‡∏µ‡πà‡πÄ‡∏Ñ‡∏¢‡∏î‡∏π‡πÅ‡∏•‡πâ‡∏ß
    seen_movie_ids = set(ratings_global[ratings_global.userId == user_id]['movieId'])
    
    # Map index ‡∏Å‡∏•‡∏±‡∏ö‡πÑ‡∏õ‡πÄ‡∏õ‡πá‡∏ô movieId
    recs = []
    for i in range(len(preds)):
        if i in svd_reverse_movie_index: # ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏ß‡πà‡∏≤ movie index ‡∏ô‡∏µ‡πâ‡∏°‡∏µ‡πÉ‡∏ô mapping
            movie_id = svd_reverse_movie_index[i]
            if movie_id not in seen_movie_ids:
                recs.append((movie_id, preds[i]))

    # ‡∏à‡∏±‡∏î‡∏≠‡∏±‡∏ô‡∏î‡∏±‡∏ö
    recs.sort(key=lambda x: x[1], reverse=True)
    
    # Join ‡∏ä‡∏∑‡πà‡∏≠‡∏´‡∏ô‡∏±‡∏á
    top_movie_ids = [mid for mid, score in recs[:top_n]]
    result = movies_global[movies_global.movieId.isin(top_movie_ids)][['movieId', 'title']].copy()
    score_map = dict(recs[:top_n])
    result['predicted_rating'] = result['movieId'].map(score_map)
    
    return result.sort_values('predicted_rating', ascending=False)

# --- ‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Test 2.2: CF (SVD) (Input: Movie) ---
def get_cf_recs_for_movie(movie_title: str, top_n: int = 10) -> pd.DataFrame:
    log(f"Finding best Users for Movie '{movie_title}' using CF (SVD)")
    
    # 1. ‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤ movieId ‡∏à‡∏≤‡∏Å title
    movie_row = movies_global[movies_global['title'].str.contains(movie_title, case=False, na=False)]
    if movie_row.empty:
        log(f"Movie not found: {movie_title}", "WARN")
        return pd.DataFrame()
    
    movie_id = movie_row.iloc[0]['movieId']
    
    if movie_id not in svd_movie_index:
        log(f"MovieId {movie_id} not in SVD training set.", "WARN")
        return pd.DataFrame()
        
    m_idx = svd_movie_index[movie_id]

    # 2. ‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô (U @ movie_vector) + user_mean
    movie_factors = Vt[:, m_idx]                 # ‡πÑ‡∏î‡πâ vector ‡∏Ñ‡∏∏‡∏ì‡∏•‡∏±‡∏Å‡∏©‡∏ì‡∏∞‡πÅ‡∏ù‡∏á‡∏Ç‡∏≠‡∏á‡∏´‡∏ô‡∏±‡∏á (Shape k,)
    movie_vector = np.dot(Sigma, movie_factors)  # ‡∏ô‡∏≥‡πÑ‡∏õ‡∏ñ‡πà‡∏ß‡∏á‡∏ô‡πâ‡∏≥‡∏´‡∏ô‡∏±‡∏Å‡∏î‡πâ‡∏ß‡∏¢ Sigma (Shape k,)
    preds_all_users = np.dot(U, movie_vector) + svd_user_mean # (n_users,)
    
    # 3. Map ‡∏Å‡∏•‡∏±‡∏ö‡πÄ‡∏õ‡πá‡∏ô userId
    recs = []
    for i in range(len(preds_all_users)):
        if i in svd_reverse_user_index:
            user_id = svd_reverse_user_index[i]
            recs.append((user_id, preds_all_users[i]))
            
    # 4. ‡∏à‡∏±‡∏î‡∏≠‡∏±‡∏ô‡∏î‡∏±‡∏ö
    recs.sort(key=lambda x: x[1], reverse=True)
    
    # 5. ‡∏™‡∏£‡πâ‡∏≤‡∏á DataFrame
    top_users = recs[:top_n]
    result = pd.DataFrame(top_users, columns=['userId', 'predicted_rating'])
    
    return result

# --- ‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Test 3: Hybrid (‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô‡πÄ‡∏î‡∏¥‡∏°) ---
# (‡∏´‡∏°‡∏≤‡∏¢‡πÄ‡∏´‡∏ï‡∏∏: 2 ‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡∏ô‡∏µ‡πâ‡∏ñ‡∏π‡∏Å‡∏Ñ‡∏±‡∏î‡∏•‡∏≠‡∏Å‡∏°‡∏≤‡∏à‡∏≤‡∏Å Cell 3 ‡∏Ç‡∏≠‡∏á‡∏Ñ‡∏≥‡∏ï‡∏≠‡∏ö‡∏Å‡πà‡∏≠‡∏ô‡∏´‡∏ô‡πâ‡∏≤)
def hybrid_score(userId: int, movieId: int, alpha: float = 0.7, top_k: int = 50) -> float:
    try:
        svd_row = svd_preds_df_global[svd_preds_df_global.movieId == movieId]
        svd_score = float(svd_row.pred_rating.values[0]) if not svd_row.empty else np.nan
    except Exception:
        svd_score = np.nan

    if sim_sparse is None or len(movie_ids_global) == 0:
        content_score = np.nan
    else:
        idx_arr = np.where(movie_ids_global == movieId)[0]
        if idx_arr.size == 0:
            content_score = np.nan
        else:
            idx = int(idx_arr[0])
            row = sim_sparse.rows[idx]
            data = sim_sparse.data[idx]
            if len(data) == 0:
                content_score = np.nan
            else:
                content_score = float(np.nanmean(data[:top_k]))

    if np.isnan(svd_score) and np.isnan(content_score): return np.nan
    if np.isnan(svd_score): return content_score
    if np.isnan(content_score): return svd_score
    return alpha * svd_score + (1.0 - alpha) * content_score

def recommend_movies(userId: int, top_n: int = 10, alpha: float = 0.7, top_k_content: int = 50) -> pd.DataFrame:
    global svd_preds_df_global 
    try:
        # ‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì SVD preds '‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î' ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö user ‡∏ô‡∏µ‡πâ '‡∏Ñ‡∏£‡∏±‡πâ‡∏á‡πÄ‡∏î‡∏µ‡∏¢‡∏ß'
        svd_preds_df_global = get_cf_recs_for_user(userId, top_n=len(svd_movie_index))
        # ‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡∏ä‡∏∑‡πà‡∏≠‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡πÉ‡∏´‡πâ‡∏ï‡∏£‡∏á‡∏Å‡∏±‡∏ö‡∏ó‡∏µ‡πà hybrid_score ‡∏Ñ‡∏≤‡∏î‡∏´‡∏ß‡∏±‡∏á
        svd_preds_df_global = svd_preds_df_global.rename(columns={'predicted_rating': 'pred_rating'})
    except ValueError:
        return pd.DataFrame(columns=['movieId', 'title', 'hybrid_score', 'reason'])

    seen = set(ratings_global.loc[ratings_global.userId == userId, 'movieId'].unique())
    candidates = [mid for mid in movie_ids_global if mid not in seen]
    
    log(f"Scoring {len(candidates)} candidate movies for user {userId} (Hybrid)...")
    
    scores = []
    for mid in tqdm(candidates, desc=f"Scoring user {userId} (Hybrid)"):
        score = hybrid_score(userId, mid, alpha=alpha, top_k=top_k_content)
        if not np.isnan(score):
            scores.append((mid, score))
            
    if len(scores) == 0:
        log(f"No candidate scores for user {userId}", "WARN")
        return pd.DataFrame(columns=['movieId', 'title', 'hybrid_score'])

    scores.sort(key=lambda x: x[1], reverse=True)
    top_scores = scores[:top_n]
    top_movie_ids = [mid for mid, s in top_scores]
    
    result = movies_global[movies_global.movieId.isin(top_movie_ids)][['movieId', 'title']].copy()
    score_map = dict(top_scores)
    result['hybrid_score'] = result['movieId'].map(score_map)
    
    return result.sort_values('hybrid_score', ascending=False).reset_index(drop=True)

log("Adding Training Functions...")
def train_svd_model(user_movie_matrix: csr_matrix, 
                    user_mean: np.ndarray, 
                    user_index: Dict[int, int], 
                    movie_index: Dict[int, int], 
                    k: int = 50) -> Dict[str, Any]:
    
    log(f"Running SVD with k={k}...")
    try:
        # ‡∏•‡∏ö user_mean ‡∏≠‡∏≠‡∏Å‡∏à‡∏≤‡∏Å matrix
        user_movie_matrix_demeaned = user_movie_matrix.copy()
        for i in range(user_movie_matrix.shape[0]):
            user_movie_matrix_demeaned.data[user_movie_matrix.indptr[i]:user_movie_matrix.indptr[i+1]] -= user_mean[i]
        
        U, s, Vt = svds(user_movie_matrix_demeaned, k=k)
        
        # s (singular values) ‡∏ï‡πâ‡∏≠‡∏á‡πÅ‡∏õ‡∏•‡∏á‡πÄ‡∏õ‡πá‡∏ô Diagonal Matrix (Sigma)
        Sigma = np.diag(s)
        
        log("SVD training complete.")
        
        # ‡∏™‡∏£‡πâ‡∏≤‡∏á reverse mappings
        reverse_user_index = {v: k for k, v in user_index.items()}
        reverse_movie_index = {v: k for k, v in movie_index.items()}
        
        return {
            "U": U, "Sigma": Sigma, "Vt": Vt, "user_mean": user_mean,
            "user_index": user_index, "movie_index": movie_index,
            "reverse_user_index": reverse_user_index, "reverse_movie_index": reverse_movie_index
        }
    except Exception as e:
        log(f"SVD training failed: {e}", "ERROR")
        return {}
def save_svd_artifacts(artifacts: Dict[str, Any], model_dir: Path) -> None:
    log(f"Saving SVD artifacts to {model_dir}")
    model_dir.mkdir(parents=True, exist_ok=True)
    
    # Save Numpy arrays
    np.save(model_dir / "svd_U.npy", artifacts["U"])
    np.save(model_dir / "svd_Sigma.npy", artifacts["Sigma"])
    np.save(model_dir / "svd_Vt.npy", artifacts["Vt"])
    np.save(model_dir / "svd_user_mean.npy", artifacts["user_mean"])
    
    # Save Pickle files
    for name in ["user_index", "movie_index", "reverse_user_index", "reverse_movie_index"]:
        with open(model_dir / f"svd_{name}.pkl", "wb") as f:
            pickle.dump(artifacts[name], f)
    log("Artifacts saved.")

log("Training functions added.")

log("All helper functions defined.")

[INFO] 2025-m-d 16:28:31 | Adding Training Functions...
[INFO] 2025-m-d 16:28:31 | Training functions added.
[INFO] 2025-m-d 16:28:31 | All helper functions defined.


# LOAD DATA & MODELS

In [4]:
log("--- STARTING ONE-TIME LOAD ---")

# 1. ‡πÇ‡∏´‡∏•‡∏î DataFrames (‡πÄ‡∏Å‡πá‡∏ö‡πÄ‡∏õ‡πá‡∏ô Global)
log("Loading DataFrames (movies, ratings)...")
movies_global = pd.read_csv(CLEANED_PATH / "movies_cleaned_f.csv")
ratings_global = pd.read_csv(CLEANED_PATH / "ratings_cleaned_f.csv")

# 2. ‡πÇ‡∏´‡∏•‡∏î Movie IDs (‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Content-Based)
log("Loading TF-IDF reduced data (for movie_ids)...")
tfidf_reduced_df = pd.read_csv(PREPROCESS_PATH / "movies_tfidf_reduced.csv")
movie_ids_global = tfidf_reduced_df["movieId"].to_numpy()

# 3. ‡πÇ‡∏´‡∏•‡∏î Content Similarity Matrix (Global)
log("Loading Content Similarity matrix...")
sim_sparse = load_npz(MODEL_PATH / "content_similarity_sparse.npz").tolil() # .tolil() ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÉ‡∏´‡πâ‡πÄ‡∏Ç‡πâ‡∏≤‡∏ñ‡∏∂‡∏á row ‡πÑ‡∏î‡πâ‡πÄ‡∏£‡πá‡∏ß

# 4. ‡πÇ‡∏´‡∏•‡∏î SVD Artifacts (Global)
artifacts = load_svd_artifacts(MODEL_PATH)
U = artifacts["U"]
Sigma = artifacts["Sigma"]
Vt = artifacts["Vt"]
svd_user_mean = artifacts["user_mean"]
svd_user_index = artifacts["user_index"]
svd_movie_index = artifacts["movie_index"]
svd_reverse_user_index = artifacts["reverse_user_index"]
svd_reverse_movie_index = artifacts["reverse_movie_index"]

# 5. ‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏ï‡∏±‡∏ß‡πÅ‡∏õ‡∏£ Global ‡∏ß‡πà‡∏≤‡∏á‡πÜ ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡πÄ‡∏Å‡πá‡∏ö SVD preds ‡∏Ç‡∏≠‡∏á user
svd_preds_df_global = pd.DataFrame() 

log("--- ALL ARTIFACTS LOADED AND READY ---")

[INFO] 2025-m-d 16:28:32 | --- STARTING ONE-TIME LOAD ---
[INFO] 2025-m-d 16:28:32 | Loading DataFrames (movies, ratings)...
[INFO] 2025-m-d 16:29:06 | Loading TF-IDF reduced data (for movie_ids)...
[INFO] 2025-m-d 16:29:07 | Loading Content Similarity matrix...
[INFO] 2025-m-d 16:29:08 | Loading SVD artifacts from disk...
[INFO] 2025-m-d 16:29:09 | Loaded SVD artifacts
[INFO] 2025-m-d 16:29:09 | --- ALL ARTIFACTS LOADED AND READY ---


# üöÄ Interactive Test Dashboard (4-Tab Version)

In [5]:
# Cell 5 (NEW): üöÄ Interactive Test Dashboard (4-Tab Version)

log("Building 4-Tab Interactive Test Dashboard...")

# --- Tab 1: Test 1 (Content-Based) ---

# Widgets
movie_search_box_1 = widgets.Text(placeholder='‡∏û‡∏¥‡∏°‡∏û‡πå‡∏ä‡∏∑‡πà‡∏≠‡∏´‡∏ô‡∏±‡∏á (‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏ô‡πâ‡∏≠‡∏¢ 3 ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£)...', description='Search Movie:', layout=widgets.Layout(width='95%'))
movie_search_output_1 = widgets.Output() # <-- FIX: ‡πÄ‡∏û‡∏¥‡πà‡∏° Output
movie_select_dropdown_1 = widgets.Dropdown(options=[], description='Select Movie:', layout=widgets.Layout(width='95%'))
top_n_slider_1 = widgets.IntSlider(value=10, min=5, max=20, step=1, description='Top N:')
button_1 = widgets.Button(description='Test 1: Find Similar Movies', button_style='info', icon='film', layout=widgets.Layout(width='20%')) # <-- (‡πÅ‡∏Å‡πâ‡∏õ‡∏∏‡πà‡∏°‡πÅ‡∏•‡πâ‡∏ß)
output_1 = widgets.Output()

# Logic
def on_movie_search_change_1(change):
    with movie_search_output_1: # <-- FIX: ‡πÉ‡∏ä‡πâ Output ‡∏ó‡∏µ‡πà‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á
        clear_output()
        query = change['new']
        if len(query) < 3:
            movie_select_dropdown_1.options = []
            return
        results = movies_global[movies_global['title'].str.contains(query, case=False, na=False)]
        
        if results.empty:
            movie_select_dropdown_1.options = []
            print(f"‡πÑ‡∏°‡πà‡∏û‡∏ö‡∏´‡∏ô‡∏±‡∏á‡∏ó‡∏µ‡πà‡∏ï‡∏£‡∏á‡∏Å‡∏±‡∏ö '{query}'") # <-- FIX: ‡πÄ‡∏û‡∏¥‡πà‡∏°
        else:
            # ‡πÉ‡∏ä‡πâ (Title, Title) ‡πÄ‡∏û‡∏£‡∏≤‡∏∞‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô Test 1 ‡∏£‡∏±‡∏ö 'title'
            movie_options = list(zip(results['title'], results['title'])) 
            movie_select_dropdown_1.options = movie_options
            print(f"‡∏û‡∏ö {len(movie_options)} ‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£:") # <-- FIX: ‡πÄ‡∏û‡∏¥‡πà‡∏°
            display(results[['title']].head()) # <-- FIX: ‡πÄ‡∏û‡∏¥‡πà‡∏°
movie_search_box_1.observe(on_movie_search_change_1, names='value')

def on_button_clicked_1(b):
    with output_1:
        clear_output()
        movie_title = movie_select_dropdown_1.value
        top_n = top_n_slider_1.value
        if not movie_title:
            log("‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤‡πÅ‡∏•‡∏∞‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏´‡∏ô‡∏±‡∏á‡∏Å‡πà‡∏≠‡∏ô", "WARN")
            return
        display(Markdown(f"### 1. Content-Based Recommendations (Similar to '{movie_title}')"))
        cb_recs = get_content_based_recs(movie_title, top_n=top_n)
        display(cb_recs)
button_1.on_click(on_button_clicked_1)

# Layout
tab1_content = widgets.VBox([
    movie_search_box_1, 
    movie_search_output_1, # <-- FIX: ‡πÄ‡∏û‡∏¥‡πà‡∏°
    movie_select_dropdown_1, 
    top_n_slider_1, 
    button_1, 
    output_1
])

# --- Tab 2: Test 2.1 (CF - User) ---
# (Tab ‡∏ô‡∏µ‡πâ‡πÑ‡∏°‡πà‡∏°‡∏µ Search Box ‡πÑ‡∏°‡πà‡∏ï‡πâ‡∏≠‡∏á‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç)
user_id_input_21 = widgets.IntText(value=42, description='Test User ID:')
top_n_slider_21 = widgets.IntSlider(value=10, min=5, max=20, step=1, description='Top N:')
button_21 = widgets.Button(description='Test 2.1: Recs for User (CF)', button_style='primary', icon='user', layout=widgets.Layout(width='20%')) # <-- (‡πÅ‡∏Å‡πâ‡∏õ‡∏∏‡πà‡∏°‡πÅ‡∏•‡πâ‡∏ß)
output_21 = widgets.Output()
def on_button_clicked_21(b):
    with output_21:
        clear_output()
        uid = user_id_input_21.value
        top_n = top_n_slider_21.value
        display(Markdown(f"### 2.1 CF (SVD) Recommendations (For User {uid})"))
        cf_recs = get_cf_recs_for_user(uid, top_n=top_n)
        display(cf_recs)
button_21.on_click(on_button_clicked_21)
tab2_1_content = widgets.VBox([user_id_input_21, top_n_slider_21, button_21, output_21])

# --- Tab 3: Test 2.2 (CF - Movie) ---

# Widgets
movie_search_box_22 = widgets.Text(placeholder='‡∏û‡∏¥‡∏°‡∏û‡πå‡∏ä‡∏∑‡πà‡∏≠‡∏´‡∏ô‡∏±‡∏á...', description='Search Movie:', layout=widgets.Layout(width='95%'))
movie_search_output_22 = widgets.Output() # <-- FIX: ‡πÄ‡∏û‡∏¥‡πà‡∏° Output
movie_select_dropdown_22 = widgets.Dropdown(options=[], description='Select Movie:', layout=widgets.Layout(width='95%'))
top_n_slider_22 = widgets.IntSlider(value=10, min=5, max=20, step=1, description='Top N:')
button_22 = widgets.Button(description='Test 2.2: Find Users (CF)', button_style='success', icon='users', layout=widgets.Layout(width='20%')) # <-- (‡πÅ‡∏Å‡πâ‡∏õ‡∏∏‡πà‡∏°‡πÅ‡∏•‡πâ‡∏ß)
output_22 = widgets.Output()

# Logic
def on_movie_search_change_22(change):
    with movie_search_output_22: # <-- FIX: ‡πÉ‡∏ä‡πâ Output ‡∏ó‡∏µ‡πà‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á
        clear_output()
        query = change['new']
        if len(query) < 3:
            movie_select_dropdown_22.options = []
            return
        results = movies_global[movies_global['title'].str.contains(query, case=False, na=False)]
        
        if results.empty:
            movie_select_dropdown_22.options = []
            print(f"‡πÑ‡∏°‡πà‡∏û‡∏ö‡∏´‡∏ô‡∏±‡∏á‡∏ó‡∏µ‡πà‡∏ï‡∏£‡∏á‡∏Å‡∏±‡∏ö '{query}'") # <-- FIX: ‡πÄ‡∏û‡∏¥‡πà‡∏°
        else:
            # ‡πÉ‡∏ä‡πâ (Title, Title) ‡πÄ‡∏û‡∏£‡∏≤‡∏∞‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô Test 2.2 ‡∏£‡∏±‡∏ö 'title'
            movie_options = list(zip(results['title'], results['title'])) 
            movie_select_dropdown_22.options = movie_options
            print(f"‡∏û‡∏ö {len(movie_options)} ‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£:") # <-- FIX: ‡πÄ‡∏û‡∏¥‡πà‡∏°
            display(results[['title']].head()) # <-- FIX: ‡πÄ‡∏û‡∏¥‡πà‡∏°
movie_search_box_22.observe(on_movie_search_change_22, names='value')

def on_button_clicked_22(b):
    with output_22:
        clear_output()
        movie_title = movie_select_dropdown_22.value
        top_n = top_n_slider_22.value
        if not movie_title:
            log("‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤‡πÅ‡∏•‡∏∞‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏´‡∏ô‡∏±‡∏á‡∏Å‡πà‡∏≠‡∏ô", "WARN")
            return
        display(Markdown(f"### 2.2 CF (SVD) Recommendations (Users for '{movie_title}')"))
        cf_movie_recs = get_cf_recs_for_movie(movie_title, top_n=top_n)
        display(cf_movie_recs)
button_22.on_click(on_button_clicked_22)

# Layout
tab2_2_content = widgets.VBox([
    movie_search_box_22, 
    movie_search_output_22, # <-- FIX: ‡πÄ‡∏û‡∏¥‡πà‡∏°
    movie_select_dropdown_22, 
    top_n_slider_22, 
    button_22, 
    output_22
])

# --- Tab 4: Test 3 (Hybrid) ---
# (Tab ‡∏ô‡∏µ‡πâ‡πÑ‡∏°‡πà‡∏°‡∏µ Search Box ‡πÑ‡∏°‡πà‡∏ï‡πâ‡∏≠‡∏á‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç)
user_id_input_3 = widgets.IntText(value=42, description='Test User ID:')
top_n_slider_3 = widgets.IntSlider(value=10, min=5, max=20, step=1, description='Top N:')
button_3 = widgets.Button(description='Test 3: Recs for User (Hybrid)', button_style='danger', icon='star', layout=widgets.Layout(width='20%')) # <-- (‡πÅ‡∏Å‡πâ‡∏õ‡∏∏‡πà‡∏°‡πÅ‡∏•‡πâ‡∏ß)
output_3 = widgets.Output()
def on_button_clicked_3(b):
    with output_3:
        clear_output()
        uid = user_id_input_3.value
        top_n = top_n_slider_3.value
        display(Markdown(f"### 3. Hybrid Recommendations (For User {uid})"))
        hybrid_recs = recommend_movies(userId=uid, top_n=top_n)
        display(hybrid_recs)
button_3.on_click(on_button_clicked_3)

# Layout
tab3_content = widgets.VBox([
    user_id_input_3, 
    top_n_slider_3, 
    button_3, 
    output_3
])

# --- ‡∏™‡∏£‡πâ‡∏≤‡∏á Tab ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î ---
app_tabs = widgets.Tab()
app_tabs.children = [tab1_content, tab2_1_content, tab2_2_content, tab3_content]

# --- ‡∏ï‡∏±‡πâ‡∏á‡∏ä‡∏∑‡πà‡∏≠ Tab ‡∏ï‡∏≤‡∏°‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô ---
app_tabs.set_title(0, 'Test 1: Content-Based')
app_tabs.set_title(1, 'Test 2.1: CF (User)')
app_tabs.set_title(2, 'Test 2.2: CF (Movie)')
app_tabs.set_title(3, 'Test 3: Hybrid')

# --- ‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏• Dashboard ---
display(app_tabs)

[INFO] 2025-m-d 16:29:44 | Building 4-Tab Interactive Test Dashboard...


Tab(children=(VBox(children=(Text(value='', description='Search Movie:', layout=Layout(width='95%'), placehold‚Ä¶

## ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á Interactive Test
<table>
  <tr>
    <td align="center">
      <img src="./images/test1.png" alt="‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå Test 1: Content-Based" width="400">
      <br>
      <b>Test 1: Content-Based (Input: Movie)</b>
    </td>
    <td align="center">
      <img src="./images/test2.1.png" alt="‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå Test 2.1: CF (User)" width="400">
      <br>
      <b>Test 2.1: Collaborative Filtering (Input: User)</b>
    </td>
  </tr>
  <tr>
    <td align="center">
      <<img src="./images/test2.2.png" alt="‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå Test 2.2: CF (Movie)" width="400">
      <br>
      <b>Test 2.2: Collaborative Filtering (Input: Movie)</b>
    </td>
    <td align="center">
      <img src="./images/test3.png" alt="‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå Test 3: Hybrid" width="400">
      <br>
      <b>Test 3: Hybrid Model (Input: User)</b>
    </td>
  </tr>
</table>

# Create Feedback File (Run once)

In [6]:
FEEDBACK_FILE_PATH = PROCESSED_PATH / "user_feedback.csv"
FEEDBACK_COLUMNS = ['userId', 'movieId', 'rating', 'timestamp'] # <-- ‡∏ï‡πâ‡∏≠‡∏á‡∏ï‡∏£‡∏á‡∏Å‡∏±‡∏ö ratings_cleaned_f.csv

if not FEEDBACK_FILE_PATH.exists():
    log(f"Creating new feedback file at: {FEEDBACK_FILE_PATH}")
    pd.DataFrame(columns=FEEDBACK_COLUMNS).to_csv(FEEDBACK_FILE_PATH, index=False)
else:
    log(f"Feedback file already exists: {FEEDBACK_FILE_PATH}")

[INFO] 2025-m-d 16:29:49 | Feedback file already exists: C:\Users\nonth\Documents\movie_predict_move\processed\user_feedback.csv


# ‚≠êÔ∏è Feedback Widget

In [7]:
log("Building feedback widgets...")

# 1. ‡∏™‡∏£‡πâ‡∏≤‡∏á Widgets

user_id_input = widgets.IntText(
    value=None,  # <-- FIX 1: ‡∏ï‡∏±‡πâ‡∏á‡πÄ‡∏õ‡πá‡∏ô None ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÉ‡∏´‡πâ placeholder ‡∏ó‡∏≥‡∏á‡∏≤‡∏ô
    placeholder='‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏Å‡∏£‡∏≠‡∏Å User ID', # <-- FIX 2: ‡πÄ‡∏û‡∏¥‡πà‡∏° placeholder
    description='User ID:',
    layout=widgets.Layout(width='30%')
)

search_box = widgets.Text(
    value='',
    placeholder='‡∏û‡∏¥‡∏°‡∏û‡πå‡∏ä‡∏∑‡πà‡∏≠‡∏´‡∏ô‡∏±‡∏á‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤...',
    description='‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤‡∏´‡∏ô‡∏±‡∏á:',
    layout=widgets.Layout(width='95%')
)

# ‡∏Å‡∏•‡πà‡∏≠‡∏á‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤
search_output = widgets.Output()

# Dropdown ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏´‡∏ô‡∏±‡∏á (‡∏à‡∏∞‡∏ñ‡∏π‡∏Å‡πÄ‡∏ï‡∏¥‡∏°‡πÄ‡∏°‡∏∑‡πà‡∏≠‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤)
movie_dropdown = widgets.Dropdown(
    options=[],
    description='‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏´‡∏ô‡∏±‡∏á:',
    layout=widgets.Layout(width='95%')
)

# Slider ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡πÉ‡∏´‡πâ‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô
rating_slider = widgets.FloatSlider(
    value=2.5,
    min=0.5,
    max=5.0,
    step=0.5,
    description='‡πÉ‡∏´‡πâ‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô:',
    readout_format='.1f'
)

submit_button = widgets.Button(
    description='‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å Feedback',
    button_style='success',
    icon='save'
)

# ‡∏Å‡∏•‡πà‡∏≠‡∏á‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å
submit_output = widgets.Output()

# --- Logic ---

# 2. Logic ‡∏Å‡∏≤‡∏£‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤
def on_search_change(change):
    with search_output:
        clear_output()
        query = change['new']
        if len(query) < 3: # ‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤‡πÄ‡∏°‡∏∑‡πà‡∏≠‡∏û‡∏¥‡∏°‡∏û‡πå‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏ô‡πâ‡∏≠‡∏¢ 3 ‡∏ï‡∏±‡∏ß
            movie_dropdown.options = []
            return
        
        # ‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤ (‡πÉ‡∏ä‡πâ na=False ‡∏ó‡∏µ‡πà‡πÄ‡∏£‡∏≤‡πÅ‡∏Å‡πâ‡πÑ‡∏õ)
        results = movies_global[movies_global['title'].str.contains(query, case=False, na=False)]
        
        if results.empty:
            print(f"‡πÑ‡∏°‡πà‡∏û‡∏ö‡∏´‡∏ô‡∏±‡∏á‡∏ó‡∏µ‡πà‡∏ï‡∏£‡∏á‡∏Å‡∏±‡∏ö '{query}'")
            movie_dropdown.options = []
        else:
            # ‡∏™‡∏£‡πâ‡∏≤‡∏á List ‡∏Ç‡∏≠‡∏á (Title, MovieID)
            movie_options = list(zip(results['title'], results['movieId']))
            movie_dropdown.options = movie_options
            print(f"‡∏û‡∏ö {len(movie_options)} ‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£:")
            display(results[['title']].head())

# 3. Logic ‡∏Å‡∏≤‡∏£‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å
def on_submit_clicked(b):
    with submit_output:
        clear_output()
        
        # ‡∏î‡∏∂‡∏á‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏à‡∏≤‡∏Å Widgets
        user_id = user_id_input.value
        movie_id = movie_dropdown.value
        raw_rating = rating_slider.value
        rating = round(raw_rating, 1)
        
        if movie_id is None:
            log("‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏´‡∏ô‡∏±‡∏á‡∏Å‡πà‡∏≠‡∏ô‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å", "WARN")
            return

        log(f"Submitting: User {user_id}, Movie {movie_id}, Rating {rating}")

        # ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•
        new_feedback = {
            'userId': user_id,
            'movieId': movie_id,
            'rating': rating,
            'timestamp': int(datetime.now().timestamp())
        }
        
        df_new = pd.DataFrame([new_feedback])
        
        # ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÅ‡∏ö‡∏ö Append (mode='a') ‡πÅ‡∏•‡∏∞‡πÑ‡∏°‡πà‡πÉ‡∏™‡πà Header (header=False)
        df_new.to_csv(FEEDBACK_FILE_PATH, mode='a', header=False, index=False)
        
        log(f"Feedback for User {user_id} saved successfully!", "INFO")
        print(f"‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•: {new_feedback['title'] if 'title' in new_feedback else movie_id} | {rating} ‡∏î‡∏≤‡∏ß")

# 4. ‡πÄ‡∏ä‡∏∑‡πà‡∏≠‡∏°‡πÇ‡∏¢‡∏á Events
search_box.observe(on_search_change, names='value')
submit_button.on_click(on_submit_clicked)

# 5. ‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏• Widgets
display(Markdown("### 4. ‡πÉ‡∏´‡πâ‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô Feedback (‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÉ‡∏ä‡πâ Retrain)"))
display(user_id_input)
display(search_box, search_output, movie_dropdown, rating_slider, submit_button, submit_output)

[INFO] 2025-m-d 16:29:57 | Building feedback widgets...


### 4. ‡πÉ‡∏´‡πâ‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô Feedback (‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÉ‡∏ä‡πâ Retrain)

IntText(value=0, description='User ID:', layout=Layout(width='30%'))

Text(value='', description='‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤‡∏´‡∏ô‡∏±‡∏á:', layout=Layout(width='95%'), placeholder='‡∏û‡∏¥‡∏°‡∏û‡πå‡∏ä‡∏∑‡πà‡∏≠‡∏´‡∏ô‡∏±‡∏á‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤...')

Output()

Dropdown(description='‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏´‡∏ô‡∏±‡∏á:', layout=Layout(width='95%'), options=(), value=None)

FloatSlider(value=2.5, description='‡πÉ‡∏´‡πâ‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô:', max=5.0, min=0.5, readout_format='.1f', step=0.5)

Button(button_style='success', description='‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å Feedback', icon='save', style=ButtonStyle())

Output()

## ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á Feedback Widget
<img src="./images/feedback_ui.png" alt="test2.1" width="40%">

## ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• user_feedback.csv
<img src="./images/feedback_csv_example.png" alt="test2.1" width="60%">

# ‚öôÔ∏è Retrain Master Function

In [10]:
# Cell 11: ‚öôÔ∏è Retrain Master Function (‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç Memory Error)

def run_retrain_pipeline():
    """
    (Optimized for Memory)
    1. ‡πÄ‡∏ä‡πá‡∏Å Feedback ‡∏Å‡πà‡∏≠‡∏ô
    2. ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ -> 'Append' (‡∏ï‡πà‡∏≠‡∏ó‡πâ‡∏≤‡∏¢) feedback ‡∏•‡∏á‡πÑ‡∏ü‡∏•‡πå ratings.csv
    3. ‡∏•‡πâ‡∏≤‡∏á feedback.csv
    4. 'Load' ‡πÑ‡∏ü‡∏•‡πå ratings.csv (‡∏ó‡∏µ‡πà‡∏£‡∏ß‡∏°‡πÅ‡∏•‡πâ‡∏ß) ‡πÅ‡∏ö‡∏ö Optimized
    5. Deduplicate (‡πÉ‡∏ô Memory)
    6. 'Save' ‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏µ‡πà clean ‡πÅ‡∏•‡πâ‡∏ß‡∏Å‡∏•‡∏±‡∏ö‡∏ó‡∏µ‡πà‡πÄ‡∏î‡∏¥‡∏°
    7. ‡πÄ‡∏ó‡∏£‡∏ô SVD
    """
    log("--- STARTING RETRAIN PIPELINE ---")
    
    try:
        # 1. ‡πÇ‡∏´‡∏•‡∏î Feedback (‡πÉ‡∏´‡∏°‡πà) ‡∏Å‡πà‡∏≠‡∏ô
        log("Checking for new feedback...")
        df_new = pd.DataFrame(columns=FEEDBACK_COLUMNS)
        if FEEDBACK_FILE_PATH.exists():
            df_new = pd.read_csv(FEEDBACK_FILE_PATH)

        # 2. (Optimized) ‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏Å‡πà‡∏≠‡∏ô
        if df_new.empty:
            log("No new feedback to process. Retrain aborted.", "INFO")
            return False 
        
        # --- FIX: Append-then-Read (‡∏´‡∏•‡∏µ‡∏Å‡πÄ‡∏•‡∏µ‡πà‡∏¢‡∏á pd.concat) ---
        log("New feedback found. Appending to main ratings file...")
        
        # 3. 'Append' (‡∏ï‡πà‡∏≠‡∏ó‡πâ‡∏≤‡∏¢) feedback ‡∏•‡∏á‡πÑ‡∏ü‡∏•‡πå ratings.csv (‡∏ö‡∏ô Disk)
        df_new.to_csv(CLEANED_PATH / "ratings_cleaned_f.csv", mode='a', header=False, index=False)
        
        # 4. ‡∏•‡πâ‡∏≤‡∏á feedback.csv (‡πÄ‡∏û‡∏£‡∏≤‡∏∞‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏¢‡πâ‡∏≤‡∏¢‡πÑ‡∏õ‡πÅ‡∏•‡πâ‡∏ß)
        log(f"Clearing feedback file: {FEEDBACK_FILE_PATH}...")
        pd.DataFrame(columns=FEEDBACK_COLUMNS).to_csv(FEEDBACK_FILE_PATH, index=False)
        
        # 5. 'Load' ‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏µ‡πà‡∏£‡∏ß‡∏°‡πÅ‡∏•‡πâ‡∏ß (‡∏î‡πâ‡∏ß‡∏¢ Optimized Types)
        log("Loading combined ratings file (with optimized types)...")
        DTYPE_MAP = {
            'userId': 'int32',  # ‡∏õ‡∏£‡∏∞‡∏´‡∏¢‡∏±‡∏î RAM (‡∏à‡∏≤‡∏Å int64)
            'movieId': 'int32', # ‡∏õ‡∏£‡∏∞‡∏´‡∏¢‡∏±‡∏î RAM (‡∏à‡∏≤‡∏Å int64)
            'rating': 'float32',# ‡∏õ‡∏£‡∏∞‡∏´‡∏¢‡∏±‡∏î RAM (‡∏à‡∏≤‡∏Å float64)
            'timestamp': 'int64' # timestamp ‡πÑ‡∏ß‡πâ‡∏≠‡∏ô‡∏≤‡∏Ñ‡∏ï
        }
        
        # ‡∏ô‡∏µ‡πà‡∏Ñ‡∏∑‡∏≠‡∏à‡∏∏‡∏î‡∏ó‡∏µ‡πà‡πÄ‡∏£‡∏≤‡πÇ‡∏´‡∏•‡∏î‡πÑ‡∏ü‡∏•‡πå‡πÉ‡∏´‡∏ç‡πà‡πÅ‡∏Ñ‡πà '‡∏Ñ‡∏£‡∏±‡πâ‡∏á‡πÄ‡∏î‡∏µ‡∏¢‡∏ß'
        combined_ratings = pd.read_csv(CLEANED_PATH / "ratings_cleaned_f.csv", dtype=DTYPE_MAP)
        # --- End Fix ---
        
        log(f"[DEBUG-1] Load complete. Starting Sort/Deduplicate...")

        # 6. Deduplicate (‡πÉ‡∏ô Memory)
        combined_ratings = combined_ratings.sort_values(by='timestamp', ascending=True)
        combined_ratings = combined_ratings.drop_duplicates(subset=['userId', 'movieId'], keep='last')
        log(f"Total ratings for retrain (deduplicated): {len(combined_ratings)}")
        
        # 7. 'Save' ‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏µ‡πà clean (Deduplicated) ‡πÅ‡∏•‡πâ‡∏ß‡∏Å‡∏•‡∏±‡∏ö‡∏ó‡∏µ‡πà‡πÄ‡∏î‡∏¥‡∏°
        log("Saving cleaned, deduplicated data back to file...")
        combined_ratings.to_csv(CLEANED_PATH / "ratings_cleaned_f.csv", index=False)

        # 8. ‡∏™‡∏£‡πâ‡∏≤‡∏á Mappings ‡πÅ‡∏•‡∏∞ Matrix (‡∏à‡∏≤‡∏Å 'combined_ratings' ‡∏ó‡∏µ‡πà‡∏≠‡∏¢‡∏π‡πà‡πÉ‡∏ô RAM)
        log("[DEBUG-3] Building Mappings and User Mean...")
        movie_ids = np.sort(combined_ratings['movieId'].unique())
        movie_index = {mid: i for i, mid in enumerate(movie_ids)}
        user_ids = np.sort(combined_ratings['userId'].unique())
        user_index = {uid: i for i, uid in enumerate(user_ids)}
        mean_ratings_series = combined_ratings.groupby('userId')['rating'].mean()
        user_mean = mean_ratings_series.loc[user_ids].to_numpy()
        
        log("[DEBUG-4] Building Sparse Matrix rows/cols...")
        rows = combined_ratings['userId'].map(user_index).to_numpy()
        cols = combined_ratings['movieId'].map(movie_index).to_numpy()
        data = combined_ratings['rating'].to_numpy(dtype='float32') # ‡πÉ‡∏ä‡πâ‡∏ô‡πâ‡∏≠‡∏¢‡∏•‡∏á
        
        user_movie_matrix = csr_matrix((data, (rows, cols)), 
                                       shape=(len(user_index), len(movie_index)))
        log(f"New matrix shape: {user_movie_matrix.shape}")

        log("[DEBUG-5] Deleting DataFrame to free memory before SVD...")
        del combined_ratings, rows, cols, data # ‡∏•‡∏ö DataFrame ‡πÉ‡∏´‡∏ç‡πà‡∏ó‡∏¥‡πâ‡∏á
        gc.collect() 
        
        # 9. ‡πÄ‡∏ó‡∏£‡∏ô SVD
        svd_artifacts = train_svd_model(user_movie_matrix, user_mean, user_index, movie_index, k=50)
        
        if not svd_artifacts:
            log("Retrain failed. Aborting.", "ERROR")
            return False

        # 10. ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å Artifacts (‡πÑ‡∏°‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å .csv ‡∏ã‡πâ‡∏≥)
        save_svd_artifacts(svd_artifacts, MODEL_PATH)
        
        log("--- RETRAIN PIPELINE COMPLETE ---")
        return True
        
    except Exception as e:
        log(f"An error occurred during retraining: {e}", "ERROR")
        return False

# ü§ñ Retrain Button

In [11]:
retrain_button = widgets.Button(
    description='Retrain Model',
    button_style='danger', # ‡∏™‡∏µ‡πÅ‡∏î‡∏á ‡πÄ‡∏ï‡∏∑‡∏≠‡∏ô‡∏ß‡πà‡∏≤‡πÉ‡∏ä‡πâ‡πÄ‡∏ß‡∏•‡∏≤‡∏ô‡∏≤‡∏ô
    icon='cogs'
)

retrain_output = widgets.Output()

def on_retrain_clicked(b):
    with retrain_output:
        clear_output()
        log("--- RETRAINING STARTED ---")
        log("‡∏ô‡∏µ‡πà‡∏≠‡∏≤‡∏à‡πÉ‡∏ä‡πâ‡πÄ‡∏ß‡∏•‡∏≤ 1-5 ‡∏ô‡∏≤‡∏ó‡∏µ ‡∏Ç‡∏∂‡πâ‡∏ô‡∏≠‡∏¢‡∏π‡πà‡∏Å‡∏±‡∏ö‡∏Ñ‡∏≠‡∏°‡∏Ç‡∏≠‡∏á‡∏Ñ‡∏∏‡∏ì...")
        
        # 1. ‡∏£‡∏±‡∏ô‡∏Å‡∏≤‡∏£‡πÄ‡∏ó‡∏£‡∏ô (‡∏ã‡∏∂‡πà‡∏á‡∏à‡∏∞‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÑ‡∏ü‡∏•‡πå‡πÉ‡∏´‡∏°‡πà‡∏•‡∏á Disk)
        success = run_retrain_pipeline()
        
        if success:
            log("Retrain successful. Reloading new model into memory...")
            
            # 2. (‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç‡∏°‡∏≤‡∏Å) ‡πÇ‡∏´‡∏•‡∏î‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÉ‡∏´‡∏°‡πà‡πÄ‡∏Ç‡πâ‡∏≤ Global memory ‡∏ó‡∏±‡∏ô‡∏ó‡∏µ
            # ‡πÄ‡∏£‡∏≤‡∏à‡∏∞ "‡πÇ‡∏Å‡∏á" ‡πÇ‡∏î‡∏¢‡∏Å‡∏≤‡∏£‡πÄ‡∏£‡∏µ‡∏¢‡∏Å Cell 4 ‡πÉ‡∏´‡∏°‡πà‡∏≠‡∏µ‡∏Å‡∏Ñ‡∏£‡∏±‡πâ‡∏á
            # ‡πÅ‡∏ï‡πà‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏õ‡∏•‡∏≠‡∏î‡∏†‡∏±‡∏¢ ‡πÄ‡∏£‡∏≤‡∏à‡∏∞‡πÄ‡∏£‡∏µ‡∏¢‡∏Å‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô load_svd_artifacts ‡∏ï‡∏£‡∏á‡∏ô‡∏µ‡πâ‡πÄ‡∏•‡∏¢
            
            global U, Sigma, Vt, svd_user_mean, svd_user_index, svd_movie_index, svd_reverse_user_index, svd_reverse_movie_index
            
            try:
                artifacts = load_svd_artifacts(MODEL_PATH)
                U = artifacts["U"]
                Sigma = artifacts["Sigma"]
                Vt = artifacts["Vt"]
                svd_user_mean = artifacts["user_mean"]
                svd_user_index = artifacts["user_index"]
                svd_movie_index = artifacts["movie_index"]
                svd_reverse_user_index = artifacts["reverse_user_index"]
                svd_reverse_movie_index = artifacts["reverse_movie_index"]
                
                log("--- üöÄ NEW MODEL LOADED ---")
                log("‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÉ‡∏´‡∏°‡πà‡∏û‡∏£‡πâ‡∏≠‡∏°‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô‡πÅ‡∏•‡πâ‡∏ß! ‡∏Ñ‡∏∏‡∏ì‡∏™‡∏≤‡∏°‡∏≤‡∏£‡∏ñ Test ‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢")
            except Exception as e:
                log(f"Failed to reload new model: {e}", "ERROR")
        else:
            log("Retrain failed. Please check errors.", "ERROR")

retrain_button.on_click(on_retrain_clicked)

display(Markdown("### 5. Retrain Model (‡πÉ‡∏ä‡πâ Feedback ‡πÉ‡∏´‡∏°‡πà)"))
display(retrain_button, retrain_output)

### 5. Retrain Model (‡πÉ‡∏ä‡πâ Feedback ‡πÉ‡∏´‡∏°‡πà)

Button(button_style='danger', description='Retrain Model', icon='cogs', style=ButtonStyle())

Output()

## ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏Å‡∏≤‡∏£‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏•‡πÄ‡∏°‡∏∑‡πà‡∏≠ Retrain ‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à
<img src="./images/retrain_modelling_example.png" alt="retrain_modelling" width="80%">

# for ETL

In [None]:
import sqlite3

def export_to_sqlite():
    print("üì¶ Starting Export Pipeline: CSV -> SQLite...")
    
    # 1. Path ‡∏Ç‡∏≠‡∏á‡πÑ‡∏ü‡∏•‡πå
    csv_path = "processed/cleaned/ratings_cleaned_f.csv" # ‡πÅ‡∏Å‡πâ‡πÉ‡∏´‡πâ‡∏ï‡∏£‡∏á‡∏Å‡∏±‡∏ö Path ‡πÉ‡∏ô Notebook ‡∏Ñ‡∏∏‡∏ì
    db_path = "processed/cleaned/ratings.db"
    
    # 2. ‡πÄ‡∏ä‡∏∑‡πà‡∏≠‡∏°‡∏ï‡πà‡∏≠ Database (‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÉ‡∏´‡∏°‡πà‡∏ó‡∏±‡∏ö‡∏Ç‡∏≠‡∏á‡πÄ‡∏î‡∏¥‡∏°)
    if os.path.exists(db_path):
        os.remove(db_path) # ‡∏•‡∏ö‡∏Ç‡∏≠‡∏á‡πÄ‡∏Å‡πà‡∏≤‡∏ó‡∏¥‡πâ‡∏á‡∏Å‡πà‡∏≠‡∏ô‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÉ‡∏´‡πâ‡πÑ‡∏î‡πâ‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏•‡πà‡∏≤‡∏™‡∏∏‡∏î
        
    conn = sqlite3.connect(db_path)
    
    # 3. ‡∏≠‡πà‡∏≤‡∏ô CSV ‡πÅ‡∏•‡∏∞‡∏¢‡∏±‡∏î‡∏•‡∏á DB (‡∏ó‡∏≥‡πÄ‡∏õ‡πá‡∏ô Chunk ‡∏Å‡∏±‡∏ô RAM ‡πÄ‡∏ï‡πá‡∏°‡∏ö‡∏ô Colab)
    chunk_size = 1000000
    total = 0
    
    # ‡πÄ‡∏£‡∏≤‡πÄ‡∏≠‡∏≤‡πÅ‡∏Ñ‡πà‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡∏ó‡∏µ‡πà‡πÉ‡∏ä‡πâ‡∏à‡∏£‡∏¥‡∏á‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏õ‡∏£‡∏∞‡∏´‡∏¢‡∏±‡∏î‡∏û‡∏∑‡πâ‡∏ô‡∏ó‡∏µ‡πà
    for chunk in pd.read_csv(csv_path, chunksize=chunk_size, usecols=['userId', 'movieId']):
        chunk.to_sql('ratings', conn, if_exists='append', index=False)
        total += len(chunk)
        print(f"   ...exported {total:,} rows")
        
    # 4. ‡∏™‡∏£‡πâ‡∏≤‡∏á Index (‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç‡∏°‡∏≤‡∏Å! ‡∏ó‡∏≥‡πÉ‡∏´‡πâ‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤‡πÄ‡∏£‡πá‡∏ß‡πÄ‡∏ó‡πà‡∏≤ Pandas)
    print("   Creating Index...")
    conn.execute("CREATE INDEX idx_user ON ratings(userId)")
    
    conn.close()
    print(f"‚úÖ Export Complete! Ready to upload: {db_path}")

# ‡πÄ‡∏£‡∏µ‡∏¢‡∏Å‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô
export_to_sqlite()