# ƒê√°nh gi√° M√¥ h√¨nh H·ªá G·ª£i √Ω Hybrid

## M·ª•c ti√™u
ƒê√°nh gi√° hi·ªáu su·∫•t c·ªßa c√°c m√¥ h√¨nh:
- Collaborative Filtering (CF)
- Content-Based Filtering (CB)
- Hybrid System

## Metrics
- RMSE (Root Mean Squared Error)
- MAE (Mean Absolute Error)


In [1]:
# Import c√°c th∆∞ vi·ªán c·∫ßn thi·∫øt
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
import warnings
warnings.filterwarnings('ignore')

# Thi·∫øt l·∫≠p hi·ªÉn th·ªã
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("ƒê√£ import c√°c th∆∞ vi·ªán th√†nh c√¥ng!")


ƒê√£ import c√°c th∆∞ vi·ªán th√†nh c√¥ng!


## 1. Load Models v√† D·ªØ li·ªáu


In [2]:
# Load models v√† d·ªØ li·ªáu
print("=" * 60)
print("ƒêANG LOAD MODELS V√Ä D·ªÆ LI·ªÜU...")
print("=" * 60)

# Load SVD model
with open('../models/svd_model.pkl', 'rb') as f:
    svd = pickle.load(f)

# Load TF-IDF vectorizer
with open('../models/tfidf_vectorizer.pkl', 'rb') as f:
    tfidf = pickle.load(f)

# Load factors
user_factors = np.load('../models/user_factors.npy')
item_factors = np.load('../models/item_factors.npy')

# Load mappings
with open('../models/movie_id_to_idx.pkl', 'rb') as f:
    movie_id_to_idx = pickle.load(f)

with open('../models/user_id_to_idx.pkl', 'rb') as f:
    user_id_to_idx = pickle.load(f)

with open('../models/tfidf_movie_id_to_row.pkl', 'rb') as f:
    tfidf_movie_id_to_row = pickle.load(f)

# Load dataframes
movies_df_clean = pd.read_pickle('../models/movies_df_clean.pkl')
train_df = pd.read_pickle('../models/train_df.pkl')
tfidf_df = pd.read_pickle('../models/tfidf_df.pkl')

# T·∫°o TF-IDF matrix t·ª´ tfidf_df
tfidf_matrix = csr_matrix(tfidf_df.values)

print(f"\n‚úì ƒê√£ load th√†nh c√¥ng!")
print(f"  - User factors shape: {user_factors.shape}")
print(f"  - Item factors shape: {item_factors.shape}")
print(f"  - TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"  - Train ratings: {len(train_df):,}")


ƒêANG LOAD MODELS V√Ä D·ªÆ LI·ªÜU...

‚úì ƒê√£ load th√†nh c√¥ng!
  - User factors shape: (668, 50)
  - Item factors shape: (3854, 50)
  - TF-IDF matrix shape: (3855, 21)
  - Train ratings: 75,296


## 2. T·∫°o Test Set


In [3]:
# Load d·ªØ li·ªáu g·ªëc v√† t·∫°o test set (gi·ªëng nh∆∞ trong train_models.py)
print("=" * 60)
print("T·∫†O TEST SET...")
print("=" * 60)

# Load d·ªØ li·ªáu g·ªëc
ratings_df = pd.read_csv('../data/ratings.csv')

# L√†m s·∫°ch d·ªØ li·ªáu (gi·ªëng train_models.py)
ratings_df_clean = ratings_df.dropna(subset=['userId', 'movieId', 'rating'])
ratings_df_clean = ratings_df_clean.drop_duplicates(subset=['userId', 'movieId'], keep='last')
ratings_df_clean = ratings_df_clean[
    (ratings_df_clean['rating'] >= 0.5) & 
    (ratings_df_clean['rating'] <= 5.0)
]

# Ch·ªâ gi·ªØ c√°c phim c√≥ trong movies_df_clean
ratings_df_clean = ratings_df_clean[ratings_df_clean['movieId'].isin(movies_df_clean['movieId'])]

# Chia train/test v·ªõi c√πng random_state=42
train_df_new, test_df = train_test_split(
    ratings_df_clean,
    test_size=0.2,
    random_state=42
)

print(f"Train set: {len(train_df_new):,} ratings")
print(f"Test set: {len(test_df):,} ratings")
print(f"\n‚úì ƒê√£ t·∫°o test set th√†nh c√¥ng!")


T·∫†O TEST SET...
Train set: 75,296 ratings
Test set: 18,825 ratings

‚úì ƒê√£ t·∫°o test set th√†nh c√¥ng!


## 3. ƒê·ªãnh nghƒ©a c√°c h√†m d·ª± ƒëo√°n


In [4]:
# H√†m d·ª± ƒëo√°n Collaborative Filtering
def predict_rating_cf(user_id, movie_id, user_factors, item_factors, user_id_to_idx, movie_id_to_idx, train_df):
    """D·ª± ƒëo√°n rating s·ª≠ d·ª•ng Collaborative Filtering"""
    if user_id not in user_id_to_idx or movie_id not in movie_id_to_idx:
        return train_df['rating'].mean()
    
    user_idx = user_id_to_idx[user_id]
    movie_idx = movie_id_to_idx[movie_id]
    
    prediction = np.dot(user_factors[user_idx], item_factors[movie_idx])
    prediction = np.clip(prediction, 0.5, 5.0)
    return prediction

# H√†m d·ª± ƒëo√°n Content-Based Filtering
def predict_rating_cb(user_id, movie_id, train_df, tfidf_matrix, tfidf_movie_id_to_row):
    """D·ª± ƒëo√°n rating s·ª≠ d·ª•ng Content-Based Filtering"""
    user_ratings = train_df.loc[train_df['userId'] == user_id, ['movieId', 'rating']]
    if user_ratings.empty:
        return float(train_df['rating'].mean())

    target_row = tfidf_movie_id_to_row.get(int(movie_id))
    if target_row is None:
        return float(train_df['rating'].mean())

    rated = user_ratings.copy()
    rated['row'] = rated['movieId'].map(tfidf_movie_id_to_row)
    rated = rated.dropna(subset=['row'])
    if rated.empty:
        return float(train_df['rating'].mean())

    rated_rows = rated['row'].astype(int).to_numpy()
    ratings = rated['rating'].to_numpy(dtype=float)

    # T√≠nh similarity
    sims = (tfidf_matrix[rated_rows] @ tfidf_matrix[target_row].T).toarray().ravel()

    similarity_sum = np.abs(sims).sum()
    if similarity_sum == 0:
        return float(train_df['rating'].mean())

    pred = float((sims * ratings).sum() / similarity_sum)
    pred = float(np.clip(pred, 0.5, 5.0))
    return pred

# H√†m d·ª± ƒëo√°n Hybrid
def predict_rating_hybrid(user_id, movie_id, 
                         user_factors, item_factors, user_id_to_idx, movie_id_to_idx,
                         train_df, tfidf_matrix, tfidf_movie_id_to_row,
                         cf_weight=0.6, cb_weight=0.4):
    """D·ª± ƒëo√°n rating s·ª≠ d·ª•ng Hybrid approach"""
    cf_pred = predict_rating_cf(
        user_id, movie_id,
        user_factors, item_factors,
        user_id_to_idx, movie_id_to_idx,
        train_df
    )

    cb_pred = predict_rating_cb(
        user_id, movie_id,
        train_df, tfidf_matrix, tfidf_movie_id_to_row
    )

    hybrid_pred = cf_weight * cf_pred + cb_weight * cb_pred
    return hybrid_pred, cf_pred, cb_pred

print("‚úì ƒê√£ ƒë·ªãnh nghƒ©a c√°c h√†m d·ª± ƒëo√°n!")


‚úì ƒê√£ ƒë·ªãnh nghƒ©a c√°c h√†m d·ª± ƒëo√°n!


## 4. ƒê√°nh gi√° tr√™n Test Set


In [None]:
# ƒê√°nh gi√° tr√™n test set
print("=" * 60)
print("ƒêANG ƒê√ÅNH GI√Å TR√äN TEST SET...")
print("=" * 60)
print(f"S·ªë l∆∞·ª£ng samples trong test set: {len(test_df):,}")
print("ƒêang t√≠nh to√°n predictions...")

# L·∫•y sample ƒë·ªÉ test nhanh (c√≥ th·ªÉ comment ƒë·ªÉ ch·∫°y full test set)
# test_df_sample = test_df.sample(n=min(1000, len(test_df)), random_state=42)
test_df_sample = test_df  # Ch·∫°y full test set

# D·ª± ƒëo√°n cho t·ª´ng ph∆∞∆°ng ph√°p
predictions_cf = []
predictions_cb = []
predictions_hybrid = []
actual_ratings = []

for idx, row in test_df_sample.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    actual_rating = row['rating']
    
    # CF prediction
    cf_pred = predict_rating_cf(
        user_id, movie_id,
        user_factors, item_factors,
        user_id_to_idx, movie_id_to_idx,
        train_df
    )
    
    # CB prediction
    cb_pred = predict_rating_cb(
        user_id, movie_id,
        train_df, tfidf_matrix, tfidf_movie_id_to_row
    )
    
    # Hybrid prediction
    hybrid_pred, _, _ = predict_rating_hybrid(
        user_id, movie_id,
        user_factors, item_factors, user_id_to_idx, movie_id_to_idx,
        train_df, tfidf_matrix, tfidf_movie_id_to_row
    )
    
    predictions_cf.append(cf_pred)
    predictions_cb.append(cb_pred)
    predictions_hybrid.append(hybrid_pred)
    actual_ratings.append(actual_rating)
    
    # Progress indicator
    if (len(predictions_cf) % 1000 == 0):
        print(f"  ƒê√£ x·ª≠ l√Ω: {len(predictions_cf):,} / {len(test_df_sample):,} samples")

print(f"\n‚úì ƒê√£ ho√†n th√†nh d·ª± ƒëo√°n cho {len(predictions_cf):,} samples!")


ƒêANG ƒê√ÅNH GI√Å TR√äN TEST SET...
S·ªë l∆∞·ª£ng samples trong test set: 18,825
ƒêang t√≠nh to√°n predictions...
  ƒê√£ x·ª≠ l√Ω: 1,000 / 18,825 samples
  ƒê√£ x·ª≠ l√Ω: 2,000 / 18,825 samples
  ƒê√£ x·ª≠ l√Ω: 3,000 / 18,825 samples
  ƒê√£ x·ª≠ l√Ω: 4,000 / 18,825 samples
  ƒê√£ x·ª≠ l√Ω: 5,000 / 18,825 samples
  ƒê√£ x·ª≠ l√Ω: 6,000 / 18,825 samples
  ƒê√£ x·ª≠ l√Ω: 7,000 / 18,825 samples
  ƒê√£ x·ª≠ l√Ω: 8,000 / 18,825 samples
  ƒê√£ x·ª≠ l√Ω: 9,000 / 18,825 samples
  ƒê√£ x·ª≠ l√Ω: 10,000 / 18,825 samples
  ƒê√£ x·ª≠ l√Ω: 11,000 / 18,825 samples
  ƒê√£ x·ª≠ l√Ω: 12,000 / 18,825 samples
  ƒê√£ x·ª≠ l√Ω: 13,000 / 18,825 samples


## 5. T√≠nh RMSE v√† MAE


In [None]:
# Chuy·ªÉn ƒë·ªïi sang numpy array
actual_ratings = np.array(actual_ratings)
predictions_cf = np.array(predictions_cf)
predictions_cb = np.array(predictions_cb)
predictions_hybrid = np.array(predictions_hybrid)

# T√≠nh RMSE v√† MAE cho t·ª´ng ph∆∞∆°ng ph√°p
print("=" * 60)
print("K·∫æT QU·∫¢ ƒê√ÅNH GI√Å")
print("=" * 60)

# Collaborative Filtering
rmse_cf = np.sqrt(mean_squared_error(actual_ratings, predictions_cf))
mae_cf = mean_absolute_error(actual_ratings, predictions_cf)

# Content-Based Filtering
rmse_cb = np.sqrt(mean_squared_error(actual_ratings, predictions_cb))
mae_cb = mean_absolute_error(actual_ratings, predictions_cb)

# Hybrid System
rmse_hybrid = np.sqrt(mean_squared_error(actual_ratings, predictions_hybrid))
mae_hybrid = mean_absolute_error(actual_ratings, predictions_hybrid)

# Hi·ªÉn th·ªã k·∫øt qu·∫£
results_df = pd.DataFrame({
    'Ph∆∞∆°ng ph√°p': ['Collaborative Filtering', 'Content-Based Filtering', 'Hybrid System'],
    'RMSE': [rmse_cf, rmse_cb, rmse_hybrid],
    'MAE': [mae_cf, mae_cb, mae_hybrid]
})

print("\n" + results_df.to_string(index=False))
print("\n" + "=" * 60)


## 6. So s√°nh chi ti·∫øt


In [None]:
# So s√°nh chi ti·∫øt
print("\n" + "=" * 60)
print("SO S√ÅNH CHI TI·∫æT")
print("=" * 60)

print(f"\nüìä Collaborative Filtering:")
print(f"   RMSE: {rmse_cf:.4f}")
print(f"   MAE:  {mae_cf:.4f}")
print(f"   Improvement vs Baseline (mean): {((actual_ratings.mean() - rmse_cf) / actual_ratings.mean() * 100):.2f}%")

print(f"\nüìä Content-Based Filtering:")
print(f"   RMSE: {rmse_cb:.4f}")
print(f"   MAE:  {mae_cb:.4f}")
print(f"   Improvement vs Baseline (mean): {((actual_ratings.mean() - rmse_cb) / actual_ratings.mean() * 100):.2f}%")

print(f"\nüìä Hybrid System (CF: 0.6, CB: 0.4):")
print(f"   RMSE: {rmse_hybrid:.4f}")
print(f"   MAE:  {mae_hybrid:.4f}")
print(f"   Improvement vs CF: {((rmse_cf - rmse_hybrid) / rmse_cf * 100):.2f}%")
print(f"   Improvement vs CB: {((rmse_cb - rmse_hybrid) / rmse_cb * 100):.2f}%")

# T√¨m ph∆∞∆°ng ph√°p t·ªët nh·∫•t
best_method = results_df.loc[results_df['RMSE'].idxmin(), 'Ph∆∞∆°ng ph√°p']
print(f"\nüèÜ Ph∆∞∆°ng ph√°p t·ªët nh·∫•t (RMSE th·∫•p nh·∫•t): {best_method}")

# L∆∞u k·∫øt qu·∫£
results_df.to_csv('../results/rmse_mae_results.csv', index=False)
print(f"\n‚úì ƒê√£ l∆∞u k·∫øt qu·∫£ v√†o: ../results/rmse_mae_results.csv")


## 7. Visualization


In [None]:
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# T·∫°o figure v·ªõi 2 subplots
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Subplot 1: Bar chart so s√°nh RMSE v√† MAE
x = np.arange(len(results_df))
width = 0.35

axes[0].bar(x - width/2, results_df['RMSE'], width, label='RMSE', alpha=0.8)
axes[0].bar(x + width/2, results_df['MAE'], width, label='MAE', alpha=0.8)
axes[0].set_xlabel('Ph∆∞∆°ng ph√°p')
axes[0].set_ylabel('Error')
axes[0].set_title('So s√°nh RMSE v√† MAE')
axes[0].set_xticks(x)
axes[0].set_xticklabels(results_df['Ph∆∞∆°ng ph√°p'], rotation=15, ha='right')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Subplot 2: Scatter plot Actual vs Predicted (Hybrid)
axes[1].scatter(actual_ratings, predictions_hybrid, alpha=0.3, s=10)
axes[1].plot([actual_ratings.min(), actual_ratings.max()], 
             [actual_ratings.min(), actual_ratings.max()], 
             'r--', lw=2, label='Perfect Prediction')
axes[1].set_xlabel('Actual Rating')
axes[1].set_ylabel('Predicted Rating (Hybrid)')
axes[1].set_title('Actual vs Predicted Ratings (Hybrid System)')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("‚úì ƒê√£ t·∫°o visualization!")


## 8. Ph√¢n t√≠ch l·ªói


In [None]:
# Ph√¢n t√≠ch l·ªói chi ti·∫øt
print("=" * 60)
print("PH√ÇN T√çCH L·ªñI CHI TI·∫æT")
print("=" * 60)

# T√≠nh errors
errors_cf = actual_ratings - predictions_cf
errors_cb = actual_ratings - predictions_cb
errors_hybrid = actual_ratings - predictions_hybrid

# Th·ªëng k√™ l·ªói
error_stats = pd.DataFrame({
    'Ph∆∞∆°ng ph√°p': ['Collaborative Filtering', 'Content-Based Filtering', 'Hybrid System'],
    'Mean Error': [errors_cf.mean(), errors_cb.mean(), errors_hybrid.mean()],
    'Std Error': [errors_cf.std(), errors_cb.std(), errors_hybrid.std()],
    'Min Error': [errors_cf.min(), errors_cb.min(), errors_hybrid.min()],
    'Max Error': [errors_cf.max(), errors_cb.max(), errors_hybrid.max()],
    'RMSE': [rmse_cf, rmse_cb, rmse_hybrid],
    'MAE': [mae_cf, mae_cb, mae_hybrid]
})

print("\nTh·ªëng k√™ l·ªói:")
print(error_stats.to_string(index=False))

# Ph√¢n t√≠ch theo kho·∫£ng rating
print("\n" + "=" * 60)
print("PH√ÇN T√çCH L·ªñI THEO KHO·∫¢NG RATING")
print("=" * 60)

# Chia th√†nh c√°c kho·∫£ng rating
bins = [0.5, 2.0, 3.5, 5.0]
labels = ['Low (0.5-2.0)', 'Medium (2.0-3.5)', 'High (3.5-5.0)']
rating_bins = pd.cut(actual_ratings, bins=bins, labels=labels)

# T√≠nh RMSE v√† MAE cho t·ª´ng kho·∫£ng
for label in labels:
    mask = rating_bins == label
    if mask.sum() > 0:
        actual_bin = actual_ratings[mask]
        pred_cf_bin = predictions_cf[mask]
        pred_cb_bin = predictions_cb[mask]
        pred_hybrid_bin = predictions_hybrid[mask]
        
        rmse_cf_bin = np.sqrt(mean_squared_error(actual_bin, pred_cf_bin))
        mae_cf_bin = mean_absolute_error(actual_bin, pred_cf_bin)
        rmse_cb_bin = np.sqrt(mean_squared_error(actual_bin, pred_cb_bin))
        mae_cb_bin = mean_absolute_error(actual_bin, pred_cb_bin)
        rmse_hybrid_bin = np.sqrt(mean_squared_error(actual_bin, pred_hybrid_bin))
        mae_hybrid_bin = mean_absolute_error(actual_bin, pred_hybrid_bin)
        
        print(f"\n{label} (n={mask.sum():,}):")
        print(f"  CF:     RMSE={rmse_cf_bin:.4f}, MAE={mae_cf_bin:.4f}")
        print(f"  CB:     RMSE={rmse_cb_bin:.4f}, MAE={mae_cb_bin:.4f}")
        print(f"  Hybrid: RMSE={rmse_hybrid_bin:.4f}, MAE={mae_hybrid_bin:.4f}")

print("\n" + "=" * 60)
print("HO√ÄN TH√ÄNH ƒê√ÅNH GI√Å!")
print("=" * 60)
