## **Final Project - Machine Learning**
### **Movie Recommendation System**

### **I. Importing Libraries**

In [4]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import accuracy
from collections import defaultdict
import os
import sys 

### **II. Loading Data**

In [None]:
def load_data():
    print("Loading data...")
    ratings_df = pd.read_csv(r"data\ratings.csv")
    movies_df = pd.read_csv(r"data\movies.csv")
    
    # Load tags if available, handle gracefully if not
    tags_df = None
    if os.path.exists(r"data\tags.csv"):
        tags_df = pd.read_csv(r"data\tags.csv")
        print("Tags data loaded.")
    else:
        print("Tags file not found, proceeding without tags data.")
        # Create an empty DataFrame or handle its absence in downstream functions
        tags_df = pd.DataFrame(columns=['userId', 'movieId', 'tag', 'timestamp'])


    reader = Reader(rating_scale=(0.5, 5.0))
    data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

    movie_id_to_title = movies_df.set_index('movieId')['title'].to_dict()
    movie_id_to_genres = movies_df.set_index('movieId')['genres'].to_dict()

    print("Data loaded and preprocessed for Surprise.")
    return data, ratings_df, movies_df, tags_df, movie_id_to_title, movie_id_to_genres


### **III. Training Model**

In [None]:
def train_svd_model(data, enable_hyperparameter_tuning=False):
    print("Splitting data into training and testing sets...")
    trainset, testset = train_test_split(data, test_size=0.20, random_state=42)

    if enable_hyperparameter_tuning:
        print("Performing hyperparameter tuning for SVD (this may take a while)...")
        param_grid = {
            'n_epochs': [20, 30], 
            'lr_all': [0.002, 0.005],
            'reg_all': [0.02, 0.1]
        }
        gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
        gs.fit(data)
        print(f"Best RMSE score: {gs.best_score['rmse']}")
        print(f"Best parameters: {gs.best_params['rmse']}")
        algo = gs.best_estimator['rmse']
        
        print("Training SVD model with best parameters on the designated trainset...")
        algo = SVD(n_epochs=gs.best_params['rmse']['n_epochs'],
                   lr_all=gs.best_params['rmse']['lr_all'],
                   reg_all=gs.best_params['rmse']['reg_all'],
                   random_state=42)
        algo.fit(trainset)

    else:
        print("Training SVD model with default parameters...")
        algo = SVD(random_state=42)
        algo.fit(trainset)
    
    print("Model training complete.")
    return algo, testset

### **IV. Model Evaluation**

In [7]:
def evaluate_model(algo, testset):
    print("Evaluating model...")
    predictions = algo.test(testset)
    
    rmse = accuracy.rmse(predictions)
    print(f"RMSE on test set: {rmse}")

    def precision_recall_at_k(predictions, k=10, threshold=3.5):
        user_est_true = defaultdict(list)
        for uid, _, true_r, est, _ in predictions:
            user_est_true[uid].append((est, true_r))

        precisions = dict()
        recalls = dict()
        for uid, user_ratings in user_est_true.items():
            user_ratings.sort(key=lambda x: x[0], reverse=True)
            n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
            n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
            n_rel_and_rec_k = sum(
                ((true_r >= threshold) and (est >= threshold))
                for (est, true_r) in user_ratings[:k]
            )

            precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
            recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
        
        avg_precision = sum(prec for prec in precisions.values()) / len(precisions) if precisions else 0
        avg_recall = sum(rec for rec in recalls.values()) / len(recalls) if recalls else 0
        f1_score = (2 * avg_precision * avg_recall) / (avg_precision + avg_recall) if (avg_precision + avg_recall) > 0 else 0
        
        return avg_precision, avg_recall, f1_score

    k_val = 10
    rating_threshold = 3.5
    precision, recall, f1 = precision_recall_at_k(predictions, k=k_val, threshold=rating_threshold)
    print(f"Precision@{k_val} (threshold={rating_threshold}): {precision:.4f}")
    print(f"Recall@{k_val} (threshold={rating_threshold}): {recall:.4f}")
    print(f"F1-score@{k_val} (threshold={rating_threshold}): {f1:.4f}")
    print("\nNote: Qualitative evaluation (user satisfaction) would require user studies.")


### **V. Usage Function (Get Top Recommendations)**

In [8]:
def get_top_n_recommendations(algo, user_id, ratings_df, movies_df, movie_id_to_title, movie_id_to_genres, n=10, genre_filter=None):
    all_movie_ids = movies_df['movieId'].unique()
    rated_movie_ids = ratings_df[ratings_df['userId'] == user_id]['movieId'].unique()
    movies_to_predict_ids = np.setdiff1d(all_movie_ids, rated_movie_ids)
    
    testset_for_user = [[user_id, movie_id, 4.] for movie_id in movies_to_predict_ids]
    predictions = algo.test(testset_for_user)
    predictions.sort(key=lambda x: x.est, reverse=True)
    
    top_n_recs = []
    for pred in predictions:
        movie_id = pred.iid
        predicted_rating = pred.est
        genres = movie_id_to_genres.get(movie_id, "N/A")

        passes_genre_filter = True
        if genre_filter:
            passes_genre_filter = False
            for g in genre_filter:
                if g.lower() in genres.lower():
                    passes_genre_filter = True
                    break
        
        if passes_genre_filter:
            top_n_recs.append({
                'movieId': movie_id,
                'title': movie_id_to_title.get(movie_id, "Unknown Title"),
                'predicted_rating': predicted_rating,
                'genres': genres
            })
        
        if len(top_n_recs) >= n:
            break
            
    return top_n_recs

### **VI. User Input-Interface**

In [9]:
def run_user_interface(algo, ratings_df, movies_df, movie_id_to_title, movie_id_to_genres):
    print("\n--- Movie Recommendation System ---")
    all_user_ids = ratings_df['userId'].unique()

    while True:
        try:
            user_id_input = input(f"Enter User ID (e.g., 1 to {max(all_user_ids)}, or 'exit' to quit): ")
            if user_id_input.lower() == 'exit':
                break
            user_id = int(user_id_input)
            if user_id not in all_user_ids:
                print("Invalid User ID. Please choose from the available IDs.")
                continue

            num_recs_input = input("Enter number of recommendations (e.g., 10): ")
            num_recs = int(num_recs_input)

            genre_pref_input = input("Enter preferred genre(s) separated by comma (e.g., Action,Comedy or leave blank for any): ").strip()
            genre_filters = [g.strip() for g in genre_pref_input.split(',') if g.strip()] if genre_pref_input else None

            print(f"\nGetting recommendations for User ID: {user_id} (Preferred Genres: {genre_filters or 'Any'})...")
            
            user_ratings = ratings_df[ratings_df['userId'] == user_id].sort_values(by='rating', ascending=False)
            print("\nUser's Top Rated Movies (for context):")
            for _, row in user_ratings.head(5).iterrows():
                title = movie_id_to_title.get(row['movieId'], "Unknown")
                print(f"- {title} (Rated: {row['rating']})")
            
            recommendations = get_top_n_recommendations(
                algo, user_id, ratings_df, movies_df, movie_id_to_title, movie_id_to_genres, n=num_recs, genre_filter=genre_filters
            )
            
            print(f"\nTop {len(recommendations)} Recommendations for User {user_id}:")
            if recommendations:
                for i, rec in enumerate(recommendations):
                    print(f"{i+1}. {rec['title']} (Predicted Rating: {rec['predicted_rating']:.2f}, Genres: {rec['genres']})")
            else:
                print("No recommendations found matching your criteria.")
            print("-" * 30)

        except ValueError:
            print("Invalid input. Please enter numbers where expected.")
        except Exception as e:
            print(f"An error occurred: {e}")


### **VII. Usage - Results**

In [None]:
if __name__ == "__main__": # This block might not run directly if you're in a notebook cell-by-cell
                           # You'd typically run these commands in separate cells or one large cell.
    # Load and preprocess data
    surprise_data, ratings_df, movies_df, tags_df, movie_id_to_title, movie_id_to_genres = load_data()
    
    # Train the SVD model
    svd_model, testset = train_svd_model(surprise_data, enable_hyperparameter_tuning=False) # Set to True for tuning
    
    # Evaluate the model
    evaluate_model(svd_model, testset)
    
    # Run the user interface
    run_user_interface(svd_model, ratings_df, movies_df, movie_id_to_title, movie_id_to_genres)

    print("\nMovie Recommendation Program Finished.")

Loading data...
Tags data loaded.
Data loaded and preprocessed for Surprise.
Splitting data into training and testing sets...
Training SVD model with default parameters...
Model training complete.
Evaluating model...
RMSE: 0.7717
RMSE on test set: 0.7717007704524537
Precision@10 (threshold=3.5): 0.8120
Recall@10 (threshold=3.5): 0.5714
F1-score@10 (threshold=3.5): 0.6708

Note: Qualitative evaluation (user satisfaction) would require user studies.

--- Movie Recommendation System ---

Getting recommendations for User ID: 18 (Preferred Genres: Any)...

User's Top Rated Movies (for context):
- American Beauty (1999) (Rated: 5.0)
- Big Lebowski, The (1998) (Rated: 5.0)
- One Flew Over the Cuckoo's Nest (1975) (Rated: 5.0)
- Princess Bride, The (1987) (Rated: 5.0)
- Psycho (1960) (Rated: 5.0)

Top 8 Recommendations for User 18:
1. Decalogue, The (Dekalog) (1989) (Predicted Rating: 4.90, Genres: Crime|Drama|Romance)
2. Wizard of Oz, The (1939) (Predicted Rating: 4.88, Genres: Adventure|Chil