In [1]:
import pandas as pd
import numpy as np
from io import StringIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
import random
import sys

MIN_MOVIE_RATINGS = 50
MAX_MOVIE_RATINGS = 70
MIN_USER_RATINGS = 20
RATINGS_SAMPLE_SIZE = 5000000

def cap_movie_ratings(group):
    if len(group) > MAX_MOVIE_RATINGS:
        return group.sample(n=MAX_MOVIE_RATINGS, random_state=1)
    return group

def get_content_filter(movies_df):
    movies_df_copy = movies_df.copy()
    movies_df_copy['genres_processed'] = movies_df_copy['genres'].fillna("").str.replace('|', ' ')
    
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(movies_df_copy['genres_processed'])
    
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    movies_df_copy = movies_df_copy.reset_index()
    indices = pd.Series(movies_df_copy.index, index=movies_df_copy['title']).drop_duplicates()
    
    return cosine_sim, indices, movies_df_copy

def get_collaborative_filter(ratings_df):
    user_ids = ratings_df['userId'].unique()
    movie_ids = ratings_df['movieId'].unique()
    
    user_id_to_index = {user_id: index for index, user_id in enumerate(user_ids)}
    movie_id_to_index = {movie_id: index for index, movie_id in enumerate(movie_ids)}
    
    ratings_df_copy = ratings_df.copy()
    ratings_df_copy['user_index'] = ratings_df_copy['userId'].map(user_id_to_index)
    ratings_df_copy['movie_index'] = ratings_df_copy['movieId'].map(movie_id_to_index)
    
    n_users = len(user_ids)
    n_movies = len(movie_ids)
    
    user_item_matrix = csr_matrix(
        (ratings_df_copy['rating'], (ratings_df_copy['user_index'], ratings_df_copy['movie_index'])),
        shape=(n_users, n_movies)
    )

    print("Training TruncatedSVD model...")
    svd_model = TruncatedSVD(n_components=50, random_state=42) 
    user_factors = svd_model.fit_transform(user_item_matrix)
    item_factors = svd_model.components_.T
    print("SVD model trained.")

    return svd_model, user_factors, item_factors, user_id_to_index, movie_id_to_index, user_ids.tolist()

def get_hybrid_recommendations(user_id, movie_title, movies_df, content_indices, content_cosine_sim, cf_model, user_factors, item_factors, user_map, movie_map):
    
    content_idx = content_indices[movie_title]
    sim_scores = list(enumerate(content_cosine_sim[content_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:51]
    content_movie_indices = [i[0] for i in sim_scores]
    candidates_df = movies_df.iloc[content_movie_indices][['title', 'movieId']]

    user_matrix_index = user_map[user_id]
    user_vec = user_factors[user_matrix_index]
    
    predictions = []
    
    for _, row in candidates_df.iterrows():
        candidate_movie_id = row['movieId']
        
        if candidate_movie_id in movie_map:
            movie_matrix_index = movie_map[candidate_movie_id]
            item_vec = item_factors[movie_matrix_index]
            pred_rating = np.dot(user_vec, item_vec)
            predictions.append((row['title'], pred_rating))
        
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    return [title for title, rating in predictions[:10]]

def main():
    print("--- 1. Loading Real Data ---")
    movies_df = pd.read_csv("/mnt/10EE4B76EE4B5360/College/pccoe/7th Sem/RS/RS-A2_A3_movie.csv")
    ratings_df = pd.read_csv("/mnt/10EE4B76EE4B5360/College/pccoe/7th Sem/RS/RS-A2_A3_Filtered_Ratings.csv", nrows=RATINGS_SAMPLE_SIZE)

    print(f"Loaded {len(movies_df)} movies.")
    print(f"Loaded a sample of {len(ratings_df)} ratings (up to {RATINGS_SAMPLE_SIZE}).")
    
    print("\n--- 2. Applying Filtering Logic ---")
    
    movie_counts = ratings_df['movieId'].value_counts()
    movies_to_keep = movie_counts[movie_counts >= MIN_MOVIE_RATINGS].index
    ratings_f1 = ratings_df[ratings_df['movieId'].isin(movies_to_keep)]
    print(f"Ratings after (movie >= {MIN_MOVIE_RATINGS}): {len(ratings_f1)}")

    user_counts = ratings_f1['userId'].value_counts()
    users_to_keep = user_counts[user_counts >= MIN_USER_RATINGS].index
    ratings_f2 = ratings_f1[ratings_f1['userId'].isin(users_to_keep)]
    print(f"Ratings after (user >= {MIN_USER_RATINGS}): {len(ratings_f2)}")

    print("Applying rating cap (this may take a moment)...")
    ratings_f3 = ratings_f2.groupby('movieId').apply(cap_movie_ratings).reset_index(drop=True)
    print(f"Ratings after (movie <= {MAX_MOVIE_RATINGS}): {len(ratings_f3)}")
    
    ratings_final_df = ratings_f3
    
    movies_filtered_df = movies_df[movies_df['movieId'].isin(ratings_final_df['movieId'].unique())]
    
    print(f"\nFinal movies in system: {len(movies_filtered_df)}")
    print(f"Final ratings in system: {len(ratings_final_df)}")
    print(f"Final users in system: {ratings_final_df['userId'].nunique()}")
    
    print("\n--- 3. Building Content-Based Filter (TF-IDF) ---")
    cosine_sim, content_indices, movies_indexed_df = get_content_filter(movies_filtered_df)
    print("Content filter built.")

    print("\n--- 4. Building Collaborative Ranker (sklearn TruncatedSVD) ---")
    svd, u_factors, i_factors, user_map, movie_map, final_user_list = get_collaborative_filter(ratings_final_df)
    print("Collaborative ranker built.")
    
    print("\n--- 5. Getting Hybrid Recommendations ---")
    
    TEST_USER_ID = final_user_list[0]
    TEST_MOVIE = movies_indexed_df.iloc[0]['title']
        
    print(f"\nTarget User: {TEST_USER_ID}, Target Movie: {TEST_MOVIE}")
    
    recommendations = get_hybrid_recommendations(
        TEST_USER_ID, TEST_MOVIE, 
        movies_indexed_df, content_indices, cosine_sim,
        svd, u_factors, i_factors, user_map, movie_map
    )
    
    print("\nTop 10 Hybrid Recommendations:")
    for i, title in enumerate(recommendations):
        print(f"{i+1}. {title}")

if __name__ == "__main__":
    main()

--- 1. Loading Real Data ---
Loaded 27278 movies.
Loaded a sample of 10000 ratings (up to 5000000).

--- 2. Applying Filtering Logic ---
Ratings after (movie >= 50): 5379
Ratings after (user >= 20): 4136
Applying rating cap (this may take a moment)...
Ratings after (movie <= 70): 3724

Final movies in system: 68
Final ratings in system: 3724
Final users in system: 143

--- 3. Building Content-Based Filter (TF-IDF) ---
Content filter built.

--- 4. Building Collaborative Ranker (sklearn TruncatedSVD) ---
Training TruncatedSVD model...
SVD model trained.
Collaborative ranker built.

--- 5. Getting Hybrid Recommendations ---

Target User: 99851, Target Movie: Toy Story (1995)

Top 10 Hybrid Recommendations:
1. Englishman Who Went Up a Hill But Came Down a Mountain, The (1995)
2. Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
3. Seven (a.k.a. Se7en) (1995)
4. Dangerous Minds (1995)
5. Madness of King George, The (1994)
6. Clueless (1995)
7. Ed Wood (1994)
8. French Kiss (1995)
9. Rob Roy (1995)

  ratings_f3 = ratings_f2.groupby('movieId').apply(cap_movie_ratings).reset_index(drop=True)
