In [1]:
import pandas as pd
import numpy as np

# --- STEP 1: LOAD DATA ---
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1')

i_cols = [
    'movie id', 'movie title', 'release date', 'video release date', 'IMDb URL', 
    'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 
    'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
    'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
]
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')

# --- STEP 2: CALCULATE GLOBAL POPULARITY ---

# We group by movie and calculate count and average rating
movie_stats = ratings.groupby('movie_id')['rating'].agg(['size', 'mean'])
movie_stats.columns = ['rating_count', 'rating_mean']

# Merge with titles
popularity_df = movie_stats.merge(items[['movie id', 'movie title']], left_index=True, right_on='movie id')

# --- STEP 3: THE NEW USER FUNCTION ---

def recommend_for_new_user(top_n=10, min_ratings=100):
    """
    Returns the most popular movies for a user with no history.
    min_ratings: ensures we only show movies with enough data to be 'trusted'.
    """
    # 1. Filter movies that meet the minimum popularity threshold
    qualified = popularity_df[popularity_df['rating_count'] >= min_ratings].copy()
    
    # 2. Sort by rating_mean (descending)
    # If two movies have the same rating, we prefer the one with more reviews (rating_count)
    top_hits = qualified.sort_values(by=['rating_mean', 'rating_count'], ascending=False)
    
    return top_hits.head(top_n)



In [2]:
# --- STEP 4: USAGE ---
# When a user signs up for the first time, you call this:
new_user_suggestions = recommend_for_new_user(top_n=5, min_ratings=150)

print("Welcome! Here are some trending movies to get you started:")
print(new_user_suggestions[['movie title', 'rating_mean', 'rating_count']])

Welcome! Here are some trending movies to get you started:
                          movie title  rating_mean  rating_count
317           Schindler's List (1993)     4.466443           298
482                 Casablanca (1942)     4.456790           243
63   Shawshank Redemption, The (1994)     4.445230           283
602                Rear Window (1954)     4.387560           209
11         Usual Suspects, The (1995)     4.385768           267


In [None]:
# We can't compare a new user to others or check their genre history, 
#we simply show them the overall best-performing content on our platform.