#### Movie Data Analysis and Recommendation

##### Import Libraries & Load Data

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Load movie metadata
movies_df = pd.read_csv('data/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


##### Clean Movie Titles and Extract Year

In [38]:
#Extract the year from title using Regex
movies_df['year'] = movies_df['title'].str.extract(r'\((\d{4})\)', expand=False)

#Remove the year from the title
movies_df['title']=movies_df['title'].str.replace(r'\(\d{4}\)', '', regex=True)
movies_df['title'] = movies_df['title'].str.strip()

movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


##### One-Hot Encode Genres

In [40]:
genre_dummies = movies_df['genres'].str.get_dummies(sep='|')
movies_df = pd.concat([movies_df, genre_dummies], axis=1)
movies_df

##### Simulate a New User's Ratings

In [41]:
user_input_df = pd.DataFrame([
    {'title': 'Toy Story', 'rating': 4},
    {'title': 'Jumanji', 'rating': 5},
    {'title': 'Father of the Bride Part II', 'rating': 5},
    {'title': 'Heat', 'rating': 1},
    {'title': 'Space Jam', 'rating': 5},
])

user_input_df

Unnamed: 0,title,rating
0,Toy Story,4
1,Jumanji,5
2,Father of the Bride Part II,5
3,Heat,1
4,Space Jam,5


##### Merge with Main Movies Data

In [53]:
import re

# Title cleaning
def clean_title(title):
    return re.sub(r"\s*\(\d{4}\)", "", title).lower().strip()

user_input_df['clean_title'] = user_input_df['title'].apply(clean_title)
movies_df['clean_title'] = movies_df['title'].apply(clean_title)

# Merge based on clean title
merged_df = pd.merge(user_input_df, movies_df, on='clean_title')
merged_df = merged_df.drop_duplicates(subset=['movieId'])

# Drop unnecessary columns
merged_df.drop(columns=['genres', 'year'], inplace=True, errors='ignore')

# Merge again to get full genre matrix
user_favorite_genres = pd.merge(merged_df, movies_df, on='movieId')


##### Analyze Favorite Genres of the User

In [31]:
print(user_favorite_genres.columns)

Index(['title_x', 'rating', 'clean_title_x', 'movieId', 'title_y',
       'Adventure_x', 'Animation_x', 'Children_x', 'Comedy_x', 'Fantasy_x',
       ...
       'Film-Noir_y', 'Horror_y', 'IMAX_y', 'Musical_y', 'Mystery_y',
       'Romance_y', 'Sci-Fi_y', 'Thriller_y', 'War_y', 'Western_y'],
      dtype='object', length=206)


In [56]:
# Drop unnecessary columns
cols_to_drop = ['movieId', 'title_x', 'title_y', 'title', 'rating', 'genres', 'year', 'clean_title_x', 'clean_title_y']
user_favorite_genres.drop(
    columns=[col for col in cols_to_drop if col in user_favorite_genres.columns],
    inplace=True
)

##### Create Movie Feature Matrix

In [58]:
movie_matrix = movies_df.drop(columns=['title', 'genres', 'year'])
movie_matrix.set_index('movieId', inplace=True)
movie_matrix.head()

##### Collaborative Filtering: Pearson Correlation

In [59]:
# Ratings dataset
ratings_df = pd.read_csv('data/ratings.csv')

# Merge with user-rated movies
user_subset_df = pd.merge(ratings_df, merged_df,  on='movieId')
user_subset_df.drop(columns=['title', 'genres', 'year', 'rating_y'], inplace=True, errors='ignore')
user_subset_df.rename(columns={'rating_x': 'rating'}, inplace=True)

##### Pearson Correlation Calculation

In [None]:
from math import sqrt

# Pearson correlation dictionary
pearson_correlation_dict = {}

# Group ratings by user
sorted_user_subset_group = user_subset_df.groupby('userId')

# First, get the movieId-rating info for the target user
target_user_ratings = merged_df[['movieId', 'rating']].sort_values('movieId')

for userId, group in sorted_user_subset_group:
    group = group[['movieId', 'rating']].sort_values('movieId')

    # Get the common movies
    common_movies = set(target_user_ratings['movieId']).intersection(set(group['movieId']))
    
    if len(common_movies) == 0:
        pearson_correlation_dict[userId] = 0
        continue

    # Filter only common movies
    target_common = target_user_ratings[target_user_ratings['movieId'].isin(common_movies)].sort_values('movieId')
    group_common = group[group['movieId'].isin(common_movies)].sort_values('movieId')

    # Get the ratings as lists
    target_ratings = target_common['rating'].tolist()
    group_ratings = group_common['rating'].tolist()

    n = len(target_ratings)

    # Calculate Pearson correlation
    Sxx = sum([i ** 2 for i in target_ratings]) - pow(sum(target_ratings), 2) / float(n)
    Syy = sum([i ** 2 for i in group_ratings]) - pow(sum(group_ratings), 2) / float(n)
    Sxy = sum(i * j for i, j in zip(target_ratings, group_ratings)) - sum(target_ratings) * sum(group_ratings) / float(n)

    if Sxx != 0 and Syy != 0:
        pearson_correlation_dict[userId] = Sxy / sqrt(Sxx * Syy)
    else:
        pearson_correlation_dict[userId] = 0

# View the top 50 users
dict(list(pearson_correlation_dict.items())[:50])