In [9]:
import pandas as pd
import numpy as np
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from scipy import sparse
import re

In [10]:
ratings = pd.read_csv('data/ml-32m/ratings.csv')
movies= pd.read_csv('data/ml-32m/movies.csv')
tags = pd.read_csv('data/ml-32m/tags.csv')

In [4]:
preprocessed_movies = pd.read_csv('pre_processed_movies.csv')

In [5]:
META_TAGS = {
    "religious" : ["pope", "god", "jesus", "bible", "church", "devil", "angel", "heaven", "hell", "satan", "jesus christ", "spiritual", "faith",
                   "demon", "religion"],
    "nsfw" : ["gore", "nudity", "sex", "group sex", "violence", "explicit sex", "gruesome", "erection", "nudity (full frontal - notable)",
              "breasts", "nudity (topless - notable)", "nudity (topless)"],
    "childrens" : ["disney", "disney animated feature", "animated"],
    "oscar_winner" : ["oscar (best supporting actor)" ,"oscar (best actor)","oscar (best directing)","oscar winner: best picture","oscar (best picture)",
                      "oscar (best supporting actress)","oscar (best actress)"],
    "oscar_nominee" : ["oscar nominee: best picture","oscar nominee: best actor","oscar nominee: best actress","oscar nominee: best supporting actor",
                      "oscar nominee: best supporting actress","oscar nominee: best director"],
    "notable" : ["afi 100", "imdb top 250", "national film registry"],
    "genres" : [
        "action", "adventure", "animation", "children's", "comedy", "crime", "documentary", "drama", "fantasy", "film-noir", "horror", "musical", "mystery", "romance", "sci-fi", "thriller", "war", "western", "(no genres listed)"
    ]
}

In [6]:
# Average rating of all movies the user has watched
user_avg_ratings = ratings.groupby('userId')['rating'].mean()
print(user_avg_ratings.head(n=5))
# List of all movies the user has watched
movies_usr_watched = ratings.groupby('userId')['movieId'].apply(list)
print(movies_usr_watched.head(n=5))
# List of all genres the user has watched
genres_usr_watched = ratings.merge(movies, on='movieId')[['userId', 'genres']]
genres_usr_watched['genres'] = genres_usr_watched['genres'].str.split('|')

userId
1    3.531915
2    4.269231
3    3.588435
4    2.629630
5    3.272727
Name: rating, dtype: float64
userId
1    [17, 25, 29, 30, 32, 34, 36, 80, 110, 111, 161...
2    [31, 34, 39, 48, 153, 185, 186, 193, 207, 216,...
3    [2, 10, 11, 17, 26, 48, 62, 110, 141, 150, 151...
4    [223, 1210, 1272, 1327, 1513, 1833, 2115, 2428...
5    [10, 47, 110, 150, 153, 161, 165, 185, 208, 23...
Name: movieId, dtype: object


In [7]:
# Get unique genres watched by each user
genres_usr_unique = genres_usr_watched.groupby('userId')['genres'].sum().apply(lambda x: list(set(x)))
print(genres_usr_unique.head(n=5))

userId
1    [Sci-Fi, Thriller, Film-Noir, Horror, Mystery,...
2    [Thriller, Horror, Animation, Mystery, IMAX, D...
3    [Sci-Fi, Thriller, Romance, Horror, Animation,...
4    [Sci-Fi, Thriller, Horror, Mystery, Drama, Act...
5    [Sci-Fi, Thriller, Horror, Animation, Mystery,...
Name: genres, dtype: object


In [None]:
# WARNING: This is block is very inefficient, so the following operations may take some time to run.
# 
# 1) User watch count by genre
# Explode genres so each row is (userId, genre)
exploded = genres_usr_watched.explode('genres')
# Remove any missing genres
exploded = exploded[exploded['genres'].notnull()]
# User watch count by genre (number of movies watched per genre)
user_genre_watch_count = exploded.groupby(['userId', 'genres']).size().unstack(fill_value=0)
print(user_genre_watch_count.head())

# 2) User average rating by genre
# Merge ratings with movies to get genres for each rating
ratings_with_genres = ratings.merge(movies[['movieId', 'genres']], on='movieId')
ratings_with_genres['genres'] = ratings_with_genres['genres'].str.split('|')
ratings_exploded = ratings_with_genres.explode('genres')
ratings_exploded = ratings_exploded[ratings_exploded['genres'].notnull()]
# Group by user and genre, then average the ratings
user_genre_avg_rating = ratings_exploded.groupby(['userId', 'genres'])['rating'].mean().unstack()
print(user_genre_avg_rating.head())

genres  (no genres listed)  Action  Adventure  Animation  Children  Comedy  \
userId                                                                       
1                        0      19         17          0         2      51   
2                        0       9          7          8        11      26   
3                        0      58         58         12        20      54   
4                        0      11          5          0         0       7   
5                        0      14         13          3         4       9   

genres  Crime  Documentary  Drama  Fantasy  Film-Noir  Horror  IMAX  Musical  \
userId                                                                         
1          16            1    102        8          1       3     0        0   
2           6            0     24        4          0       1     2        7   
3          16            0     53       17          0       4     4       10   
4           3            0      8        3          0

In [51]:
# Join user-level features into a single DataFrame
user_features = pd.DataFrame(user_avg_ratings)
user_features.columns = ['avg_rating']

# Join unique genres watched
user_features = user_features.join(genres_usr_unique.rename('unique_genres_watched'))

# Replace unique_genres_watched lists with their lengths
user_features['unique_genres_watched'] = user_features['unique_genres_watched'].apply(lambda x: len(x) if isinstance(x, list) else 0)

# Join user watch count by genre
user_features = user_features.join(user_genre_watch_count, rsuffix='_watch_count')

# Join user average rating by genre
user_features = user_features.join(user_genre_avg_rating, rsuffix='_avg_rating')

print(user_features.head())

        avg_rating  unique_genres_watched  (no genres listed)  Action  \
userId                                                                  
1         3.531915                     16                   0      19   
2         4.269231                     15                   0       9   
3         3.588435                     17                   0      58   
4         2.629630                     11                   0      11   
5         3.272727                     17                   0      14   

        Adventure  Animation  Children  Comedy  Crime  Documentary  ...  \
userId                                                              ...   
1              17          0         2      51     16            1  ...   
2               7          8        11      26      6            0  ...   
3              58         12        20      54     16            0  ...   
4               5          0         0       7      3            0  ...   
5              13          3         4

In [10]:
user_features.shape

(200948, 42)

In [11]:
count_movies_by_genre = movies['genres'].str.get_dummies(sep='|').sum().sort_values(ascending=False)
print(count_movies_by_genre)

Drama                 34175
Comedy                23124
Thriller              11823
Romance               10369
Action                 9668
Documentary            9363
Horror                 8654
(no genres listed)     7080
Crime                  6976
Adventure              5402
Sci-Fi                 4907
Animation              4617
Children               4520
Mystery                4013
Fantasy                3851
War                    2325
Western                1696
Musical                1059
Film-Noir               353
IMAX                    195
dtype: int64


In [12]:
### Probability a user will interact with a movie given it has a specific genre (with smoothing)

# Smoothing parameter
alpha = 1
K = len(count_movies_by_genre)

# Numerator: user_genre_watch_count + alpha
numerator = user_genre_watch_count.add(alpha)
# Denominator: total movies in genre + alpha*K
# (broadcasted for each genre)
denominator = count_movies_by_genre.add(alpha * K)

# Broadcast division to get smoothed probability matrix
p_user_will_interact = numerator.div(denominator, axis=1)

print(p_user_will_interact.head())

        (no genres listed)    Action  Adventure  Animation  Children  \
userId                                                                 
1                 0.000141  0.002064   0.003320   0.000216  0.000661   
2                 0.000141  0.001032   0.001475   0.001941  0.002643   
3                 0.000141  0.006090   0.010882   0.002804  0.004626   
4                 0.000141  0.001239   0.001107   0.000216  0.000220   
5                 0.000141  0.001548   0.002582   0.000863  0.001101   

          Comedy     Crime  Documentary     Drama   Fantasy  Film-Noir  \
userId                                                                   
1       0.002247  0.002430     0.000213  0.003012  0.002325   0.005362   
2       0.001167  0.001001     0.000107  0.000731  0.001292   0.002681   
3       0.002376  0.002430     0.000107  0.001579  0.004650   0.002681   
4       0.000346  0.000572     0.000107  0.000263  0.001033   0.002681   
5       0.000432  0.001429     0.000107  0.000409  

In [13]:
def extract_year(title):
    """Extracts the year from a movie title string like 'Movie Title (1999)'. Returns np.nan if not found."""
    match = re.search(r'\((\d{4})\)', title)
    if match:
        return int(match.group(1))
    return np.nan

# Add a 'year' column to the movies DataFrame
movies['year'] = movies['title'].apply(extract_year)

# Function to compute average year of movies watched by each user
def year_stats_watched(movie_ids):
    years = movies[movies['movieId'].isin(movie_ids)]['year'].dropna()
    if len(years) == 0:
        return pd.Series([np.nan, np.nan, np.nan, np.nan], index=['year_mean', 'year_median', 'year_mode', 'year_stdDev'])
    mean = years.mean()
    median = years.median()
    stddev = years.std()
    mode = years.mode().iloc[0] if not years.mode().empty else np.nan
    return pd.Series([mean, median, mode, stddev], index=['year_mean', 'year_median', 'year_mode', 'year_stdDev'])


# Compute year statistics for each user
user_year_stats = movies_usr_watched.apply(year_stats_watched)
user_year_stats = user_year_stats.reset_index().rename(columns={"index": "userId"})
print(user_year_stats.head())

   userId    year_mean  year_median  year_mode  year_stdDev
0       1  1984.673759       1989.0     1995.0    14.897026
1       2  1992.192308       1994.0     1994.0     8.597853
2       3  1991.408163       1994.0     1995.0    10.832318
3       4  1994.518519       1999.0     1999.0     7.747805
4       5  1993.666667       1994.0     1994.0     1.534329


In [39]:
def user_meta_stats(movie_ids, preprocessed_movies):
    """
    Returns the number and average score of religious, NSFW, and Oscar movies watched by a user.
    Output: (religious_count, religious_avg, nsfw_count, nsfw_avg, oscar_count, oscar_avg)
    """
    watched = preprocessed_movies[preprocessed_movies['movieId'].isin(movie_ids)]
    # Religious stats
    religious_movies = watched[watched['religion'] > 0]
    religious_count = len(religious_movies)
    religious_avg = religious_movies['religion'].mean() if religious_count > 0 else 0
    # NSFW stats
    nsfw_movies = watched[watched['nsfw'] > 0]
    nsfw_count = len(nsfw_movies)
    nsfw_avg = nsfw_movies['nsfw'].mean() if nsfw_count > 0 else 0
    # Oscar stats
    oscar_movies = watched[watched['oscars'] > 0]
    oscar_count = len(oscar_movies)
    oscar_avg = oscar_movies['oscars'].mean() if oscar_count > 0 else 0
    return religious_count, religious_avg, nsfw_count, nsfw_avg, oscar_count, oscar_avg

In [40]:
usrmeta_stats = movies_usr_watched.reset_index().rename(columns={"index": "userId"})
meta_stats = movies_usr_watched['movieId'].apply(lambda x: user_meta_stats(x, preprocessed_movies))


In [45]:
# Split the tuples into separate columns
movies_usr_watched['religious_count'] = meta_stats.apply(lambda x: x[0])
movies_usr_watched['religious_avg_score'] = meta_stats.apply(lambda x: x[1])
movies_usr_watched['nsfw_count'] = meta_stats.apply(lambda x: x[2])
movies_usr_watched['nsfw_avg_score'] = meta_stats.apply(lambda x: x[3])
movies_usr_watched['oscar_count'] = meta_stats.apply(lambda x: x[4])
movies_usr_watched['oscar_avg_score'] = meta_stats.apply(lambda x: x[5])

# Display the first few rows to verify
print(movies_usr_watched[['religious_count', 'religious_avg_score', 'nsfw_count', 'nsfw_avg_score', 'oscar_count', 'oscar_avg_score']].head(10))

   religious_count  religious_avg_score  nsfw_count  nsfw_avg_score  \
0               27             0.108642          62        1.919355   
1                8             0.133333          13        1.769231   
2               29             0.101149          52        1.769231   
3                4             0.116667          12        1.583333   
4                9             0.125926          20        1.750000   
5                4             0.233333          14        1.857143   
6                9             0.103704          18        1.444444   
7                8             0.100000          26        2.538462   
8               11             0.145455          38        2.105263   
9              119             0.112045         349        1.808023   

   oscar_count  oscar_avg_score  
0           64         5.398438  
1           12         4.041667  
2           38         6.000000  
3            4         3.750000  
4           12         7.250000  
5            4

In [13]:
# Create a user-item interaction matrix: users x movies, values are 1 (like), -1 (dislike), or NaN (not rated)

# Map ratings to implicit confidence values for ALS
# Mirroring method from Collaborative Filtering for Implicit Feedback Datasets
# http://yifanhu.net/PUB/cf.pdf
alpha = 40  # You can tune this value
ratings['interaction'] = np.where(ratings['rating'] > 2.5, 1 + alpha * (ratings['rating'] / 5), ratings['rating'])
ratings['interaction'] = np.where(ratings['interaction'] == 1, 0, ratings['interaction'])




# Use all users and all movies for the matrix shape
user_ids = ratings['userId'].unique()
movie_ids = movies['movieId'].unique()
user_idx = {user: i for i, user in enumerate(user_ids)}
movie_idx = {movie: i for i, movie in enumerate(movie_ids)}

# For each rating, get the user and movie index
row = [user_idx[user] for user in ratings['userId']]
col = [movie_idx[movie] for movie in ratings['movieId']]
data = ratings['interaction'].values

# Create sparse matrix with all users and all movies
interactions = sparse.coo_matrix((data, (row, col)), 
                                shape=(len(user_ids), len(movie_ids)))

# Convert to CSR format for efficient row slicing
interactions_csr = interactions.tocsr()
sparse.save_npz('user_movie_interactions.npz', interactions_csr)

print(f"Sparse user-movie interaction matrix shape: {interactions_csr.shape}")

Sparse user-movie interaction matrix shape: (200948, 87585)


In [14]:
print(interactions_csr[0])

  (0, 16)	33.0
  (0, 24)	0.0
  (0, 28)	2.0
  (0, 29)	41.0
  (0, 31)	41.0
  (0, 33)	2.0
  (0, 35)	0.0
  (0, 79)	41.0
  (0, 108)	25.0
  (0, 109)	41.0
  (0, 159)	0.0
  (0, 164)	41.0
  (0, 174)	33.0
  (0, 220)	25.0
  (0, 229)	41.0
  (0, 257)	41.0
  (0, 298)	33.0
  (0, 302)	41.0
  (0, 303)	41.0
  (0, 318)	33.0
  (0, 340)	0.0
  (0, 351)	2.0
  (0, 522)	25.0
  (0, 536)	41.0
  (0, 555)	41.0
  :	:
  (0, 2245)	41.0
  (0, 2261)	25.0
  (0, 2305)	41.0
  (0, 2316)	0.0
  (0, 2333)	0.0
  (0, 2411)	25.0
  (0, 2429)	41.0
  (0, 2438)	41.0
  (0, 2508)	41.0
  (0, 2549)	41.0
  (0, 2598)	25.0
  (0, 2620)	0.0
  (0, 2632)	0.0
  (0, 2705)	41.0
  (0, 2790)	0.0
  (0, 2798)	33.0
  (0, 2826)	33.0
  (0, 2852)	33.0
  (0, 2874)	0.0
  (0, 2881)	41.0
  (0, 2893)	41.0
  (0, 2905)	33.0
  (0, 2937)	33.0
  (0, 2985)	2.0
  (0, 2995)	25.0


In [52]:
# Join user-level features into a single DataFrame
# Join year statistics
user_features = user_features.join(user_year_stats.set_index('userId'))

# Join religious and NSFW stats
user_features = user_features.join(movies_usr_watched.set_index('userId')[['religious_count', 'religious_avg_score', 'nsfw_count', 'nsfw_avg_score', 
                                                                           'oscar_count', 'oscar_avg_score']])

# Join probability of user interacting with each genre (optional: can flatten columns if needed)
# user_features = user_features.join(p_user_will_interact, rsuffix='_p_interact')

# Save to CSV
user_features.to_csv('pre_processed_users.csv')
print('Saved all user features to pre_processed_users.csv')

Saved all user features to pre_processed_users.csv


In [55]:
print(user_features.head(n=10))
print(user_features.shape)

        avg_rating  unique_genres_watched  (no genres listed)  Action  \
userId                                                                  
1         3.531915                     16                   0      19   
2         4.269231                     15                   0       9   
3         3.588435                     17                   0      58   
4         2.629630                     11                   0      11   
5         3.272727                     17                   0      14   
6         4.173077                     15                   0      22   
7         3.636364                     16                   0      13   
8         4.322581                     12                   0       6   
9         4.241379                     16                   0      18   
10        2.787121                     19                   0     341   

        Adventure  Animation  Children  Comedy  Crime  Documentary  ...  \
userId                                          