In [1]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from gensim.models import Word2Vec

In [3]:
ratings = pd.read_csv('data/ml-32m/ratings.csv')
movies= pd.read_csv('data/ml-32m/movies.csv')
tags = pd.read_csv('data/ml-32m/tags.csv')

In [6]:
META_TAGS = {
    "religious" : ["pope", "god", "jesus", "bible", "church", "devil", "angel", "heaven", "hell", "satan", "jesus christ", "spiritual", "faith",
                   "demon", "religion"],
    "nsfw" : ["gore", "nudity", "sex", "group sex", "violence", "explicit sex", "gruesome", "erection", "nudity (full frontal - notable)",
              "breasts", "nudity (topless - notable)", "nudity (topless)"],
    "childrens" : ["disney", "disney animated feature", "animated"],
    "oscar_winner" : ["oscar (best supporting actor)" ,"oscar (best actor)","oscar (best directing)","oscar winner: best picture","oscar (best picture)",
                      "oscar (best supporting actress)","oscar (best actress)"],
    "oscar_nominee" : ["oscar nominee: best picture","oscar nominee: best actor","oscar nominee: best actress","oscar nominee: best supporting actor",
                      "oscar nominee: best supporting actress","oscar nominee: best director"],
    "notable" : ["afi 100", "imdb top 250", "national film registry"],
    "genres" : [
        "action", "adventure", "animation", "children's", "comedy", "crime", "documentary", "drama", "fantasy", "film-noir", "horror", "musical", "mystery", "romance", "sci-fi", "thriller", "war", "western", "(no genres listed)"
    ]
}

In [None]:
# Average rating of all movies the user has watched
user_avg_ratings = ratings.groupby('userId')['rating'].mean()
print(user_avg_ratings.head(n=5))
# List of all movies the user has watched
movies_usr_watched = ratings.groupby('userId')['movieId'].apply(list)
print(movies_usr_watched.head(n=5))
# List of all genres the user has watched
genres_usr_watched = ratings.merge(movies, on='movieId')[['userId', 'genres']]
genres_usr_watched['genres'] = genres_usr_watched['genres'].str.split('|')

userId
1    3.531915
2    4.269231
3    3.588435
4    2.629630
5    3.272727
Name: rating, dtype: float64
userId
1    [17, 25, 29, 30, 32, 34, 36, 80, 110, 111, 161...
2    [31, 34, 39, 48, 153, 185, 186, 193, 207, 216,...
3    [2, 10, 11, 17, 26, 48, 62, 110, 141, 150, 151...
4    [223, 1210, 1272, 1327, 1513, 1833, 2115, 2428...
5    [10, 47, 110, 150, 153, 161, 165, 185, 208, 23...
Name: movieId, dtype: object


In [9]:
# Get unique genres watched by each user
genres_usr_unique = genres_usr_watched.groupby('userId')['genres'].sum().apply(lambda x: list(set(x)))
print(genres_usr_unique.head(n=5))

userId
1    [Mystery, Romance, Horror, Children, Fantasy, ...
2    [Mystery, Romance, Horror, Children, Fantasy, ...
3    [Mystery, Romance, Horror, Children, Fantasy, ...
4    [Mystery, Horror, Fantasy, Comedy, Thriller, W...
5    [Mystery, Horror, Romance, Children, Fantasy, ...
Name: genres, dtype: object


In [10]:
# 1) User watch count by genre
# Explode genres so each row is (userId, genre)
exploded = genres_usr_watched.explode('genres')
# Remove any missing genres
exploded = exploded[exploded['genres'].notnull()]
# User watch count by genre (number of movies watched per genre)
user_genre_watch_count = exploded.groupby(['userId', 'genres']).size().unstack(fill_value=0)
print(user_genre_watch_count.head())

# 2) User average rating by genre
# Merge ratings with movies to get genres for each rating
ratings_with_genres = ratings.merge(movies[['movieId', 'genres']], on='movieId')
ratings_with_genres['genres'] = ratings_with_genres['genres'].str.split('|')
ratings_exploded = ratings_with_genres.explode('genres')
ratings_exploded = ratings_exploded[ratings_exploded['genres'].notnull()]
# Group by user and genre, then average the ratings
user_genre_avg_rating = ratings_exploded.groupby(['userId', 'genres'])['rating'].mean().unstack()
print(user_genre_avg_rating.head())

genres  (no genres listed)  Action  Adventure  Animation  Children  Comedy  \
userId                                                                       
1                        0      19         17          0         2      51   
2                        0       9          7          8        11      26   
3                        0      58         58         12        20      54   
4                        0      11          5          0         0       7   
5                        0      14         13          3         4       9   

genres  Crime  Documentary  Drama  Fantasy  Film-Noir  Horror  IMAX  Musical  \
userId                                                                         
1          16            1    102        8          1       3     0        0   
2           6            0     24        4          0       1     2        7   
3          16            0     53       17          0       4     4       10   
4           3            0      8        3          0

In [None]:
# Join user-level features into a single DataFrame
user_features = pd.DataFrame(user_avg_ratings)
user_features.columns = ['avg_rating']

# Join unique genres watched
user_features = user_features.join(genres_usr_unique.rename('unique_genres_watched'))

# Join user watch count by genre
user_features = user_features.join(user_genre_watch_count, rsuffix='_watch_count')

# Join user average rating by genre
user_features = user_features.join(user_genre_avg_rating, rsuffix='_avg_rating')

print(user_features.head())