# Movie prefs 2 vec
I want to build a taste vector for each user to recommend content.

In [2]:
# import kagglehub
# import shutil
# import os
# 
# # Download latest version
# path = kagglehub.dataset_download("aigamer/movie-lens-dataset")
# 
# # Ensure the destination folder exists
# os.makedirs("data", exist_ok=True)
# 
# # Move the downloaded files to the destination folder
# for filename in os.listdir(path):
#     source_file = os.path.join(path, filename)
#     destination_file = os.path.join("data", filename)
#     
#     # Move the file
#     shutil.move(source_file, destination_file)

In [3]:
import pandas as pd

# Load the movies data into a DataFrame
movies_df = pd.read_csv("data/movies.csv")
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
links_df = pd.read_csv("data/links.csv")
links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [5]:
ratings_df = pd.read_csv("data/ratings.csv")
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
tags_df = pd.read_csv("data/tags.csv")
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [7]:
# Find the movie in movies_df with movieId 60756
movie_60756 = movies_df[movies_df['movieId'] == 60756]

tags_60756 = tags_df[tags_df['movieId'] == 60756]

# Display movie info and user tags
print("Movie info:")
print(movie_60756)
print("\nUser tags for this movie:")
print(tags_60756[['userId', 'tag']])

Movie info:
      movieId                 title  genres
6801    60756  Step Brothers (2008)  Comedy

User tags for this movie:
     userId              tag
0         2            funny
1         2  Highly quotable
2         2     will ferrell
205      62           comedy
206      62            funny
207      62     will ferrell
909     424            funny
910     424     will ferrell


In [8]:
print(f"Number of unique users: {ratings_df['userId'].nunique()}")
print(f"Number of unique movies: {ratings_df['movieId'].nunique()}")
print(f"Number of ratings: {len(ratings_df)}")
print(f"Number of tags: {len(tags_df)}")



Number of unique users: 610
Number of unique movies: 9724
Number of ratings: 100836
Number of tags: 3683


In [9]:
print(f"\nRating statistics:")
print(ratings_df['rating'].describe())


Rating statistics:
count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64


In [10]:
# Extract all unique genres from the dataset
all_genres = set()
for genres in movies_df['genres']:
    if genres != '(no genres listed)':
        all_genres.update(genres.split('|'))

print(f"Number of unique genres: {len(all_genres)}")
print(f"Unique genres: {sorted(all_genres)}")

Number of unique genres: 19
Unique genres: ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


In [None]:
# For each movie, split the genres string into a list (or empty list if no genres)
def split_genres(genres):
    if genres == '(no genres listed)':
        return []
    else:
        return genres.split('|')

genre_list = movies_df['genres'].apply(split_genres) 


In [None]:
# Create genre indicators for each movie
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer # because we can onehot encode this one 


mlb = MultiLabelBinarizer(classes=sorted(all_genres))
genre_indicators = mlb.fit_transform(genre_list)
genre_df = pd.DataFrame(genre_indicators, columns=mlb.classes_)
genre_df['movieId'] = movies_df['movieId']

genre_df.head()

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,movieId
0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2
2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,4
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5


In [27]:
# Method 1: User-Genre preference vectors
# Merge ratings with genres
user_ratings = ratings_df.merge(movies_df[['movieId', 'genres']], on='movieId')
user_ratings

Unnamed: 0,userId,movieId,rating,timestamp,genres
0,1,1,4.0,964982703,Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Comedy|Romance
2,1,6,4.0,964982224,Action|Crime|Thriller
3,1,47,5.0,964983815,Mystery|Thriller
4,1,50,5.0,964982931,Crime|Mystery|Thriller
...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Drama|Horror|Thriller
100832,610,168248,5.0,1493850091,Action|Crime|Thriller
100833,610,168250,5.0,1494273047,Horror
100834,610,168252,5.0,1493846352,Action|Sci-Fi


## Building User Taste Vectors

Now we'll create user taste vectors based on their ratings and movie genres.

In [None]:
# Create a mapping of genres to index positions
genre_list = sorted(all_genres)
genre_to_idx = {genre: idx for idx, genre in enumerate(genre_list)}

print(genre_to_idx)

user_taste_vectors = {} # init taste vector

{'Action': 0, 'Adventure': 1, 'Animation': 2, 'Children': 3, 'Comedy': 4, 'Crime': 5, 'Documentary': 6, 'Drama': 7, 'Fantasy': 8, 'Film-Noir': 9, 'Horror': 10, 'IMAX': 11, 'Musical': 12, 'Mystery': 13, 'Romance': 14, 'Sci-Fi': 15, 'Thriller': 16, 'War': 17, 'Western': 18}


In [38]:

def create_user_taste_vector(user_id):
    user_data = user_ratings[user_ratings['userId'] == user_id]
    if len(user_data) == 0:
        return None
    genre_vector = np.zeros(len(genre_list))
    rating_sum = user_data['rating'].sum()
    for _, row in user_data.iterrows():
        if row['genres'] != '(no genres listed)':
            for genre in row['genres'].split('|'):
                genre_index = genre_to_idx[genre]
                # Weight by rating (normalized)
                genre_vector[genre_index] += row['rating'] / rating_sum
    
    return genre_vector

sample_users = ratings_df['userId'].unique()[:10]  # Just take first 10 users for demonstration

for user_id in sample_users:
    user_taste_vectors[user_id] = create_user_taste_vector(user_id)

sample_user = sample_users[0]
taste_vector = user_taste_vectors[sample_user]

user_taste_df = pd.DataFrame({
    'Genre': genre_list,
    'Preference': taste_vector
})

user_taste_df = user_taste_df.sort_values('Preference', ascending=False)

print(f"User {sample_user}'s top genre preferences:")
print(user_taste_df.head(10))

User 1's top genre preferences:
        Genre  Preference
0      Action    0.384008
1   Adventure    0.368213
4      Comedy    0.350444
7       Drama    0.304047
16   Thriller    0.225074
8     Fantasy    0.199408
5       Crime    0.193485
3    Children    0.188549
15     Sci-Fi    0.166831
2   Animation    0.134255
