In [4]:
import pandas as pd

movies_df = pd.read_csv('ml-latest-small/movies.csv')

# Extract unique genres
# Split genres by '|' and get all unique genres
unique_genres = set()
for genres in movies_df['genres']:
    unique_genres.update(genres.split('|'))
unique_genres = sorted(unique_genres)  # Sort in alphabetic order

# Genre vector
for genre in unique_genres:
    movies_df[genre] = movies_df['genres'].apply(lambda x: 1 if genre in x.split('|') else 0)

# Drop the original 'genres' column (optional)
movies_df = movies_df.drop(columns=['genres'])

print(movies_df.head())


   movieId                               title  (no genres listed)  Action  \
0        1                    Toy Story (1995)                   0       0   
1        2                      Jumanji (1995)                   0       0   
2        3             Grumpier Old Men (1995)                   0       0   
3        4            Waiting to Exhale (1995)                   0       0   
4        5  Father of the Bride Part II (1995)                   0       0   

   Adventure  Animation  Children  Comedy  Crime  Documentary  ...  Film-Noir  \
0          1          1         1       1      0            0  ...          0   
1          1          0         1       0      0            0  ...          0   
2          0          0         0       1      0            0  ...          0   
3          0          0         0       1      0            0  ...          0   
4          0          0         0       1      0            0  ...          0   

   Horror  IMAX  Musical  Mystery  Romance  

In [5]:
from collections import Counter
tags_df = pd.read_csv('ml-latest-small/tags.csv')

unique_tag = set() # set use to get number of unique tags
count_tag = [] # working with arrays is easier, also all tags in the csv

for tags in tags_df['tag']:
    # This is the most common tag, but its not relevent to the movie so I skip it
    if (tags == 'In Netflix queue'):
        continue
    count_tag.append(tags)
    unique_tag.add(tags)

# get most common
count = Counter(count_tag)
max_tag = count.most_common(60)

name_of_tag = []
for x in max_tag:
    name_of_tag.append(x[0])

# number of unique tags
print(len(unique_tag))
# All 60 tags with their num uses 
print(max_tag)
# we get the top 60 tags
print(name_of_tag)

1588
[('atmospheric', 36), ('superhero', 24), ('thought-provoking', 24), ('funny', 23), ('Disney', 23), ('surreal', 23), ('religion', 22), ('sci-fi', 21), ('dark comedy', 21), ('quirky', 21), ('psychology', 21), ('suspense', 20), ('twist ending', 19), ('visually appealing', 19), ('crime', 19), ('politics', 18), ('music', 16), ('time travel', 16), ('mental illness', 16), ('dark', 15), ('comedy', 15), ('aliens', 15), ('space', 14), ('mindfuck', 14), ('dreamlike', 14), ('black comedy', 13), ('emotional', 13), ('heist', 13), ('anime', 12), ('action', 12), ('satire', 12), ('high school', 12), ('disturbing', 12), ('Shakespeare', 12), ('journalism', 12), ('Stephen King', 12), ('court', 12), ('imdb top 250', 11), ('comic book', 11), ('classic', 11), ('psychological', 11), ('Holocaust', 11), ('adolescence', 11), ('adultery', 11), ('boxing', 11), ('drugs', 10), ('Leonardo DiCaprio', 10), ('Mafia', 10), ('animation', 10), ('robots', 10), ('cinematography', 10), ('India', 10), ('ghosts', 10), ('re

In [6]:
movies_df = pd.read_csv('ml-latest-small/movies.csv')
tags_set = tags_df.groupby(['movieId']).agg({'tag':set}).reset_index()
print(tags_set.head(10))

# Goes over the most common tags
# adds column tags and goes through movie tags using the tags_set,
# applies 1 if tag in tag_set  
for tag in name_of_tag:
    movies_df[tag] = tags_set['tag'].apply(lambda x: 1 if tag in x else 0)

    
print(movies_df.head(10))
# can't see the 1s being added using movies_df.head() so i made a file
movies_df.to_csv('movie_vct.csv',index=False)

   movieId                                                tag
0        1                                       {fun, pixar}
1        2  {game, magic board game, Robin Williams, fantasy}
2        3                                       {moldy, old}
3        5                                {remake, pregnancy}
4        7                                           {remake}
5       11                              {politics, president}
6       14                              {politics, president}
7       16                                            {Mafia}
8       17                                      {Jane Austen}
9       21                                        {Hollywood}
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   
5        6        

In [12]:
movies_df['tag_list'] = tags_set['tag'].apply(lambda x: [tag.strip().lower() for tag in x])
movies_df['tag_list'].head()

0                                         [fun, pixar]
1    [game, magic board game, robin williams, fantasy]
2                                         [moldy, old]
3                                  [remake, pregnancy]
4                                             [remake]
Name: tag_list, dtype: object

In [22]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np

movies_df = pd.read_csv('ml-latest-small/movies.csv')
tags_df = pd.read_csv('ml-latest-small/tags.csv')
tags_set = tags_df.groupby(['movieId']).agg({'tag':set}).reset_index()

# 3. Merge tags with movies, ensuring all movies are included
movies_df = movies_df.merge(tags_set, how='left', left_on='movieId', right_on='movieId')

# 4. Replace NaN tags with an empty set
movies_df['tag'] = movies_df['tag'].apply(lambda x: x if isinstance(x, set) else set())

# 5. Preprocess tags: Split and clean
movies_df['tag_list'] = movies_df['tag'].apply(lambda x: [tag.strip().lower() for tag in x])

# 3. Train word2vec model
# Combine all tag lists into one list of lists
tag_corpus = movies_df['tag_list'].tolist()

# Train a word2vec model
word2vec_model = Word2Vec(tag_corpus, vector_size=50, window=5, min_count=1, workers=4)

# 4. Create movie vectors by averaging tag vectors
def get_movie_vector(tags):
    tag_vectors = [word2vec_model.wv[tag] for tag in tags if tag in word2vec_model.wv]
    return np.mean(tag_vectors, axis=0) if tag_vectors else np.zeros(word2vec_model.vector_size)

movies_df['tag_vector'] = movies_df['tag_list'].apply(get_movie_vector)

# 5. Drop unnecessary columns if needed
# movies_df = movies_df.drop(columns=['tags', 'tag_list'])

print(movies_df.head())


   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  \
0  Adventure|Animation|Children|Comedy|Fantasy   
1                   Adventure|Children|Fantasy   
2                               Comedy|Romance   
3                         Comedy|Drama|Romance   
4                                       Comedy   

                                                 tag  \
0                                       {fun, pixar}   
1  {game, magic board game, Robin Williams, fantasy}   
2                                       {moldy, old}   
3                                                 {}   
4                                {remake, pregnancy}   

                                            tag_list  \
0    

In [31]:
# Each vector is the same length 50
print(len(movies_df['tag_vector'][0]))
print(len(movies_df['tag_vector'][1]))
print('--------')

print(movies_df['tag_vector'][0])
print(movies_df['tag_vector'][1])
print(movies_df['tag_vector'][3])

50
50
--------
[ 0.0033657  -0.00672364 -0.00700654  0.00592889  0.00425919 -0.0029133
 -0.00420047  0.00707759 -0.00436095 -0.00597242  0.00664629 -0.00462665
 -0.00280841  0.00865653  0.01090183  0.00371634  0.00041324 -0.00321005
 -0.0109264   0.00674776 -0.00306672  0.00737879 -0.00655809  0.00533048
  0.00691196  0.00359092 -0.00866677 -0.0122306  -0.00462123  0.0155831
 -0.00111824 -0.00433347  0.00298679 -0.01379059 -0.00629304 -0.00450092
  0.00020178 -0.00293169 -0.00704303 -0.00485109 -0.00698893  0.01014541
  0.01220491  0.00115883 -0.00621614  0.00290144  0.00782992 -0.01500152
 -0.01357348  0.01416418]
[ 1.65762403e-03  7.28934538e-05 -1.19641609e-03  2.12769024e-04
 -1.09860115e-02 -4.68750857e-03  1.10640340e-02 -1.31127541e-03
 -3.02379159e-03 -9.59620113e-04  1.61726656e-03 -5.50090522e-03
  1.04773156e-02 -1.04739144e-02 -7.34326243e-03  3.61172063e-03
  4.73072752e-04 -2.03676685e-03  4.77112783e-03 -9.09759104e-03
  2.13456806e-05 -6.02829829e-03  9.51240025e-03  1.