In [1]:
import pandas as pd
import re

In [2]:
movies = pd.read_csv("dataset/movies.csv")
ratings = pd.read_csv("dataset/ratings.csv")
tags = pd.read_csv("dataset/tags.csv")
genome_scores = pd.read_csv("dataset/genome-scores.csv")
genome_tags = pd.read_csv("dataset/genome-tags.csv")

In [8]:
missing = tags.isnull().sum().sort_values(ascending=False)
print(missing[missing > 0])

tag    17
dtype: int64


In [9]:
tags.dropna(inplace=True)
movies.drop_duplicates(inplace=True)
ratings.drop_duplicates(inplace=True)
tags.drop_duplicates(inplace=True)
genome_scores.drop_duplicates(inplace=True)
genome_tags.drop_duplicates(inplace=True)

In [10]:
def extract_year(title):
    match = re.search(r'\((\d{4})\)', title)
    return int(match.group(1)) if match else None

movies['year'] = movies['title'].apply(extract_year)

In [11]:
genome = pd.merge(genome_scores, genome_tags, on='tagId')

In [12]:
important_tags = genome[genome['relevance'] > 0.7]

In [13]:
movie_tags = important_tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
movie_tags.rename(columns={'tag': 'genome_tags'}, inplace=True)

In [14]:
user_tags = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
user_tags.rename(columns={'tag': 'user_tags'}, inplace=True)

In [15]:
# Merge movies with genome tags
movies = pd.merge(movies, movie_tags, on='movieId', how='left')

# Merge with user tags
movies = pd.merge(movies, user_tags, on='movieId', how='left')

# Replace NaNs with empty strings
movies['genome_tags'] = movies['genome_tags'].fillna('')
movies['user_tags'] = movies['user_tags'].fillna('')

# Replace genre separators
movies['genres'] = movies['genres'].str.replace('|', ' ', regex=False)

# Combine all metadata into one column
movies['metadata'] = movies['genres'] + ' ' + movies['genome_tags']  # + ' ' + movies['user_tags']

In [16]:
print(movies)

       movieId                                              title  \
0            1                                   Toy Story (1995)   
1            2                                     Jumanji (1995)   
2            3                            Grumpier Old Men (1995)   
3            4                           Waiting to Exhale (1995)   
4            5                 Father of the Bride Part II (1995)   
...        ...                                                ...   
86532   288967               State of Siege: Temple Attack (2021)   
86533   288971                                 Ouija Japan (2021)   
86534   288975   The Men Who Made the Movies: Howard Hawks (1973)   
86535   288977                    Skinford: Death Sentence (2023)   
86536   288983  UNZIPPED: An Autopsy of American Inequality (2...   

                                            genres    year  \
0      Adventure Animation Children Comedy Fantasy  1995.0   
1                       Adventure Children Fant

In [17]:
movies.to_csv('dataset/cleaned_movies_with_metadata.csv', index=False)