In [23]:
import pandas as pd

In [24]:
movies = pd.read_csv('data/movies.csv', sep=',')
ratings = pd.read_csv('data/ratings.csv', sep=',', nrows=10000000)
links = pd.read_csv('data/links.csv', sep=',')
movies_with_plots = pd.read_csv('data/mpst_full_data.csv', sep=',')

In [25]:
movies_with_plots.rename(columns={"imdb_id": "imdbId"}, inplace=True)

user_ids = ratings['userId'].unique()
user2idx = {user_id: i for i, user_id in enumerate(user_ids)}
movie_ids = ratings['movieId'].unique()
movie2idx = {movie_id: i for i, movie_id in enumerate(movie_ids)}

ratings['userId'] = ratings['userId'].map(user2idx)
ratings['movieId'] = ratings['movieId'].map(movie2idx)

# Filter movies with at least 10 votes
movie_counts = ratings.groupby('movieId')['userId'].count()
popular_movies = movie_counts[movie_counts >= 10].index
ratings_filtered = ratings[ratings['movieId'].isin(popular_movies)].copy()

# Filter users who have rated at least 50 movies
user_counts = ratings_filtered.groupby('userId')['movieId'].count()
active_users = user_counts[user_counts >= 50].index
ratings_filtered = ratings_filtered[ratings_filtered['userId'].isin(active_users)].copy()

movies_filtered = movies[movies['movieId'].isin(popular_movies)].copy()
links_filtered = links[links['movieId'].isin(popular_movies)].copy()

# Ensure 'imdbId' follows the correct formatting (prefix with 'tt' and zero-padding to 7 digits)
links_filtered['imdbId'] = 'tt' + links_filtered['imdbId'].astype(str).str.zfill(7)

movies_merged = pd.merge(links_filtered, movies_with_plots, on='imdbId', how='left')

movies_merged.drop(columns=['split', 'synopsis_source', 'imdbId', 'tmdbId'], inplace=True)

# Extract year from the title in movies_filtered and append to movies_merged
movies_filtered['year'] = movies_filtered['title'].str.extract(r'\((\d{4})\)')

movies_merged = pd.merge(movies_merged, movies_filtered[['movieId', 'year', 'genres']], on='movieId', how='left')

movies_merged.dropna(inplace=True)

# Filter ratings based on available movies in the merged DataFrame
ratings_filtered = ratings_filtered[ratings_filtered['movieId'].isin(movies_merged['movieId'])].copy()

In [26]:
movies_merged.head()

Unnamed: 0,movieId,title,plot_synopsis,tags,year,genres
0,1,Toy Story,A boy called Andy Davis (voice: John Morris) u...,"comedy, fantasy, cult, cute, violence, clever,...",1995,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji,The film begins in 1869 in the town of Brantfo...,"psychedelic, fantasy",1995,Adventure|Children|Fantasy
2,3,Grumpier Old Men,The feud between Max (Walter Matthau) and John...,"revenge, comedy, prank",1995,Comedy|Romance
3,4,Waiting to Exhale,"""Friends are the People who let you be yoursel...",revenge,1995,Comedy|Drama|Romance
4,5,Father of the Bride Part II,The film begins five years after the events of...,"romantic, comedy, fantasy, sentimental",1995,Comedy


In [27]:
print(ratings['userId'].nunique())

6503


In [28]:
ratings_filtered.shape

(495099, 4)

In [34]:
plot_synopsis = movies[movies['movieId'] == 1].title.values[0]

print(plot_synopsis)

Toy Story (1995)


In [35]:
color = {'one': 1, 
         'two': 2
         }

1