In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import csr_matrix, lil_matrix, hstack
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import seaborn as sns
from fuzzywuzzy import process, fuzz



In [45]:
df_ratings = pd.read_csv(
    "data/ratings.csv",
    dtype={"userId": "int32", "movieId": "int32", "rating": "float32"}
)

# Calculate the reviews count and average rating, then reset index
# Ensure "reviews" is kept as an int
ratings_count = df_ratings.groupby("movieId").size().reset_index(
    name="reviews")
average_ratings = df_ratings.groupby(
    "movieId")["rating"].mean().round(1).reset_index(name="Rating")

# Read in the movies data
df_movies = pd.read_csv(
    "data/movies.csv",
    usecols=["movieId", "title", "genres"],
    dtype={"movieId": "int32", "title": "str", "genres": "str"}
)

# Merge the reviews count and average ratings with the movies data
# Make sure the merges do not change the "reviews" data type
df_merged = df_movies.merge(ratings_count, on="movieId", how="left").merge(
    average_ratings, on="movieId", how="left")

df_filtered_movies = df_merged[df_merged["reviews"] >= 300].dropna()

# Convert "reviews" to int explicitly
df_filtered_movies["reviews"] = df_filtered_movies["reviews"].astype("int32")

# Set "movieId" as the index
df_filtered_movies.set_index("movieId", inplace=True)
df_filtered_movies.reset_index(drop=True, inplace=True)

In [46]:
df_filtered_movies


Unnamed: 0,title,genres,reviews,Rating
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,76813,3.9
1,Jumanji (1995),Adventure|Children|Fantasy,30209,3.3
2,Grumpier Old Men (1995),Comedy|Romance,15820,3.2
3,Waiting to Exhale (1995),Comedy|Drama|Romance,3028,2.9
4,Father of the Bride Part II (1995),Comedy,15801,3.1
...,...,...,...,...
7871,John Wick: Chapter 4 (2023),Action|Crime|Thriller,651,3.7
7872,Dungeons & Dragons: Honor Among Thieves (2023),Action|Adventure|Fantasy,488,3.7
7873,Guardians of the Galaxy Volume 3 (2023),Action|Adventure|Sci-Fi,534,3.8
7874,The Super Mario Bros. Movie (2023),Adventure|Animation|Children|Comedy|Fantasy,366,3.3


In [53]:
df_filtered_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7876 entries, 0 to 7875
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         7876 non-null   object 
 1   genres        7876 non-null   object 
 2   reviews       7876 non-null   int32  
 3   Rating        7876 non-null   float32
 4   bayesian_avg  7876 non-null   float64
dtypes: float32(1), float64(1), int32(1), object(2)
memory usage: 246.2+ KB


## Baysian Average

It makes the data more reliable for items with less reviews in a dataset with a mix of both. We have cut off movies with less than 300 reviews in our cold start so that might not be nessecary, but still cool to have.

In [54]:
m = df_ratings["rating"].mean()
C = df_ratings.groupby("movieId").size().mean()

#Baysian might be used later if our dataset 
df_filtered_movies["bayesian_avg"] = (
    df_filtered_movies["reviews"] * df_filtered_movies["Rating"] + C * m) / (df_filtered_movies["reviews"] + C)     #The actual algorithm
df_filtered_movies["bayesian_avg"] = df_filtered_movies["bayesian_avg"].round(1)                                    #Adding it back inside rounded to 1 deciminal


In [55]:
df_filtered_movies

Unnamed: 0,title,genres,reviews,Rating,bayesian_avg
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,76813,3.9,3.9
1,Jumanji (1995),Adventure|Children|Fantasy,30209,3.3,3.3
2,Grumpier Old Men (1995),Comedy|Romance,15820,3.2,3.2
3,Waiting to Exhale (1995),Comedy|Drama|Romance,3028,2.9,3.0
4,Father of the Bride Part II (1995),Comedy,15801,3.1,3.1
...,...,...,...,...,...
7871,John Wick: Chapter 4 (2023),Action|Crime|Thriller,651,3.7,3.6
7872,Dungeons & Dragons: Honor Among Thieves (2023),Action|Adventure|Fantasy,488,3.7,3.6
7873,Guardians of the Galaxy Volume 3 (2023),Action|Adventure|Sci-Fi,534,3.8,3.7
7874,The Super Mario Bros. Movie (2023),Adventure|Animation|Children|Comedy|Fantasy,366,3.3,3.4


## Sparse Matrix

In [65]:
unique_genres = set()
df_filtered_movies['genres'].str.split('|').apply(unique_genres.update)

genre_counts = {genre: 0 for genre in unique_genres}
for genre in unique_genres:
    df_filtered_movies[genre] = df_filtered_movies['genres'].str.contains(
        genre).astype(int)
    genre_counts[genre] = df_filtered_movies[genre].sum()


  df_filtered_movies[genre] = df_filtered_movies['genres'].str.contains(


In [66]:
print(genre_counts)

{'War': 323, '(no genres listed)': 18, 'Horror': 842, 'Drama': 3678, 'Crime': 1045, 'Romance': 1331, 'IMAX': 149, 'Action': 1566, 'Adventure': 1115, 'Thriller': 1797, 'Mystery': 548, 'Comedy': 2842, 'Film-Noir': 76, 'Western': 140, 'Fantasy': 665, 'Animation': 421, 'Children': 564, 'Documentary': 257, 'Musical': 263, 'Sci-Fi': 847}
