# Front Matter

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise import accuracy
from sklearn.metrics import precision_score, recall_score, f1_score
import warnings
import pickle

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# 1. Data Preparation
## Load and Clean Data

In [2]:
# Load datasets
movies = pd.read_csv(r'C:\Users\pedro\Desktop\Github\DS_340-Movies\Small MovieLens\movies.csv')
ratings = pd.read_csv(r'C:\Users\pedro\Desktop\Github\DS_340-Movies\Small MovieLens\ratings.csv')
tags = pd.read_csv(r'C:\Users\pedro\Desktop\Github\DS_340-Movies\Small MovieLens\tags.csv')

In [3]:
# Display first few rows
print("Movies DataFrame:")
display(movies.head())

print("Ratings DataFrame:")
display(ratings.head())

print("Tags DataFrame:")
display(tags.head())

Movies DataFrame:


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Ratings DataFrame:


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


Tags DataFrame:


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [4]:
print(f"Movies: {movies.duplicated().sum()}")
print(f"Ratings: {ratings.duplicated().sum()}")
print(f"Tags: {tags.duplicated().sum()}")


Number of duplicate rows removed:
Movies: 0
Ratings: 0
Tags: 0


In [9]:
# Check for missing values
print("\nMissing values in Movies DataFrame:")
print(movies.isnull().sum())

print("\nMissing values in Ratings DataFrame:")
print(ratings.isnull().sum())

print("\nMissing values in Tags DataFrame:")
print(tags.isnull().sum())


Missing values in Movies DataFrame:
movieId    0
title      0
genres     0
dtype: int64

Missing values in Ratings DataFrame:
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

Missing values in Tags DataFrame:
userId       0
movieId      0
tag          0
timestamp    0
dtype: int64


In [7]:
# Drop rows in tags where 'tag' is NaN
tags = tags.dropna(subset=['tag']).reset_index(drop=True)

In [10]:
# Select necessary columns
movies = movies[['movieId', 'title', 'genres']]
ratings = ratings[['userId', 'movieId', 'rating']]
tags = tags[['movieId', 'tag']]

In [11]:
# Merge Movies and Ratings to create CF DataFrame
cf = pd.merge(movies, ratings, on='movieId', how='inner')

# Merge Movies and Tags to create CBF DataFrame
cbf = pd.merge(movies, tags, on='movieId', how='inner')

In [12]:
# Check data types
print("\nData types in CF DataFrame:")
print(cf.dtypes)

print("\nData types in CBF DataFrame:")
print(cbf.dtypes)


Data types in CF DataFrame:
movieId      int64
title       object
genres      object
userId       int64
rating     float64
dtype: object

Data types in CBF DataFrame:
movieId     int64
title      object
genres     object
tag        object
dtype: object


In [14]:
# Function to extract year from title
def extract_year(title):
    match = re.search(r'\((\d{4})\)', title)
    if match:
        return int(match.group(1))
    else:
        return np.nan  # Handle cases where year is not found

# Apply the function to create a 'year' column
movies['year'] = movies['title'].apply(extract_year)

# Clean the 'title' by removing the year and converting to lowercase
movies['title_clean'] = movies['title'].apply(lambda x: re.sub(r'\s*\(\d{4}\)', '', x).lower())

# Verify the changes
print("Sample of Movies DataFrame after extracting year and cleaning title:")
display(movies[['movieId', 'title', 'title_clean', 'year']].head())

Sample of Movies DataFrame after extracting year and cleaning title:


Unnamed: 0,movieId,title,title_clean,year
0,1,Toy Story (1995),toy story,1995.0
1,2,Jumanji (1995),jumanji,1995.0
2,3,Grumpier Old Men (1995),grumpier old men,1995.0
3,4,Waiting to Exhale (1995),waiting to exhale,1995.0
4,5,Father of the Bride Part II (1995),father of the bride part ii,1995.0


In [18]:
# Group tags by 'movieId' and concatenate them into a single string
tags_grouped = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

# Verify the grouped tags
print("\nSample of Grouped Tags DataFrame:")
display(tags_grouped.head())


Sample of Grouped Tags DataFrame:


Unnamed: 0,movieId,tag
0,1,pixar pixar fun
1,2,fantasy magic board game Robin Williams game
2,3,moldy old
3,5,pregnancy remake
4,7,remake


In [16]:
# Merge 'tags_grouped' with 'movies' DataFrame to update 'cbf'
cbf = pd.merge(movies, tags_grouped, on='movieId', how='left')

# Replace NaN tags with empty strings (for movies without tags)
cbf['tag'] = cbf['tag'].fillna('')

# Verify the merged DataFrame
print("\nSample of Content-Based Filtering (CBF) DataFrame after merging tags:")
display(cbf.head())


Sample of Content-Based Filtering (CBF) DataFrame after merging tags:


Unnamed: 0,movieId,title,genres,year,title_clean,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995.0,toy story,pixar pixar fun
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995.0,jumanji,fantasy magic board game Robin Williams game
2,3,Grumpier Old Men (1995),Comedy|Romance,1995.0,grumpier old men,moldy old
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995.0,waiting to exhale,
4,5,Father of the Bride Part II (1995),Comedy,1995.0,father of the bride part ii,pregnancy remake


In [17]:
# Create 'year_str' column for concatenation
cbf['year_str'] = cbf['year'].astype(str)

# Combine genres, title_clean, tags, and year into the 'related' column
cbf['related'] = cbf['genres'].str.replace('|', ' ') + ' ' + cbf['title_clean'] + ' ' + cbf['tag'] + ' ' + cbf['year_str']

# Verify the 'related' column
print("\nSample of CBF DataFrame with 'related' column:")
display(cbf[['movieId', 'title', 'related']].head())


Sample of CBF DataFrame with 'related' column:


Unnamed: 0,movieId,title,related
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy to...
1,2,Jumanji (1995),Adventure Children Fantasy jumanji fantasy mag...
2,3,Grumpier Old Men (1995),Comedy Romance grumpier old men moldy old 1995.0
3,4,Waiting to Exhale (1995),Comedy Drama Romance waiting to exhale 1995.0
4,5,Father of the Bride Part II (1995),Comedy father of the bride part ii pregnancy r...


In [19]:
# Preprocess the 'related' column
cbf['related'] = cbf['related'].str.lower()  # Ensure lowercase
cbf['related'] = cbf['related'].str.replace(r'\d+', '', regex=True)  # Remove numbers
cbf['related'] = cbf['related'].str.replace(r'[^a-z\s]', '', regex=True)  # Remove special characters
cbf['related'] = cbf['related'].str.strip()  # Remove extra spaces

# Verify the preprocessing
print("\nSample of CBF DataFrame after preprocessing 'related' column:")
display(cbf[['movieId', 'title', 'related']].head())


Sample of CBF DataFrame after preprocessing 'related' column:


Unnamed: 0,movieId,title,related
0,1,Toy Story (1995),adventure animation children comedy fantasy to...
1,2,Jumanji (1995),adventure children fantasy jumanji fantasy mag...
2,3,Grumpier Old Men (1995),comedy romance grumpier old men moldy old
3,4,Waiting to Exhale (1995),comedy drama romance waiting to exhale
4,5,Father of the Bride Part II (1995),comedy father of the bride part ii pregnancy r...


In [20]:
# Check for NaN values in 'related' column
nan_related = cbf['related'].isna().sum()
print(f"\nNumber of NaN values in 'related' column: {nan_related}")


Number of NaN values in 'related' column: 0


In [None]:
# Save CF and CBF DataFrames to CSV (optional)
# Uncomment if you need to save for later use
cf.to_csv(r'C:\Users\pedro\Desktop\Github\DS_440-Project\cf.csv', index=False)
cbf.to_csv(r'C:\Users\pedro\Desktop\Github\DS_440-Project\cbf.csv', index=False)

print("\nCleaned CF and CBF DataFrames are ready.")


Cleaned CF and CBF DataFrames are ready.
