<span style='color:#008000; font-size:20pt; font-weight:bold'>Import Libraries</span>

In [190]:
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# pd.set_option('display.max_rows', None) # This code will display all of the dataframe
# pd.reset_option('all') # Reset to default

<span style='color:#008000; font-size:20pt; font-weight:bold'>Loading Dataset</span>

In [191]:
movies = pd.read_csv('movielens-1m/movies.dat', sep='::', engine='python', 
                        names=['movieId', 'title', 'genres'], encoding='ISO-8859-1')

In [192]:
movies_with_des = pd.read_csv('movielens_movies_with_descriptions.csv', sep=',')

In [193]:
users = pd.read_csv('movielens-1m/users.dat', sep='::', engine='python',
                    names=['userId', 'gender', 'age', 'occupation', 'zip-code'], encoding='ISO-8859-1')

In [194]:
ratings = pd.read_csv('movielens-1m/ratings.dat',
                      sep='::', engine='python', 
                      names=['userId', 'movieId', 'rating', 'timestamp'], encoding='ISO-8859-1')

In [195]:
ratings.drop(['timestamp'], axis=1, inplace=True)
users.drop(['zip-code'], axis=1, inplace=True)

In [196]:

movies['year'] = movies['title'].str.extract(r'\((\d{4})\)')

movies['title'] = movies['title'].str.replace(r'\(\d{4}\)', '', regex=True).str.strip()


movies['title'] = movies['title'].str.replace(r'\s+', ' ', regex=True).str.strip()


def fix_title_regex(title):
    return re.sub(r"^(.*), (The|A|An|L'|Le)( \(.+\))?$", r'\2 \1\3', title)
movies['title'] = movies['title'].apply(fix_title_regex)

<span style='color:#008000; font-size:20pt; font-weight:bold'>Item-based CF</span>

<span style='color:#007ACC; font-size:15pt; font-weight:bold'>Pivot Table</span>

In [197]:
# Pivot table: users x movies
user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

<span style='color:#007ACC; font-size:15pt; font-weight:bold'>Cosine Similarity</span>

In [198]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Ma trận tương đồng giữa item (dựa trên rating vector)
item_similarity = cosine_similarity(user_item_matrix.T)
item_similarity_df = pd.DataFrame(item_similarity, 
                                   index=user_item_matrix.columns, 
                                   columns=user_item_matrix.columns)

<span style='color:#007ACC; font-size:15pt; font-weight:bold'>Recommender</span>

In [199]:
# Hàm gợi ý các phim tương tự (CF truyền thống)
def item_based_recommend(movie_id, top_n=5):
    if movie_id not in item_similarity_df.columns:
        return f"Movie ID {movie_id} not exists in the system."
    
    original_movie = movies[movies['movieId'] == movie_id][['title', 'year']].values
    original_title, original_year = original_movie[0]

    similar_scores = item_similarity_df[movie_id].sort_values(ascending=False)
    similar_movies = similar_scores.iloc[1:top_n+1].index

    recommended_movies = movies[movies['movieId'].isin(similar_movies)][['movieId', 'title', 'year']]
    recommended_movies['similarity'] = recommended_movies['movieId'].apply(lambda x: similar_scores[x])
    recommended_movies = recommended_movies.sort_values(by='similarity', ascending=False).reset_index(drop=True)

    print(f"Top {top_n} similarity movies with Movie ID {movie_id}: \n{original_title} ({original_year}):")
    return recommended_movies

In [200]:
item_based_recommend(1, 5)

Top 5 similarity movies with Movie ID 1: 
Toy Story (1995):


Unnamed: 0,movieId,title,year,similarity
0,3114,Toy Story 2,1999,0.633104
1,1265,Groundhog Day,1993,0.610826
2,588,Aladdin,1992,0.605849
3,2355,A Bug's Life,1998,0.579382
4,1270,Back to the Future,1985,0.570125
