<span style='color:#008000; font-size:20pt; font-weight:bold'>Import Libraries</span>

In [95]:
import pandas as pd
import re

pd.set_option('display.max_rows', None) # This code will display all of the dataframe
# pd.reset_option('all') # Reset to default

<span style='color:#008000; font-size:20pt; font-weight:bold'>Loading Dataset</span>

In [96]:
movielens_movies = pd.read_csv('movielens-1m/movies.dat', sep='::', engine='python', 
                        names=['movieId', 'title', 'genres'], encoding='ISO-8859-1')

In [98]:
movie_metadata = pd.read_csv('movies_metadata.csv', low_memory=False)

<span style='color:#008000; font-size:20pt; font-weight:bold'>Data Cleaning</span>

<span style='color:#007ACC; font-size:15pt; font-weight:bold'>MovieLens-1M</span>

In [99]:
movielens_movies.tail()

Unnamed: 0,movieId,title,genres
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama
3882,3952,"Contender, The (2000)",Drama|Thriller


In [100]:
# Extracts the year from the movie title using regex pattern that looks for four-digit numbers within parentheses.
movielens_movies['year'] = movielens_movies['title'].str.extract(r'\((\d{4})\)')

# Removes the extracted year (four-digit numbers in parentheses) from the title, 
# ensuring only the movie name remains, and strips any extra spaces.
movielens_movies['title'] = movielens_movies['title'].str.replace(r'\(\d{4}\)', '', regex=True).str.strip()

# Replaces multiple spaces in the movie title with a single space to ensure a clean format.
movielens_movies['title'] = movielens_movies['title'].str.replace(r'\s+', ' ', regex=True).str.strip()

# Title got error E.g: "Contender, The"
# Fix titles by using regex
def fix_title_regex(title):
    return re.sub(r"^(.*), (The|A|An|L'|Le)( \(.+\))?$", r'\2 \1\3', title)
movielens_movies['title'] = movielens_movies['title'].apply(fix_title_regex)

In [101]:
movielens_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  3883 non-null   int64 
 1   title    3883 non-null   object
 2   genres   3883 non-null   object
 3   year     3883 non-null   object
dtypes: int64(1), object(3)
memory usage: 121.5+ KB


<span style='color:#007ACC; font-size:15pt; font-weight:bold'>Movie-Metadata</span>

In [106]:
movie_metadata['release_date'] = movie_metadata['release_date'].astype(str).str[:4]         
movie_metadata['title'] = movie_metadata['title'].str.replace(r'\s+', ' ', regex=True).str.strip()

<span style='color:#007ACC; font-size:15pt; font-weight:bold'>Merging btw MovieLens_Movies and Movie_Metadata</span>

In [None]:
merged = pd.merge(movielens_movies, movie_metadata[['title', 'release_date', 'overview']], 
                    left_on=['title', 'year'], right_on=['title', 'release_date'], how='left')

In [None]:
merged.head(5)

Unnamed: 0,movieId,title,genres,year,release_date,overview
0,1,Toy Story,Animation|Children's|Comedy,1995,1995,"Led by Woody, Andy's toys live happily in his ..."
1,2,Jumanji,Adventure|Children's|Fantasy,1995,1995,When siblings Judy and Peter discover an encha...
2,3,Grumpier Old Men,Comedy|Romance,1995,1995,A family wedding reignites the ancient feud be...
3,4,Waiting to Exhale,Comedy|Drama,1995,1995,"Cheated on, mistreated and stepped on, the wom..."
4,5,Father of the Bride Part II,Comedy,1995,1995,Just when George Banks has recovered from his ...


In [None]:
merged.isna().sum()

movieId           0
title             0
genres            0
year              0
release_date    853
overview        861
dtype: int64

In [None]:
merged = merged.drop_duplicates(subset=['movieId', 'title', 'year'])


In [None]:
merged = merged.rename(columns={'overview':'description'})

In [None]:
merged = merged[['movieId', 'title', 'genres', 'year', 'description']]
merged

Unnamed: 0,movieId,title,genres,year,description
0,1,Toy Story,Animation|Children's|Comedy,1995,"Led by Woody, Andy's toys live happily in his ..."
1,2,Jumanji,Adventure|Children's|Fantasy,1995,When siblings Judy and Peter discover an encha...
2,3,Grumpier Old Men,Comedy|Romance,1995,A family wedding reignites the ancient feud be...
3,4,Waiting to Exhale,Comedy|Drama,1995,"Cheated on, mistreated and stepped on, the wom..."
4,5,Father of the Bride Part II,Comedy,1995,Just when George Banks has recovered from his ...
5,6,Heat,Action|Crime|Thriller,1995,"Obsessive master thief, Neil McCauley leads a ..."
6,7,Sabrina,Comedy|Romance,1995,An ugly duckling having undergone a remarkable...
7,8,Tom and Huck,Adventure|Children's,1995,"A mischievous young boy, Tom Sawyer, witnesses..."
8,9,Sudden Death,Action,1995,International action superstar Jean Claude Van...
9,10,GoldenEye,Action|Adventure|Thriller,1995,James Bond must unmask the mysterious head of ...


In [None]:
merged.isna().sum()

movieId          0
title            0
genres           0
year             0
description    861
dtype: int64

In [None]:
unmatched_titles = movielens_movies[~movielens_movies['title'].isin(movie_metadata['title'])]
unmatched_titles

In [None]:
merged[pd.isna(merged['description'])]

In [None]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3883 entries, 0 to 3888
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   movieId      3883 non-null   int64 
 1   title        3883 non-null   object
 2   genres       3883 non-null   object
 3   year         3883 non-null   object
 4   description  3022 non-null   object
dtypes: int64(1), object(4)
memory usage: 182.0+ KB


In [None]:
missing_descriptions = merged[pd.isna(merged['description'])]
print(f"Number of missing description movies: {len(missing_descriptions)}")

Number of missing description movies: 861


<span style='color:#007ACC; font-size:15pt; font-weight:bold'>Call OMDB API to fill in missing description movies</span>

In [132]:
import requests
import time

API_KEY = "271a55f8"

def fetch_movie_description(title):
    url = f"http://www.omdbapi.com/?t={title}&apikey={API_KEY}"
    response = requests.get(url)
    data = response.json()
    
    if "Plot" in data and data["Plot"] != "N/A":
        return data["Plot"]
    
    return "No description available"


In [None]:
merged['description'] = merged.apply(
    lambda row: fetch_movie_description(row['title']) if pd.isna(row['description']) else row['description'],
    axis=1
)

In [None]:
print(f"Number of missing description movies: {merged['description'].isna().sum()}")

Number of missing description movies: 0


<span style='color:#007ACC; font-size:15pt; font-weight:bold'>Extract to CSV file</span>

In [None]:
merged.to_csv('movielens_movies_with_descriptions.csv', index=False, encoding='utf-8')

# 📌 Bước 1: Đọc dữ liệu

In [None]:

import pandas as pd

# Đọc dữ liệu
ratings = pd.read_csv('/mnt/data/ratings.dat', sep='::', engine='python', names=['userId', 'movieId', 'rating', 'timestamp'], encoding='latin1')
movies = pd.read_csv('/mnt/data/movies.dat', sep='::', engine='python', names=['movieId', 'title', 'genres'], encoding='latin1')
users = pd.read_csv('/mnt/data/users.dat', sep='::', engine='python', names=['userId', 'gender', 'age', 'occupation', 'zip'], encoding='latin1')

# Hiển thị thông tin cơ bản
print(f"Số lượng user: {ratings['userId'].nunique()}")
print(f"Số lượng movie: {ratings['movieId'].nunique()}")
print(f"Số lượng rating: {len(ratings)}")

ratings.head()


# 📊 Bước 2: Thực hiện EDA

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Vẽ biểu đồ phân phối rating
plt.figure(figsize=(8,5))
sns.histplot(ratings['rating'], bins=np.arange(0.5, 5.5, 0.5), kde=True)
plt.xlabel('Rating')
plt.ylabel('Count')
plt.title('Phân phối Rating')
plt.grid(True)
plt.show()


In [None]:

# Phân tích thể loại phim (genres)
all_genres = movies['genres'].str.split('|').explode()
genre_counts = all_genres.value_counts()

# Vẽ biểu đồ thể loại phim
plt.figure(figsize=(12,6))
sns.barplot(x=genre_counts.index, y=genre_counts.values, palette='viridis')
plt.xticks(rotation=90)
plt.xlabel('Genres')
plt.ylabel('Số lượng phim')
plt.title('Tần suất thể loại phim')
plt.show()


In [None]:

# Top 10 phim được đánh giá nhiều nhất
top_n_movies = ratings['movieId'].value_counts().head(10)
top_movies = movies[movies['movieId'].isin(top_n_movies.index)]

print("Top 10 phim có nhiều rating nhất:")
top_movies.merge(top_n_movies, left_on="movieId", right_index=True).rename(columns={"movieId": "Movie ID", "title": "Title", "rating": "Number of Ratings"})


In [None]:

# Tính sparsity của ma trận user-item
num_users = ratings['userId'].nunique()
num_movies = ratings['movieId'].nunique()
num_ratings = len(ratings)
sparsity = 1 - (num_ratings / (num_users * num_movies))

print(f"Tỷ lệ sparsity của ma trận user-item: {sparsity:.2%}")


# 🏗️ Bước 3: Chuẩn bị dữ liệu cho mô hình

In [None]:

# Tạo ma trận user-item (nếu tự code CF)
user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')
user_item_matrix.fillna(0, inplace=True)  # Điền các giá trị NaN bằng 0

print("Kích thước ma trận user-item:", user_item_matrix.shape)
user_item_matrix.head()


In [None]:

# Chuẩn bị dữ liệu theo format 3 cột (user, item, rating) nếu dùng scikit-surprise
ratings_subset = ratings[['userId', 'movieId', 'rating']]
ratings_subset.head()


## ✂️ Bước 4: Chia tập train/test

In [None]:

from sklearn.model_selection import train_test_split

# Chia dữ liệu theo tỉ lệ 80/20
train, test = train_test_split(ratings, test_size=0.2, random_state=42)

print(f"Số lượng rating trong tập train: {len(train)}")
print(f"Số lượng rating trong tập test: {len(test)}")


In [None]:

# (Tùy chọn) Chia dữ liệu dựa trên thời gian (Time-based Split)
ratings_sorted = ratings.sort_values(by='timestamp')
split_idx = int(len(ratings_sorted) * 0.8)
train_time_based = ratings_sorted.iloc[:split_idx]
test_time_based = ratings_sorted.iloc[split_idx:]

print(f"Số lượng rating trong tập train (time-based): {len(train_time_based)}")
print(f"Số lượng rating trong tập test (time-based): {len(test_time_based)}")
