In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
# Đọc file movies.csv (sửa đường dẫn nếu chạy trên Colab)
data = pd.read_csv(
    "movie_data/movies.csv",   # đúng đường dẫn
    sep="\t",
    encoding='latin1',
    usecols=["movie_id", "title", "genres"]
)


print("Đọc dữ liệu xong")
print(data.shape)
data.head()


Đọc dữ liệu xong
(3883, 3)


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# Thay | bằng khoảng trắng và bỏ dấu -
data["genres"] = data["genres"].apply(lambda s: s.replace("|", " ").replace("-", ""))

print("Tiền xử lý dữ liệu xong")
data.head()


Tiền xử lý dữ liệu xong


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation Children's Comedy
1,2,Jumanji (1995),Adventure Children's Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data["genres"])

print("Tạo TF-IDF xong")
print("TF-IDF shape:", tfidf_matrix.shape)


Tạo TF-IDF xong
TF-IDF shape: (3883, 18)


In [8]:
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim_df = pd.DataFrame(cosine_sim, index=data["title"], columns=data["title"])

print("Ma trận cosine similarity xong")
cosine_sim_df.head(5)


Ma trận cosine similarity xong


title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Bamboozled (2000),Bootmen (2000),Digimon: The Movie (2000),Get Carter (2000),Get Carter (1971),Meet the Parents (2000),Requiem for a Dream (2000),Tigerland (2000),Two Family House (2000),"Contender, The (2000)"
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story (1995),1.0,0.305525,0.197372,0.260194,0.344351,0.0,0.197372,0.425153,0.0,0.0,...,0.344351,0.260194,0.80146,0.0,0.0,0.344351,0.0,0.0,0.0,0.0
Jumanji (1995),0.305525,1.0,0.0,0.0,0.0,0.0,0.0,0.718623,0.0,0.32089,...,0.0,0.0,0.538118,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Grumpier Old Men (1995),0.197372,0.0,1.0,0.433093,0.573172,0.0,1.0,0.0,0.0,0.0,...,0.573172,0.433093,0.0,0.0,0.0,0.573172,0.0,0.0,0.0,0.0
Waiting to Exhale (1995),0.260194,0.0,0.433093,1.0,0.755606,0.0,0.433093,0.0,0.0,0.0,...,0.755606,1.0,0.0,0.262005,0.0,0.755606,0.655026,0.655026,0.655026,0.343133
Father of the Bride Part II (1995),0.344351,0.0,0.573172,0.755606,1.0,0.0,0.573172,0.0,0.0,0.0,...,1.0,0.755606,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [9]:
seen_movie = "Quick and the Dead, The (1995)"
top_k = 20

top_movies = cosine_sim_df.loc[seen_movie, :].sort_values(ascending=False)[:top_k]

print(f"Top {top_k} phim giống '{seen_movie}':")
top_movies

Top 20 phim giống 'Quick and the Dead, The (1995)':


Unnamed: 0_level_0,"Quick and the Dead, The (1995)"
title,Unnamed: 1_level_1
"Quick and the Dead, The (1995)",1.0
Tashunga (1995),0.89762
True Grit (1969),0.89762
Dances with Wolves (1990),0.858772
"Good, The Bad and The Ugly, The (1966)",0.851792
"Fistful of Dollars, A (1964)",0.851792
Last Man Standing (1996),0.811141
Young Guns II (1990),0.798927
Young Guns (1988),0.798927
Maverick (1994),0.798927


In [10]:
top_movies_df = top_movies.reset_index()
top_movies_df.columns = ["Movie", "Similarity Score"]
top_movies_df


Unnamed: 0,Movie,Similarity Score
0,"Quick and the Dead, The (1995)",1.0
1,Tashunga (1995),0.89762
2,True Grit (1969),0.89762
3,Dances with Wolves (1990),0.858772
4,"Good, The Bad and The Ugly, The (1966)",0.851792
5,"Fistful of Dollars, A (1964)",0.851792
6,Last Man Standing (1996),0.811141
7,Young Guns II (1990),0.798927
8,Young Guns (1988),0.798927
9,Maverick (1994),0.798927
