In [26]:
import pandas as pd
import re
from langdetect import detect


books_df = pd.read_csv('dataset/books_data_with_id.csv')


books_df['publishedDate'] = pd.to_datetime(books_df['publishedDate'], errors='coerce')


def split_categories(cat_list):
    if isinstance(cat_list, str):
        categories = cat_list.strip('[]').replace("'", "").split(', ')
        split_categories = []
        for cat in categories:
            split_categories.extend(cat.replace(' and ', ' & ').split(' & '))
        return [cat.strip() for cat in split_categories if cat.strip()]
    return []


def split_author(author_list):
    if isinstance(author_list, str):
        authors = author_list.strip('[]').replace("'", "").split(', ')
        split_authors = []
        for cat in authors:
            split_authors.extend(cat.replace(' , ', ' & ').split(' & '))
        return [cat.strip() for cat in split_authors if cat.strip()]
    return []


def preprocess_description(description):

    description = description.lower()
    description = re.sub(r'[^a-zA-Z0-9\s]', '', description)

    try:
        if detect(description) == 'en':
            return description
        else:
            return None 
    except:
        return None  

books_df['categories'] = books_df['categories'].apply(lambda x: split_categories(x) if isinstance(x, str) else [])
books_df['authors'] = books_df['authors'].apply(lambda x: split_author(x) if isinstance(x, str) else [])
books_df['description'] = books_df['description'].apply(lambda x: preprocess_description(x) if isinstance(x, str) else None)



In [27]:
books_data_df = books_df[['categories', 'Title', 'authors', 'description','book_id']].dropna(subset=['categories', 'Title', 'authors', 'description'])

books_data_df = books_data_df.sample(n=10000, random_state=1)

print("Sample DataFrame of 5000 books:")
print(books_data_df)
books_data_df = books_data_df[
    books_data_df['categories'].apply(lambda x: len(x) > 0) & 
    books_data_df['authors'].apply(lambda x: len(x) > 0) & 
    books_data_df['description'].apply(lambda x: len(x) > 0)
].reset_index(drop=True)


Sample DataFrame of 5000 books:
                   categories  \
33781                 [Drama]   
47761   ["Childrens stories"]   
210051                [Texas]   
175940  [Juvenile Nonfiction]   
40367        [Adobe Premiere]   
...                       ...   
4763     [Body, Mind, Spirit]   
189982               [Nature]   
188322              [England]   
80345           [Photography]   
130191             [Religion]   

                                                    Title  \
33781                        Sight Unseen and Other Plays   
47761   The TALE OF THE VIRTUAL NIGHTMARE: ARE YOU AFR...   
210051                Slavery and the annexation of Texas   
175940                                One Boy from Kosovo   
40367   Focal Easy Guide to Premiere Pro: For New User...   
...                                                   ...   
4763                                     Ghosts of London   
189982                                            Beetles   
188322         The pos

In [28]:
books_rating_with_id_df = pd.read_csv('dataset/Books_rating_with_id.csv')

books_rating_df = books_rating_with_id_df[books_rating_with_id_df['Title'].isin(books_data_df['Title'])]

books_rating_df.loc[:, 'review/time'] = pd.to_datetime(books_rating_df['review/time'], unit='s')
books_rating_df = books_rating_df.dropna(subset=['User_id'])
books_rating_df = books_rating_df[['User_id', 'review/score', 'book_id', 'Title']].dropna(subset=['User_id', 'review/score', 'book_id', 'Title'])

print("Df mới của book rating:")
print(books_rating_df)

Df mới của book rating:
                User_id  review/score  book_id  \
79       A2HDZHLMT3L5IO           5.0       15   
161      A18YH0DR1GFOGW           4.0       24   
162       AMKC1EJBUXDS2           5.0       24   
163      A37QRFQTGTJH7K           4.0       24   
164       AMKC1EJBUXDS2           5.0       24   
...                 ...           ...      ...   
2999914  A1WD8SD9L65UN3           5.0    78851   
2999915  A2Y1SCM930PZI7           5.0    78851   
2999917  A18DH7N6ZI48L4           5.0    78851   
2999918  A2617Y1UUBS2I3           5.0    78851   
2999919  A3BNL7P0QWDINI           2.0    78851   

                                   Title  
79                  A husband for Kutani  
161      History of Magic and the Occult  
162      History of Magic and the Occult  
163      History of Magic and the Occult  
164      History of Magic and the Occult  
...                                  ...  
2999914               The Red Right Hand  
2999915               The Red R

In [29]:
import pandas as pd
# Đếm số lần đánh giá cho mỗi User_id
user_rating_counts = books_rating_df.groupby('User_id').size().reset_index(name='Rated_Books_Count')

# Lọc các User_id đã đánh giá ít nhất 10 cuốn sách
users_with_at_least_10_ratings = user_rating_counts[user_rating_counts['Rated_Books_Count'] >= 10]['User_id']

# Lọc các hàng trong 'books_rating_df' với User_id thỏa mãn điều kiện
filtered_books_rating_df = books_rating_df[books_rating_df['User_id'].isin(users_with_at_least_10_ratings)]

print("Df mới của book rating sau khi loại bỏ các User_id đánh giá ít hơn 10 lần:")
print(filtered_books_rating_df)


Df mới của book rating sau khi loại bỏ các User_id đánh giá ít hơn 10 lần:
                User_id  review/score  book_id  \
544      A14OJS0VWMOSWO           5.0       79   
1562     A18OBUSMXVE8R0           5.0      126   
1563     A14OJS0VWMOSWO           5.0      126   
3067     A2EMP366TTS6E1           4.0      237   
3068     A3BIWTN2DA0YY2           4.0      237   
...                 ...           ...      ...   
2998086   A96K1ZGW56S2I           3.0   132356   
2998100  A1IOJE0W1NXOSE           4.0   132356   
2999786  A14OJS0VWMOSWO           5.0   132379   
2999787  A3BH49ZKESHDID           5.0   132379   
2999911  A1SKNS2DGG46XM           5.0    78851   

                                                     Title  
544      Alternative Chicago: Unique Destinations Beyon...  
1562                          Chocolate: The Sweet History  
1563                          Chocolate: The Sweet History  
3067                                      Voices in Summer  
3068               

In [30]:
books_data_df = books_data_df[
    books_data_df['categories'].apply(lambda x: len(x) > 0) & 
    books_data_df['authors'].apply(lambda x: len(x) > 0) & 
    books_data_df['description'].apply(lambda x: len(x) > 0)
].reset_index(drop=True)


In [31]:
from sklearn.model_selection import train_test_split

def split_user_group(group, test_size=0.2):
    if len(group) < 2:
        return group, pd.DataFrame()
    train, test = train_test_split(group, test_size=test_size, random_state=1)
    return train, test

train_list = []
test_list = []

for user_id, group in filtered_books_rating_df.groupby('User_id'):
    train_group, test_group = split_user_group(group)
    train_list.append(train_group)
    test_list.append(test_group)

train_data = pd.concat(train_list)
test_data = pd.concat(test_list)

print("Tập train:")
print(train_data)
print("Tập test:")
print(test_data)

Tập train:
                User_id  review/score  book_id  \
1550346  A114YQ7ZT9Y1W5           5.0   110203   
821645   A114YQ7ZT9Y1W5           4.0    58984   
227871   A114YQ7ZT9Y1W5           4.0    15809   
717477   A114YQ7ZT9Y1W5           4.0    51021   
283121   A114YQ7ZT9Y1W5           5.0    19882   
...                 ...           ...      ...   
871464    AYE778OBAAX4O           5.0     7075   
581049    AYE778OBAAX4O           5.0     7075   
2167858   AYE778OBAAX4O           5.0   167038   
2190484   AYE778OBAAX4O           5.0   168749   
1605741   AYE778OBAAX4O           5.0     7075   

                                                     Title  
1550346                             Moby Dick Or the Whale  
821645   Blood meridian, or, The evening redness in the...  
227871                                             3 LIVES  
717477                                      The Immoralist  
283121                                 Homage to Catalonia  
...                   

In [32]:
books_rating_df = train_data

In [33]:
import pandas as pd

def count_books_per_user(data):
    user_book_counts = data.groupby('User_id')['book_id'].count().reset_index()
    user_book_counts.columns = ['User_id', 'Rated_Books_Count']
    return user_book_counts

train_user_counts = count_books_per_user(books_rating_df)

train_user_counts = train_user_counts.sort_values(by='Rated_Books_Count', ascending=False)

print("Số lượng sách đã đánh giá trong train_data cho từng User_id (sắp xếp giảm dần):")
print(train_user_counts)


Số lượng sách đã đánh giá trong train_data cho từng User_id (sắp xếp giảm dần):
            User_id  Rated_Books_Count
15   A14OJS0VWMOSWO                252
430     AFVQZQ8PW0L                171
42   A1D2C0WDCSHUWZ                116
439   AHD101501WCN1                 87
345  A3MAN5CBRX1KEV                 67
..              ...                ...
263  A2WLZD9BY669HY                  8
265   A2XZKGL6DJA03                  8
271  A2ZY5IX8MDRJY7                  8
278  A32IWIJ6UI3YYF                  8
497   AYE778OBAAX4O                  8

[498 rows x 2 columns]


In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
import pandas as pd
import numpy as np

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(books_data_df['description'])

mlb_categories = MultiLabelBinarizer()
mlb_authors = MultiLabelBinarizer()



In [35]:
cosine_sim_description = cosine_similarity(tfidf_matrix)
cosine_sim_description_df = pd.DataFrame(cosine_sim_description, index=books_data_df['book_id'], columns=books_data_df['book_id'])
print(cosine_sim_description_df)


book_id    33782   47762     210052    175941    40368     136814    37113   \
book_id                                                                       
33782    1.000000     0.0  0.000000  0.022990  0.000000  0.000000  0.025061   
47762    0.000000     1.0  0.000000  0.000000  0.000000  0.000000  0.000000   
210052   0.000000     0.0  1.000000  0.002778  0.004205  0.001939  0.052051   
175941   0.022990     0.0  0.002778  1.000000  0.002202  0.026534  0.002032   
40368    0.000000     0.0  0.004205  0.002202  1.000000  0.001537  0.003076   
...           ...     ...       ...       ...       ...       ...       ...   
4764     0.000000     0.0  0.000000  0.015724  0.007659  0.009019  0.018607   
189983   0.008162     0.0  0.010013  0.005376  0.010319  0.003125  0.013376   
188323   0.000000     0.0  0.017237  0.005120  0.000000  0.007245  0.000000   
80346    0.004523     0.0  0.004923  0.002578  0.141113  0.007932  0.003601   
130192   0.002202     0.0  0.002397  0.007381  0.001

In [36]:
categories_encoded = mlb_categories.fit_transform(books_data_df['categories'])
authors_encoded = mlb_authors.fit_transform(books_data_df['authors'])

In [37]:

print(books_data_df[['categories', 'authors']].isnull().sum())
print("-------------")

print(books_data_df['categories'].value_counts())
print(books_data_df['authors'].value_counts())
print("-------------")

print(books_data_df[books_data_df['categories'] == ''])
print(books_data_df[books_data_df['authors'] == ''])


categories    0
authors       0
dtype: int64
-------------
categories
[Fiction]                       1530
[Religion]                       623
[History]                        581
[Biography, Autobiography]       429
[Juvenile Fiction]               424
                                ... 
[Missionaries]                     1
[Maori (New Zealand people)]       1
[Free trade]                       1
[Businesspeople]                   1
[Beauty, Personal]                 1
Name: count, Length: 771, dtype: int64
authors
[William Shakespeare]    9
[Agatha Christie]        7
[Anne Perry]             7
[Danielle Steel]         7
[Jane Austen]            6
                        ..
[Gene Seabolt]           1
[Matthew Geller]         1
[Urs Eggli]              1
[Greg Costikyan]         1
[Malcolm A. Jeeves]      1
Name: count, Length: 8929, dtype: int64
-------------
Empty DataFrame
Columns: [categories, Title, authors, description, book_id]
Index: []
Empty DataFrame
Columns: [categories, T

In [38]:
def check_all_zero(encoded_matrix, feature_name, df):
    all_zero_rows = []
    for idx, row in enumerate(encoded_matrix):
        if np.all(row == 0):
            all_zero_rows.append(idx)
    
    if all_zero_rows:
        print(f"Các hàng toàn 0 trong {feature_name}: {all_zero_rows}")
        zero_id_list = df.iloc[all_zero_rows]['book_id'].tolist()  # Giả sử cột ID có tên là 'id'
        print(f"Các ID của sách có hàng mã hóa toàn 0 trong {feature_name}: {zero_id_list}")
    else:
        print(f"Không có hàng nào toàn 0 trong {feature_name}.")

check_all_zero(categories_encoded, "categories", books_data_df)
check_all_zero(authors_encoded, "authors", books_data_df)

Không có hàng nào toàn 0 trong categories.
Không có hàng nào toàn 0 trong authors.


Khoảng cách L2 (Euclidean Distance)

In [39]:
import pandas as pd
import numpy as np
import faiss

categories_encoded = categories_encoded.astype(np.float32)
authors_encoded = authors_encoded.astype(np.float32)

n_categories = categories_encoded.shape[0]
dim_categories = categories_encoded.shape[1]

index_categories = faiss.IndexFlatL2(dim_categories)
index_categories.add(categories_encoded)  

distances_categories, indices_categories = index_categories.search(categories_encoded, k=n_categories) 
similarity_categories = 1 - (distances_categories / np.max(distances_categories))

n_authors = authors_encoded.shape[0]
dim_authors = authors_encoded.shape[1]

index_authors = faiss.IndexFlatL2(dim_authors)
index_authors.add(authors_encoded) 

distances_authors, indices_authors = index_authors.search(authors_encoded, k=n_authors)  
similarity_authors = 1 - (distances_authors / np.max(distances_authors)) 

similarity_categories_df = pd.DataFrame(similarity_categories, index=books_data_df['book_id'], columns=books_data_df['book_id'])
similarity_authors_df = pd.DataFrame(similarity_authors, index=books_data_df['book_id'], columns=books_data_df['book_id'])

print("Ma trận tương đồng cho thể loại:")
print(similarity_categories_df)

print("Ma trận tương đồng cho tác giả:")
print(similarity_authors_df)


Ma trận tương đồng cho thể loại:
book_id  33782     47762     210052    175941    40368     136814    37113   \
book_id                                                                       
33782       1.0  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000   
47762       1.0  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000   
210052      1.0  0.714286  0.714286  0.714286  0.714286  0.714286  0.714286   
175941      1.0  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000   
40368       1.0  0.714286  0.714286  0.714286  0.714286  0.714286  0.714286   
...         ...       ...       ...       ...       ...       ...       ...   
4764        1.0  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000   
189983      1.0  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000   
188323      1.0  1.000000  1.000000  1.000000  1.000000  0.714286  0.714286   
80346       1.0  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000   
130192      1.0  1.

In [40]:
alpha = 0.2  # (description)
beta = 0.4  # (categories)
gamma = 0.4  # (authors)

similarity_matrix = (
    alpha * cosine_sim_description_df +
    beta * similarity_categories_df +
    gamma * similarity_authors_df
) 

print(similarity_matrix)

book_id    33782     47762     210052    175941    40368     136814    37113   \
book_id                                                                         
33782    1.000000  0.800000  0.785185  0.789783  0.785185  0.785185  0.790197   
47762    0.800000  0.985185  0.785185  0.785185  0.785185  0.785185  0.785185   
210052   0.800000  0.685714  0.863492  0.664048  0.664333  0.663880  0.673902   
175941   0.804598  0.785185  0.785741  0.985185  0.785626  0.790492  0.785592   
40368    0.800000  0.670900  0.671741  0.671340  0.870900  0.671207  0.671515   
...           ...       ...       ...       ...       ...       ...       ...   
4764     0.800000  0.785185  0.785185  0.788330  0.786717  0.786989  0.788907   
189983   0.801632  0.785185  0.787188  0.786260  0.787249  0.785810  0.787860   
188323   0.800000  0.800000  0.803447  0.801024  0.785185  0.672349  0.670900   
80346    0.800905  0.770370  0.771355  0.770886  0.798593  0.771957  0.771091   
130192   0.800440  0.785185 

In [41]:
# import pandas as pd

# similar_books_list = []

# for book_id_1 in cosine_sim_matrix.index:
#     for book_id_2 in cosine_sim_matrix.columns:
#         if book_id_1 != book_id_2:
#             similarity = cosine_sim_matrix.loc[book_id_1, book_id_2]
#             if 0 < similarity < 0.9:
#                 similar_books_list.append({
#                     'book_id_1': book_id_1,
#                     'book_id_2': book_id_2,
#                     'similarity_score': similarity
#                 })

# similar_books_df = pd.DataFrame(similar_books_list)

# top_similar_books = similar_books_df.sort_values(by='similarity_score', ascending=False).head(5)

# if not top_similar_books.empty:
#     print("Các cặp sách có giá trị tương đồng cosine > 0 và < 0.9:")
#     for index, row in top_similar_books.iterrows():
#         book_id_1 = row['book_id_1']
#         book_id_2 = row['book_id_2']
#         similarity = row['similarity_score']

#         title1 = books_data_df[books_data_df['book_id'] == book_id_1]['Title'].values[0]
#         title2 = books_data_df[books_data_df['book_id'] == book_id_2]['Title'].values[0]
#         categories1 = books_data_df[books_data_df['book_id'] == book_id_1]['categories'].values[0]
#         categories2 = books_data_df[books_data_df['book_id'] == book_id_2]['categories'].values[0]

#         print(f"Sách '{title1}' với thể loại {categories1} và Sách '{title2}' với thể loại {categories2} có giá trị tương đồng: {similarity}")
# else:
#     print("Không có cặp sách nào có giá trị tương đồng trong khoảng (0, 0.9).")


In [42]:
# import pandas as pd

# book_title = "Brave New World Revisited"
# book_id = books_data_df.loc[books_data_df['Title'] == book_title, 'book_id'].values
# print(book_id)
# if len(book_id) > 0:
#     # Tìm user_id đã đánh giá sách này
#     user_ids = books_rating_df.loc[books_rating_df['book_id'].isin(book_id), 'User_id'].unique()
#     print("User IDs đã đánh giá sách 'History of Magic and the Occult':", user_ids)
# else:
#     print("Không tìm thấy sách có tiêu đề 'History of Magic and the Occult'.")


In [43]:
# book_id = 24

# book_info = books_data_df.loc[books_data_df['book_id'] == book_id]

# if not book_info.empty:
#     title = book_info['Title'].values[0]
#     description = book_info['description'].values[0]
#     categories = book_info['categories'].values[0]
#     author = book_info['authors'].values[0]
#     print(f"Thông tin cuốn sách với book_id {book_id}:")
#     print(f"Tiêu đề: '{title}'")
#     print(f"Mô tả: '{description}'")
#     print(f"Thể loại: {categories}")
#     print(f"Tác giả: {author}")
# else:
#     print(f"Không tìm thấy sách với book_id {book_id}.")


In [44]:
# book_id = 64109

# book_info = books_data_df.loc[books_data_df['book_id'] == book_id]

# if not book_info.empty:
#     title = book_info['Title'].values[0]
#     description = book_info['description'].values[0]
#     categories = book_info['categories'].values[0]
#     author = book_info['authors'].values[0]
#     print(f"Thông tin cuốn sách với book_id {book_id}:")
#     print(f"Tiêu đề: '{title}'")
#     print(f"Mô tả: '{description}'")
#     print(f"Thể loại: {categories}")
#     print(f"Tác giả: {author}")
# else:
#     print(f"Không tìm thấy sách với book_id {book_id}.")


In [45]:
import numpy as np
import pandas as pd

def calculate_similarity(book_id, rated_books):
    if book_id not in similarity_matrix.index:
        return None

    rated_books_in_similarity = [book for book in rated_books if book in similarity_matrix.columns]

    if rated_books_in_similarity:
        similar_scores = similarity_matrix.loc[book_id, rated_books_in_similarity]

        average_similarity = similar_scores.mean()
        if not np.isnan(average_similarity) and average_similarity > 0:
            return average_similarity
    return None

def recommend_books(user_id, top_k=10):
    rated_books = train_data[train_data['User_id'] == user_id]['book_id'].unique()

    if len(rated_books) == 0:
        print(f"Người dùng {user_id} chưa đánh giá sách nào.")
        return []

    similarity_scores = {}

    test_books = test_data['book_id'].unique()
    unrated_books = books_data_df['book_id'][~books_data_df['book_id'].isin(rated_books)]
    unrated_books = unrated_books[unrated_books.isin(test_books)]

    rated_books = [book_id for book_id in rated_books if book_id in similarity_matrix.index]

    for book_id in unrated_books:
        similarity_score = calculate_similarity(book_id, rated_books)
        
        if similarity_score is not None:
            similarity_scores[book_id] = similarity_score

    if similarity_scores:
        suggested_books = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]

        return [(book[0], similarity_scores[book[0]]) for book in suggested_books]
    else:
        print(f"Không có sách nào để gợi ý cho người dùng {user_id}.")
        return []



In [46]:
user_id = 'A14OJS0VWMOSWO'
top_k = 10

suggested_books = recommend_books(user_id, top_k)
print(suggested_books)
if suggested_books:
    for book_id, similarity_score in suggested_books:
        title = books_data_df.loc[books_data_df['book_id'] == book_id, 'Title'].values[0]
        categories = books_data_df.loc[books_data_df['book_id'] == book_id, 'categories'].values[0]
        print(f"Sách '{title}' (ID: {book_id}) với thể loại {categories}, Điểm tương đồng: {similarity_score:.4f}")
else:
    print("Không có cuốn sách nào để gợi ý.")

[(157529, 0.6763149839795598), (44076, 0.676189972889137), (41170, 0.6761572510118246), (163054, 0.6761347646292153), (30038, 0.676133394061653), (49206, 0.6760495374101276), (14960, 0.6760448166450413), (11415, 0.6760312667423983), (65466, 0.6759936348143167), (40004, 0.6759798243675629)]
Sách 'One True Love' (ID: 157529) với thể loại ['Fiction'], Điểm tương đồng: 0.6763
Sách 'The Children's Book of Heroes' (ID: 44076) với thể loại ['Fiction'], Điểm tương đồng: 0.6762
Sách 'Two From Galilee.,' (ID: 41170) với thể loại ['Fiction'], Điểm tương đồng: 0.6762
Sách 'The Last Safe Place on Earth' (ID: 163054) với thể loại ['Fiction'], Điểm tương đồng: 0.6761
Sách 'Animal Dreams' (ID: 30038) với thể loại ['Fiction'], Điểm tương đồng: 0.6761
Sách 'The Rawhide Knot And Other Stories' (ID: 49206) với thể loại ['Fiction'], Điểm tương đồng: 0.6760
Sách 'The Science Fiction Hall of Fame: The Greatest Science Fiction Stories of All Time' (ID: 14960) với thể loại ['Fiction'], Điểm tương đồng: 0.6760


In [47]:
import pandas as pd

def calculate_top_k_hits_and_metrics(user_id, k, train_data, test_data):
    recommended_books = recommend_books(user_id, k)
    book_list = [book_id for book_id, score in recommended_books]

    actual_books = test_data[test_data['User_id'] == user_id]['book_id'].tolist()

    hits = len(set(book_list) & set(actual_books))

    precision = hits / k if k > 0 else 0

    total_relevant_books = len(actual_books) 
    recall = hits / total_relevant_books if total_relevant_books > 0 else 0

    return hits, precision, recall

def evaluate_f1_score(k, train_data, test_data):
    user_ids = train_data['User_id'].unique() 
    total_hits = 0
    total_precision = 0
    total_recall = 0
    total_users = len(user_ids)

    for user_id in user_ids:
        hits, precision, recall = calculate_top_k_hits_and_metrics(user_id, k, train_data, test_data)
        total_hits += hits
        total_precision += precision
        total_recall += recall

    avg_precision = total_precision / total_users if total_users > 0 else 0
    avg_recall = total_recall / total_users if total_users > 0 else 0

    if avg_precision + avg_recall > 0:
        f1_score = 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall)
    else:
        f1_score = 0

    return total_hits, avg_precision, avg_recall, f1_score

k = 10
total_hits, avg_precision, avg_recall, f1_score = evaluate_f1_score(k, train_data, test_data)

print(f"Tổng số hits: {total_hits}")
print(f"Precision trung bình: {avg_precision:.4f}")
print(f"Recall trung bình: {avg_recall:.4f}")
print(f"F1-Score: {f1_score:.4f}")


Tổng số hits: 50
Precision trung bình: 0.0100
Recall trung bình: 0.0273
F1-Score: 0.0147


In [48]:
import pandas as pd

def calculate_top_k_hits_and_metrics(user_id, k, train_data, test_data):
    recommended_books = recommend_books(user_id, k)
    book_list = [book_id for book_id, score in recommended_books]

    actual_books = test_data[test_data['User_id'] == user_id]['book_id'].tolist()

    hits = len(set(book_list) & set(actual_books))

    precision = hits / k if k > 0 else 0

    total_relevant_books = len(actual_books) 
    recall = hits / total_relevant_books if total_relevant_books > 0 else 0

    return hits, precision, recall

def evaluate_f1_score(k, train_data, test_data):
    user_ids = train_data['User_id'].unique() 
    total_hits = 0
    total_precision = 0
    total_recall = 0
    total_users = len(user_ids)

    for user_id in user_ids:
        hits, precision, recall = calculate_top_k_hits_and_metrics(user_id, k, train_data, test_data)
        total_hits += hits
        total_precision += precision
        total_recall += recall

    avg_precision = total_precision / total_users if total_users > 0 else 0
    avg_recall = total_recall / total_users if total_users > 0 else 0

    if avg_precision + avg_recall > 0:
        f1_score = 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall)
    else:
        f1_score = 0

    return total_hits, avg_precision, avg_recall, f1_score

k = 20
total_hits, avg_precision, avg_recall, f1_score = evaluate_f1_score(k, train_data, test_data)

print(f"Tổng số hits: {total_hits}")
print(f"Precision trung bình: {avg_precision:.4f}")
print(f"Recall trung bình: {avg_recall:.4f}")
print(f"F1-Score: {f1_score:.4f}")


Tổng số hits: 86
Precision trung bình: 0.0086
Recall trung bình: 0.0466
F1-Score: 0.0146


In [49]:
import pandas as pd

def calculate_top_k_hits_and_metrics(user_id, k, train_data, test_data):
    recommended_books = recommend_books(user_id, k)
    book_list = [book_id for book_id, score in recommended_books]

    actual_books = test_data[test_data['User_id'] == user_id]['book_id'].tolist()

    hits = len(set(book_list) & set(actual_books))

    precision = hits / k if k > 0 else 0

    total_relevant_books = len(actual_books) 
    recall = hits / total_relevant_books if total_relevant_books > 0 else 0

    return hits, precision, recall

def evaluate_f1_score(k, train_data, test_data):
    user_ids = train_data['User_id'].unique() 
    total_hits = 0
    total_precision = 0
    total_recall = 0
    total_users = len(user_ids)

    for user_id in user_ids:
        hits, precision, recall = calculate_top_k_hits_and_metrics(user_id, k, train_data, test_data)
        total_hits += hits
        total_precision += precision
        total_recall += recall

    avg_precision = total_precision / total_users if total_users > 0 else 0
    avg_recall = total_recall / total_users if total_users > 0 else 0

    if avg_precision + avg_recall > 0:
        f1_score = 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall)
    else:
        f1_score = 0

    return total_hits, avg_precision, avg_recall, f1_score

k = 50
total_hits, avg_precision, avg_recall, f1_score = evaluate_f1_score(k, train_data, test_data)

print(f"Tổng số hits: {total_hits}")
print(f"Precision trung bình: {avg_precision:.4f}")
print(f"Recall trung bình: {avg_recall:.4f}")
print(f"F1-Score: {f1_score:.4f}")


Tổng số hits: 181
Precision trung bình: 0.0073
Recall trung bình: 0.0928
F1-Score: 0.0135
