# Load book data

In [1]:
import pandas as pd
books = pd.read_excel("/content/drive/MyDrive/library/book.xlsx")
books = books[~books["book_id"].isnull()].reset_index(drop = True)
books.head(5)

Unnamed: 0,book_id,title,title_length,author_id,publisher_id,category_id,published_time,score,num_of_score,content
0,1.0,夢土,2.0,78.0,54.0,2,2008.04,0.0,0.0,01. 序曲／演奏弦樂團_x000D_\n02. 祖靈之聲／紀曉君_x000D_\n03. ...
1,4.0,奇蹟的夏天【單碟精裝版】DVD，家用版,19.0,50.0,22.0,4,,0.0,0.0,足球比賽開始！一群年輕原住民小朋友所組成的足球隊，在經歷一連串的比賽後，終於打進了這場準決賽...
2,5.0,大漢溪流域的三峽莊,9.0,25.0,50.0,3,2004.1,0.0,0.0,「2003 客家文化資源調查」叢書輯選4個客家鄉鎮做田野調查，有台北縣的《大漢溪流域的三峽莊...
3,6.0,來去東勢庄,6.0,41.0,50.0,3,2004.0,0.0,0.0,「2003 客家文化資源調查」叢書輯選4個客家鄉鎮做田野調查，有台北縣的《大漢溪流域的三峽莊...
4,7.0,大和志. 一個村落的誕生,12.0,80.0,50.0,1,2004.1,0.0,0.0,「2003 客家文化資源調查」叢書輯選4個客家鄉鎮做田野調查，有台北縣的《大漢溪流域的三峽莊...


# Book data preprocessing

In [3]:
books["published_time"] = books["published_time"].astype("str")
books["published_year"] = books["published_time"].str[:4]
#text cleanup
for col in ["title","content"]:
  books[col] = books[col].fillna("").astype(str).str.strip()
#combine title with content for vectorization
books["text"] = (books["title"] + " "+books["content"]).str.strip()

#one-hot encode author_id, pulisher_id
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
X_meta = ohe.fit_transform(books[["author_id","publisher_id","category_id" ]])
X_meta = X_meta.toarray()

# embbeding text (title & content)
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
texts = books["text"].tolist()
emb = model.encode(texts, batch_size=64, normalize_embeddings=True)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
#concate the data (book_id, title+content embedding, author+publisher+category_id one-hot encoding, score, num_of_score, publish_year)
data_1 = books[["book_id", "score", "num_of_score","published_year"]].reset_index(drop=True)
meta_df = pd.DataFrame(X_meta, columns=ohe.get_feature_names_out(["author_id","publisher_id","category_id"])).reset_index(drop=True)
emb_df = pd.DataFrame(emb).reset_index(drop=True)
processed_data = pd.concat([data_1, meta_df, emb_df], axis=1)
processed_data.head(5)


Unnamed: 0,book_id,score,num_of_score,published_year,author_id_1.0,author_id_2.0,author_id_3.0,author_id_4.0,author_id_5.0,author_id_6.0,...,374,375,376,377,378,379,380,381,382,383
0,1.0,0.0,0.0,2008.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.013741,0.05548,0.007118,0.01987,-0.0292,0.0597,0.090683,-0.011423,0.016195,0.013536
1,4.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.109407,0.010004,-0.024499,0.024782,-0.039779,0.042379,0.035443,0.05499,-0.037037,0.04533
2,5.0,0.0,0.0,2004.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.002827,0.024591,0.005744,0.025903,-0.022592,0.026379,0.134257,0.032097,0.057082,-0.031892
3,6.0,0.0,0.0,2004.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.003014,0.013898,0.008057,0.039058,-0.016474,0.037983,0.128308,0.05556,0.024469,-0.015784
4,7.0,0.0,0.0,2004.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001062,0.009163,0.012179,0.028383,-0.009795,0.030687,0.130857,0.051157,0.035598,-0.02358


In [5]:
#normalizaing score, num_of_score, year
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
num_cols = ["score","num_of_score", "published_year"]
processed_data["published_year"] = processed_data["published_year"].replace("May ",'2023').replace('01.3','2023')
processed_data["published_year"] = pd.to_numeric(processed_data["published_year"], errors = "coerce")
processed_data["published_year"] = processed_data["published_year"].fillna(processed_data["published_year"].median())
processed_data[num_cols] = scaler.fit_transform(processed_data[num_cols])
processed_data.head(5)


Unnamed: 0,book_id,score,num_of_score,published_year,author_id_1.0,author_id_2.0,author_id_3.0,author_id_4.0,author_id_5.0,author_id_6.0,...,374,375,376,377,378,379,380,381,382,383
0,1.0,0.0,0.0,0.991534,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.013741,0.05548,0.007118,0.01987,-0.0292,0.0597,0.090683,-0.011423,0.016195,0.013536
1,4.0,0.0,0.0,0.990538,0.0,0.0,0.0,0.0,0.0,0.0,...,0.109407,0.010004,-0.024499,0.024782,-0.039779,0.042379,0.035443,0.05499,-0.037037,0.04533
2,5.0,0.0,0.0,0.989542,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.002827,0.024591,0.005744,0.025903,-0.022592,0.026379,0.134257,0.032097,0.057082,-0.031892
3,6.0,0.0,0.0,0.989542,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.003014,0.013898,0.008057,0.039058,-0.016474,0.037983,0.128308,0.05556,0.024469,-0.015784
4,7.0,0.0,0.0,0.989542,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001062,0.009163,0.012179,0.028383,-0.009795,0.030687,0.130857,0.051157,0.035598,-0.02358


# Test similarity fuction: this function is able to find the similar book accroding to the input book_id

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
book_ids = processed_data["book_id"].astype(int).to_numpy()
id2row = {bid: i for i, bid in enumerate(book_ids)}
def similar_books(book_id, k=10):
   # 1. Get the row position
    bid = int(book_id)
    i = id2row.get(bid)
    if i is None:
        return []
    # 2. Feature vectors
    v = processed_data.drop(columns=["book_id"]).iloc[i].values.reshape(1, -1)
    X = processed_data.drop(columns=["book_id"]).values

    # 3. Cosine similarity
    sims = cosine_similarity(v, X).ravel()

    # 4. Exclude itself by **position**, not label
    sims[i] = -1

    # 5. Get top-k row positions
    top = sims.argsort()[::-1][:k]
    # 6. Return a DataFrame of book_id, title, and similarity score
    return books.loc[top, ["book_id","title","author_id","category_id","content"]].assign(score = sims[top])


similar_books(379)  #test 379 水滸傳(上)/ 637 安徒生故事全集１/ 559 等一個人咖啡/ 615聖地之旅以色列.埃及.約旦/886 紐約通








Unnamed: 0,book_id,title,author_id,category_id,content,score
299,378.0,水滸傳(上),276.0,2,《水滸傳》是中國歷史上第一部白話章回小說，與《紅樓夢》、《三國演義》、《西遊記》並列中國古典四大文學名著之一。內容講述梁山泊以宋江為首的綠林好漢，由被迫落草，發展壯大，直至受到朝廷招安，東征西討的歷程。又名《忠義水滸傳》，初名《江湖豪客傳》，一般簡稱《水滸》，作於元末明初，根據《大宋宣和遺事》和《東都事略》等史籍中有關宋朝宋江等三十六人起義造反的記載的基礎上，再進行創作的。 _x000D_\n _x000D_\n 水滸中的一百單八將傳說是三十六個天罡星和七十二個地煞星轉世，他們講究忠和義，愛打抱不平、劫富濟貧，不滿貪官污吏，最後集結梁山，與腐化的朝廷抗爭。小說成功地塑造了宋江、林沖、李逵、魯智深、武松等人物的鮮明形象，也向讀者展示了宋代的政治與社會狀況。 _x000D_\n _x000D_\n 作為中國歷史上，第一部以白話文寫成的長篇小說，《水滸傳》對後世的影響巨大。金聖嘆將《水滸傳》與《離騷》、《莊子》、《史記》、「杜詩」、《西廂記》合稱為「六才子書」。李漁將《水滸傳》與《三國演義》、《西遊記》、《金瓶梅》定為「四大奇書」。 _x000D_\n _x000D_\n---www.books.com.tw/exep/prod/books/ _x000D_,0.999993
1332,382.0,鏡花緣(上),278.0,2,,0.63577
1333,383.0,鏡花緣(下),278.0,2,,0.631415
453,699.0,台灣七色記前記,177.0,2,《台灣七色記》是台灣大河小說史上的曠世經典，橫跨一千六百年，描繪《白版戶》（公元383年河洛人的故事）、《黑水溝》（公元1683年台灣天地會）、《洪豆劫》（公元1786年林爽文事件）、《黃虎印》（公元1895年台灣民主國抗日）、《藍海夢》（公元1945年台灣光復記）、《青山路》（公元1971年退出聯合國）及《紫帽寺》（公元1984年泉州人的故事）這精彩動人的七部曲。而今作者特書這本「前記」，一方面向讀者介紹七色記各集小說的背景、主題及意義，另也表達作者寫作心得，記錄其七年牢中寫作的經驗及各集小說構思、籌畫、設計、著筆的經過。「前記」可以做為七色記的導讀，也可提供給有意寫作長篇歷史小說的讀者參考，做為寫作手冊之用。本書除是理解吾土吾祖的最佳讀物，也是各大專院校指定的優良教材。\n\nfrom http://www.books.com.tw/exep/prod/booksfile.php?item=0010396142,0.554007
274,185.0,浪淘沙下冊,143.0,2,旅加台灣作家東方白傾全生命，以十年文學苦旅寫成的150萬字大河小說《浪淘沙》，為台灣本土文學樹立了一塊閃閃生輝的里程碑，台灣文壇大老鍾肇政喻之為「台灣文學史上最燦爛輝煌的一座金字塔」，葉石濤稱譽已接近世界偉大小說的系譜。_x000D_\n_x000D_\n 《浪淘沙》以台灣歷史為證，以台灣鄉土為懷，描繪台灣自1895年割讓日本迄至當代，三個台灣家族三代人之間的人事滄桑與悲歡離合的故事，反映近百年來台灣人民的歷史運命和精神意志。_x000D_\n---www.books.com.tw/exep/prod/booksfile,0.547895
190,365.0,臥虎藏龍 重出江湖版,267.0,2,本書根據王度廬原作《寶劍金釵》與《臥虎藏龍》二書濃縮改寫，描述武功高強、俠名遠播的李慕白和美若天仙的女俠俞秀蓮的情義糾葛。為了迂腐的義氣而逃避感情的李慕白，在一番波折後，如何解開自己的心結，最後終能和心愛的俞秀蓮相協于飛。而出身富豪官家的玉嬌龍，與土匪頭子半天雲（羅小虎），在因緣際會下，產生一段奇情。_x000D_\n_x000D_\n 兩對個性書義的男女主角，兩段坎坷的戀情，最後卻是：逃避情愛的人，得以相聚。追求愛情的人，卻在一夜情緣後，飄然離去。他們心中的起伏，是這部武俠奇緣故事中，最令人感動的。_x000D_\n _x000D_\n- 博客來http://www.books.com.tw/exep/prod/booksfile.php?item=0010118688,0.545751
999,1208.0,張愛玲未完：解讀張愛玲的作品,866.0,2,（張愛玲未－完解讀張愛玲的作品）是水晶先生繼（張愛玲的小說）後又一力作。全書剖析了（傳奇）中大多數膾炙人口的精采小說，如（沈香屑－第一爐香）、（金鎖記）、（傾城之戀）、（紅玫塊兵白玫塊）、（留情）……等，以及中篇小說（秧歌）。此書係作者經過二十多年的醞釀酵發，在張愛玲女士逝世後新資訊的刺激下，配戴了廣角鏡後的深度書寫，值得向廣大的張迷們推薦。,0.542717
20,23.0,琦君散文選中英對照(紀念珍藏版),9.0,2,舊文學的根柢與新文學的洗禮，琦君的散文承先啟後，深厚的中國人情味與練達的人生觀並陳，早被文學評論家夏志清肯定為：與李後主、李清照同屬中國的抒情傳統，而成就卻比二李高。_x000D_\n_x000D_\n本書從琦君名著《三更有夢書當枕》、《煙愁》、《淚珠與珍珠》等書中，精選〈髻〉、〈一對金手鐲〉、〈桂花雨〉、〈媽媽銀行〉等十二篇名作，採中英對照方式，每篇譯文均為名家翻譯，絕大多數曾在中華民國筆會季刊發表。期使讀者在閱讀兩種語文中，體會琦君的文章為什麼是「人人心中所有，人人筆中所無」，歷久彌新，百看不厭，有中國人處必有琦君的讀者。_x000D_\n_x000D_\n本書特色_x000D_\n_x000D_\n★為紀念琦君逝世一週年，特別推出「紀念珍藏版」。_x000D_\n_x000D_\n★琦君與資深出版人蔡文甫相交近四十年，書信往來超過一百封，本書特別精選若干封琦君書信手稿，略窺他們深厚的交誼。在e-mail盛行的年代，琦君的信件真跡更加彌足珍貴。_x000D_\n_x000D_\n★琦君的先生李唐基，特地為文，述說他們夫婦倆與《琦君散文選中英對照》的譯者們，因合作該書而成為忘年交的故事。_x000D_\n_x000D_\n★名作家彭歌，描寫他自年輕時代即認識的琦君大姐。_x000D_,0.540631
992,1171.0,楊梅三部曲第二部：寒蟬,249.0,2,本書是楊梅三部曲的第二部。它呈現了台灣人在身份轉換後（即日本戰敗），由名為「祖國」實為占領者的中國無端凌遲、任意宰割之下，以血淚編織的一段歷史。作者把這一段歷史，用小說的形式呈現出來，既為當年受難的忠魂代言，又可便利讀者從中領悟歷史的教訓和啟示。_x000D_\n_x000D_\n 由於作者在青少年時期曾經受過戰爭的洗禮，又度過了台灣戰後換了朝代那一段慘無天日的歲月，所以能運用累積的見識，寫就了這一部反映時代和社會的小說。她把當時的老、青、少不同世代的看法和想法，做了傳真式的刻劃，活靈活現，即使不曾身歷其境的人看了，也會感同身受，而且一讀再讀，每次都有新的收穫。_x000D_\n_x000D_\n 台灣的文化生態已墮入一之元的絕路，有心人如要救亡圖存，必須要澈底探究早已在台灣漫延的「心理肺炎」的自然史。為此，＜寒蟬＞可以提供一面鏡子，幫我們從歷史中汲取教訓，從而洗淨「心理肺炎」的病毒，走出一次元的窄路，重新建構一個有台灣個性的高次元文化圈。,0.539792
301,381.0,台灣連翹,127.0,2,吳濁流的作品，是台灣文學傳統一份重要的遺產，《台灣連翹》則是吳濁流一生的最後著作；他對台灣的期許、對中國的幻滅，都在這本書裏有明確的交代，就像他撰寫《亞細亞孤兒》的心情那樣，吳老寫出他在「祖國」來臨之後，思想上和精神上的變化，他留下遺言，這本書要在他去世十年、二十年後才能出版。_x000D_\n_x000D_\n本刊獲得這份原稿，並邀請台灣文學泰斗鍾肇政先生全文譯出，全書結構嚴謹、體魄恢宏、文字精確而細膩，道出台灣戰爭前後政壇的祕辛，愛台灣者、關心台灣者都應在書架上置上此書一冊。,0.537485


In [15]:
pd.set_option('display.max_colwidth', None)
books[books["book_id"] == 379]

Unnamed: 0,book_id,title,title_length,author_id,publisher_id,category_id,published_time,score,num_of_score,content,published_year,text
300,379.0,水滸傳(下),6.0,276.0,177.0,2,1990.1,0.0,0.0,《水滸傳》是中國歷史上第一部白話章回小說，與《紅樓夢》、《三國演義》、《西遊記》並列中國古典四大文學名著之一。內容講述梁山泊以宋江為首的綠林好漢，由被迫落草，發展壯大，直至受到朝廷招安，東征西討的歷程。又名《忠義水滸傳》，初名《江湖豪客傳》，一般簡稱《水滸》，作於元末明初，根據《大宋宣和遺事》和《東都事略》等史籍中有關宋朝宋江等三十六人起義造反的記載的基礎上，再進行創作的。_x000D_\n_x000D_\n 水滸中的一百單八將傳說是三十六個天罡星和七十二個地煞星轉世，他們講究忠和義，愛打抱不平、劫富濟貧，不滿貪官污吏，最後集結梁山，與腐化的朝廷抗爭。小說成功地塑造了宋江、林沖、李逵、魯智深、武松等人物的鮮明形象，也向讀者展示了宋代的政治與社會狀況。_x000D_\n_x000D_\n 作為中國歷史上，第一部以白話文寫成的長篇小說，《水滸傳》對後世的影響巨大。金聖嘆將《水滸傳》與《離騷》、《莊子》、《史記》、「杜詩」、《西廂記》合稱為「六才子書」。李漁將《水滸傳》與《三國演義》、《西遊記》、《金瓶梅》定為「四大奇書」。_x000D_\n_x000D_\n---www.books.com.tw/exep/prod/books/,1990,水滸傳(下) 《水滸傳》是中國歷史上第一部白話章回小說，與《紅樓夢》、《三國演義》、《西遊記》並列中國古典四大文學名著之一。內容講述梁山泊以宋江為首的綠林好漢，由被迫落草，發展壯大，直至受到朝廷招安，東征西討的歷程。又名《忠義水滸傳》，初名《江湖豪客傳》，一般簡稱《水滸》，作於元末明初，根據《大宋宣和遺事》和《東都事略》等史籍中有關宋朝宋江等三十六人起義造反的記載的基礎上，再進行創作的。_x000D_\n_x000D_\n 水滸中的一百單八將傳說是三十六個天罡星和七十二個地煞星轉世，他們講究忠和義，愛打抱不平、劫富濟貧，不滿貪官污吏，最後集結梁山，與腐化的朝廷抗爭。小說成功地塑造了宋江、林沖、李逵、魯智深、武松等人物的鮮明形象，也向讀者展示了宋代的政治與社會狀況。_x000D_\n_x000D_\n 作為中國歷史上，第一部以白話文寫成的長篇小說，《水滸傳》對後世的影響巨大。金聖嘆將《水滸傳》與《離騷》、《莊子》、《史記》、「杜詩」、《西廂記》合稱為「六才子書」。李漁將《水滸傳》與《三國演義》、《西遊記》、《金瓶梅》定為「四大奇書」。_x000D_\n_x000D_\n---www.books.com.tw/exep/prod/books/


# Idea 1: The borrow record was extracted from 2019 to recent. We can get the most recent borrowed book each reader borrowed then use similar book function to get the top 5 similar books to recommend they read it.

In [16]:
#load borrow data
borrow = pd.read_excel("/content/drive/MyDrive/library/borrow.xlsx")
borrow = borrow[["user_id","book_id","create_time"]]
borrow.head(5)

Unnamed: 0,user_id,book_id,create_time
0,1258,1497,2019-01-19 11:03:32
1,1258,1495,2019-01-19 11:03:43
2,1258,1496,2019-01-19 11:03:52
3,1258,1494,2019-01-19 11:03:59
4,1258,1493,2019-01-19 11:04:10


In [17]:
#get the most recent record for each reader
b1 = borrow.groupby(["user_id"])["create_time"].max().reset_index()
processed_b = pd.merge(borrow,b1, how = "inner", on = ["user_id","create_time"])
most_recent_book = processed_b.drop(columns=["create_time"])
most_recent_book.head(5)

Unnamed: 0,user_id,book_id
0,1196,667
1,1086,1297
2,1086,1378
3,1086,913
4,1086,1124


In [18]:
#similar book function
# build a helper map
book_ids = processed_data["book_id"].astype(int).to_numpy()
id2row = {bid: i for i, bid in enumerate(book_ids)}

def similar_books1(book_id, k=10):
    # 1. Get the row position
    bid = int(book_id)
    i = id2row.get(bid)
    if i is None:
        return []

    # 2. Feature vectors
    v = processed_data.drop(columns=["book_id"]).iloc[i].values.reshape(1, -1)
    X = processed_data.drop(columns=["book_id"]).values

    # 3. Cosine similarity
    sims = cosine_similarity(v, X).ravel()

    # 4. Exclude itself by **position**, not label
    sims[i] = -1

    # 5. Get top-k row positions
    top = sims.argsort()[::-1][:k]

    # 6. Return book_ids from the same frame
    top_book = processed_data.iloc[top]["book_id"].tolist()
    return top_book


In [19]:
#use similar function to get the top 5 similar book for each user base on their most recent borrowed book
most_recent_book["recommend"] =  most_recent_book["book_id"].apply(lambda x: similar_books1(x,5))
most_recent_book.head(5)



Unnamed: 0,user_id,book_id,recommend
0,1196,667,"[1925.0, 696.0, 669.0, 681.0, 726.0]"
1,1086,1297,"[1692.0, 84.0, 1590.0, 1198.0, 1922.0]"
2,1086,1378,"[689.0, 1460.0, 671.0, 1124.0, 1499.0]"
3,1086,913,"[1664.0, 1676.0, 1122.0, 1235.0, 1672.0]"
4,1086,1124,"[689.0, 1378.0, 1308.0, 1460.0, 1499.0]"


# Idea 2: Taking average of each reader's borrowed-book vectors to constrcut the user profile, then calculate similarity between user vector and vectors for each book in the book dataframe to get the top 5 books to recommend.



In [20]:
reader = borrow["user_id"].unique().tolist()
vector_col = processed_data.columns.drop("book_id").tolist()
vector_col.insert(0,"user_id")
user_profile = pd.DataFrame(columns=vector_col)

for i in reader:
  book_borrowed = borrow[borrow["user_id"] == i]["book_id"]
  borrowed_vector = pd.merge(book_borrowed, processed_data, how = "left", on = "book_id")
  avg_vector = pd.DataFrame([borrowed_vector.mean(axis = 0)], columns= borrowed_vector.columns)
  avg_vector = avg_vector.drop(columns = ["book_id"])
  avg_vector["user_id"] = i
  avg_vector = avg_vector[vector_col]
  user_profile = pd.concat([user_profile,avg_vector], ignore_index = True)
user_profile.head(5)


  user_profile = pd.concat([user_profile,avg_vector], ignore_index = True)


Unnamed: 0,user_id,score,num_of_score,published_year,author_id_1.0,author_id_2.0,author_id_3.0,author_id_4.0,author_id_5.0,author_id_6.0,...,374,375,376,377,378,379,380,381,382,383
0,1258,1.0,0.333333,0.99446,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002379,0.038077,0.020955,-0.026673,-0.028722,0.045595,0.059412,0.039078,0.035909,0.001812
1,1196,0.933334,1.0,0.99253,0.0,0.0,0.0,0.0,0.0,0.0,...,0.025866,-0.01298,-0.02156,-0.019028,-0.072975,-0.014592,0.098686,-0.048045,-0.032542,0.017272
2,211,0.356604,0.125786,0.990707,0.0,0.0,0.0,0.0,0.0,0.0,...,0.024248,0.005157,0.019774,0.043259,-0.054999,0.016296,0.072959,0.030563,-0.031199,0.011124
3,890,0.46,0.166667,0.99258,0.0,0.0,0.0,0.0,0.0,0.0,...,0.012906,0.012186,0.026665,0.039403,-0.038751,0.029707,0.040878,0.056992,-0.012932,0.014623
4,1086,0.5,0.166667,0.992654,0.0,0.0,0.0,0.0,0.0,0.0,...,0.031456,-0.02154,0.024092,0.059363,-0.048704,-0.015409,0.084825,0.001889,-0.003836,-0.004825


In [21]:
#calculate cosine similarity to get the top 5 similar book to the user profile
def similar_books2(vector,user_id, k=5):
    df = pd.DataFrame([vector])
    v = df.values.reshape(1, -1)
    X = processed_data.drop(columns=["book_id"]).values
    # Cosine similarity
    sims = cosine_similarity(v, X).ravel()

    # Exclude the book already borrowed
    borrowed_book_id = borrow[borrow["user_id"] == user_id]["book_id"].values.tolist()
    book_ids = processed_data["book_id"].astype(int).to_numpy()
    id2row = {bid: i for i, bid in enumerate(book_ids)}
    borrowed_book_idx = []
    for i in borrowed_book_id:
      idx = id2row.get(i)
      borrowed_book_idx.append(idx)

    for id in range(len(sims)):
      if id in borrowed_book_idx:
        sims[id] = -1

    # Get top-k row positions
    top = sims.argsort()[::-1][:k]
    # Return recommend book list
    return books.iloc[top]["book_id"].values.tolist()

user_profile["recommend"] = user_profile.apply(lambda row: similar_books2(row[1:],row["user_id"]), axis= 1)
user_profile.head(5)



Unnamed: 0,user_id,score,num_of_score,published_year,author_id_1.0,author_id_2.0,author_id_3.0,author_id_4.0,author_id_5.0,author_id_6.0,...,375,376,377,378,379,380,381,382,383,recommend
0,1258,1.0,0.333333,0.99446,0.0,0.0,0.0,0.0,0.0,0.0,...,0.038077,0.020955,-0.026673,-0.028722,0.045595,0.059412,0.039078,0.035909,0.001812,"[16.0, 377.0, 1263.0, 428.0, 700.0]"
1,1196,0.933334,1.0,0.99253,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.01298,-0.02156,-0.019028,-0.072975,-0.014592,0.098686,-0.048045,-0.032542,0.017272,"[1925.0, 696.0, 669.0, 681.0, 726.0]"
2,211,0.356604,0.125786,0.990707,0.0,0.0,0.0,0.0,0.0,0.0,...,0.005157,0.019774,0.043259,-0.054999,0.016296,0.072959,0.030563,-0.031199,0.011124,"[899.0, 1107.0, 22.0, 1302.0, 686.0]"
3,890,0.46,0.166667,0.99258,0.0,0.0,0.0,0.0,0.0,0.0,...,0.012186,0.026665,0.039403,-0.038751,0.029707,0.040878,0.056992,-0.012932,0.014623,"[1263.0, 1532.0, 1627.0, 724.0, 744.0]"
4,1086,0.5,0.166667,0.992654,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.02154,0.024092,0.059363,-0.048704,-0.015409,0.084825,0.001889,-0.003836,-0.004825,"[689.0, 1460.0, 671.0, 1308.0, 1179.0]"


In [22]:
avg_recom_result = user_profile[["user_id", "recommend"]]
avg_recom_result.head(5)

Unnamed: 0,user_id,recommend
0,1258,"[16.0, 377.0, 1263.0, 428.0, 700.0]"
1,1196,"[1925.0, 696.0, 669.0, 681.0, 726.0]"
2,211,"[899.0, 1107.0, 22.0, 1302.0, 686.0]"
3,890,"[1263.0, 1532.0, 1627.0, 724.0, 744.0]"
4,1086,"[689.0, 1460.0, 671.0, 1308.0, 1179.0]"


# cold-start probelm?
recomend the most popular books

get all user data and filter out those who didn't borrow any book.