In [55]:
import os

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import TextSplitter, CharacterTextSplitter
from langchain_community.embeddings import DashScopeEmbeddings
from langchain_chroma import Chroma

In [56]:
from dotenv import load_dotenv

load_dotenv()

True

In [57]:
import pandas as pd

books = pd.read_csv("books_cleaned.csv")

In [58]:
books

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine..."
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5192,9788172235222,8172235224,Mistaken Identity,Nayantara Sahgal,Indic fiction (English),http://books.google.com/books/content?id=q-tKP...,On A Train Journey Home To North India After L...,2003.0,2.93,324.0,0.0,Mistaken Identity,9788172235222 On A Train Journey Home To North...
5193,9788173031014,8173031010,Journey to the East,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,24.0,Journey to the East,9788173031014 This book tells the tale of a ma...
5194,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,1568.0,The Monk Who Sold His Ferrari: A Fable About F...,9788179921623 Wisdom to Create a Life of Passi...
5195,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,I Am that: Talks with Sri Nisargadatta Maharaj,9788185300535 This collection of the timeless ...


In [59]:
books["tagged_description"].to_csv("tagged_descriptions.txt", sep="\n", index=False, header=False)

In [60]:
raw_documents = TextLoader("tagged_descriptions.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n")
documents = text_splitter.split_documents(raw_documents)

Created a chunk of size 1168, which is longer than the specified 0
Created a chunk of size 1214, which is longer than the specified 0
Created a chunk of size 373, which is longer than the specified 0
Created a chunk of size 309, which is longer than the specified 0
Created a chunk of size 483, which is longer than the specified 0
Created a chunk of size 482, which is longer than the specified 0
Created a chunk of size 960, which is longer than the specified 0
Created a chunk of size 188, which is longer than the specified 0
Created a chunk of size 843, which is longer than the specified 0
Created a chunk of size 296, which is longer than the specified 0
Created a chunk of size 197, which is longer than the specified 0
Created a chunk of size 881, which is longer than the specified 0
Created a chunk of size 1088, which is longer than the specified 0
Created a chunk of size 1189, which is longer than the specified 0
Created a chunk of size 304, which is longer than the specified 0
Create

In [61]:
documents[0]

Document(metadata={'source': 'tagged_descriptions.txt'}, page_content='9780002005883 A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, G

In [62]:
embeddings = DashScopeEmbeddings(
    model="text-embedding-v2",
    dashscope_api_key=os.getenv("QWEN_API_KEY"),
)

In [65]:
batch_size = 50

db_books = None

for i in range(0, len(documents), batch_size):
    batch = documents[i:i + batch_size]
    try:
        if db_books is None:
            db_books = Chroma.from_documents(
                documents=batch,
                embedding=embeddings,
            )
        else:
            db_books.add_documents(batch)
        print(f"处理第 {i // batch_size + 1} 批成功")
    except Exception as e:
        print(f"处理第 {i // batch_size + 1} 批失败: {e}")
        break

if db_books:
    print(f"成功存储 {db_books._collection.count()} 条记录")

处理第 1 批成功
处理第 2 批成功
处理第 3 批成功
处理第 4 批成功
处理第 5 批成功
处理第 6 批成功
处理第 7 批成功
处理第 8 批成功
处理第 9 批成功
处理第 10 批成功
处理第 11 批成功
处理第 12 批成功
处理第 13 批成功
处理第 14 批成功
处理第 15 批成功
处理第 16 批成功
处理第 17 批成功
处理第 18 批成功
处理第 19 批成功
处理第 20 批成功
处理第 21 批成功
处理第 22 批成功
处理第 23 批成功
处理第 24 批成功
处理第 25 批成功
处理第 26 批成功
处理第 27 批成功
处理第 28 批成功
处理第 29 批成功
处理第 30 批成功
处理第 31 批成功
处理第 32 批成功
处理第 33 批成功
处理第 34 批成功
处理第 35 批成功
处理第 36 批成功
处理第 37 批成功
处理第 38 批成功
处理第 39 批成功
处理第 40 批成功
处理第 41 批成功
处理第 42 批成功
处理第 43 批成功
处理第 44 批成功
处理第 45 批成功
处理第 46 批成功
处理第 47 批成功
处理第 48 批成功
处理第 49 批成功
处理第 50 批成功
处理第 51 批成功
处理第 52 批成功
处理第 53 批成功
处理第 54 批成功
处理第 55 批成功
处理第 56 批成功
处理第 57 批成功
处理第 58 批成功
处理第 59 批成功
处理第 60 批成功
处理第 61 批成功
处理第 62 批成功
处理第 63 批成功
处理第 64 批成功
处理第 65 批成功
处理第 66 批成功
处理第 67 批成功
处理第 68 批成功
处理第 69 批成功
处理第 70 批成功
处理第 71 批成功
处理第 72 批成功
处理第 73 批成功
处理第 74 批成功
处理第 75 批成功
处理第 76 批成功
处理第 77 批成功
处理第 78 批成功
处理第 79 批成功
处理第 80 批成功
处理第 81 批成功
处理第 82 批成功
处理第 83 批成功
处理第 84 批成功
处理第 85 批成功
处理第 86 批成功
处理第 87 批成功
处理第 88 批成功
处理第 89 批成功
处理第 90 批成功
处理第 91 批成功
处理第 92 批

In [64]:
def retrieve_semantic_recommendations(
        query: str,
        top_k: int = 10
) -> pd.DataFrame:
    recs = db_books.similarity_search(query, k=50)

    books_list = []
    for rec in recs:
        books_list += [int(rec.page_content.strip('"').split()[0])]

    return books[books["isbn13"].isin(books_list)].head(top_k)

In [67]:
retrieve_semantic_recommendations("推荐一些科幻小说", 10)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
123,9780060515225,0060515228,Fragile Things,Neil Gaiman,Fiction,http://books.google.com/books/content?id=ggLzJ...,A mysterious circus terrifies an audience for ...,2006.0,4.01,360.0,43210.0,Fragile Things: Short Fictions and Wonders,9780060515225 A mysterious circus terrifies an...
147,9780060556525,0060556528,Gold,Isaac Asimov,Fiction,http://books.google.com/books/content?id=td_ul...,Gold is the final and crowning achievement of ...,2003.0,3.99,416.0,2393.0,Gold: The Final Science Fiction Collection,9780060556525 Gold is the final and crowning a...
313,9780060936501,0060936509,The Best American Science Writing 2002,Matt Ridley;Alan Lightman,Science,http://books.google.com/books/content?id=p8yA_...,"If, as Matt Ridley suggests, science is simply...",2002.0,3.88,352.0,66.0,The Best American Science Writing 2002,"9780060936501 If, as Matt Ridley suggests, sci..."
549,9780131871656,013187165X,Astronomy,Eric Chaisson;Stephen McMillan,Mathematics,http://books.google.com/books/content?id=1O00A...,This introduction to astronomy features an exc...,2006.0,3.85,499.0,153.0,Astronomy: a beginner's guide to the universe,9780131871656 This introduction to astronomy f...
874,9780143039938,0143039938,The Book of Imaginary Beings,Jorge Luis Borges;Margarita Guerrero;Andrew Hu...,Fiction,http://books.google.com/books/content?id=FuNQP...,A whimsical compendium of mythological creatur...,2006.0,4.09,236.0,4809.0,The Book of Imaginary Beings,9780143039938 A whimsical compendium of mythol...
991,9780192862099,019286209X,The Origins of Life,John Maynard Smith;Eörs Szathmáry,Science,http://books.google.com/books/content?id=nHDbB...,'I can recommend this book as a thoroughly int...,2000.0,4.11,192.0,41.0,The Origins of Life: From the Birth of Life to...,9780192862099 'I can recommend this book as a ...
1070,9780231124270,0231124279,The Poetry of Sylvia Plath,Claire Brennan,Literary Criticism,http://books.google.com/books/content?id=TSrx1...,Liquid Metal brings together 'seminal' essays ...,1999.0,4.23,202.0,25.0,The Poetry of Sylvia Plath,9780231124270 Liquid Metal brings together 'se...
1074,9780237525378,0237525372,Oliver Twist,Pauline Francis;Charles Dickens,Juvenile Nonfiction,http://books.google.com/books/content?id=X6RvT...,This wonderful series is a quick way into a ra...,2003.0,3.66,48.0,92.0,Oliver Twist,9780237525378 This wonderful series is a quick...
1271,9780312852535,0312852533,The Humanoids,Jack Williamson,Fiction,http://books.google.com/books/content?id=vPSl0...,A classic science fiction novel features human...,1996.0,3.75,299.0,880.0,The Humanoids: A Novel,9780312852535 A classic science fiction novel ...
1285,9780312878269,0312878265,Duel,Richard Matheson,Fiction,http://books.google.com/books/content?id=CEM1s...,"Collects horror stories such as ""Third from th...",2003.0,4.05,400.0,1752.0,Duel: Terror Stories by Richard Matheson,9780312878269 Collects horror stories such as ...


In [68]:
retrieve_semantic_recommendations("A fiction book about universe", 10)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
123,9780060515225,60515228,Fragile Things,Neil Gaiman,Fiction,http://books.google.com/books/content?id=ggLzJ...,A mysterious circus terrifies an audience for ...,2006.0,4.01,360.0,43210.0,Fragile Things: Short Fictions and Wonders,9780060515225 A mysterious circus terrifies an...
147,9780060556525,60556528,Gold,Isaac Asimov,Fiction,http://books.google.com/books/content?id=td_ul...,Gold is the final and crowning achievement of ...,2003.0,3.99,416.0,2393.0,Gold: The Final Science Fiction Collection,9780060556525 Gold is the final and crowning a...
224,9780060776091,60776099,Brave New World and Brave New World Revisited,Aldous Huxley,Fiction,http://books.google.com/books/content?id=3h9eN...,"The astonishing novel Brave New World, origina...",2005.0,4.16,340.0,123044.0,Brave New World and Brave New World Revisited,9780060776091 The astonishing novel Brave New ...
332,9780060976095,60976098,Fiskadoro,Denis Johnson,Fiction,http://books.google.com/books/content?id=YSTe7...,"Hailed by the New York Times as ""wildly ambiti...",1995.0,3.54,221.0,1463.0,Fiskadoro,9780060976095 Hailed by the New York Times as ...
358,9780061094156,61094153,Imajica II,Clive Barker,Fiction,http://books.google.com/books/content?id=DZVKS...,The magical tale of ill-fated lovers lost amon...,1995.0,4.42,544.0,2538.0,Imajica II: The Reconciliation,9780061094156 The magical tale of ill-fated lo...
393,9780061238239,61238236,The End of Days,Zecharia Sitchin,History,http://books.google.com/books/content?id=EIBlj...,A conclusion to the Earth Chronicles series br...,2007.0,4.06,336.0,470.0,The End of Days: Armageddon and Prophecies of ...,9780061238239 A conclusion to the Earth Chroni...
440,9780066238500,66238501,The Chronicles of Narnia (adult),C. S. Lewis,Fiction,http://books.google.com/books/content?id=3VGkK...,"Journeys to the end of the world, fantastic cr...",2001.0,4.26,767.0,425445.0,The Chronicles of Narnia (adult),9780066238500 Journeys to the end of the world...
455,9780091898243,91898242,Darwin's Watch,Terry Pratchett;Ian Stewart;Jack Sidney Cohen,Cosmology,http://books.google.com/books/content?id=91OA4...,"Roundworld is in trouble again, and this time ...",2006.0,3.94,344.0,3481.0,Darwin's Watch,"9780091898243 Roundworld is in trouble again, ..."
458,9780099267546,99267543,Timequake,Kurt Vonnegut,Fiction,http://books.google.com/books/content?id=DQg-d...,"According to Vonnegut`s alter ego, science-fic...",1998.0,3.72,219.0,23529.0,Timequake,9780099267546 According to Vonnegut`s alter eg...
534,9780099595816,99595818,Civilwarland in Bad Decline,George Saunders,Short stories,http://books.google.com/books/content?id=ZY7h1...,A brilliant collection of stories from the win...,1997.0,4.22,179.0,12271.0,Civilwarland in Bad Decline,9780099595816 A brilliant collection of storie...
