In [8]:
from src.file_uploader import upload_files
from src.embedding_database import process_documents, faiss_vector_store, save_vector_store, load_vector_store
from src.rag import *
from src.Rag_preprocess import *

In [3]:
# Example Notebook for Using RAG-Based Book Classification Project
# This notebook demonstrates how to use the core functionalities of the project

data_dir = r'C:\Users\skrge\Documents\GitHub\llmtesting\data' #'path/to/your/files'

docs = upload_files(data_dir) #upload pdf and csv  files from data_dir
processed_docs = process_documents(docs) # add unique id to each document base on file name and time of upload
print(f"Total documents processed: {len(processed_docs)}")

Total documents processed: 100


In [4]:
# Example: Creating FAISS Vector Store
new_vector_store = faiss_vector_store(processed_docs) #store the document vectors in a faiss index

In [10]:
# Example: Saving and Loading FAISS Vector Store
output_dir = "./faiss_index"
save_vector_store(new_vector_store, output_dir)
new_vector_store = load_vector_store(output_dir)

FAISS index saved at: ./faiss_index
FAISS index loaded from: ./faiss_index


In [11]:
# Example: Querying the RAG system
query_text = "tell me about Stephen Kings books?"
query_rag_chat(query_text, new_vector_store)

("Welcome to the world of books! I'd be delighted to tell you about Stephen King's works.\n\nStephen King is a master of horror, suspense, and supernatural fiction. He has written numerous bestselling novels, many of which have been adapted into films and TV shows. Here are some of his notable works:\n\n1. **Cell** (2006): This novel tells the story of Clay Riddell, a graphic artist who finds himself in the midst of a mysterious apocalypse. The book follows Clay as he tries to survive and understand what's happening.\n2. **Desperation** (1996): Set in the desert town of Desperation, Nevada, this novel is about a group of people who are trapped by a supernatural force that's wreaking havoc on their lives.\n\nBoth Cell and Desperation are excellent examples of King's ability to craft gripping stories that explore the darker aspects of human nature. If you're a fan of horror or suspense fiction, I highly recommend checking out these books!\n\nWould you like more information about Stephen 

In [15]:
query_text = 'i wanna read some horror with romance book, do u have any sugetions?'
query_rag_chat(query_text, new_vector_store)

('A horror with romance book, what a great combination! I\'d be happy to suggest some books that fit your request.\n\nConsidering you\'re looking for a blend of horror and romance, here are a few suggestions:\n\n1. **Lover Mine (Black Dagger Brotherhood, #8)** by J.R. Ward - This paranormal romance novel combines vampire lore with a dark and suspenseful plot, perfect for fans of the Black Dagger Brotherhood series.\n2. **Steal Away** by Amber Green - This African-American Prohibition-era Historical Ménage has elements of horror, mystery, and romance, set against the vibrant backdrop of 1920s Harlem.\n3. **Desperation** by Stephen King - While not strictly a romance novel, Desperation features a strong romantic subplot amidst its dark and supernatural horror themes.\n4. **The Good Soldier** by Ford Madox Ford - This classic novel explores complex social relationships, passion, and intrigue, with some elements of psychological horror.\n\nHowever, I must say that none of these books seem 

In [16]:
query_text = 'tell me about 5 most popular book i wanna their names firstly and than some info about them'
query_rag_chat(query_text, new_vector_store)

("Here are the names of the 5 most popular books along with some info about them:\n\n**1. The Boy in the Striped Pajamas**\nAuthor: John Boyne\nAverage Rating: 4.15/5 (748,052 ratings)\nDescription: A heart-wrenching and thought-provoking novel about a young boy's journey during WWII.\n\n**2. Shakespeare's Sonnets**\nAuthor: William Shakespeare\nAverage Rating: 4.22/5 (2,493 ratings)\nDescription: A collection of beautiful poems that explore themes of love, beauty, and mortality.\n\n**3. The Ugly American**\nAuthor: Eugene Burdick\nAverage Rating: 4.05/5 (4,463 ratings)\nDescription: A classic novel about the consequences of American arrogance and incompetence abroad.\n\n**4. Fables of the Reconstruction**\nAuthor: Jez Jones\nAverage Rating: 4.13/5 (159 ratings)\nDescription: A sensual and erotic novel that explores themes of desire, secrets, and fantasy.\n\n**5. The Boy in the Striped Pajamas was not a top 5, it was actually... Haroun and the Sea of Stories**\nAuthor: Salman Rushdie\n

Book Classification

In [17]:
csv_documents = [doc for doc in processed_docs if doc.metadata.get('source', '').endswith('.csv')]

In [18]:
# assuming we have genres for classification
genres = ['Fiction', 'Classics', 'Nonfiction', 'Fantasy', 'Historical Fiction', 'Young Adult', 'Mystery', 
          'Romance', 'Literature', 'Contemporary', 'Novels', 'Audiobook', 'Thriller', 'Historical', 'Science Fiction', 
          'History', 'Adventure', 'Childrens', 'Philosophy', 'Biography', 'Crime', 'Self Help', 'Psychology', 
          'Mystery Thriller', 'Humor', 'Adult', 'Memoir', 'Horror', 'Science Fiction Fantasy', 'Suspense']

In [19]:
# Example: Extracting Relevant Information from CSV Documents for classification query
extract_relevant_info(csv_documents[20].page_content)

("The Heretic's Daughter",
 'Kathleen Kent',
 "Martha Carrier was one of the first women to be accused, tried and hanged as a witch in Salem, Massachusetts. Like her mother, young Sarah Carrier is bright and willful, openly challenging the small, brutal world in which they live. Often at odds with one another, mother and daughter are forced to stand together against the escalating hysteria of the trials and the superstitious tyranny that led to the torture and imprisonment of more than 200 people accused of witchcraft. This is the story of Martha's courageous defiance and ultimate death, as told by the daughter who survived.Kathleen Kent is a tenth generation descendant of Martha Carrier. She paints a haunting portrait, not just of Puritan New England, but also of one family's deep and abiding love in the face of fear and persecution.")

In [20]:
# Example: Classifying Books into Genres
classified_books = classify_books_from_docs(csv_documents, genres)

Classifying Books: 100%|██████████| 51/51 [06:07<00:00,  7.20s/book]


In [21]:
classified_books_df = classified_books_df(csv_documents, classified_books)
classified_books_df.head(5)

Unnamed: 0,Title,Author,Description,Genres
0,Pygmalion,George Bernard Shaw,"One of George Bernard Shaw's best-known plays,...","[Classics, Literature, Comedy]"
1,The How of Happiness: A Scientific Approach to...,Sonja Lyubomirsky,"An easy-to-follow, life-changing approach desi...","[Nonfiction, Self Help, Psychology]"
2,Fables of the Reconstruction,Jez Jones,"Secrets, fantasies, and desires mingle as an a...","[Romance, Erotic Fiction, Literature]"
3,"Capital: A Critique of Political Economy, Volu...",Karl Marx,"Capital, one of Marx's major and most influent...","[Nonfiction, Philosophy]"
4,Mudbound,Hillary Jordan,"In Jordan's prize-winning debut, prejudice tak...","[Historical Fiction, Contemporary]"


Extracting information from pdf files

In [23]:
#create data with pdf documents
pdf_documents = [doc for doc in processed_docs if doc.metadata.get('source', '').endswith('.pdf')]
len(pdf_documents)

49

In [24]:
#Extracting Title information from each books for extracting data assuming we dont know amound of books in pdfs
# Group documents by pages to extract information from each book page separately, preventing issues caused by excessively large contexts.
titles_info = extract_titles_from_grouped_documents(pdf_documents)
titles_info

Processing Groups:   0%|          | 0/3 [00:00<?, ?group/s]

Processing Groups: 100%|██████████| 3/3 [02:25<00:00, 48.58s/group]


['["A Magazine", "MAK: The Architecture of Byoungsoo Cho", \n"Archives 7: Francisco Mangado", "Archives 6: Solano Benítez & Gloria Cabral", \n"Encounters with Plečnik", "Pitsou Kedem Architects – Works and Projects", \n"Robin Boyd: Late Works", "Dudok by Iwan Baan"]',
 '["4380. Immortal: Lost Memoirs of Cornelia Dulac Concerning the Freshwater Polyp Hydra",\n"79. Monique Besten – The Wanderer*",\n"78. Gustafsson&Haapoja – Bud Book",\n"77. Juan Hein – Clouds and Bombs*",\n"84. Jörg Schmeisser Retrospective: Neverending Journeys",\n"83. Ohara Koson: Paradise On Paper Where Flowers Bloom, Birds Sing",\n"82. Rei Naito – Mirror Creation*",\n"81. Tsuyoshi Hisakado – Practice of Spiral Practice of Spiral"]',
 '["Goblins",\n"The Cult of Water",\n"Satan is Real: Two Short Stories",\n"Empty Aphrodite: An Encyclopaedia of Fate",\n"Bruce Hamana Sosei – 100 Beautiful Words in the Way of Tea",\n"Aesthetics as Space",\n"Errant Journal 1: Where are We?",\n"Unpacking My Library"]']

In [25]:
#Create list with books information
book_name = combine_text_info(titles_info)

In [26]:
#Extracting information from each book base on list with books information
extracted_info = extract_book_info(pdf_documents, book_name, new_vector_store)

Processing books: 100%|██████████| 24/24 [24:11<00:00, 60.50s/book]


In [33]:
#Creating dataframe from extracted information 
extracted_df = create_dataframe_from_json_strings(extracted_info)
extracted_df.head(10)

Unnamed: 0,ISBN,City,Year,price,book_shop_id,pages,colour,size,language
0,9789077745212,Antwerp,2020,15.5,20253.0,222.0,colour & bw,17 x 21 cm,English
1,9788792700322,Copenhagen,2020,61.7,,408.0,colour & bw,23 x 33 cm,English
2,9788412162523,La Coruña,2020,25.2,,,colour & bw,17 x 24 cm,
3,9784907562212,Tokyo,2020,49.5,20247.0,304.0,colour & bw,17 x 24 cm,Spanish/English
4,9788412162516,La Coruña,2020,25.2,20203.0,,,,
5,9788412162516,La Coruña,2020,25.2,20203.0,304.0,colour & bw,17 x 24 cm,Spanish/English
6,9789895462049,Porto,2020,49.5,,304.0,colour & bw,17 x 24 cm,Spanish/English
7,9789895462049,Tel Aviv,2000,49.5,20161.0,52.0,colour & bw,15 x 21 cm,Slovenian/English
8,9780648435594,Melbourne,2020,38.8,,152.0,colour & bw,24 x 28 cm,English
9,9789462085817,Rotterdam,2020,39.95,,108.0,colour & bw,22 x 30 cm,English
