In [66]:
'''
   Author  : Emon Hasan
   Email   : iconicemon01@gmail.com
   GitHub  : https://github.com/Md-Emon-Hasan
   LinkedIn: https://www.linkedin.com/in/emon-hasan/
   Date    : 01/28/2025
   Time    : 18:48
   Purpose : How Chorama DB works
'''

# Run and Check Compiler
def main():
    print('Hello, World!')

if __name__ == '__main__':
    main()

Hello, World!


In [3]:
import tensorflow as tf
print(tf.__version__)  # Should print 2.10.0

2.10.0


In [5]:
# Import required libraries
import os
from langchain.vectorstores import Chroma  

In [6]:
# Vector database
from langchain.embeddings import HuggingFaceEmbeddings  # Open-source embeddings

In [7]:
from langchain.llms import HuggingFacePipeline  # Using FLAN-T5 for QA

In [8]:
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader  # Load text files

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter  # Split text into chunks

In [10]:
from langchain.chains import RetrievalQA  # Question-answering pipeline

In [11]:
from transformers import pipeline  # Hugging Face pipeline for FLAN-T5

  from .autonotebook import tqdm as notebook_tqdm


In [67]:
from langchain.schema import Document  # Required for metadata handling

In [41]:
# Step 2: Download Example Dataset
import requests
import zipfile

In [42]:
# Download the dataset (collection of text files)
url = "https://www.dropbox.com/s/vs6ocyvpzzncvwh/new_articles.zip?dl=1"
response = requests.get(url)

In [43]:
# Save it locally
zip_path = "new_articles.zip"
with open(zip_path, "wb") as file:
    file.write(response.content)

In [44]:
# Extract the ZIP file into a folder
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall("new_articles")

In [45]:
# Step 3: Load Documents and Add Metadata (Fix for Missing Source)
directory_path = "new_articles"

In [46]:
# Read all text files and store them as Document objects with metadata
documents = []
for file_name in os.listdir(directory_path):
    if file_name.endswith('.txt'):  # Process only text files
        file_path = os.path.join(directory_path, file_name)
        
        try:
            with open(file_path, encoding='utf-8') as f:
                content = f.read()
        except UnicodeDecodeError:
            with open(file_path, encoding='latin-1') as f:
                content = f.read()

        # Create a Document object with metadata (filename as 'source')
        doc = Document(page_content=content, metadata={"source": file_name})
        documents.append(doc)

In [47]:
print(f"Successfully loaded {len(documents)} documents.")

Successfully loaded 21 documents.


In [48]:
# Step 4: Split Documents into Smaller Chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_texts = text_splitter.split_documents(documents)  # Now retains metadata

In [49]:
print(f"Total Chunks Created: {len(split_texts)}")

Total Chunks Created: 233


In [50]:
# Step 5: Initialize Embeddings Model
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [51]:
# Step 6: Create ChromaDB Vector Store
persist_directory = "db"  # Directory to store the database
vectordb = Chroma.from_documents(documents=split_texts, embedding=embedding, persist_directory=persist_directory)

In [52]:
# Persist the database to disk
vectordb.persist()

In [53]:
# Step 7: Reload Vector Database
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

In [54]:
# Step 8: Create a Retriever (Top 2 Most Relevant Chunks)
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [55]:
# Step 9: Load the FLAN-T5 Model for Question Answering
flan_t5_pipeline = pipeline("text2text-generation", model="google/flan-t5-base", device=0)

Device set to use cpu


In [56]:
# Wrap the model for LangChain
from langchain.llms import HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=flan_t5_pipeline)

In [57]:
# Step 10: Create a Question-Answering (QA) Chain
qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)

In [58]:
# Step 11: Function to Process and Display Answers (Fix for Missing Source)
def process_llm_response(llm_response):
    print("AI Answer:", llm_response['result'])
    
    print("\n\nSources:")
    if "source_documents" in llm_response:
        for source in llm_response["source_documents"]:
            # Ensure 'source' key exists in metadata before accessing it
            if "source" in source.metadata:
                print(f"- {source.metadata['source']}")
            else:
                print("- [No source available]")  # Handle missing metadata gracefully
    else:
        print("- [No source documents found]")  # Handle missing source_documents

In [59]:
# Step 12: Ask a Question and Get an Answer
query = "How much money did Microsoft raise?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

AI Answer: $10 billion


Sources:
- [No source available]
- 05-03-chatgpt-everything-you-need-to-know-about-the-ai-powered-chatbot.txt


In [64]:
# break it down
query = "any news for microsoft?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

AI Answer: yes


Sources:
- 05-04-microsoft-doubles-down-on-ai-with-new-bing-features.txt
- [No source available]


In [68]:
# Deleteing the DB

# !zip -r db.zip ./db

In [69]:
# To cleanup, you can delete the collection
# vectordb.delete_collection()
# vectordb.persist()

In [70]:
# delete the directory
# !rm -rf db/