In [3]:
# %pip install -r requirement.txt
#print("requirements done")

In [1]:
import os
from dotenv import load_dotenv
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.vectorstores import Chroma
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load environment variables from .env
load_dotenv(".env")
print(os.getenv("GOOGLE_API_KEY"))

AIzaSyAwHWjFQqU-3pdzDzsvFb0awAz2u5RpM-8


In [3]:
import os

from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

# Define the directory containing the text files and the persistent directory

books_dir = './cleaned_txt'
db_dir = os.path.join('./', "db")
persistent_directory = os.path.join(db_dir, "chroma_db_with_metadata")

print(f"Books directory: {books_dir}")
print(f"Persistent directory: {persistent_directory}")

def create_vector_database(embeddings,persistent_directory):
    # Check if the Chroma vector store already exists
    if not os.path.exists(persistent_directory):
        print("Persistent directory does not exist. Initializing vector store...")

        # Ensure the books directory exists
        if not os.path.exists(books_dir):
            raise FileNotFoundError(
                f"The directory {books_dir} does not exist. Please check the path."
            )

        # List all text files in the directory
        book_files = [f for f in os.listdir(books_dir) if f.endswith(".txt")]

        # Read the text content from each file and store it with metadata
        documents = []
        for book_file in book_files:
            file_path = os.path.join(books_dir, book_file)
            loader = TextLoader(file_path)
            book_docs = loader.load()
            for doc in book_docs:
                # Add metadata to each document indicating its source
                doc.metadata = {"source": book_file}
                documents.append(doc)

        # Split the documents into chunks
        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        docs = text_splitter.split_documents(documents)

        # Display information about the split documents
        print("\n--- Document Chunks Information ---")
        print(f"Number of document chunks: {len(docs)}")

        # Create embeddings
        print("\n--- Creating embeddings ---")
        # Update to a valid embedding model if needed
        print("\n--- Finished creating embeddings ---")

        # Create the vector store and persist it
        print("\n--- Creating and persisting vector store ---")
        db = Chroma.from_documents(
            docs, embeddings, persist_directory=persistent_directory)
        print("\n--- Finished creating and persisting vector store ---")

    else:
        print("Vector store already exists. No need to initialize.")


Books directory: ./cleaned_txt
Persistent directory: ./db\chroma_db_with_metadata


Start chatting with the AI! Type 'exit' to end the conversation.
Bye....


In [5]:
# %pip install langdetect
# %pip install deep_translator

Note: you may need to restart the kernel to use updated packages.
Collecting deep_translator
  Using cached deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Using cached deep_translator-1.11.4-py3-none-any.whl (42 kB)
Installing collected packages: deep_translator
Successfully installed deep_translator-1.11.4
Note: you may need to restart the kernel to use updated packages.


In [9]:
from langdetect import detect, DetectorFactory

# Ensure consistent results
DetectorFactory.seed = 0

def detect_language(text):
    try:
        # Detect the language of the input text
        language_code = detect(text)
        return language_code
    except Exception as e:
        return str(e)

In [6]:
from deep_translator import GoogleTranslator
def english_to_punjabi(english_text):
    return GoogleTranslator(source='en', target='pa').translate(english_text)

def punjabi_to_english(punjabi_text):
    return GoogleTranslator(source='pa', target='en').translate(punjabi_text)


In [10]:
db_dir = os.path.join('./', "db")
persistent_directory = os.path.join(db_dir, "extended_chroma_db_with_metadata")

In [12]:
create_vector_database(huggingface_embeddings,persistent_directory)

Vector store already exists. No need to initialize.


In [15]:
import pickle
# Define the persistent directory
# current_dir = os.path.dirname(os.path.abspath(__file__))
# persistent_directory = os.path.join(current_dir, "db", "chroma_db_with_metadata")
# persistent_directory = ".db/chroma_db_with_metadata"
persistent_directory = "db/extended_chroma_db_with_metadata"

# Define the embedding model
# embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# huggingface_embeddings = HuggingFaceEmbeddings(
    # model_name="sentence-transformers/all-mpnet-base-v2"
# )
# huggingface_embeddings=pickle.load(open("huggingface_embeddings.pkl","rb"))
huggingface_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)
# Load the existing vector store with the embedding function
db = Chroma(persist_directory=persistent_directory, embedding_function=huggingface_embeddings)

# Create a retriever for querying the vector store
# `search_type` specifies the type of search (e.g., similarity)
# `search_kwargs` contains additional arguments for the search (e.g., number of results to return)
retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3},
)

# Create a google model
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

# Contextualize question prompt
# This system prompt helps the AI understand that it should reformulate the question
# based on the chat history to make it a standalone question
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, just "
    "reformulate it if needed and otherwise return it as is."
)

# Create a prompt template for contextualizing questions
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

# Create a history-aware retriever
# This uses the LLM to help reformulate the question based on chat history
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

# Answer question prompt
# This system prompt helps the AI understand that it should provide concise answers
# based on the retrieved context and indicates what to do if the answer is unknown
qa_system_prompt = (
    "You are an assistant for question-answering tasks. Use "
    "the following pieces of retrieved context to answer the "
    "question. If you don't know the answer, just say that you "
    "don't know. Use optimal number of sentences to answer the question. "
    "Provide the source as well. "
    "\n\n"
    "{context}"
)

# Create a prompt template for answering questions
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

# Create a chain to combine documents for question answering
# `create_stuff_documents_chain` feeds all retrieved context into the LLM
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

# Create a retrieval chain that combines the history-aware retriever and the question answering chain
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)


# Function to simulate a continual chat punjabi to punjabi
def continual_chat():
    print("Start chatting with the AI! Type 'exit' to end the conversation.")
    chat_history = []  # Collect chat history here (a sequence of messages)

    while True:
        query = input("You: ")
        if query.lower() == "exit":
            print("Bye....")
            break

        language=detect_language(query)
        
        if language.lower()=="pa":
            query = punjabi_to_english(str(query))
        
            print(query)
        else:
            print(query)

        # Process the user's query through the retrieval chain
        result = rag_chain.invoke({"input": query, "chat_history": chat_history})
        # Display the AI's response
        
        if language.lower()=="pa":
            punjabi_response = english_to_punjabi(str(result['answer']))
    
            print(f"AI: {punjabi_response}")
        else:
            print("AI:", result['answer'])


        # Update the chat history
#         chat_history.append(HumanMessage(content=query))
#         chat_history.append(SystemMessage(content=result["answer"]))
        chat_history.append(HumanMessage(content=query))
        chat_history.append(AIMessage(content=result["answer"]))
#         print(chat_history)


# Main function to start the continual chat
if __name__ == "__main__":
    continual_chat()


Start chatting with the AI! Type 'exit' to end the conversation.
Hello
AI: Hello! How can I help you today?

What is wheat?
AI: Wheat, scientifically known as *Triticum*, is an annual grass belonging to the family Poaceae (Gramineae).  It's a staple food for a large portion of the world's population, used to make flour for bread, pasta, and other products.  The grain itself contains carbohydrates, protein, fat, minerals, fiber, and moisture.  Wheat straw is also used as animal feed and for various other purposes. (Source: Status Paper on Wheat)

Bye....
