In [1]:
import torch
from langchain import  PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import os
import re
import pickle
import fitz  # PyMuPDF
from langchain.schema import Document
import langdetect
from langchain_huggingface import HuggingFacePipeline
from langchain_huggingface import HuggingFaceEmbeddings
# Function to clean the output
def clean_output(output: str) -> str:
    # Find the position of [/INST]
    start_index = output.find('[/INST]') + len('[/INST]')
    
    # Extract the answer portion after [/INST]
    cleaned_output = output[start_index:].strip()
    
    # Remove any leading or trailing whitespace
    return cleaned_output


# Function to get device
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

# Function to split text into paragraphs
def split_text_into_paragraphs(text_content):
    paragraphs = text_content.split('#')  # Split paragraphs based on two newlines
    return [paragraph.strip() for paragraph in paragraphs if paragraph.strip()]

# Function to sanitize filenames
def sanitize_filename(filename):
    return re.sub(r'[^a-zA-Z0-9_-]', '_', filename)

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    text_content = ''
    with fitz.open(pdf_path) as pdf_document:
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            text_content += page.get_text()
    return text_content

# Function to detect language
def detect_language(text):
    try:
        return langdetect.detect(text)
    except:
        return "en"  # Default to English if detection fails

# Function to generate the prompt
def generate_prompt(prompt: str, system_prompt: str) -> str:
    return f"""
[INST] <>
{system_prompt}
<>

{prompt} [/INST]
""".strip()


# Function to create embeddings
def create_embeddings(language):
    if language == "en":
        return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    elif language == "ml":
        return HuggingFaceEmbeddings(model_name="KooAI/KooBERT")
    else:
        raise ValueError("Unsupported language")

# Function to create the database
def create_database(documents, embeddings):
    db = Chroma.from_documents(documents, embeddings)
    return db

# Function to create the retriever
def create_retriever(db):
    search_kwargs = {"k": 3}
    retriever = db.as_retriever(search_kwargs=search_kwargs)
    return retriever

# Function to create the QA chain
def create_qa_chain(llm, retriever, prompt):
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True,
        chain_type_kwargs={"prompt": prompt},
    )
    return qa_chain

# Function to create the LLM pipeline
def create_llm_pipeline(model, tokenizer):
    text_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=1024,
        temperature=0.1,
        top_p=0.95,
        repetition_penalty=1.15,
        device=DEVICE
    )
    llm = HuggingFacePipeline(pipeline=text_pipeline, model_kwargs={"temperature": 0})
    return llm

def process_pdf_file(filename, pdf_path, embeddings, llm, prompt):
    print(f'\nProcessing: {pdf_path}')
    text_content = extract_text_from_pdf(pdf_path)
    
    language = detect_language(text_content)
    print(f"Detected language: {language}")

    paragraphs = split_text_into_paragraphs(text_content)
    documents = [Document(page_content=paragraph, metadata={"language": language, "source": pdf_path}) for paragraph in paragraphs]
    
    print(f"Number of documents created: {len(documents)}")

    try:
        db = create_database(documents, embeddings)
        retriever = create_retriever(db)
        qa_chain = create_qa_chain(llm, retriever, prompt)
        
        print(f"QA chain created for {filename}")
        return qa_chain, language
    except Exception as e:
        print(f"Error processing file {filename}: {e}")
        return None, language

# System prompt
SYSTEM_PROMPT = "You are a helpful assistant for answering questions based on provided context. Explain the answer in paragraph!!"

def main():
    folder_path = './language/'  # Path to the folder containing PDFs
    model_pickle_path = './model.pkl'

    if os.path.exists(model_pickle_path):
        with open(model_pickle_path, 'rb') as f:
            model, tokenizer = pickle.load(f)
    else:
        MODEL_NAME = "sarvamai/sarvam-2b-v0.5"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
        tokenizer.pad_token = tokenizer.eos_token

        model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)
        with open(model_pickle_path, 'wb') as f:
            pickle.dump((model, tokenizer), f)

    llm = create_llm_pipeline(model, tokenizer)

    # Create the PromptTemplate
    template = generate_prompt(
        "{context}\nQuestion: {question}",
        system_prompt=SYSTEM_PROMPT,
    )
    prompt = PromptTemplate(template=template, input_variables=["context", "question"])

    language_doc_map = {
        "en": "english.pdf",
        "ml": "malayalam.pdf",
    }

    qa_chains = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            text_content = extract_text_from_pdf(pdf_path)
            language = detect_language(text_content)
            embeddings = create_embeddings(language)
            qa_chain, doc_language = process_pdf_file(filename, pdf_path, embeddings, llm, prompt)
            qa_chains[doc_language] = (qa_chain, filename)

    print(f"All PDFs processed. You can now ask questions about these documents.")

    while True:
        user_input = input("\nEnter your query or 'exit' to quit: ").strip()
        if user_input in ['exit', 'quit']:
            print("Exiting the program. Goodbye!")
            break

        if not user_input:
            print("Please enter a query.")
            continue

        query_language = detect_language(user_input)

        if query_language in qa_chains:
            qa_chain, associated_file = qa_chains[query_language]
            expected_doc = language_doc_map.get(query_language)

            if associated_file == expected_doc:
                try:
                    result = qa_chain({"query": user_input})
                    cleaned_answer = clean_output(result['result'])
                    print(f"Answer ({query_language}):", cleaned_answer)
                except Exception as e:
                    print(f"Error while processing query: {e}")
            else:
                print(f"Query language detected as {query_language}, but no matching document found for {expected_doc}.")
        else:
            print(f"No document available for the detected language: {query_language}")

if __name__ == "__main__":
    main()


2024-09-06 07:47:48.836187: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-06 07:47:48.836249: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-06 07:47:48.837664: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Some weights of BertModel were not initialized from the model checkpoint at KooAI/KooBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing: /kaggle/input/language/malayalam.pdf
Detected language: ml
Number of documents created: 34
QA chain created for malayalam.pdf





Processing: /kaggle/input/language/english.pdf
Detected language: en
Number of documents created: 33
QA chain created for english.pdf
All PDFs processed. You can now ask questions about these documents.



Enter your query or 'exit' to quit:  നെൽപ്പാടങ്ങളിൽ കുഴൽപ്പുഴുവിനെ എങ്ങിനെ നശിപ്പിക്കാം ?


  result = qa_chain({"query": user_input})


Answer (ml): നെൽപ്പാടങ്ങളിലെ പുഴുക്കളുടെ ആക്രമണം നിയന്ത്രിക്കാൻ നിരവധി രീതികൾ ഉപയോഗിക്കുന്നുണ്ട്. ഒന്നാമതായി, കള വെള്ളം മൂന്ന് ദിവസം വരെ സൂക്ഷിക്കുന്നത് ഒഴിവാക്കേണ്ടത് അത്യാവശ്യമാണ്. കൂടാതെ, അര ഏക്കർ സ്ഥലത്തേക്ക് 25 കിലോഗ്രാം അറക്ക് പൊടി കലർത്തി ഭൂമിയിൽ പുരട്ടുന്നത് ഫലപ്രദമായ ചികിത്സയാണ്. ഈ പ്രക്രിയയ്ക്ക് ശേഷം കൈറ്റൈൻ അടിസ്ഥാനമാക്കിയുള്ള സൂഡോമോണാസ് (20 ഗ്രാം ഒരു ലിറ്റർ വെള്ളത്തിൽ), അസാദിറാക്റ്റിൻ (750 മില്ലിലിറ്റർ ഒരു ഹെക്ടോറിൽ) എന്നിവയുടെ പ്രയോഗവും പരിഗണിക്കണം. രോഗം ബാധിച്ച പ്രദേശങ്ങൾക്ക് ചുറ്റും ട്രൈക്കോഡെർമ ഹാർസിയാനം (5 CC per hectare) അല്ലെങ്കിൽ ഫിറോമൻ കെണികൾ സ്ഥാപിക്കേണ്ടതുണ്ട്. അവസാനമായി, ബ്ലാസ്റ്റ് രോഗത്തിന്റെ ലക്ഷണങ്ങൾ നിരീക്ഷിക്കുകയും ആവശ്യമെങ്കിൽ ക്വിൻറൽഫോസ് (25 EC per hectare) പോലുള്ള രാസ കീടനാശിനികൾ പ്രയോഗിക്കുകയും വേണം. </s>



Enter your query or 'exit' to quit:  നെൽചെടികളിലെ ഇലപ്പേൻ ഭാദയെപ്പറ്റി വിവരിക്കുക?


Answer (ml): നെല്ല് ചെടികളെ ബാധിക്കുന്ന ഒരു കീടം മൂലമുണ്ടാകുന്ന രോഗമാണ് ലീഫ് ഫാൾഡ് ഡിസീസ് എന്നും അറിയപ്പെടുന്ന നെല്ല് ഇലപ്പേൻ ഭാദ. ചെറിയ പ്രാണികളായ ടാൻസാനിയയിലെ നെല്ല് വിളകളെ നശിപ്പിക്കുന്നതിന് കാരണമാകുന്നതിനാൽ ഈ രോഗം പ്രാധാന്യമർഹിക്കുന്നു. രോഗത്തിന്‍റെ ലക്ഷണങ്ങളിൽ ഇലകളിലെ പാടുകൾ ഉൾപ്പെടുന്നു, അവിടെ അവയ്ക്ക് വെള്ള നിറമുള്ളതോ മഞ്ഞ നിറമുള്ളതോ ആയ അടയാളങ്ങളുണ്ട്. കൂടാതെ, ഇലയുടെ അറ്റങ്ങളും അരികുകളും ചുളിവുകളുള്ളതായി കാണപ്പെടാം, ചിലപ്പോൾ ഇലകൾ കത്തിപ്പോകുകയും ഒടുവിൽ ചത്തുപോകുകയും ചെയ്യും. </s>



Enter your query or 'exit' to quit:  നെല്ലിൽ ഇലപ്പേൻ ലക്ഷണങ്ങൾ എന്തൊക്കെ ?


Answer (ml): നെല്ലിൻറെ മുന്തിരിപ്പഴത്തിലെ ലീഫ് പേൻ എന്നറിയപ്പെടുന്ന ഒരു തരം പ്രാണി മൂലമുണ്ടാകുന്ന അണുബാധയാണ് നെല്ലിൻറെ മുന്തിരിപ്പഴത്തിലെ ലീഫ് പേൻ (യൂറോപ്യൻ ലീഫ് പേൻ). ഈ പ്രാണി ചെടികളുടെ ഇലകളെ ഭക്ഷിക്കുകയും അവയ്ക്ക് കേടുപാടുകൾ വരുത്തുകയും ചെയ്യുന്നു. ഇത് സാധാരണയായി നെല്ലിന്‍റെ വിളയെ ബാധിക്കുന്നുണ്ടെങ്കിലും മറ്റ് തരത്തിലുള്ള ധാന്യങ്ങളിലും ഇത് കാണപ്പെടുന്നു. </s>



Enter your query or 'exit' to quit:  What are the symptoms of Ants and Termite Attacks in rice crops?


Answer (en): Symptoms of ant attacks include eating the seeds and affecting germination, while termite attacks result in missing plants and reduced stand density due to feeding on the roots. Damage is primarily observed in upland rice, where cream-colored, small insects resemble ants with darker heads may be spotted. To manage these issues, increase the seed rate to counteract the effects of ants, apply chlorpyrifos 20 EC as a drench around the affected area, and irrigate the field if possible. </s>



Enter your query or 'exit' to quit:  quit


Exiting the program. Goodbye!
