In [1]:
# from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain.chains.summarize import load_summarize_chain
# from langchain_mistralai import MistralAIEmbeddings, ChatMistralAI
from langchain.text_splitter import CharacterTextSplitter
from langchain_core.documents import Document
import os
import sys
from langchain_community.llms import HuggingFaceHub
from langchain_community.embeddings import HuggingFaceHubEmbeddings
from langchain.chains import ConversationalRetrievalChain, RetrievalQA
from langchain_core.prompts  import PromptTemplate
from langchain_community.vectorstores import Chroma, FAISS
from pdfminer.high_level import extract_text
from pathlib import Path
from pdf2image import convert_from_path
import pytesseract
import PyPDF2
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from langchain_community.llms import Cohere
from langchain_community.embeddings import CohereEmbeddings
from langchain_community.llms import HuggingFaceEndpoint
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/himanshu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/himanshu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/himanshu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Hugging Face Mixtral ChatBot

Enter API Key

In [2]:
# os.environ["HUGGINGFACEHUB_API_TOKEN"]="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"
os.environ["COHERE_API_KEY"] =""
os.environ["HUGGINGFACEHUB_API_TOKEN"]=""

# os.environ["HUGGINGFACEHUB_API_TOKEN"]="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"

Some pdfs have text that can't be copied, so use OCR technique to extract text

Some can be opened by PyPDF2 and some by pdfminer

In [3]:
def clean_paper(input_file):
    text = ''
    if input_file not in ['Publications/15_Nazneen.pdf', 'Publications/Tariq_2019.pdf']:
        with open(input_file, 'rb') as file:
            if input_file not in ["Publications/Tariq2018.pdf", "Publications/Asd_Cry_patterns.pdf"]:
                pdf = PyPDF2.PdfReader(file)
                for page_num in range(len(pdf.pages)):
                    text += pdf.pages[page_num].extract_text()
            else:
                text = extract_text(input_file)
    else:
        images = convert_from_path(input_file)
        for i in range(len(images)):
            text += pytesseract.image_to_string(images[i])
            
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\[Fig.\d+\]', '', text)
    text = re.sub(r'\[Table \d+\]', '', text)
    text = re.sub(r'\[\d+\]', '', text)
    text = ' '.join(text.split())
    return text

def clean_all_papers(input_dir):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    for file_name in os.listdir(input_dir):
        input_file = os.path.join(input_dir, file_name)
        txt = clean_paper(input_file)
        if txt is not None:
            base_name = os.path.splitext(file_name)[0]
            words = nltk.word_tokenize(txt)
            words = [word for word in words if word not in stopwords.words('english')]
            words = [stemmer.stem(word) for word in words]
            words = [lemmatizer.lemmatize(word) for word in words]
            txt = ' '.join(w for w in words)
            
            with open(Path("Processed_Texts")/f"{base_name}.txt", "w") as f:
                f.write(txt)

input_dir = Path("Publications")
clean_all_papers(input_dir)


KeyboardInterrupt: 

Split the txt files as Documents

In [None]:
documents=[]
for file in os.listdir('Processed_Texts'):
        if file.endswith('.txt'):
            text_path = './Processed_Texts/' + file
            # print(text_path)
            loader = TextLoader(text_path)
            documents.extend(loader.load())
print(len(documents))
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunked_documents = text_splitter.split_documents(documents)
print(len(chunked_documents))



15
15


Define the Model - I am using Mistral 7B

Could also use OpenAI,s ChatGPT 3.5 although you need a paid account

In [None]:
llm = HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-Instruct-v0.2",
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 512,
        "top_k": 30,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
    }
)


Store with embeddings in a vector database- Can use Chroma, Faiss etc

In [None]:
vectordb = Chroma.from_documents(persist_directory="./chroma_db",embedding=HuggingFaceHubEmbeddings(), documents=chunked_documents)
vectordb.persist()


If we want to make a Conversational ChatBot with Chat History

In [None]:
# pdf_qa = ConversationalRetrievalChain.from_llm(
#     llm=llm,
#     retriever=vectordb.as_retriever(search_kwargs={'k': 1}),
#     return_source_documents=True,
#     verbose=False
# )


In [None]:

# yellow = "\033[0;33m"
# green = "\033[0;32m"
# white = "\033[0;39m"

# chat_history = []
# print(f"{yellow}---------------------------------------------------------------------------------")
# print('Welcome to the DocBot. You are now ready to start interacting with your documents')
# print('---------------------------------------------------------------------------------')
# while True:
#     query = input(f"{green}Prompt: ")
#     if query == "exit" or query == "quit" or query == "q" or query == "f":
#         print('Exiting')
#         sys.exit()
#     if query == '':
#         continue
#     result = pdf_qa.invoke(
#         {"question": query, "chat_history": chat_history})
#     print(f"{white}Answer: " + result["answer"])
#     chat_history.append((query, result["answer"]))

If we want to make a RAG(Retrieval and Generation) model that retrieves information from the cleaned research papers

In [None]:
template = "Question: {question} \n\n <COS> {context} </COS> \n\n Answer:"
prompt = PromptTemplate(input_variables=['context', "question"], template=template)

chain_qa = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever=vectordb.as_retriever(search_type="similarity",search_kwargs={'k': 2}),
    chain_type_kwargs={'prompt': prompt},
    return_source_documents=True,
    chain_type="stuff",
    verbose=False
)


For Testing the RAG

In [None]:

yellow = "\033[0;33m"
green = "\033[0;32m"
white = "\033[0;39m"

print(f"{yellow}---------------------------------------------------------------------------------")
while True:
    query = input(f"{green}Prompt: ")
    if query == "exit" or query == "quit" or query == "q" or query == "f":
        print('Exiting')
        sys.exit()
    if query == '':
        continue
    result = chain_qa.invoke(
        {"query": query})
    print(result["result"])
    print(len(result['source_documents']))

[0;33m---------------------------------------------------------------------------------
Question: What are signs of autism? 

 <COS> scientif report doi postur control detect via comput vision analysi toddler autism spectrum disord geraldin dawson kathleen campbel jordan hashemi steven j lippman n valeri smith kimberli carpent r helen egger steven espinosa saritha vermeer jeffrey baker guillermo sapiro evid suggest differ motor function earli featur autism spectrum disord asd one aspect motor abil develop childhood postur control reflect abil maintain steadi head bodi posit without excess sway observ studi document differ postur control older child asd the present studi use comput vision analysi ass midlin head postur control reflect rate spontan head movement state activ attent toddler month age mean month diagnos asd timeseri data reveal robust group differ rate head movement toddler watch movi depict social nonsoci stimulus toddler asd exhibit significantli higher rate head movemen

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


Get answers of queries and store as csv

In [None]:
import pandas as pd
queries=pd.read_csv("Query Questions - Sheet1.csv", index_col=["SN"])
queries.columns=["Question"]

No Summarization Response

In [None]:
for i in range(queries.shape[0]):
    res = chain_qa.invoke({"query": queries.loc[i+1, "Question"]})
    queries.loc[i+1, "ResultNoSummarization"] = res["result"][res["result"].find("</COS>")+19:]


Summarization Response

Retrieve the most similar 5 unique documents

In [None]:
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={'k': 35})
RelDoc = []
for i in range(queries.shape[0]):
    RelDoc.append([])
    docs = retriever.get_relevant_documents(queries.loc[i+1, "Question"])
    #get 5 unique docs
    j=0
    for doc in docs:
        if doc not in RelDoc[i]:
            queries.loc[i+1, f"Doc_{j+1}"]=doc.metadata["source"]
            j=j+1
            RelDoc[i].append(doc)
            if j>5:
                break


In [None]:
llms = HuggingFaceHub(
    repo_id="kabita-choudhary/finetuned-bart-for-conversation-summary",
    task="summarization",
    model_kwargs={
        "max_new_tokens": 250,
        "top_k": 30,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
        "max_length":500,
        "min_length":300
    }
    
)


chain = load_summarize_chain(llm=llms, chain_type="stuff")
for i in range(queries.shape[0]):
    summary = chain.invoke(input=RelDoc[i], question=f"Query {queries.loc[i+1, 'Question']} . Write a summary of about 500 words from the following text to answer the question ")
    print(summary['output_text'])
    queries["ResultSummarization"]=summary["output_text"]



"robotassistedautismspectrumdisorderdiagnosticbas onarticialreason andresaram rezduqueanselmofrizeranetoteodianofreirebasto receivedaprilaccepteddecemb.org, a report on autism spectrum disorder and its symptoms found in the case of a newborn baby. The system developed by robotassist framework is based on a robot oper system. It was developed with the help of a clinician and a robot. It is useful for diagnosing and treating the child with autism. It also serves as a training tool for the clinician. It has a number of useful functions, including: diagnosis, diagnosis, treatment and supervision. It can also be used for children with autism and other mental health problems. It's also useful for people with visual impairment. It helps to monitor the child's physical and mental health. The main aim of the system is to improve the quality of care for autistic children and to prevent them from getting into trouble in the long-term. It should be used in order to improve their mental health and 

Not the best results. The largest summarization model in HuggingFace is facebook's bert large cnn which doesnt perform much better either


In [None]:
llms = HuggingFaceHub(
    repo_id="facebook/bart-large-cnn",
    task="summarization",
    model_kwargs={
        "max_new_tokens": 250,
        "top_k": 30,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
        "max_length":500,
        "min_length":300
    }
)


chain = load_summarize_chain(llm=llms, chain_type="stuff")
for i in range(queries.shape[0]):
    summary = chain.invoke(input=RelDoc[i], question=f"Query {queries.loc[i+1, 'Question']} . Write a summary of about 500 words from the following text to answer the question ")
    print(summary['output_text'])
    # queries["ResultSummarization"]=summary["output_text"]



 Write a concise summary of the following:  "robotassistedautismspectrumdisorderdiagnosticbas onarticialreason andresaram. Write a summary of  autism spectrum disord asd neurodevelopment disord affect peopl birth symptom found earli development period the asd diagnosi usual perform sever session behavior observ exhaust screen manual code behavior. The system reli comput vision unstructur scalabl network rgbd sensor built upon robot oper system ro machin learn algorithm autom face analysi also proof concept present particip three typic develop td child three child risk suffer asd keyword childrobot interact autism spectrum disords. The robotassist framework use artifici reason modul assist clinician asd diagnostic process the framework compos respons robot platform flexibl scalabl vision sensor network autom face analysedi algorithm in research take advantag neural model avail open sourc project build complet new pipelin algorithm global recognit track child face among mani face present

Arguably worse than the above one

Save the results to a csv file

In [None]:
queries.to_csv("Results.csv")

In [None]:
res=pd.read_csv("Results.csv", index_col="SN")
res.head()

Unnamed: 0_level_0,Question,ResultNoSummarization,Doc_1,Doc_2,Doc_3,Doc_4,ResultSummarization
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,What are the variety of Multimodal and Multi-m...,he text describes various approaches to using ...,./Processed_Texts/1_Ramırez-Duque_.txt,./Processed_Texts/Tariq2018.txt,./Processed_Texts/Dawson.txt,./Processed_Texts/Young_Behavior.txt,"""a videobas measur identifi autism risk infanc..."
2,"What is Autism Spectrum Disorder, how it is ca...",Autism Spectrum Disorder (ASD) is a neurodevel...,./Processed_Texts/Abbas_2020.txt,./Processed_Texts/Tariq2018.txt,./Processed_Texts/Asd_Cry_patterns.txt,./Processed_Texts/Qiu.txt,"""a videobas measur identifi autism risk infanc..."
3,What is the cure of Autism Spectrum Disorder,The text describes a scientific report on the...,./Processed_Texts/Abbas_2020.txt,./Processed_Texts/Tariq2018.txt,./Processed_Texts/Qiu.txt,./Processed_Texts/15_Nazneen.txt,"""a videobas measur identifi autism risk infanc..."
4,What are Stereotypical and maladaptive behavio...,Stereotypical and maladaptive behaviors in Au...,./Processed_Texts/22_Ouss_ASD.txt,./Processed_Texts/Dawson.txt,./Processed_Texts/Abbas_2020.txt,./Processed_Texts/Young_Behavior.txt,"""a videobas measur identifi autism risk infanc..."
5,How relevant is eye contact and how it can be ...,ye contact is a relevant social behavior indic...,./Processed_Texts/Qiu.txt,./Processed_Texts/Dawson.txt,./Processed_Texts/Young_Behavior.txt,./Processed_Texts/Tariq2018.txt,"""a videobas measur identifi autism risk infanc..."


Things to try:

Hyperparameters like:

    max_new_tokens

    top_k

    temperature

    repetition_penalty

    max_length

    min_length

can be changed

More Models like

    Zephyr

    OpenAI(paid)

    Pert(HuggingFace Paid)

    Yi-ko(9B/6B, Too large to be used through serverless HuggingFaceHub)
    
    Cohere
    
Changing the search type to mmf in retriever

Using another LLM to clean the extracted text as libraries like PyPDF, Pdfminer and OCR aren't perfect and the text data is still very unclean.


