In [None]:
!pip install pypdf2

In [None]:
!sudo apt install tesseract-ocr
!pip install pytesseract
!apt-get install poppler-utils
!pip install pdf2image

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import re
import PyPDF2
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function to preprocess text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    #stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    # Tokenize
    words = nltk.word_tokenize(text)
    # Remove stop words, stem, and lemmatize
    cleaned_words = [lemmatizer.lemmatize(word) for word in words if word.lower() not in stop_words]
    return ' '.join(cleaned_words)

# Function to remove figures, tables, and in-text citations
def remove_figures_tables_citations(text):
    # Remove figures and tables
    text = re.sub(r'\bFigure\s\d+\b', '', text)
    text = re.sub(r'\bFig\s\d+\b', '', text)
    text = re.sub(r'\bTable\s\d+\b', '', text)
    # Remove in-text citations [1], [2], etc.
    text = re.sub(r'\[\d+\]', '', text)
    # Remove references (Author, Year)
    text = re.sub(r'\([A-Za-z]+, \d+\)', '', text)
    text = re.sub(r'References\s*.*', '', text, flags=re.IGNORECASE)
    return text

# Function to process a single PDF file
def process_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        # Read PDF file
        pdf_reader = PyPDF2.PdfReader(file)
        text = ''
        # Extract text from each page
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
        # Remove figures, tables, citations, and references
        cleaned_text = remove_figures_tables_citations(text)
        # Preprocess text
        #cleaned_text = preprocess_text(cleaned_text)
    return cleaned_text

# Function to process all PDF files in a folder
def process_folder(folder_path, output_folder):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            # Process PDF file
            cleaned_text = process_pdf(pdf_path)
            # Save cleaned text to a new file
            output_file_path = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}.txt")
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                output_file.write(cleaned_text)

# Main function
# Folder containing PDF files
pdf_folder_path = r"/content/drive/MyDrive/publications"
# Output folder for cleaned text files
output_folder = r"/content/drive/MyDrive/cleaned"
# Process PDF files in the folder
process_folder(pdf_folder_path, output_folder)



Some PDFs gave gibberish or no result in the above method so we will be extracting text from them using OCR. Though it is a time expensive method but is more reliable in such cases.


In [None]:
import os
import re
from tempfile import TemporaryDirectory
from pathlib import Path

import pytesseract
from pdf2image import convert_from_path
from PIL import Image

out_directory = Path("/content/drive/MyDrive/cleaned")

def remove_figures_tables_citations(text):
    # Remove figures and tables
    text = re.sub(r'\bFigure\s\d+\b', '', text)
    text = re.sub(r'\bFig\s\d+\b', '', text)
    text = re.sub(r'\bTable\s\d+\b', '', text)
    # Remove in-text citations [1], [2], etc.
    text = re.sub(r'\[\d+\]', '', text)
    # Remove references (Author, Year)
    text = re.sub(r'\([A-Za-z]+, \d+\)', '', text)
    text = re.sub(r'References\s*.*', '', text, flags=re.IGNORECASE)
    return text

def main():
    ''' Main execution point of the program'''
    # Path to the folder containing PDF files
    pdf_folder = Path('/content/drive/MyDrive/publications2')

    # Get list of PDF files in the folder
    pdf_files = [file for file in os.listdir(pdf_folder) if file.endswith('.pdf')]

    for PDF_file_name in pdf_files:
        PDF_file = pdf_folder / PDF_file_name
        text_file = out_directory / Path(f"{PDF_file_name[:-4]}.txt")

        with TemporaryDirectory() as tempdir:
            # Part #1: Converting PDF to images
            pdf_pages = convert_from_path(PDF_file, 500)

            image_file_list = []

            for page_enumeration, page in enumerate(pdf_pages, start=1):
                filename = f"{tempdir}/page_{page_enumeration:03}.jpg"
                page.save(filename, "JPEG")
                image_file_list.append(filename)

            # Part #2: Recognizing text from the images using OCR
            with open(text_file, "w") as output_file:
                for image_file in image_file_list:
                    text = str(pytesseract.image_to_string(Image.open(image_file)))
                    text = text.replace("-\n", "")
                    clean_content = remove_figures_tables_citations(text)
                    output_file.write(clean_content)

if __name__ == "__main__":
    main()


##LLM Application



In [None]:
!pip -q install langchain tiktoken chromadb pypdf transformers InstructorEmbedding
!pip -q install accelerate bitsandbytes sentencepiece Xformers

In [None]:
!pip install sentence_transformers==2.2.2

In [None]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader


from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

  from tqdm.autonotebook import trange


##Load multiple documents

In [None]:
loader = DirectoryLoader('/content/cleaned', glob="./*.txt", loader_cls=TextLoader)

documents = loader.load()

In [None]:
len(documents)

15

In [None]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
texts = text_splitter.split_documents(documents)

##HF Instructor Embeddings

In [None]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",
                                                      model_kwargs={"device": "cuda"})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512


##create the DB

In [None]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## Here is the nmew embeddings being used
embedding = instructor_embeddings

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

##QA Retrieval- WizardLM

In [None]:
import torch
import transformers
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline,BitsAndBytesConfig

tokenizer = LlamaTokenizer.from_pretrained("TheBloke/wizardLM-7B-HF")
quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=False)
model = LlamaForCausalLM.from_pretrained("TheBloke/wizardLM-7B-HF",quantization_config=quantization_config,

                                              device_map='auto',
                                              torch_dtype=torch.float16,
                                              low_cpu_mem_usage=True
                                              )

tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [None]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
import torch

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=4096,
    temperature=0,
    top_p=0.95,
    repetition_penalty=1.15
)

local_llm = HuggingFacePipeline(pipeline=pipe)

##Make a Retriever

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

##Make a QA Chain

In [None]:
# create the chain to answer questions
from langchain.prompts import PromptTemplate
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:
"""

chain = RetrievalQA.from_chain_type(
    llm=local_llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={
        "prompt": PromptTemplate(
            template=template,
            input_variables=["context", "question"],
        ),
    },return_source_documents=True
)

In [None]:
import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    def remove_text_before(string, text):
      index = string.find(text)
      if index != -1:
          return string[index:]
      else:
          return string

    result = remove_text_before(wrapped_text, "Question: ")


    return result

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])
    print("\n" + "-"*50 + "\n")

##Result

##Top 5 retrieved research findings

In [None]:
import pandas as pd

def load_questions(file_path):
    questions_df = pd.read_excel(file_path)
    return questions_df['Questions '].tolist()

def get_relevant_documents_for_queries(queries):
    relevant_docs = {}
    for idx, query in enumerate(queries, start=1):
        print(f"Query {idx}/{len(queries)}: {query}\n")
        relevant_docs[query] = retriever.get_relevant_documents(query)
        print(f"Relevant Documents for Query {idx}:\n")
        for doc in relevant_docs[query]:
            print(doc)
        print('-' * 50)  # Line of dashes after printing relevant documents
    return relevant_docs

questions_file_path = '/content/Query Questions.xlsx'
questions = load_questions(questions_file_path)

relevant_documents = get_relevant_documents_for_queries(questions)


Query 1/13: What are the variety of Multimodal and Multi-modular AI Approaches to Streamline Autism Diagnosis in Young Children

Relevant Documents for Query 1:

page_content='1 Scientific  RepoRtS  |         (2020) 10:5014  | https://doi.org/10.1038/s41598-020-61213-w\nwww.nature.com/scientificreportsMulti-modular Ai Approach to \nStreamline Autism Diagnosis in \nYoung children\nHalim Abbas  1, ford Garberson  1, Stuart Liu-Mayo  1, eric Glover1* & Dennis p . Wall  2\nAutism has become a pressing healthcare challenge. the instruments used to aid diagnosis are time \nand labor expensive and require trained clinicians to administer, leading to long wait times for at-risk' metadata={'source': '/content/cleaned/Abbas_2020.txt'}
page_content='Abbas H, Garberson F, Liu-Mayo S, Glover E, Wall DP. Multi-modular Al Approach to Streamline\nAutism Diagnosis in Young Children. Scientific Reports. 2020; 10(1):5014. https://doi.org/10.1038/\n$41598-020-61213-w PMID: 32193406' metadata={'source': '/

##Summarizing the Top 5 research findings

In [None]:
import pandas as pd
import requests
import jsonpickle

API_URL = "https://api-inference.huggingface.co/models/slauw87/bart_summarisation"
headers = {"Authorization": "Bearer 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"}

def summ(input_texts):
    payload = {
        "inputs": "\n".join(input_texts),  # Convert list of texts to a single string separated by newlines
        "options": {"wait_for_model": True},
    }

    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

def load_questions(file_path):
    questions_df = pd.read_excel(file_path)
    return questions_df['Questions '].tolist()

def get_relevant_documents_for_queries(queries):
    relevant_docs = {}
    aggregated_summaries = {}

    for idx, query in enumerate(queries, start=1):
        print(f"Query {idx}/{len(queries)}: {query}\n")
        relevant_docs[query] = retriever.get_relevant_documents(query)
        print(f"Relevant Documents for Query {idx}:\n")

        all_docs_content = []
        for doc in relevant_docs[query]:
            try:
                # Fetch 'page_content' from the document object
                doc_content = doc.page_content
                all_docs_content.append(doc_content)
            except AttributeError:
                print(f"Document {doc} does not have a 'page_content' attribute.")

        # Check the content before passing it to summ
        print(f"Contents to summarize for Query {idx}:")
        print(all_docs_content)

        # Generate summary for all relevant documents for the query
        aggregated_summary = summ(all_docs_content)
        aggregated_summaries[query] = aggregated_summary

        print(aggregated_summary)

    return relevant_docs, aggregated_summaries


questions_file_path = '/content/Query Questions.xlsx'
questions = load_questions(questions_file_path)

relevant_documents, aggregated_summaries = get_relevant_documents_for_queries(questions)

# Print aggregated summaries at the end
for query, summary in aggregated_summaries.items():
    print(f"\nAggregated Summary for Query: {query}\n")
    print(summary)

Query 1/13: What are the variety of Multimodal and Multi-modular AI Approaches to Streamline Autism Diagnosis in Young Children

Relevant Documents for Query 1:

Contents to summarize for Query 1:
['1 Scientific  RepoRtS  |         (2020) 10:5014  | https://doi.org/10.1038/s41598-020-61213-w\nwww.nature.com/scientificreportsMulti-modular Ai Approach to \nStreamline Autism Diagnosis in \nYoung children\nHalim Abbas  1, ford Garberson  1, Stuart Liu-Mayo  1, eric Glover1* & Dennis p . Wall  2\nAutism has become a pressing healthcare challenge. the instruments used to aid diagnosis are time \nand labor expensive and require trained clinicians to administer, leading to long wait times for at-risk', 'Abbas H, Garberson F, Liu-Mayo S, Glover E, Wall DP. Multi-modular Al Approach to Streamline\nAutism Diagnosis in Young Children. Scientific Reports. 2020; 10(1):5014. https://doi.org/10.1038/\n$41598-020-61213-w PMID: 32193406', 'children. We present a multi-modular, machine learning-based ass

##Final responses

In [None]:
questions_file_path = '/content/Query Questions.xlsx'
questions = load_questions(questions_file_path)

for que in questions:
    con = aggregated_summaries[que]

    # Call the chain with the question and its aggregated summary
    llm_response = chain({'query': que, 'context': con})
    print(con)
    # Process the llm_response
    process_llm_response(llm_response)



[{'summary_text': 'Autism has become a pressing healthcare challenge. The instruments used to aid diagnosis are time and labor expensive and require trained clinicians to administer. Three machine learning modules for the identification of autism via mobile app outperform conventional autism screeners. The accuracy of the combined assessment is similar to gold-standard instruments such as ADOS and ADI-R.'}]
Question: What are the variety of Multimodal and Multi-modular AI Approaches to Streamline Autism Diagnosis in
Young Children
Helpful Answer:
There are various multimodal and multi-modular AI approaches being developed to streamline autism diagnosis in
young children. One such approach is described in the article "Multi-modular Ai Approach to Streamline Autism
Diagnosis in Young Children" by Halim Abbas et al. This approach consists of three machine learning modules
designed for use on a mobile app. These modules include speech recognition, facial expression recognition, and
natural