# Orgnized_code

In [1]:
!pip install PyPDF2
!pip install pdfreader
!pip install transformers torch
!pip install gensim
!pip install --upgrade lamini
!pip install sentence_transformers
!pip install python-docx
!pip install pickle5
!pip install faiss-cpu rank_bm25 numpy
!pip install sentencepiece
!pip install subword-nmt
!pip install tiktoken
!pip install bpemb

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting pdfreader
  Downloading pdfreader-0.1.15-py3-none-any.whl.metadata (4.3 kB)
Collecting bitarray>=1.1.0 (from pdfreader)
  Downloading bitarray-2.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (34 kB)
Downloading pdfreader-0.1.15-py3-none-any.whl (135 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.6/135.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading bitarray-2.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (288 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m288.3/288.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0mta

# **Write New Data in file**

In [2]:
import faiss
import numpy as np
import pickle

def create_or_update_faiss_indices(new_embeddings, new_sentences, overwrite=False, index_ip_filename='index_ip.index', index_hnsw_filename='index_hnsw.index', data_filename='faiss_data.pkl'):
    # Convert new embeddings to numpy array if not already
    new_embeddings = np.array(new_embeddings)

    if not overwrite:
        try:
            # Load existing data
            with open(data_filename, 'rb') as file:
                data = pickle.load(file)

            # Load existing indices
            index_ip = faiss.read_index(data['index_ip_filename'])
            index_hnsw = faiss.read_index(data['index_hnsw_filename'])

            # Normalize new embeddings for cosine similarity
            normalized_new_embeddings = new_embeddings / np.linalg.norm(new_embeddings, axis=1, keepdims=True)

            # Add new embeddings to existing indices
            index_ip.add(normalized_new_embeddings)
            index_hnsw.add(new_embeddings)

            # Append new sentences to existing sentences
            sentences = data['sentences'] + new_sentences

        except FileNotFoundError:
            print("No existing data found. Creating new indices and data.")
            # Create new indices and data if not found
            index_ip = faiss.IndexFlatIP(new_embeddings.shape[1])
            index_hnsw = faiss.IndexHNSWFlat(new_embeddings.shape[1], 32)
            index_hnsw.hnsw.efConstruction = 40

            # Normalize new embeddings for cosine similarity
            normalized_new_embeddings = new_embeddings / np.linalg.norm(new_embeddings, axis=1, keepdims=True)

            # Add new embeddings to indices
            index_ip.add(new_embeddings)
            index_hnsw.add(new_embeddings)

            # Initialize sentences
            sentences = new_sentences

    else:
        # Create new indices and overwrite existing data
        index_ip = faiss.IndexFlatIP(new_embeddings.shape[1])
        index_hnsw = faiss.IndexHNSWFlat(new_embeddings.shape[1], 32)
        index_hnsw.hnsw.efConstruction = 40

        # Normalize new embeddings for cosine similarity
        normalized_new_embeddings = new_embeddings / np.linalg.norm(new_embeddings, axis=1, keepdims=True)

        # Add new embeddings to indices
        index_ip.add(normalized_new_embeddings)
        index_hnsw.add(new_embeddings)

        # Initialize sentences
        sentences = new_sentences

    # Save the updated indices to separate files
    faiss.write_index(index_ip, index_ip_filename)
    faiss.write_index(index_hnsw, index_hnsw_filename)

    # Store the filenames and sentences in a dictionary
    data = {
        'index_ip_filename': index_ip_filename,       # Filename of the cosine similarity index
        'index_hnsw_filename': index_hnsw_filename,   # Filename of the HNSW index
        'sentences': sentences                        # Corresponding sentences
    }

    # Save the data to a pickle file
    with open(data_filename, 'wb') as file:
        pickle.dump(data, file)

    print(f"FAISS indices saved to {index_ip_filename} and {index_hnsw_filename}")
    print(f"Sentences saved to {data_filename}")


# **Load Data from File **

In [3]:
import faiss
import numpy as np
import pickle
import pickle
def read_faiss_indices(data_filename='faiss_data.pkl'):
    # Load the data from the pickle file
    with open(data_filename, 'rb') as file:
        data = pickle.load(file)

    # Load the FAISS indices from their serialized files
    index_ip = faiss.read_index(data['index_ip_filename'])
    index_hnsw = faiss.read_index(data['index_hnsw_filename'])

    # Update the data dictionary with the loaded indices
    data['index_ip'] = index_ip
    data['index_hnsw'] = index_hnsw

    return data

# **Sementic Chuncking**

In [4]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans


def get_bert_embeddings(sentences):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    # Tokenize sentences
    inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=128)

    # Get BERT embeddings
    with torch.no_grad():
        outputs = model(**inputs)

    # Use the [CLS] token representation as the sentence embedding
    embeddings = outputs.last_hidden_state[:, 0, :].numpy()

    return embeddings

def semantic_chunking_with_attention(text, n_clusters=5, max_chunk_size=512):
    # Split the input text into sentences based on '.'
    sentences = [sentence.strip() for sentence in text.split('.') if sentence.strip()]

    # Get BERT embeddings for sentences
    embeddings = get_bert_embeddings(sentences)

    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=5).fit(embeddings)
    labels = kmeans.labels_

    # Group sentences by cluster
    clusters = {}
    for sentence, label in zip(sentences, labels):
        if label not in clusters:
            clusters[label] = []
        clusters[label].append(sentence)

    # Create chunks from clusters with max_chunk_size constraint
    chunks = []
    for cluster in clusters.values():
        current_chunk = ""
        for sentence in cluster:
            if len(current_chunk) + len(sentence) + 1 <= max_chunk_size:  # +1 for the period
                current_chunk += sentence + ". "
            else:
                chunks.append(current_chunk.strip())
                current_chunk = sentence + ". "
        if current_chunk:
            chunks.append(current_chunk.strip())

    return chunks


# **Recursive Chunking**

In [5]:
def division_Chunk(text, chunk_size=400):
    sentences = text.split(".")
    chunk1 = ""
    chunks = []

    for sentence in sentences:
        word_count = len(sentence.split())

        if word_count > chunk_size:
            chunks.extend(Recursive_Chunking(sentence, chunk_size))
        else:
            if len(chunk1.split()) + word_count <= chunk_size:
                chunk1 += sentence + "\n"
            else:
                chunks.append(chunk1.strip())
                chunk1 = sentence + "\n"

    if chunk1.strip():
        chunks.append(chunk1.strip())

    return chunks

def Recursive_Chunking(text, chunk_size=1000):
    chunks = text.split("\n\n")
    #print(len(chunks))
    result_chunks = []

    for chunk in chunks:
        word_count = len(chunk.split())

        if word_count > chunk_size:
            middle = int(0.8 * word_count)
            words = chunk.split()
            part1 = " ".join(words[:middle])
            part2 = " ".join(words[middle:])

            result_chunks.extend(division_Chunk(part1, chunk_size))
            result_chunks.extend(division_Chunk(part2, chunk_size))
        else:
            result_chunks.append(chunk.strip())

    return result_chunks

# Example usage





# **SentenceTransformer Embedding **

In [6]:
from sentence_transformers import SentenceTransformer
def get_sentenceTF_embeddings(sentences):
  model = SentenceTransformer('all-MiniLM-L6-v2')
  embeddings =[]
  for chunk in sentences:
    embeddings.append(model.encode(chunk))
  print(len(embeddings))
  return embeddings

def Embed_stenteceTF(sentence):
  model = SentenceTransformer('all-MiniLM-L6-v2')
  return model.encode(sentence)

2024-08-11 02:13:09.914388: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-11 02:13:09.914494: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-11 02:13:10.098717: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# **Add_Data_From_Word_file**

In [7]:
# from docx import Document
# import re

# def read_word_file(file_path):
#     # Load the document
#     doc = Document(file_path)

#     # Extract headings, subheadings, and points

#     points = []

#     for para in doc.paragraphs:
#         text = para.text.strip()
#         style = para.style.name.lower()
#         points.append(text)

#     return  points

# def replace_double_spaces(text):
#     # Replace all occurrences of two consecutive spaces with '\xa0'
#     cleaned_text = re.sub(r'\xa0',' ' , text)
#     cleaned_text1 = re.sub(r'\u200b','' , cleaned_text)
#     return cleaned_text1
# def read_word_file_(file_path, Chunking_type , Max_Chunk_size, NumberOf_cluster= 20 ):
#   Buddhism = read_word_file(file_path)
#   for i in range(len(Buddhism)):
#     Buddhism[i] = replace_double_spaces(Buddhism[i])
#   Buddhism_Str = "\n".join(Buddhism)
#   if(Chunking_type == "Recursive"):
#       chunks = Recursive_Chunking(Buddhism_Str,chunk_size= Max_Chunk_size)
#   elif(Chunking_type == "Semantic"):
#       chunks = semantic_chunking_with_attention(Buddhism_Str, n_clusters=NumberOf_cluster, max_chunk_size=Max_Chunk_size)
#   embedding =  get_sentenceTF_embeddings(chunks)
#   return embedding, chunks

# Add_Data_From_PDF_file

In [8]:
from PyPDF2 import PdfReader
import re

def read_pdf_file(file_path):
    points = []
    # Open the PDF file
    with open(file_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        # Iterate through each page in the PDF
        for page in pdf_reader.pages:
            text = page.extract_text()
            if text:
                points.append(text.strip())
    return points

def replace_double_spaces(text):
    # Replace all occurrences of two consecutive spaces with '\xa0'
    cleaned_text = re.sub(r'\xa0',' ' , text)
    cleaned_text1 = re.sub(r'\u200b','' , cleaned_text)
    return cleaned_text1

def read_pdf_file_(file_path, Chunking_type, Max_Chunk_size, NumberOf_cluster=20):
    Pneumonia = read_pdf_file(file_path)
    for i in range(len(Pneumonia)):
        Pneumonia[i] = replace_double_spaces(Pneumonia[i])
    Pneumonia_Str = "\n".join(Pneumonia)
    if Chunking_type == "Recursive":
        chunks = Recursive_Chunking(Pneumonia_Str, chunk_size=Max_Chunk_size)
    elif Chunking_type == "Semantic":
        chunks = semantic_chunking_with_attention(Pneumonia_Str, n_clusters=NumberOf_cluster, max_chunk_size=Max_Chunk_size)
    embedding = get_sentenceTF_embeddings(chunks)
    return embedding, chunks


In [9]:
def search_top_k_sentences_cos(data, input,input_embedding,threshold = 0.0 ,k=6):
    index_ip = data['index_ip']   # Euclidean distance index (IndexFlatL2)
    sentences = data['sentences']
    #print(input)
    # Convert the input embedding to a numpy array
    input_embedding_array = np.array([input_embedding])

    # Perform the cosine similarity search
    distances_ip, indices_ip = index_ip.search(input_embedding_array, k)
    top_k_indices_ip = indices_ip[0][distances_ip[0] >= threshold]
    # Get the corresponding sentences for the top k embeddings
    top_k_distances_ip = distances_ip[0]

    # Get the corresponding sentences and scores for the top k embeddings
    top_k_sentences = [sentences[i] for i in top_k_indices_ip]

    return top_k_sentences

In [10]:
from rank_bm25 import BM25Okapi
import numpy as np
import tiktoken
from transformers import GPT2TokenizerFast
from typing import List
import time
from bpemb import BPEmb

# Initialize the encoders for tokenization
tiktoken_encoder = tiktoken.encoding_for_model("gpt-4")
bpemb_en = BPEmb(lang="en")

def preprocess_func_tiktoken(text: str) -> List[str]:
    # Lowercase the input text
    lowered = text.lower()
    # Convert the lowered text into tokens
    tokens = tiktoken_encoder.encode(lowered)
    # Stringify the tokens
    return [str(token) for token in tokens]

def preprocess_func_bpemb(text: str):
    # Tokenize the input text using BPEmb tokenizer
    tokens = bpemb_en.encode(text)
    return tokens

def measure_time(func, *args):
    start_time = time.time()
    result = func(*args)
    end_time = time.time()
    elapsed_time = end_time - start_time
    return result, elapsed_time

def search_top_k_sentences(data,input_embedding,input_text, k, preprocess_func,threshold = 0.0 ):
    # Get the Faiss indices and sentences from the data dictionary
    index_ip = data['index_ip']  # Cosine similarity index (IndexFlatIP)
    index_hnsw = data['index_hnsw']  # HNSW index
    sentences = data['sentences']

    # Convert the input embedding to a numpy array
    input_embedding_array = np.array([input_embedding])
    # Perform the cosine similarity search
    distances_ip, indices_ip = index_ip.search(input_embedding_array, 20)
    #top_k_indices_ip = indices_ip[0]
    top_k_indices_ip = indices_ip[0][distances_ip[0] >= threshold]
    #print(top_k_indices_ip)
    # Combine the indices from both searches, avoiding duplicates
    combined_indices = list(set(top_k_indices_ip))

    # Get the corresponding sentences for the combined indices
    combined_sentences = [sentences[i] for i in combined_indices]
    if(len(combined_sentences)>0):

    # Preprocess the sentences for BM25
      tokenized_sentences = [preprocess_func(sent) for sent in combined_sentences]
      bm25 = BM25Okapi(tokenized_sentences)

    # Preprocess the input text
      tokenized_input_text = preprocess_func(input_text)
      bm25_scores = bm25.get_scores(tokenized_input_text)

    # Sort BM25 scores based on top combined indices
      bm25_scores_combined = [(idx, bm25_scores[j]) for j, idx in enumerate(combined_indices)]
      bm25_scores_combined_sorted = sorted(bm25_scores_combined, key=lambda x: x[1], reverse=True)

    # Extract top k sentences based on BM25 scores
      top_k_indices = [sentences[idx] for idx, score in bm25_scores_combined_sorted[:k] if score > 0]


      return top_k_indices
    return []


downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs10000.model


100%|██████████| 400869/400869 [00:00<00:00, 1245249.09B/s]


downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs10000.d100.w2v.bin.tar.gz


100%|██████████| 3784656/3784656 [00:00<00:00, 5068789.10B/s]


In [11]:
import lamini
import json

with open('config.json', 'r') as file:
    config = json.load(file)
    
def loadModel():
  lamini.api_key = config['lamini_api_key']
  llm = lamini.Lamini("meta-llama/Meta-Llama-3-8B-Instruct")
  return llm

In [12]:
def Genrate_Answer(llm_model,Data,intput, top_k,Threashold,Search_type):
  system_header = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
  user_middle = "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
  assitant_footer = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
  start_Question = "<|start_Question|>\n"
  end_Question = "<|end_Question|>\n\n"
  start_data = "<!|start_data|>\n"
  end_data = "<|end_data|>\n\n"
  String1 = """
You are an AI chat bot designed to answer questions based on a the data given along with the question.
If the answer doesn't exist wihtin the data, respond back with "I'm sorry, but I cannot answer that question as it is outside the scope of my dataset." Donot use pre-trained data to answer this prompt
  """
  #print(intput)
  top_k_sentences = []
  encoded_input = Embed_stenteceTF(intput)
  if(Search_type == "Cosine"):
    top_k_sentences = search_top_k_sentences_cos(Data,intput,encoded_input, k=top_k, threshold=Threashold)

  elif(Search_type == "Hybrid_TicToken"):
    top_k_sentences = search_top_k_sentences(Data,encoded_input, intput, k=top_k,threshold=Threashold, preprocess_func=preprocess_func_tiktoken)
  elif(Search_type == "Hybrid_bpemb"):
    top_k_sentences = search_top_k_sentences(Data,encoded_input,intput, k=top_k,threshold=Threashold ,preprocess_func=preprocess_func_bpemb)
  concatenated_text =  system_header + String1 +'\n'+ user_middle + start_data + "\n".join(top_k_sentences) + end_data+ " \n" + start_Question  + intput + end_Question + assitant_footer # Remove the extra '+' after user_middle
  print(concatenated_text)
  return llm_model.generate(concatenated_text,max_tokens=2048,max_new_tokens=2048 )

# **PrepareData**

In [None]:
# Read Data from pdf file
embeddings, chunks = read_pdf_file_(file_path='/kaggle/input/pneumonia-book/Pneumonia Symptoms Diagnosis and Treatment.pdf',Chunking_type='Semantic',Max_Chunk_size = 600,NumberOf_cluster = 20)
print(len(chunks))

In [14]:
# embeddings, chunks = read_word_file_(file_path='Buddhism.docx',Chunking_type='Semantic',Max_Chunk_size = 600,NumberOf_cluster = 20)
# print(len(chunks))

In [15]:
create_or_update_faiss_indices(embeddings, chunks, overwrite=False, index_ip_filename='Pneumonia_ip.index', index_hnsw_filename='Pneumonia_hnsw.index', data_filename='Pneumonia.pkl')

FAISS indices saved to Pneumonia_ip.index and Pneumonia_hnsw.index
Sentences saved to Pneumonia.pkl


In [16]:
load_data = read_faiss_indices('Pneumonia.pkl')
llm = loadModel()

In [17]:
print(load_data)

{'index_ip_filename': 'Pneumonia_ip.index', 'index_hnsw_filename': 'Pneumonia_hnsw.index', 'sentences': ['Pneumonia: Symptoms, Diagnosis and Treatment : Symptoms, Diagnosis and Treatment, edited by Micaela L. Pneumonia: Symptoms, Diagnosis and Treatment : Symptoms, Diagnosis and Treatment, edited by Micaela L. Pneumonia: Symptoms, Diagnosis and Treatment : Symptoms, Diagnosis and Treatment, edited by Micaela L. Pneumonia: Symptoms, Diagnosis and Treatment : Symptoms, Diagnosis and Treatment, edited by Micaela L. New York  \nPneumonia: Symptoms, Diagnosis and Treatment : Symptoms, Diagnosis and Treatment, edited by Micaela L.', 'Library of Congress Cataloging- in-Publication Data \n \nPneumonia : symptoms, diagnosis, and treatment / editors, Micaela L. †New York  ISBN: \x1c\x1a\x1b\x10\x14\x10\x19\x15\x13\x1b\x14\x10\x1c\x1c\x1b\x10\x15 (eBook)\nPneumonia: Symptoms, Diagnosis and Treatment : Symptoms, Diagnosis and Treatment, edited by Micaela L. Pneumonia: Symptoms, Diagnosis and Treat

In [18]:
input_text1 = "Tell me about Pneumonia"

In [19]:
Genrate_Answer(llm,load_data,input_text1,10,0.4,"Hybrid_bpemb")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

<|begin_of_text|><|start_header_id|>system<|end_header_id|>


You are an AI chat bot designed to answer questions based on a the data given along with the question.
If the answer doesn't exist wihtin the data, respond back with "I'm sorry, but I cannot answer that question as it is outside the scope of my dataset." Donot use pre-trained data to answer this prompt
  
<|eot_id|><|start_header_id|>user<|end_header_id|>

<!|start_data|>
Ortega, Steffani M. Fuster -Jorge, A. Montesdeoca -Melián, M. Mateos -Durán , 
M. Ramos -Real, V. Ramos -Martín, M. In: Pneumonia: Symptoms, Diagnosis and Treatment  ISBN: 978 -1-61209 -685-8 
Editors: M. Ortega , pp. carinii , P. murina  and P. [3] Van der Meer G, Brug SL. [5] Hughes WT. [20] Wazir JF, Ansari NA. [37] Grocott RG. [44] Dei-Cas E, Ali ouat EM, Cailliez JC. [55] Durand -Joly, I. , So ula, F. 30 
[63] Huan g L. [64] Durand -Joly I. [70] Calderon EJ. [99] Pontón J. [100]  Clarkson AB, Merali S. , Chernoff D, Feigal DW, Jr. [128]  Baggish AL, Hi

'Based on the provided data, here\'s what I found about Pneumonia:\n\nPneumonia is an inflammatory condition of the lung, which is one of the most common serious infections, causing two million deaths annually among young and old alike. The clinical presentation in HIV-infected patients may differ from that in other immunocompromised patients, and its diagnosis continues to be challenging because no combination of symptoms, signs, blood chemistries, or radiographic findings is specific to Pneumocystis pneumonia.\n\nPneumonia can be characterized by its onset: early or late. Early onset pneumonia occurs during the first four days of hospitalization and is often caused by Moraxella catarrhalis, Haemophilus influenzae, and Streptococcus pneumoniae.\n\nThe book "Pneumonia: Symptoms, Diagnosis and Treatment" edited by Micaela L. discusses various topics related to pneumonia, including:\n\n* Modes of supportive treatment in patients with pneumonia\n* Pneumocystis jirovecii in AIDS patients\n