In [5]:
import os
import re
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
import uuid
import torch
import numpy as np
import json
import os
import torch.nn.functional as F
import psycopg2

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def get_db_connection():
    return psycopg2.connect(
        dbname="rag_system",
        user="postgres",
        password="",
        host="localhost",
        port="5432"
    )

In [7]:
# Store documents and their chunks (with embeddings) in the database
def store_documents_and_chunks(documents, mapped_document_db):
    conn = get_db_connection()
    cur = conn.cursor()
    
    for doc_id, data in documents.items():
        # Insert the document metadata and get its generated ID
        metadata = json.dumps({
            "document_uuid": doc_id,
            "file_name": data.get("metadata")["file_name"]
            })
        cur.execute("""
            INSERT INTO documents (metadata) 
            VALUES (%s) RETURNING id;
        """, (metadata,))
        document_db_id = cur.fetchone()[0]  # This is the primary key for the document in the `documents` table
        
        # Insert chunks and embeddings for this document
        for chunk_id, text in data["text"].items():
            embedding = mapped_document_db[doc_id][chunk_id]
            cur.execute("""
                INSERT INTO chunks (uuid, document_id, embedding, text)
                VALUES (%s, %s, %s, %s)
                ON CONFLICT (id) DO NOTHING;
            """, (chunk_id, document_db_id, embedding, text))
    
    conn.commit()
    cur.close()
    conn.close()

In [23]:
def retrieve_information_pgvector(query, tokenizer, model, top_k):
    conn = get_db_connection()
    cur = conn.cursor()
    
    # Generate query embedding
    query_inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        query_embedding = model(**query_inputs).last_hidden_state.mean(dim=1).squeeze().tolist()
    
    # Use pgvector to find the top_k most similar chunks
    cur.execute("""
        SELECT c.id, c.uuid, c.text, c.embedding, d.metadata,
               1 - (c.embedding <=> %s::vector) AS similarity
        FROM chunks c
        JOIN documents d ON c.document_id = d.id
        ORDER BY c.embedding <=> %s::vector
        LIMIT %s;
    """, (query_embedding, query_embedding, top_k))
    
    results = cur.fetchall()
    cur.close()
    conn.close()
    
    top_results = [
        {
            row[2]
        }
        for row in results
    ]
    
    return top_results


In [9]:
def chunking(directory_path, tokenizer, chunk_size, para_seperator=" /n /n", separator=" "):
    documents = {}
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        print(filename)
        base = os.path.basename(file_path)
        sku = os.path.splitext(base)[0]
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()

            doc_id = str(uuid.uuid4())    

            paragraphs = re.split(para_seperator, text)
            all_chunks = {} 
            for paragraph in paragraphs:
                words = paragraph.split(separator)
                current_chunk_str = ""
                chunk = []
                for word in words:
                    if current_chunk_str:
                        new_chunk = current_chunk_str + separator + word
                    else:
                        new_chunk = current_chunk_str + word    
                    if len(tokenizer.tokenize(new_chunk)) <= chunk_size:
                        current_chunk_str = new_chunk
                    else:
                        if current_chunk_str:
                            chunk.append(current_chunk_str)
                        current_chunk_str = word
                

                if current_chunk_str:   
                    chunk.append(current_chunk_str)

                for chunk in chunk:
                    chunk_id = str(uuid.uuid4())
                    all_chunks[chunk_id] = chunk
        documents[doc_id] = {"text": all_chunks, "metadata": {"file_name":sku}}
    return documents 

In [10]:
def map_document_embeddings(documents, tokenizer, model):
    mapped_document_db = {}
    for id, dict_content in documents.items():
        mapped_embeddings = {}
        for content_id, text in dict_content["text"].items():
            inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
            with torch.no_grad():
                embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().tolist()
            mapped_embeddings[content_id] = embeddings
        mapped_document_db[id] = mapped_embeddings
    return mapped_document_db

In [11]:
def retrieve_information(query, tokenizer, model, top_k, mapped_document_db):
    query_inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True)
    query_embeddings = model(**query_inputs).last_hidden_state.mean(dim=1).squeeze()
    query_embeddings=query_embeddings.tolist()
    #converting query embeddings to numpy array
    query_embeddings=np.array(query_embeddings)

    scores = {}
    #Now calculating cosine similarity
    for doc_id, chunk_dict in mapped_document_db.items():
        for chunk_id, chunk_embeddings in chunk_dict.items():
            #converting chunk embedding to numpy array for efficent mathmetical operations
            chunk_embeddings = np.array(chunk_embeddings) 

            #Normalizing chunk embeddings and query embeddings  to get cosine similarity score
            normalized_query = np.linalg.norm(query_embeddings)
            normalized_chunk = np.linalg.norm(chunk_embeddings)

            if normalized_chunk == 0 or normalized_query == 0:
            # this is being done to avoid division with zero which will give wrong results i.e infinity. Hence to avoid this we set score to 0
                score == 0
            else:
            # Now calculationg cosine similarity score
                score = np.dot(chunk_embeddings, query_embeddings)/ (normalized_chunk * normalized_query)  

             #STORING SCORES WITH THE REFERENCE
            scores[(doc_id, chunk_id )] = score   

    sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)[:top_k]

    top_results=[]
    for ((doc_id, chunk_id), score) in sorted_scores:
        results = (doc_id, chunk_id, score)
        top_results.append(results)
    return top_results  

In [12]:
def save_json(path, data):
    with open(path, 'w') as f:
        json.dump(data, f, indent=4)

def read_json(path):
    with open(path, 'r') as f:
        data = json.load(f)
    return data

def retrieve_text(top_results, document_data):
    first_match = top_results[1]
    doc_id = first_match[0]
    chunk_id = first_match[1]
    related_text = document_data[doc_id].get("text")[chunk_id]
    return related_text


In [13]:
def generate_llm_response(model, tokenizer, query, relavent_text):
    input_text = f"""
    You are an intelligent search engine. You will be provided with some retrieved context, as well as the users query.

    Your job is to understand the request, and answer based on the retrieved context.
    Here is context:
    {relavent_text} 
    
    Question: {query}
    """
    print(input_text)
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    print(len(inputs['input_ids'][0]))
    outputs = model.generate(**inputs, max_length=100)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

In [14]:
directory_path = "documents"
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
embeddings_tokenizer = AutoTokenizer.from_pretrained(model_name)
embeddings_model = AutoModel.from_pretrained(model_name)
chunk_size = 200
para_seperator=" /n /n"
separator=" "
top_k = 2

In [15]:
#creating document store with chunk id, doc_id, text
documents = chunking(directory_path, embeddings_tokenizer, chunk_size, para_seperator, separator)
documents

behaviuor1.txt
behaviuor2.txt
behaviuor3.txt


{'3a62556b-9d2a-4bea-bdbc-9d5f5b591536': {'text': {'0d6ba946-c247-4dfc-b53a-ba65ddfc4b64': "Children throw tantrums when they are overwhelmed by strong emotions, overstimulated, or anxious. Young children may have tantrums because of strong emotions, while older children may have tantrums because they don't know how to express or manage their feelings. Overstimulation can also cause tantrums, as children don't always recognize when bright lights or loud noises are bothering them. Anxiety can also cause tantrums, such as when a child feels anxious due to an unexpected event, unrealistic demand, or lack of routine.\n \nTantrums may happen when kids are tired, hungry, or uncomfortable. They can have a meltdown because they can't have something they want (like a toy or candy) or can’t get someone to do what they want (like getting a parent to pay attention to them immediately or getting a sibling to give up the tablet). Learning to deal with frustration is a skill that children gain over",

In [16]:
#now embedding generation and mapping in database
embedded_documents = map_document_embeddings(documents, embeddings_tokenizer, embeddings_model)
embedded_documents

{'3a62556b-9d2a-4bea-bdbc-9d5f5b591536': {'0d6ba946-c247-4dfc-b53a-ba65ddfc4b64': [0.0875849649310112,
   -0.018974564969539642,
   0.13897109031677246,
   0.10450530052185059,
   0.04204026609659195,
   0.035805173218250275,
   0.02092503383755684,
   0.1309826374053955,
   0.08725124597549438,
   0.0328528992831707,
   0.012418163940310478,
   -0.0030634503345936537,
   0.03614470362663269,
   0.02373591996729374,
   0.08710785210132599,
   0.0473153330385685,
   -0.04504794627428055,
   0.0846073105931282,
   -0.05965997278690338,
   -0.14554613828659058,
   0.08232580125331879,
   0.008816184476017952,
   -0.05803896114230156,
   0.017252827063202858,
   -0.12655127048492432,
   0.2694854438304901,
   -0.026039503514766693,
   -0.20236042141914368,
   0.009083428420126438,
   0.017045458778738976,
   0.0034153524320572615,
   -0.23515978455543518,
   -0.05134494975209236,
   -0.044760189950466156,
   -0.12152154743671417,
   -0.048747751861810684,
   0.034897077828645706,
   0.0698

In [17]:
store_documents_and_chunks(documents, embedded_documents)

In [18]:
#saving json
save_json('database/doc_store_2.json', documents) 
save_json('database/vector_store_2.json', embedded_documents)    

In [19]:
llm_model_name = "google/flan-t5-base"  # Replace with your chosen model
llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_name)

In [20]:
query = "How to make tantrums less likely to happen?"

In [21]:
#Retrieving most relavent data chunks
#top_results = retrieve_information(query, embeddings_tokenizer, embeddings_model, top_k, embedded_documents)

#reading json
#document_data = read_json("database/doc_store_2.json") #read document store

#Retrieving text of relavent chunk embeddings
#relavent_text = retrieve_text(top_results, document_data)

#print(relavent_text)

in whatever situation you’re in. Concentrate on putting your plan into action when the tantrum happens.
    • Accept that you can’t control your child’s emotions or behaviour directly. You can only keep your child safe and guide their behaviour so tantrums are less likely to happen in the future.
    • Accept that it takes time for change to happen. Your child has a lot of growing up to do before tantrums are gone forever. Developing and practising self-regulation skills is a life-long task.
    • Beware of thinking that your child is doing it on purpose or trying to upset you. Children don’t have tantrums deliberately. They’re stuck in a bad habit or don’t have the skills right now to cope with the situation.
    • Keep your sense of humour. But don’t laugh at the tantrum – if you do, it might reward your child with attention. It might also upset your child even more if they think


In [24]:
#Retrieving most relavent data chunks from db
relavent_text = retrieve_information_pgvector(query, embeddings_tokenizer, embeddings_model, top_k)

In [25]:
relavent_text

[{'in whatever situation you’re in. Concentrate on putting your plan into action when the tantrum happens.\n    • Accept that you can’t control your child’s emotions or behaviour directly. You can only keep your child safe and guide their behaviour so tantrums are less likely to happen in the future.\n    • Accept that it takes time for change to happen. Your child has a lot of growing up to do before tantrums are gone forever. Developing and practising self-regulation skills is a life-long task.\n    • Beware of thinking that your child is doing it on purpose or trying to upset you. Children don’t have tantrums deliberately. They’re stuck in a bad habit or don’t have the skills right now to cope with the situation.\n    • Keep your sense of humour. But don’t laugh at the tantrum – if you do, it might reward your child with attention. It might also upset your child even more if they think'},
 {'calm.\n    • Situations that children just can’t cope with – for example, a toddler might ha

In [26]:
response = generate_llm_response(llm_model, llm_tokenizer, query, relavent_text[0])


    You are an intelligent search engine. You will be provided with some retrieved context, as well as the users query.

    Your job is to understand the request, and answer based on the retrieved context.
    Here is context:
    {'in whatever situation you’re in. Concentrate on putting your plan into action when the tantrum happens.\n    • Accept that you can’t control your child’s emotions or behaviour directly. You can only keep your child safe and guide their behaviour so tantrums are less likely to happen in the future.\n    • Accept that it takes time for change to happen. Your child has a lot of growing up to do before tantrums are gone forever. Developing and practising self-regulation skills is a life-long task.\n    • Beware of thinking that your child is doing it on purpose or trying to upset you. Children don’t have tantrums deliberately. They’re stuck in a bad habit or don’t have the skills right now to cope with the situation.\n    • Keep your sense of humour. But don’

In [27]:
response

'Accept that you can’t control your child’s emotions or behaviour directly. You can only keep your child safe and guide their behaviour so tantrums are less likely to happen in the future'