In [0]:
!pip install faiss-cpu openpyxl

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 27.5/27.5 MB 67.5 MB/s eta 0:00:00
Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 250.9/250.9 kB 57.8 MB/s eta 0:00:00
Collecting numpy<3.0,>=1.25.0
  Downloading numpy-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 16.3/16.3 MB 94.3 MB/s eta 0:00:00
Collecting et-xmlfile
  Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: numpy, et-xmlfile, openpyxl, faiss-cpu
  Attempting uninstall: numpy
    Found existing installation: numpy 1.23.5
    Not uninstalling numpy at /databricks/python3/lib/python3.10/site-packages, outside en

In [0]:
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Configuration
KB_FILE = r"Knowledge_base.xlsx"  # Path to the KB file
EMBEDDING_MODEL = "BAAI/bge-base-en-v1.5"  # Pre-trained model for embedding
MODEL_NAME = "microsoft/phi-3-mini-4k-Instruct"  # Hugging Face model name
TOP_K = 3  # Number of similar matches to retrieve

# Initialize Embedding Model and Hugging Face Model
embedding_model = SentenceTransformer(EMBEDDING_MODEL)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="cpu", #cuda
    torch_dtype=torch.float32,
    trust_remote_code=True
)
# Step 1: Load Knowledge Base
def load_knowledge_base(kb_file):
    kb_df = pd.read_excel(kb_file)
    print(kb_df.head())
    embeddings = kb_df['embedding'].apply(lambda x: json.loads(x)).tolist()
    sms_data = kb_df[['template_id','template_text','Extracted_key_value_pair']]
    return np.array(embeddings), sms_data

# Step 2: Compute Embedding for New SMS
def compute_embedding(sms_text):
    return embedding_model.encode([sms_text])[0]

# Step 3: Build and Query FAISS Index
def build_faiss_index(embeddings):
    dimension = len(embeddings[0])
    try:
        embedding_dim = embeddings.shape[1]
    except IndexError:
        raise
    try:
        embeddings = embeddings.reshape(embeddings.shape[0], embeddings.shape[2])
        # Build FAISS index
        embedding_dim = embeddings.shape[1]
        index = faiss.IndexFlatL2(embedding_dim)

    except Exception as e:
    
        raise

    try:
        index.add(embeddings)
        
    except Exception as e: 
        raise
    return index

def query_faiss_index(index, embeddings, query_embedding, top_k):
    distances, indices = index.search(np.array([query_embedding]), top_k)
    return indices[0], distances[0]

# Step 4: Construct Few-Shot Prompt
def construct_prompt(retrieved_data, new_sms):
    context = ""
    for i, row in retrieved_data.iterrows():
        context += f"SMS: {row['template_text']} -> Extracted: {row['Extracted_key_value_pair']}\n"
    prompt = f"""
    Context:
    {context}
    New SMS: "{new_sms}"
    Extract the key-value pairs from the given input based on the context and return only the key-value pairs in output 
    """
    return prompt.strip()
#Extract and output the structured key-value pair.
# Step 5: Pass Prompt to Hugging Face Model
def query_huggingface_model(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")#.to("cpu")
    output = model.generate(
        **inputs,
        max_new_tokens=180,
        temperature=0.3,
        do_sample=False
    ) #num_beams=3
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response.strip()

# Step 6: Update Knowledge Base
def update_knowledge_base(kb_file, new_template_id, new_embedding, new_sms, extracted_kv):
    new_entry = {
        "template_ID": new_template_id,
        "Embedding": json.dumps(new_embedding.tolist()),
        "Original SMS": new_sms,
        "Extracted Key-Value Pair": json.dumps(extracted_kv)
    }
    kb_df = pd.read_excel(kb_file)
    kb_df = kb_df.append(new_entry, ignore_index=True)
    kb_df.to_excel(kb_file, index=False)

import re

def clean_extracted_text(input_text):
    # Regular expression to find and capture key-value pairs from the Extracted part
    match = re.search(r'Extracted:\s*(\{.*\})', input_text)

    if match:
        # Extract the key-value pairs (the JSON object)
        extracted_json = match.group(1)
        return extracted_json
    else:
        return "No key-value pairs found."



# Main Pipeline
def rag_pipeline(new_sms):
    # Load Knowledge Base
    embeddings, sms_data = load_knowledge_base(KB_FILE)
    # Compute Embedding for New SMS
    new_sms_embedding = compute_embedding(new_sms)
    
    # Build FAISS Index
    faiss_index = build_faiss_index(embeddings)
    
    # Query FAISS for Similar SMS
    indices, _ = query_faiss_index(faiss_index, embeddings, new_sms_embedding, TOP_K)
    retrieved_data = sms_data.iloc[indices]
    
    # Construct Few-Shot Prompt
    prompt = construct_prompt(retrieved_data, new_sms)
    #print("Generated Prompt:", prompt)
    
    # Query Hugging Face Model
    llm_output = query_huggingface_model(prompt)
    #print("LLM Output:", llm_output)
    llm_output=clean_extracted_text(llm_output)
    print("LLM Output:-----", llm_output)



# Example Usage
if __name__ == "__main__":
    new_sms = "You're covered!<NAME> we have received payment of Rs 506.0 for your car insurance GJ05JM0000. Download the ACKO app now<NAME> manage claims and renewals on the go <URL>"
    rag_pipeline(new_sms)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  sender_id  ...                                          embedding
0    ACKOGI  ...  [[0.033985212445259094, -0.025274984538555145,...
1    ACKOGI  ...  [[0.022043786942958832, 0.009573428891599178, ...
2    ACKOGI  ...  [[0.005984822288155556, -0.02892979420721531, ...
3    ACKOGI  ...  [[-0.04338106885552406, -0.025429215282201767,...
4    ACKOGI  ...  [[0.03569294884800911, -0.05052488297224045, 0...

[5 rows x 5 columns]
LLM Output:----- {"insurance_number":'GJ05JM8618","insurance_amount":"Rs 5356.0"}
