In [6]:
import os
import re
import json
import random
import requests
from PyPDF2 import PdfReader
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter



os.makedirs("dataset", exist_ok=True)

# Combined dictionary mapping URLs to filenames (both webpages and PDFs)
url_filename_mapping = {
    "https://www.larsentoubro.com/corporate/about-lt-group/overview/": "Larsen_Toubro_Overview.txt",
    "https://www.larsentoubro.com/corporate/about-lt-group/technology-for-growth/": "Larsen_Toubro_Technology_for_Growth.txt",
    "https://www.larsentoubro.com/corporate/about-lt-group/awards-recognition/": "Larsen_Toubro_Awards_Recognition.txt",
    "https://www.larsentoubro.com/corporate/about-lt-group/leadership/": "Larsen_Toubro_Leadership.txt",
    "https://www.larsentoubro.com/corporate/about-lt-group/facilities/": "Larsen_Toubro_Facilities.txt",
    "https://www.larsentoubro.com/corporate/about-lt-group/experience-centre-mumbai/": "Larsen_Toubro_Experience_Centre_Mumbai.txt",
    "https://www.lntsustainability.com/overview/": "LNT_Sustainability_Overview.txt",
    "https://www.lntsustainability.com/climate-strategy/": "LNT_Climate_Strategy.txt",
    "https://www.lntsustainability.com/environment/": "LNT_Environment.txt",
    "https://www.lntsustainability.com/green-business/": "LNT_Green_Business.txt",
    "https://www.larsentoubro.com/corporate/careers/learning-development/": "LNT_Careers_Learning_Development.txt",
    "https://www.larsentoubro.com/corporate/careers/diversity-equity-inclusion/": "LNT_Careers_Diversity_Equity_Inclusion.txt",
    "https://www.larsentoubro.com/corporate/careers/recruitment-caution/": "LNT_Careers_Recruitment_Caution.txt",
    "https://www.larsentoubro.com/corporate/careers/campus-recruitment/": "LNT_Careers_Campus_Recruitment.txt",
    "https://www.larsentoubro.com/corporate/careers/renew-career-re-entry-for-women/": "LNT_Careers_ReEntry_for_Women.txt",

    # PDF URLs
    "https://annualreview.larsentoubro.com/download/L&T-Annual-Review-2024.pdf": "LT_Annual_Review_2024.pdf",
    "https://annualreview.larsentoubro.com/download/L&T%20Annual%20Review%202023.pdf": "LT_Annual_Review_2023.pdf",
    "https://annualreview.larsentoubro.com/download/Annual%20Review%202022.pdf": "Annual_Review_2022.pdf",
    # "https://annualreview.larsentoubro.com/download/Annual%20Review%202021.pdf": "Annual_Review_2021.pdf",
    # "https://annualreview.larsentoubro.com/download/L&T%20Annual%20Review%202020.pdf": "LT_Annual_Review_2020.pdf"
}


def save_webpages_as_text(urls):
    """Download webpages and save as text files"""
    for url, filename in urls.items():
        try:
            response = requests.get(url)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, "html.parser")
            text = soup.get_text(separator="\n")

            with open(f"dataset/{filename}", "w", encoding="utf-8") as file:
                file.write(text)

            print(f"✅ Saved webpage: {filename}")
        except Exception as e:
            print(f"❌ Error processing {url}: {e}")


def download_and_parse_pdfs(pdf_urls):
    """Download PDFs and extract text into .txt files"""
    for url, filename in pdf_urls.items():
        pdf_filepath = f"dataset/{filename}"
        txt_filename = filename.replace(".pdf", ".txt")
        txt_filepath = f"dataset/{txt_filename}"

        try:
            # Download PDF
            response = requests.get(url, stream=True)
            response.raise_for_status()
            with open(pdf_filepath, "wb") as file:
                for chunk in response.iter_content(1024):
                    file.write(chunk)
            print(f"✅ Downloaded PDF: {pdf_filepath}")

            # Extract text
            with open(pdf_filepath, "rb") as pdf_file:
                reader = PdfReader(pdf_file)
                extracted_text = "\n".join([page.extract_text() or "" for page in reader.pages])

            with open(txt_filepath, "w", encoding="utf-8") as txt_file:
                txt_file.write(extracted_text)

            print(f"✅ Extracted text saved: {txt_filepath}")

        except Exception as e:
            print(f"❌ Error processing {url}: {e}")


def process_text_files(urls):
    """
    Split URLs into webpage and PDF groups, 
    then send them to their respective functions.
    """
    pdf_urls = {url: filename for url, filename in urls.items() if url.endswith('.pdf')}
    webpage_urls = {url: filename for url, filename in urls.items() if not url.endswith('.pdf')}

    if pdf_urls:
        download_and_parse_pdfs(pdf_urls)
    if webpage_urls:
        save_webpages_as_text(webpage_urls)


# Run processing
process_text_files(url_filename_mapping)

✅ Downloaded PDF: dataset/LT_Annual_Review_2024.pdf
✅ Extracted text saved: dataset/LT_Annual_Review_2024.txt
✅ Downloaded PDF: dataset/LT_Annual_Review_2023.pdf
✅ Extracted text saved: dataset/LT_Annual_Review_2023.txt
✅ Downloaded PDF: dataset/Annual_Review_2022.pdf
✅ Extracted text saved: dataset/Annual_Review_2022.txt
✅ Saved webpage: Larsen_Toubro_Overview.txt
✅ Saved webpage: Larsen_Toubro_Technology_for_Growth.txt
✅ Saved webpage: Larsen_Toubro_Awards_Recognition.txt
✅ Saved webpage: Larsen_Toubro_Leadership.txt
✅ Saved webpage: Larsen_Toubro_Facilities.txt
✅ Saved webpage: Larsen_Toubro_Experience_Centre_Mumbai.txt
✅ Saved webpage: LNT_Sustainability_Overview.txt
✅ Saved webpage: LNT_Climate_Strategy.txt
✅ Saved webpage: LNT_Environment.txt
✅ Saved webpage: LNT_Green_Business.txt
✅ Saved webpage: LNT_Careers_Learning_Development.txt
✅ Saved webpage: LNT_Careers_Diversity_Equity_Inclusion.txt
✅ Saved webpage: LNT_Careers_Recruitment_Caution.txt
✅ Saved webpage: LNT_Careers_Campu

In [7]:
def preprocess_text(text, filename, url):
    """
    Preprocess text by:
    1. Converting to lower case
    2. Removing excessive newlines (\n), keeping max 2 consecutive
    3. Splitting into smaller chunks using Recursive Text Splitter
    4. Storing chunks in a list with metadata
    """

    # Convert text to lowercase
    text = text.lower()

    # Replace 3 or more consecutive newlines with just 2 newlines
    text = re.sub(r'\n{3,}', '\n\n', text)

    # Initialize text splitter for chunking
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100
    )

    # Split text into chunks
    chunks = text_splitter.split_text(text)

    chunks_data = []

    # Add each chunk with its metadata
    for chunk in chunks:
        chunks_data.append({
            "chunk_data": chunk,
            "metadata": {
                "filename": filename,
                "url": url
            }
        }) 
    
    # Print message with processed filename
    print(f"[INFO] Processed and stored chunks for: {filename}")
    return chunks_data


def chunk_data(urls):
    """
    Process multiple text files, generate chunks, 
    and save them into a single JSON file.
    """
    all_chunks_data = []

    # Iterate over each URL and filename pair
    for url, filename in urls.items():
        if filename.endswith('.txt'):
            try:
                # Open and read file content
                with open(f"dataset/{filename}", "r", encoding="utf-8") as file:
                    text = file.read()
                
                # Preprocess and chunk the text
                chunks_data = preprocess_text(text, filename, url)
                all_chunks_data += chunks_data

            except Exception as e:
                print(f"[ERROR] Could not process {filename}: {e}")

    # Save all chunks data into JSON file
    with open("dataset/all_chunks_data.json", "w", encoding="utf-8") as json_file:
        json.dump(all_chunks_data, json_file, ensure_ascii=False, indent=4)
    
    print("[INFO] All chunks processed and saved to 'dataset/all_chunks_data.json'")


# Call the function with the mapping of URLs to filenames
chunk_data(url_filename_mapping)


[INFO] Processed and stored chunks for: Larsen_Toubro_Overview.txt
[INFO] Processed and stored chunks for: Larsen_Toubro_Technology_for_Growth.txt
[INFO] Processed and stored chunks for: Larsen_Toubro_Awards_Recognition.txt
[INFO] Processed and stored chunks for: Larsen_Toubro_Leadership.txt
[INFO] Processed and stored chunks for: Larsen_Toubro_Facilities.txt
[INFO] Processed and stored chunks for: Larsen_Toubro_Experience_Centre_Mumbai.txt
[INFO] Processed and stored chunks for: LNT_Sustainability_Overview.txt
[INFO] Processed and stored chunks for: LNT_Climate_Strategy.txt
[INFO] Processed and stored chunks for: LNT_Environment.txt
[INFO] Processed and stored chunks for: LNT_Green_Business.txt
[INFO] Processed and stored chunks for: LNT_Careers_Learning_Development.txt
[INFO] Processed and stored chunks for: LNT_Careers_Diversity_Equity_Inclusion.txt
[INFO] Processed and stored chunks for: LNT_Careers_Recruitment_Caution.txt
[INFO] Processed and stored chunks for: LNT_Careers_Campus_

In [8]:
from sentence_transformers import SentenceTransformer

# Example sentences
sentences = ["This is an example sentence", "Each sentence is converted"]

# Load the pretrained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert sentences to embeddings
embeddings = model.encode(sentences)

print(embeddings)


[[ 6.76569417e-02  6.34959564e-02  4.87131029e-02  7.93048963e-02
   3.74480709e-02  2.65277619e-03  3.93749177e-02 -7.09846616e-03
   5.93614057e-02  3.15370224e-02  6.00980818e-02 -5.29052354e-02
   4.06067409e-02 -2.59308442e-02  2.98428535e-02  1.12691487e-03
   7.35148937e-02 -5.03818579e-02 -1.22386597e-01  2.37028375e-02
   2.97265276e-02  4.24768776e-02  2.56337691e-02  1.99514348e-03
  -5.69190495e-02 -2.71597989e-02 -3.29035297e-02  6.60248995e-02
   1.19007193e-01 -4.58791330e-02 -7.26214200e-02 -3.25840116e-02
   5.23414016e-02  4.50553410e-02  8.25305004e-03  3.67024243e-02
  -1.39415674e-02  6.53918684e-02 -2.64272075e-02  2.06393175e-04
  -1.36643527e-02 -3.62811014e-02 -1.95043907e-02 -2.89738420e-02
   3.94270532e-02 -8.84090662e-02  2.62423488e-03  1.36713171e-02
   4.83062454e-02 -3.11566181e-02 -1.17329188e-01 -5.11690676e-02
  -8.85287672e-02 -2.18963325e-02  1.42986281e-02  4.44167778e-02
  -1.34816021e-02  7.43392631e-02  2.66382936e-02 -1.98762678e-02
   1.79191

In [9]:
import json
from sentence_transformers import SentenceTransformer
import chromadb

# Load pre-trained SentenceTransformer model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize a persistent ChromaDB client and collection
client = chromadb.PersistentClient(path="knowledge_base")
collection = client.get_or_create_collection(name="embedding_collection")


def create_embedding_from_json(json_filename):
    """
    Read the JSON file containing data, generate embeddings using SentenceTransformer,
    and store them in a ChromaDB collection.
    """

    # Load JSON data containing text chunks and metadata
    with open(json_filename, 'r', encoding='utf-8') as file:
        chunk_data = json.load(file)

    # Extract sentences (chunks) and metadata
    sentences = [chunk["chunk_data"] for chunk in chunk_data]
    metadatas = [chunk["metadata"] for chunk in chunk_data]

    # Generate embeddings for all sentences
    embeddings = model.encode(sentences)

    # Store embeddings, metadata, and original sentences in ChromaDB
    for idx, embedding in enumerate(embeddings):
        collection.add(
            ids=[str(idx)],  # unique ID for each chunk
            embeddings=[embedding],
            metadatas=[metadatas[idx]],
            documents=[sentences[idx]]
        )

    # Print status message
    print(f"[INFO] Created embeddings for {len(sentences)} chunks and added them to ChromaDB collection.")


# Run the embedding creation process
create_embedding_from_json('dataset/all_chunks_data.json')


Failed to send telemetry event ClientStartEvent: module 'chromadb' has no attribute '__version__'
Failed to send telemetry event ClientCreateCollectionEvent: module 'chromadb' has no attribute '__version__'
Failed to send telemetry event CollectionAddEvent: module 'chromadb' has no attribute '__version__'


[INFO] Created embeddings for 179 chunks and added them to ChromaDB collection.


In [10]:
num_documents = len(collection.get()['documents'])

print(f"Number of documents in the collection: {num_documents}")

Failed to send telemetry event CollectionGetEvent: module 'chromadb' has no attribute '__version__'


Number of documents in the collection: 194


In [11]:
import numpy as np

def fetch_top_relevant_queries(query, collection, top_k=10):
    """
    Takes a query, encodes it using the SentenceTransformer,
    and fetches the top `top_k` relevant queries from the Chroma collection.
    """
    
    # Encode the query into an embedding
    query_embedding = model.encode([query])

    # Perform a similarity search in the Chroma collection
    results = collection.query(
        query_embeddings=query_embedding,  # The query embedding
        n_results=top_k  # Number of top results to return
    )

    # Process the results
    relevant_queries = []
    for result in results['documents']:
        relevant_queries.append({
            "document": result,  # The chunk or sentence text
            "metadata": results['metadatas'][results['documents'].index(result)],  # Metadata for each chunk
            "score": results['distances'][results['documents'].index(result)]  # Similarity score (distance)
        })

    return relevant_queries


# Define the query
query = "What distinguishes the location of CTEA Madh from the Mysore academy?"

# Fetch top-k relevant queries
top_queries = fetch_top_relevant_queries(query, collection, top_k=10)

# Display the top 10 relevant queries
for i in range(len(top_queries[0]['metadata'])):
    print("\n[RESULT]", i+1)  # Added numbering for readability
    print("Document: " + str(top_queries[0]['document'][i].replace("\n", " ")))  # Replace newlines for cleaner display
    print("Metadata: " + str(top_queries[0]['metadata'][i]))
    print("Similarity Score: " + str(top_queries[0]['score'][i]))
    print("\n")


Failed to send telemetry event CollectionQueryEvent: module 'chromadb' has no attribute '__version__'



[RESULT] 1
Document: How large is the campus area of the academy in Mysore, and what is notable about its location?    What is the setting and unique features of CTEA Madh in Mumbai?  What classroom and practical training facilities are emphasized by these academies?    What kinds of training programs are designed and delivered at these academies?    What are the specific labs offered at CTEA Madh for training purposes?    What types of facilities are available for hands-on training at CTEA Madh?    What kind of infrastructure and environment do the academies offer?    What makes the academies suitable for learning and growth?    How large is the campus area of the academy in Mysore, and what is notable about its location?    What is the setting and unique features of CTEA Madh in Mumbai?  ctea madh is set amidst 31 acres of lush greenery and swaying palm trees against the backdrop of versova creek in mumbai. the academy at mysore is housed in 25 acres in l&t campus, the city known fo

In [12]:
import os
from dotenv import load_dotenv
from openai import AzureOpenAI

# Load environment variables from .env file
load_dotenv()

# Fetch API credentials from environment variables
api_key = os.getenv('AZURE_OPENAI_GPT_4O_API_KEY') 
end_point = os.getenv('AZURE_OPENAI_GPT_4O_ENDPOINT')
api_version = os.getenv('AZURE_OPENAI_GPT_4O_API_VERSION')

# Initialize Azure OpenAI client
client = AzureOpenAI(
    api_key=api_key,
    azure_endpoint=end_point,
    api_version=api_version
)

# Send a chat completion request to GPT-4o
response = client.chat.completions.create(
    model="gpt-4o",  # Azure model deployment name
    messages=[
        {"role": "system", "content": "You are an AI assistant"},
        {"role": "user", "content": "Tell me about the large language models"},
    ],
    max_tokens=256
    # temperature=0.7  # Uncomment to adjust randomness of responses
)

# Print only the model's reply
print(f"[RESPONSE] {response.choices[0].message.content}")


[RESPONSE] Large language models (LLMs) are a type of artificial intelligence (AI) system designed to understand and generate human-like text based on large amounts of input data. These models are trained on extensive datasets, which often consist of text from books, articles, websites, and other publicly available sources. The goal of an LLM is to process and generate natural language that is contextually relevant and coherent.

Below is an overview of key features, concepts, and applications related to large language models:

---

### **Key Features of LLMs**  
1. **Scale and Training**:  
   LLMs are considered "large" because they involve billions (or even trillions) of parameters. Parameters are the numbers within the model that adjust during training to optimize the model's ability to predict text. This scale allows the models to recognize nuanced linguistic patterns and generate high-quality text.

2. **Pretraining and Fine-Tuning**:  
   - **Pretraining**: LLMs are initially tr

In [13]:
import random

# Fix the random seed for reproducibility
random.seed(10)

def get_random_chunk_and_generate_questions(json_file_path):
    """
    Selects a random chunk from the provided JSON file and sends it to OpenAI 
    to generate a list of questions that can be answered using that chunk of text.
    """
    try:
        # Read the JSON file and load the chunks data
        with open(json_file_path, 'r', encoding='utf-8') as json_file:
            chunks_data = json.load(json_file)

        # Select a random chunk from the dataset
        random_chunk = random.choice(chunks_data)
        random_chunk_data = random_chunk['chunk_data']

        # Prepare the prompt for OpenAI
        prompt = (
            f"Please generate a set of questions that could be answered using this information: \n\n"
            f"{random_chunk_data}\n\n"
        )

        # Send the prompt to OpenAI
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": (
                        "You are an AI assistant with the ability to generate relevant questions "
                        "based on provided text. Your task is to analyze the text and create "
                        "insightful questions that can be answered using that text. "
                        "Return only the questions in plain text in multiple lines. "
                        "No headings, no titles, no bullet points."
                    )
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            model="gpt-4o",
        )

        # Extract the response text
        questions = chat_completion.choices[0].message.content
        
        return random_chunk, questions

    except Exception as e:
        print(f"[ERROR] {e}")


# Run the function on dataset and print outputs
chunk, llm_output = get_random_chunk_and_generate_questions('dataset/all_chunks_data.json')

# Print the randomly chosen chunk for reference
print("[RANDOM CHUNK SELECTED]\n", chunk)

print("\n[GENERATED QUESTIONS]\n")
print(llm_output)


[RANDOM CHUNK SELECTED]
 {'chunk_data': 'ctea\xa0madh is set amidst 31 acres of lush greenery and swaying palm trees against the backdrop of versova creek in mumbai. the academy at mysore is housed in 25 acres in l&t campus, the city known for its academic and research institutions, heritage buildings and palaces. the academies are at serene and peaceful locations, making them idyllic as well as ideal places to learn and grow.\xa0\xa0\n\nboth the academies have state-of-the-art infrastructure and a learner friendly environment, with robust processes to design and deliver various high quality training programs.\xa0\n\nin addition to excellent classroom facilities, both the academies lay emphasis on providing hands-on training to the participants and have the following labs:\n\nctea madh\xa0\n\ngd&t\n\nmanufacturing excellence\n\nfabrication & cnc \xa0\xa0\n\nfield instrumentation & calibration set-up\n\nembedded & vlsi system\n\nsafety excellence\n\nplc & automation\n\ncad & cam design 

In [14]:
questions = llm_output.split("\n")

qes = questions[0]
augumented_chunk = f"{qes}{chunk['chunk_data']}"
print(augumented_chunk)

Where is CTEA Madh located, and what natural features surround it?  ctea madh is set amidst 31 acres of lush greenery and swaying palm trees against the backdrop of versova creek in mumbai. the academy at mysore is housed in 25 acres in l&t campus, the city known for its academic and research institutions, heritage buildings and palaces. the academies are at serene and peaceful locations, making them idyllic as well as ideal places to learn and grow.  

both the academies have state-of-the-art infrastructure and a learner friendly environment, with robust processes to design and deliver various high quality training programs. 

in addition to excellent classroom facilities, both the academies lay emphasis on providing hands-on training to the participants and have the following labs:

ctea madh 

gd&t

manufacturing excellence

fabrication & cnc   

field instrumentation & calibration set-up

embedded & vlsi system

safety excellence

plc & automation

cad & cam design facility

electr

In [15]:
# List to hold augmented chunks
augumented_chunks = []

# Iterate through each generated question
for question in questions:
    # Create a new chunk by combining the question with the original chunk data
    chunk = {
        "chunk_data": f"{question}\n\n{chunk['chunk_data']}",  # Prepend question before the chunk text
        "metadata": chunk['metadata']  # Keep the original metadata
    }

    # Mark this chunk as augmented in its metadata
    chunk['metadata']['is_augumented'] = True

    # Add the new augmented chunk to the list
    augumented_chunks.append(chunk)

# Final list of augmented chunks
augumented_chunks


[{'chunk_data': 'Where is CTEA Madh located, and what natural features surround it?  \n\nctea\xa0madh is set amidst 31 acres of lush greenery and swaying palm trees against the backdrop of versova creek in mumbai. the academy at mysore is housed in 25 acres in l&t campus, the city known for its academic and research institutions, heritage buildings and palaces. the academies are at serene and peaceful locations, making them idyllic as well as ideal places to learn and grow.\xa0\xa0\n\nboth the academies have state-of-the-art infrastructure and a learner friendly environment, with robust processes to design and deliver various high quality training programs.\xa0\n\nin addition to excellent classroom facilities, both the academies lay emphasis on providing hands-on training to the participants and have the following labs:\n\nctea madh\xa0\n\ngd&t\n\nmanufacturing excellence\n\nfabrication & cnc \xa0\xa0\n\nfield instrumentation & calibration set-up\n\nembedded & vlsi system\n\nsafety exc

In [16]:
import json
from sentence_transformers import SentenceTransformer
import chromadb

# Load the SentenceTransformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def add_augumented_chunks_to_collection(augumented_chunks):
    """
    Takes augmented chunks, generates embeddings using SentenceTransformer,
    and adds them into the existing Chroma collection.
    """

    # Extract sentences (chunk text) and metadata
    sentences = [chunk['chunk_data'] for chunk in augumented_chunks]
    metadatas = [chunk['metadata'] for chunk in augumented_chunks]

    # Generate embeddings for the new chunks
    embeddings = model.encode(sentences)

    # Get the current count of documents in the collection (for unique IDs)
    last_idx = len(collection.get()['documents'])

    # Insert the embeddings and metadata into the Chroma collection
    for idx, embedding in enumerate(embeddings):
        collection.add(
            ids=[str(idx + last_idx)],      # Ensure unique ID by offsetting with last_idx
            embeddings=[embedding],         # Store embedding
            metadatas=[metadatas[idx]],     # Store associated metadata
            documents=[sentences[idx]]      # Store the chunk text
        )

    # Print confirmation
    print(f"[INFO] Created embeddings for {len(sentences)} augmented chunks and added to Chroma collection.")


# Run the function to create embeddings and add to Chroma collection
add_augumented_chunks_to_collection(augumented_chunks)


[INFO] Created embeddings for 10 augmented chunks and added to Chroma collection.


In [17]:
# pip install ipywidgets

In [18]:
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [19]:
query = "Who is the chairman of L&T?"
top_queries = fetch_top_relevant_queries(query, collection, top_k=10)

# Display the top 10 relevant queries
for i in range(len(top_queries[0]['metadata'])):
    print("Document: " + str(top_queries[0]['document'][i].replace("\n", " ")))
    print("Metadata: " + str(top_queries[0]['metadata'][i]))
    print("Similarity Score: " + str(top_queries[0]['score'][i]))
    print("\n\n")

Document: mint has conferred the award on mr. naik in recognition of his “accomplishments over a long career.” apart from being the l&t group chairman, mr. naik also happens to be the chairman of l&t employee trust which he founded in 2003.  2022  mr. a.m. naik conferred e&y’s lifetime achievement award   l&t group chairman mr a m naik was conferred the  lifetime achievement award by ernst & young, one of the world’s leading management consultancies.     reading out the citation, marico chairman mr harsh mariwala described mr naik as “the nationalist business leader, and one of the best living examples of professionalism, commitment and entrepreneurship”. the citation said that amn built l&t “into a technology powerhouse and one of india’s best-known companies through decades of blood, sweat, and tears”. hailing his commitment to the organisation, the citation said, “mr naik belongs to the rare group of business executives who dedicate their lives to the companies they work at.”
Metada

In [20]:
from sentence_transformers import CrossEncoder

# Load the cross-encoder model
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

query = "Who is the chairman of L&T?"
top_queries = fetch_top_relevant_queries(query, collection, top_k=10)

# Compute cross-encoder scores
for i in range(len(top_queries[0]['metadata'])):
    document_text = str(top_queries[0]['document'][i].replace("\n", " "))
    
    # Compute cross-encoder relevance score
    score = cross_encoder.predict([(query, document_text)])
    
    print("Document: " + document_text)
    print("Similarity Score: " + str(top_queries[0]['score'][i]))
    print("Cross-Encoder Score: " + str(score[0]))  # Displaying the cross-encoder score
    print("\n\n")

Document: mint has conferred the award on mr. naik in recognition of his “accomplishments over a long career.” apart from being the l&t group chairman, mr. naik also happens to be the chairman of l&t employee trust which he founded in 2003.  2022  mr. a.m. naik conferred e&y’s lifetime achievement award   l&t group chairman mr a m naik was conferred the  lifetime achievement award by ernst & young, one of the world’s leading management consultancies.     reading out the citation, marico chairman mr harsh mariwala described mr naik as “the nationalist business leader, and one of the best living examples of professionalism, commitment and entrepreneurship”. the citation said that amn built l&t “into a technology powerhouse and one of india’s best-known companies through decades of blood, sweat, and tears”. hailing his commitment to the organisation, the citation said, “mr naik belongs to the rare group of business executives who dedicate their lives to the companies they work at.”
Simila

In [21]:
from sentence_transformers import CrossEncoder

# Load the cross-encoder model
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

query = "Who is the chairman of L&T?"
top_queries = fetch_top_relevant_queries(query, collection, top_k=10)

# Compute cross-encoder scores and store them with the query data
scored_queries = []
for i in range(len(top_queries[0]['metadata'])):
    document_text = str(top_queries[0]['document'][i].replace("\n", " "))
    
    # Compute cross-encoder relevance score
    score = cross_encoder.predict([(query, document_text)])[0]  
    
    # Append to list with all relevant data
    scored_queries.append({
        "document": document_text,
        "metadata": top_queries[0]['metadata'][i],
        "similarity_score": top_queries[0]['score'][i],
        "cross_encoder_score": score
    })

# Sort the queries based on cross-encoder score in descending order
scored_queries.sort(key=lambda x: x['cross_encoder_score'], reverse=True)

# Display sorted results
for item in scored_queries:
    print("Document: " + item["document"])
    print("Metadata: " + str(item["metadata"]))
    print("Similarity Score: " + str(item["similarity_score"]))
    print("Cross-Encoder Score: " + str(item["cross_encoder_score"]))
    print("\n\n")

Document: mint has conferred the award on mr. naik in recognition of his “accomplishments over a long career.” apart from being the l&t group chairman, mr. naik also happens to be the chairman of l&t employee trust which he founded in 2003.  2022  mr. a.m. naik conferred e&y’s lifetime achievement award   l&t group chairman mr a m naik was conferred the  lifetime achievement award by ernst & young, one of the world’s leading management consultancies.     reading out the citation, marico chairman mr harsh mariwala described mr naik as “the nationalist business leader, and one of the best living examples of professionalism, commitment and entrepreneurship”. the citation said that amn built l&t “into a technology powerhouse and one of india’s best-known companies through decades of blood, sweat, and tears”. hailing his commitment to the organisation, the citation said, “mr naik belongs to the rare group of business executives who dedicate their lives to the companies they work at.”
Metada

In [22]:
# LLM- improvising the actual prompt