In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

In [None]:
# Load the Excel file
excel_path = "ccares_qna_dataset.xlsx"
query_df = pd.read_excel(excel_path)

In [5]:
query_df.head()

Unnamed: 0,Section,Query,Answer
0,"Paragraph 1, Subparagraph 2",To which regions does the Coal Mines Provident...,The scheme applies to all coal mines in West B...
1,"Paragraph 1, Subparagraph 2",Under which act is the Coal Mines Provident Fu...,The scheme is governed by the Coal Mines Provi...
2,"Paragraph 1, Subparagraph 2",What legal provision facilitates the applicati...,The scheme is applied under Sub-section (1) of...
3,"Paragraph 1, Subparagraph 2",Does the scheme include coal mines in partiall...,"Yes, the scheme includes coal mines in partial..."
4,"Paragraph 1, Subparagraph 2",Which provinces are explicitly mentioned as be...,The provinces explicitly mentioned are West Be...


In [6]:
# Initialize the Sentence Transformer model
embedding_model = SentenceTransformer("all-mpnet-base-v2", device="cpu")

In [7]:
# Extract the "Query" column
queries = query_df["Query"].tolist()

In [8]:
queries

['To which regions does the Coal Mines Provident Fund Scheme apply?',
 'Under which act is the Coal Mines Provident Fund Scheme governed?',
 'What legal provision facilitates the application of the Coal Mines Provident Fund Scheme?',
 'Does the scheme include coal mines in partially excluded areas?',
 'Which provinces are explicitly mentioned as being covered under the Coal Mines Provident Fund Scheme?',
 "Which Indian provinces are mentioned as part of the Coal Mines Provident Fund Scheme's applicability?",
 'What does Sub-section (1) of Section 92 of the Government of India Act, 1935, relate to in the context of the scheme?',
 'Is Maharashtra explicitly included under the Coal Mines Provident Fund Scheme?',
 'Which act is cited in the passage as governing the provident fund scheme for coal mines?',
 'Are partially excluded areas included in the applicability of the scheme?',
 'What is the purpose of the Coal Mines Provident Fund and Miscellaneous Provisions Act, 1948, in this context

In [9]:
# Create a DataFrame to hold query and metadata
query_data = pd.DataFrame({
    "query": queries,
    "chunk_char_count": [len(query) for query in queries],
    "chunk_word_count": [len(query.split()) for query in queries],
    "chunk_token_count": [len(query) // 4 for query in queries]  # Approx 4 chars per token
})

In [10]:
query_data

Unnamed: 0,query,chunk_char_count,chunk_word_count,chunk_token_count
0,To which regions does the Coal Mines Provident...,65,11,16
1,Under which act is the Coal Mines Provident Fu...,65,11,16
2,What legal provision facilitates the applicati...,89,13,22
3,Does the scheme include coal mines in partiall...,63,10,15
4,Which provinces are explicitly mentioned as be...,101,15,25
...,...,...,...,...
97,Are dependent parents included in the family o...,78,13,19
98,"Can the definition of ""family"" change over tim...",62,11,15
99,Are adopted children always considered part of...,66,10,16
100,"When were the definitions of ""family"" for male...",75,12,18


In [11]:
# Create embeddings for all queries
print("[INFO] Generating embeddings for queries...")
query_data["embedding"] = list(tqdm(embedding_model.encode(queries, batch_size=32, show_progress_bar=True)))


[INFO] Generating embeddings for queries...


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.11it/s]
100%|██████████| 102/102 [00:00<?, ?it/s]


In [12]:
query_data

Unnamed: 0,query,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,To which regions does the Coal Mines Provident...,65,11,16,"[-0.041524656, -0.049773205, -0.02330386, 0.01..."
1,Under which act is the Coal Mines Provident Fu...,65,11,16,"[-0.043123186, -0.053074803, -0.008909841, 0.0..."
2,What legal provision facilitates the applicati...,89,13,22,"[-0.024936039, -0.049957406, 0.0071940823, 0.0..."
3,Does the scheme include coal mines in partiall...,63,10,15,"[-0.029753016, -0.029991096, -0.017820057, 0.0..."
4,Which provinces are explicitly mentioned as be...,101,15,25,"[-0.034144644, -0.045873385, -0.011126724, 0.0..."
...,...,...,...,...,...
97,Are dependent parents included in the family o...,78,13,19,"[-0.002607187, -0.035035748, -0.012603954, 0.0..."
98,"Can the definition of ""family"" change over tim...",62,11,15,"[-0.0029405584, 0.025271261, -0.024342898, 0.0..."
99,Are adopted children always considered part of...,66,10,16,"[0.003107446, -0.05670504, 0.007287529, 0.0399..."
100,"When were the definitions of ""family"" for male...",75,12,18,"[0.017903931, -0.014754101, -0.005228826, -0.0..."


In [13]:
# Save the DataFrame as csv
output_path = "queries_with_embeddings.csv"
query_data.to_csv(output_path, index=False)
print(f"[INFO] Query embeddings saved to {output_path}")

[INFO] Query embeddings saved to queries_with_embeddings.csv


In [17]:
import torch
from sentence_transformers import util

# Semantic Search Function
def semantic_search(query, query_data, embedding_model, top_k=5):
    # Embed the input query
    query_embedding = embedding_model.encode(query, convert_to_tensor=True)
    
    # Extract stored embeddings
    stored_embeddings = torch.tensor(query_data["embedding"].tolist(), dtype=torch.float32)
    
    # Compute dot product scores
    scores = util.dot_score(query_embedding, stored_embeddings)[0]
    
    # Get the top-k results
    top_results = torch.topk(scores, k=top_k)
    
    # Display the top results
    print(f"Query: {query}\n")
    for score, idx in zip(top_results.values, top_results.indices):
        # Use .item() to convert idx tensor to integer (as in your notebook)
        result_query = query_data.iloc[idx.item()]["query"]
        print(f"Score: {score:.4f}\nQuery: {result_query}\n")




In [18]:
# Example Usage
input_query = "What is the purpose of the Coal Mines Provident Fund?"
semantic_search(input_query, query_data, embedding_model, top_k=5)

Query: What is the purpose of the Coal Mines Provident Fund?

Score: 0.8600
Query: What is the purpose of the Coal Mines Provident Fund and Miscellaneous Provisions Act, 1948, in this context?

Score: 0.8304
Query: What legal provision facilitates the application of the Coal Mines Provident Fund Scheme?

Score: 0.8144
Query: Under which act is the Coal Mines Provident Fund Scheme governed?

Score: 0.7937
Query: To which regions does the Coal Mines Provident Fund Scheme apply?

Score: 0.7884
Query: Which act is cited in the passage as governing the provident fund scheme for coal mines?

