In [42]:
#conda install pytorch::faiss-cpu
# %pip install sentence-transformers


In [6]:
import json
import faiss
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer

# Step 1: Load Embedding Model (BGE-M3)
embedding_model = SentenceTransformer("BAAI/bge-m3")

# Step 2: Load ESG Documents from JSON File (Convert to Lowercase)
def load_documents(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    summarized_texts = [item["Summarized"].lower() for item in data]  # Convert to lowercase
    original_texts = [item["Original"].lower() for item in data]  # Convert to lowercase

    return summarized_texts, original_texts

# Load the JSON file
json_file_path = "test_summaries.json"
summarized_docs, original_docs = load_documents(json_file_path)

# Step 3: Initialize FAISS Index
D = 1024  # Dimension of BGE-M3 embeddings
index = faiss.IndexHNSWFlat(D, 32)
metadata_store = summarized_docs  # Default to summarized text

# Step 4: Initialize BM25 (Lowercased Corpus)
bm25_corpus = [doc.split() for doc in summarized_docs]  # Tokenized text for BM25
bm25 = BM25Okapi(bm25_corpus)

# Step 5: Add Documents to FAISS (Lowercased Text for Embeddings)
def add_document(text):
    embedding = embedding_model.encode(text.lower(), normalize_embeddings=True)  # Encode lowercase text
    index.add(np.array([embedding], dtype='float32'))

for doc in summarized_docs:
    add_document(doc)

# Step 6: Search Function with Fallback to Original Text
def search(query, top_k=5, alpha=0.7):
    global metadata_store  # Default to summarized text
    results = {}

    query = query.lower()  # Convert query to lowercase
    
    # --- BM25 Search ---
    bm25_scores = bm25.get_scores(query.split())
    ranked_indices_bm25 = np.argsort(bm25_scores)[::-1][:top_k]

    # Check if BM25 results are weak (all scores are low)
    if max(bm25_scores) <= 0.5 :  # Adjust threshold if needed
        print("No strong matches in summarized text, switching to original text...")
        metadata_store = original_docs  # Switch to original texts
        bm25_corpus_original = [doc.split() for doc in original_docs]  # Tokenized text for BM25
        bm25_original = BM25Okapi(bm25_corpus_original)
        bm25_scores = bm25_original.get_scores(query.split())
        ranked_indices_bm25 = np.argsort(bm25_scores)[::-1][:top_k]

    for i, idx in enumerate(ranked_indices_bm25):
        results[idx] = alpha * bm25_scores[idx]  # BM25 score weighted by alpha
    
    # --- FAISS Search ---
    query_embedding = embedding_model.encode(query, normalize_embeddings=True)
    faiss_distances, faiss_indices = index.search(np.array([query_embedding], dtype='float32'), top_k)

    for i, idx in enumerate(faiss_indices[0]):
        if idx in results:
            results[idx] += (1 - alpha) * faiss_distances[0][i]  # Combine FAISS score
        else:
            results[idx] = (1 - alpha) * faiss_distances[0][i]

    # --- Final Reranking ---
    ranked_indices = sorted(results.keys(), key=lambda i: results[i], reverse=True)[:top_k]
    
    # Step 7: Store final results with scores
    final_results = {
        "Query": query,
        "Results": []
    }
    
    for i in ranked_indices:
        final_results["Results"].append({
            "Document": metadata_store[i],
            "BM25 Score": bm25_scores[i] if i in ranked_indices_bm25 else None,
            "FAISS Similarity": faiss_distances[0][np.where(faiss_indices[0] == i)[0][0]] if i in faiss_indices[0] else None,
            "Final Score": results[i]
        })
    
    return final_results

# query = "Carbon neutrality policy"
# results = search(query, top_k=3, alpha=0.7)
# print("Top Results (BM25 + FAISS Reranking with Fallback):", results)


In [23]:
query = "waste water"
results = search(query, top_k=3, alpha=0.7)
print("Top Results (BM25 + FAISS Reranking with Fallback):", results)

Top Results (BM25 + FAISS Reranking with Fallback): {'Query': 'waste water', 'Results': [{'Document': '# ethic culture  \n\n[image description: an aqueduct or dam, is the main component for water distribution.]', 'BM25 Score': 8.368772261875115, 'FAISS Similarity': 0.9085475, 'Final Score': 6.130704839503834}, {'Document': '# text and data highlights  \n\n[image description: the acquisition of new solar energy fields reduces carbon emissions.]  \n\nrelationship between loyalty to the brand and aftersales satisfaction', 'BM25 Score': 4.203001777725989, 'FAISS Similarity': None, 'Final Score': 2.942101244408192}, {'Document': '# wastewater reuction levels \n\n the levels of water are unprecedented with a reduction in wastewater production by 15%', 'BM25 Score': 3.3204248728736463, 'FAISS Similarity': 0.69754326, 'Final Score': 2.5335603900421617}]}


In [22]:
from sklearn.metrics.pairwise import cosine_similarity

def compute_similarity(query, retrieved_docs):
    query_embedding = embedding_model.encode(query, normalize_embeddings=True).reshape(1, -1)
    doc_embeddings = np.array([embedding_model.encode(doc, normalize_embeddings=True) for doc in retrieved_docs])

    similarities = cosine_similarity(query_embedding, doc_embeddings)[0]
    return similarities

query = "waste water"
results = search(query, top_k=3, alpha=0.7)
retrieved_docs = [res["Document"] for res in results["Results"]]

similarity_scores = compute_similarity(query, retrieved_docs)
print("Cosine Similarities:", similarity_scores)


Cosine Similarities: [0.45281696 0.3212503  0.59604496]
