1. create vector database to store summarised text embeddings <br>
2. retrieve the embeddings of the documents using semantic search and key word search <br>
3. return top k results through re-ranking <br>
Note: vectors stored in the database should have metadata so original non-summarised text can be retrieved 

In [14]:
from FlagEmbedding import BGEM3FlagModel
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from transformers import AutoTokenizer
import numpy as np
import faiss
from scipy.sparse import csr_matrix, vstack
from sklearn.preprocessing import MinMaxScaler
import json
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import nltk
from nltk.tokenize import word_tokenize

In [2]:
# load json file with data
with open("data.json", "r") as f:
    loaded_data = json.load(f)

In [3]:
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=False)
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-m3")

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [4]:
encoded_vectors = [model.encode(summary) for original, summary in loaded_data]
encoded_vectors = np.array(encoded_vectors)

print("Encoded vectors shape:", encoded_vectors.shape)

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Encoded vectors shape: (350,)


In [5]:
# Extract dense embeddings from each encoded vector
dense_embeddings = np.array([vec['dense_vecs'] for vec in encoded_vectors])

# Check the shape of dense embeddings
print(f"Dense embeddings shape: {dense_embeddings.shape}")

# Print sample embedding
sample_index = 0
print(f"Sample dense embedding: {dense_embeddings[sample_index]}")
print(f"Sample embedding shape: {dense_embeddings[sample_index].shape}")

Dense embeddings shape: (350, 1024)
Sample dense embedding: [-0.02495443  0.06316107 -0.0138085  ...  0.02421267 -0.00490005
 -0.0205813 ]
Sample embedding shape: (1024,)


In [None]:
# Normalize dense embeddings
norms = np.linalg.norm(dense_embeddings, axis=1, keepdims=True)
dense_embeddings_normalized = dense_embeddings / norms

dense_embeddings_normalized = dense_embeddings_normalized.astype('float32')

# create FAISS index
index = faiss.IndexFlatIP(dense_embeddings_normalized.shape[1])
index.add(dense_embeddings_normalized)

print(f"FAISS index created with {index.ntotal} embeddings")

FAISS index created with 350 embeddings


Semantic Search

In [9]:
# Example query
query = "What is corporate social responsibility?"

# Generate dense embedding for the query
query_embedding = model.encode([query], batch_size=1)['dense_vecs']
query_embedding = np.array(query_embedding).astype('float32')

# Normalize query embedding
query_norm = np.linalg.norm(query_embedding, axis=1, keepdims=True)
query_embedding_normalized = query_embedding / query_norm

# Perform semantic search
top_k = 5  # No. of results to retrieve
distances, indices = index.search(query_embedding_normalized, top_k)

# Retrieve the top-k documents and their metadata
top_documents = [loaded_data[i] for i in indices[0]]
top_original_texts = [loaded_data[i]['Original'] for i in indices[0]]

print("Top documents:")
for i, doc in enumerate(top_documents):
    print(f"{i + 1}. Original: {doc['Original']}")
    print(f"   Summarized: {doc['Summarized']}")
    print(f"   Similarity Score: {distances[0][i]}")

Top documents:
1. Original: # Text and data highlights  

[Image Description: The acquisition of new solar energy fields reduces carbon emissions.]  

Relationship between loyalty to the brand and aftersales satisfaction
   Summarized: 'This section discusses the relationship between customer loyalty and aftersales satisfaction, but does not provide direct metrics or indicators related to the target list such as resource use, political spending, policy adherence, turnover rate, wastewater, total production waste, regulatory violations, water recycling, manufacturing waste, ethical violations, advertising violations, professional training, social responsibility, occupational injuries, water discharge, ethical standards, anti-corruption compliance, water consumption, misconduct reports, incident rate, waste output, financial sanctions, production energy, climate impact, inclusion, training hours, consumption rate, waste recycling, social initiatives, minority representation, emissions in

Keyword Search

In [16]:
nltk.download('punkt')

def tokenize(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum() and token not in ENGLISH_STOP_WORDS]
    return tokens

# Tokenize the original texts
tokenized_corpus = [tokenize(doc['Summarized']) for doc in loaded_data]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ryant\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\ryant/nltk_data'
    - 'c:\\Users\\ryant\\AppData\\Local\\Programs\\Python\\Python39\\nltk_data'
    - 'c:\\Users\\ryant\\AppData\\Local\\Programs\\Python\\Python39\\share\\nltk_data'
    - 'c:\\Users\\ryant\\AppData\\Local\\Programs\\Python\\Python39\\lib\\nltk_data'
    - 'C:\\Users\\ryant\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [None]:
bm25 = BM25Okapi(tokenized_corpus)

In [None]:
query = "corporate social responsibility"
tokenized_query = tokenize(query)
scores = bm25.get_scores(tokenized_query)

top_k = 5
top_indices = np.argsort(scores)[::-1][:top_k]

# Retrieve the top-k documents and their metadata
top_documents = [loaded_data[i] for i in top_indices]
top_scores = [scores[i] for i in top_indices]

print("Top documents (BM25):")
for i, doc in enumerate(top_documents):
    print(f"{i + 1}. Original: {doc['Original']}")
    print(f"   Summarized: {doc['Summarized']}")
    print(f"   BM25 Score: {top_scores[i]}")

Combine Semantic and Keyword Search