1. create vector database to store summarised text embeddings <br>
2. retrieve the embeddings of the documents using semantic search and key word search <br>
3. return top k results through re-ranking <br>
Note: vectors stored in the database should have metadata so original non-summarised text can be retrieved 

In [1]:
from FlagEmbedding import BGEM3FlagModel
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from transformers import AutoTokenizer
import numpy as np
import faiss
from scipy.sparse import csr_matrix, vstack
from sklearn.preprocessing import MinMaxScaler
import json
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import nltk
from nltk.tokenize import word_tokenize

In [2]:
# download necessary nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ryant\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ryant\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ryant\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
# load json file with data
with open("data.json", "r") as f:
    loaded_data = json.load(f)

In [4]:
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=False)
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-m3")

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [5]:
encoded_vectors = [model.encode(entry['Summarized']) for entry in loaded_data]
encoded_vectors = np.array(encoded_vectors)

print("Encoded vectors shape:", encoded_vectors.shape)

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Encoded vectors shape: (350,)


In [12]:
# Extract dense embeddings from each encoded vector
dense_embeddings = np.array([vec['dense_vecs'] for vec in encoded_vectors])

# Check the shape of dense embeddings
print(f"Dense embeddings shape: {dense_embeddings.shape}")

# Print sample embedding
print(f"Sample dense embedding: {dense_embeddings[0]}")
print(f"Sample embedding shape: {dense_embeddings[0].shape}")

Dense embeddings shape: (350, 1024)
Sample dense embedding: [-0.05012071 -0.02261945 -0.02164037 ...  0.03180176 -0.02265712
 -0.02943054]
Sample embedding shape: (1024,)


In [13]:
# Normalize dense embeddings
norms = np.linalg.norm(dense_embeddings, axis=1, keepdims=True)
dense_embeddings_normalized = dense_embeddings / norms

dense_embeddings_normalized = dense_embeddings_normalized.astype('float32')
# dense_embeddings = dense_embeddings.astype('float32')

# create FAISS index
index = faiss.IndexFlatIP(dense_embeddings_normalized.shape[1])
index.add(dense_embeddings_normalized)

print(f"FAISS index created with {index.ntotal} embeddings")

FAISS index created with 350 embeddings


In [14]:
# check dense embeddings for any repeated values
unique_embeddings = np.unique(dense_embeddings_normalized, axis=0)
print(f"Number of unique embeddings: {unique_embeddings.shape[0]}")
dense_embeddings_normalized[:10]

Number of unique embeddings: 347


array([[-0.05012071, -0.02261945, -0.02164037, ...,  0.03180176,
        -0.02265712, -0.02943054],
       [-0.0537782 ,  0.01582329, -0.03108685, ...,  0.04655176,
        -0.00060346, -0.01102644],
       [-0.0227494 ,  0.00865165, -0.02520824, ...,  0.04443739,
         0.03599183, -0.01531775],
       ...,
       [-0.03282754, -0.01649688, -0.00893549, ...,  0.06326716,
         0.00532408, -0.03525482],
       [-0.05206808, -0.01915673, -0.05155739, ...,  0.04528717,
         0.00682075, -0.03072113],
       [-0.01994273,  0.03464289, -0.04770726, ...,  0.00985204,
         0.03093441,  0.01254351]], dtype=float32)

Semantic Search

In [None]:
query = "What is corporate social responsibility?"

# Generate dense embedding for the query
query_embedding = model.encode([query])['dense_vecs']
query_embedding = np.array(query_embedding).astype('float32')

# Normalize query embedding
query_norm = np.linalg.norm(query_embedding, axis=1, keepdims=True)
query_embedding_normalized = query_embedding / query_norm

# Semantic search
top_k = 5 
distances, indices = index.search(query_embedding_normalized, top_k)

# Retrieve the top-k documents and their metadata
top_documents = [loaded_data[i] for i in indices[0]]
top_original_texts = [loaded_data[i]['Original'] for i in indices[0]]

print("Top documents:")
for i, doc in enumerate(top_documents):
    print(f"{i + 1}. Original: {doc['Original']}")
    print(f"   Summarized: {doc['Summarized']}")
    print(f"   Similarity Score: {distances[0][i]}")

Top documents:
1. Original: # CORPORATE SOCIAL RESPONSIBILITY REPORT  

[Image Description: A picture of the number 02 with blue lines.]
   Summarized: 'This section reports on corporate social responsibility, including employee turnover rate, number of workplace accidents, injury rates, average training hours per employee, training investment per employee, percentage of minority employees, social related goals and actions taken, number of corruption incidents reported, compliance rate with anti-corruption policies, number of anti-competitive behavior incidents, monetary value of fines imposed, total value of political contributions made, number of marketing compliance incidents or violations reported, and governance related goals and actions taken.'
   Similarity Score: 0.5793351531028748
2. Original: # 940  

[Image Description: A yellow label that says "100" with honeycomb shapes around it.]  

ideas submitters  

incubation participants and coaches
   Summarized: 
   Similarity Sco

Keyword Search

In [56]:
def tokenize(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum() and token not in ENGLISH_STOP_WORDS]
    return tokens

# Tokenize the original texts
tokenized_corpus = [tokenize(doc['Summarized']) for doc in loaded_data]

In [57]:
# Create BM25 index
bm25 = BM25Okapi(tokenized_corpus)

In [63]:
query = "corporate social responsibility"
tokenized_query = tokenize(query)
scores = bm25.get_scores(tokenized_query)

top_k = 5
top_indices = np.argsort(scores)[::-1][:top_k]

# Retrieve the top-k documents and their metadata
top_documents = [loaded_data[i] for i in top_indices]
top_scores = [scores[i] for i in top_indices]

print("Top documents (BM25):")
for i, doc in enumerate(top_documents):
    print(f"{i + 1}. Original: {doc['Original']}")
    print(f"   Summarized: {doc['Summarized']}")
    print(f"   BM25 Score: {top_scores[i]}")

Top documents (BM25):
1. Original: # 1.3.1 CSR POLICY  

# “Powered by our diversity, we lead the way the world moves”.  

Stellantis corporate purpose finds its roots in the inextinguishable appetite for mobility of all human beings: our corporate responsibility is to design and offer a range of mobility solutions which are affordable, safe and sustainable.  

In order to protect our ability to provide those solutions in the long run and create shared value for the society in which we operate, guided by our support to the UN Sustainable Development Goals, our Corporate Social Responsibility policy finds its roots in the United Nations Global Compact Principles and embraces:  

■	 a holistic approach of our environmental footprint: our actions strive to bring a tangible impact on climate change and make a wise use of natural resources. Life cycle analysis support our decisions, notably when technology or material related changes impact key features of our products (mass, recyclability,

Combine Semantic and Keyword Search

In [None]:
# semantic search
top_k = 5
distances, indices = index.search(query_embedding, top_k)
top_documents_semantic = [loaded_data[i] for i in indices[0]]
top_scores_semantic = distances[0]

# keyword search
tokenized_query = tokenize(query)
bm25_scores = bm25.get_scores(tokenized_query)
top_indices_bm25 = np.argsort(bm25_scores)[::-1][:top_k]
top_documents_bm25 = [loaded_data[i] for i in top_indices_bm25]
top_scores_bm25 = [bm25_scores[i] for i in top_indices_bm25]

# Normalize scores
semantic_scores_normalized = MinMaxScaler().fit_transform(top_scores_semantic.reshape(-1, 1)).flatten()
bm25_scores_normalized = MinMaxScaler().fit_transform(np.array(top_scores_bm25).reshape(-1, 1)).flatten()

# Combine scores
alpha = 0.5  # Weight for semantic search
combined_scores = alpha * semantic_scores_normalized + (1 - alpha) * bm25_scores_normalized

# Re-rank documents
top_indices_combined = np.argsort(combined_scores)[::-1][:top_k]
top_documents_combined = [loaded_data[i] for i in top_indices_combined]
top_combined_scores = [combined_scores[i] for i in top_indices_combined]

print("Top documents (Combined):")
for i, doc in enumerate(top_documents_combined):
    print(f"{i + 1}. Original: {doc['Original']}")
    print(f"   Summarized: {doc['Summarized']}")
    print(f"   Combined Score: {top_combined_scores[i]}")

Top documents (Combined):
1. Original: # CORPORATE SOCIAL RESPONSIBILITY REPORT  

[Image Description: A picture of the number 02 with blue lines.]
   Summarized: 'This section reports on corporate social responsibility, including employee turnover rate, number of workplace accidents, injury rates, average training hours per employee, training investment per employee, percentage of minority employees, social related goals and actions taken, number of corruption incidents reported, compliance rate with anti-corruption policies, number of anti-competitive behavior incidents, monetary value of fines imposed, total value of political contributions made, number of marketing compliance incidents or violations reported, and governance related goals and actions taken.'
   Combined Score: 0.9999999999999998
2. Original: # Powered By Our Diversity, We Lead The Way The World Moves  

# 2023 CORPORATE SOCIAL RESPONSIBILITY REPORT  

REFERENCE FOR READING THE CSR REPOR 3  

ROM THE CHAIRMAN & THE C