# Define Storage Directory

Component	                                           BM25 (Sparse)	             FAISS (Dense)
Pre-tokenized corpus	                                   ✅ .json	                    ✅ .json
TF Vectors (tokens)	                                       ✅ .npy	                    ❌
IDF Scores (tokens)	                                       ✅ .npy	                    ❌
Token Lengths (bm25_token_lengths)	                       ✅ .npy	                    ❌
Avg Token Length (bm25_avg_token_length)	               ✅ .json	                    ❌
Token Embeddings	                                       ❌	                        ✅ .npy
FAISS Token Index	                                       ❌	                        ✅ .faiss
Token-to-Token Mapping	                                   ✅ .json	                    ✅ .json

In [1]:
import os

# Define storage folder
storage_dir = "retrieval_data"
os.makedirs(storage_dir, exist_ok=True)

# Define file paths
BM25_TF_PATH = os.path.join(storage_dir, "bm25_tf.npy")
BM25_IDF_PATH = os.path.join(storage_dir, "bm25_idf.npy")
BM25_TOKEN_LENGTHS_PATH = os.path.join(storage_dir, "bm25_token_lengths.npy")
BM25_AVG_TOKEN_LENGTH_PATH = os.path.join(storage_dir, "bm25_avg_token_length.json")
TOKENIZED_CORPUS_PATH = os.path.join(storage_dir, "tokenized_corpus.json")
FAISS_EMBEDDINGS_PATH = os.path.join(storage_dir, "faiss_token_embeddings.npy")
FAISS_INDEX_PATH = os.path.join(storage_dir, "faiss_token_index.faiss")
FAISS_TOKEN_MAP_PATH = os.path.join(storage_dir, "faiss_token_map.json")

# Tokenize Data With Auto Tokeniser

In [4]:
from transformers import AutoTokenizer
import pandas as pd
import json

# Load your dataset
df = pd.read_csv("wikipedia_category_articles.csv")  # Columns: ['Article', 'Article Info']

# Load a tokenizer (Use the same model tokenizer for inference)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")  # Change to match your model

# Tokenize articles and convert tokens to IDs
tokenized_docs = df["Article"].apply(lambda x: tokenizer.tokenize(x))  # Tokenized text
tokenized_ids = df["Article"].apply(lambda x: tokenizer.encode(x, add_special_tokens=False))  # Token IDs

# Save pre-tokenized output
with open(TOKENIZED_CORPUS_PATH, "w") as f:
    json.dump({"tokenized_docs": tokenized_docs.tolist(), "tokenized_ids": tokenized_ids.tolist()}, f)

print("✅ Tokenized corpus saved!")


✅ Tokenized corpus saved!


# Compute BM25 Weights for Tokens

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import json

# Convert tokenized text back to string for vectorization
vectorizer = TfidfVectorizer(use_idf=True, norm=None)  # ✅ Enable IDF computation
tf_matrix = vectorizer.fit_transform([" ".join(tokens) for tokens in tokenized_docs])

# Store term frequencies
np.save(BM25_TF_PATH, tf_matrix.toarray())

# Store IDF scores for each token
idf_scores = vectorizer.idf_  # ✅ This will now work
np.save(BM25_IDF_PATH, idf_scores)

# Store token lengths
token_lengths = np.array([len(tokens) for tokens in tokenized_docs])
np.save(BM25_TOKEN_LENGTHS_PATH, token_lengths)

# Compute and store average token length
avg_token_length = np.mean(token_lengths)
with open(BM25_AVG_TOKEN_LENGTH_PATH, "w") as f:
    json.dump({"avg_token_length": avg_token_length}, f)

print("✅ BM25 token weights computed and saved!")


✅ BM25 token weights computed and saved!


# Computing FAISS Token Embedding

In [9]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load embedding model (e.g., MiniLM or your custom model)
model = SentenceTransformer("all-MiniLM-L6-v2")

# Flatten tokenized docs into a token list
all_tokens = [token for doc in tokenized_docs for token in doc]

# Generate embeddings for each token
token_embeddings = model.encode(all_tokens, convert_to_numpy=True)

# Save token embeddings
np.save(FAISS_EMBEDDINGS_PATH, token_embeddings)

print("✅ FAISS token embeddings computed and saved!")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ FAISS token embeddings computed and saved!


# Build Faiss Token Index

In [10]:
import faiss
import json

# Initialize FAISS token index
dimension = token_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)  # L2 distance
faiss_index.add(token_embeddings)  # Add token embeddings to index

# Save FAISS index
faiss.write_index(faiss_index, FAISS_INDEX_PATH)

# Save token-to-ID mapping (each FAISS index corresponds to a token)
token_map = {i: all_tokens[i] for i in range(len(all_tokens))}
with open(FAISS_TOKEN_MAP_PATH, "w") as f:
    json.dump(token_map, f)

print("✅ FAISS token index created and saved!")

✅ FAISS token index created and saved!
