# Define Storage Directory

Component	                                           BM25 (Sparse)	             FAISS (Dense)
Pre-tokenized corpus	                                   ✅ .json	                    ✅ .json
TF Vectors (tokens)	                                       ✅ .npy	                    ❌
IDF Scores (tokens)	                                       ✅ .npy	                    ❌
Token Lengths (bm25_token_lengths)	                       ✅ .npy	                    ❌
Avg Token Length (bm25_avg_token_length)	               ✅ .json	                    ❌
Token Embeddings	                                       ❌	                        ✅ .npy
FAISS Token Index	                                       ❌	                        ✅ .faiss
Token-to-Token Mapping	                                   ✅ .json	                    ✅ .json

In [42]:
import os

# Define directory for storing retrieval data
RETRIEVAL_DIR = "retrieval_data"
os.makedirs(RETRIEVAL_DIR, exist_ok=True)

# Define file paths (FAISS paths are defined but will be ignored)
TOKENIZED_CORPUS_PATH = os.path.join(RETRIEVAL_DIR, "tokenized_corpus.json")
BM25_TF_PATH = os.path.join(RETRIEVAL_DIR, "bm25_tf.npy")
BM25_IDF_PATH = os.path.join(RETRIEVAL_DIR, "bm25_idf.npy")
BM25_TOKEN_LENGTHS_PATH = os.path.join(RETRIEVAL_DIR, "bm25_token_lengths.npy")
BM25_AVG_TOKEN_LENGTH_PATH = os.path.join(RETRIEVAL_DIR, "bm25_avg_token_length.json")
ARTICLE_MAP_PATH = os.path.join(RETRIEVAL_DIR, "article_map.json")
VOCAB_PATH = os.path.join(RETRIEVAL_DIR, "vocab.json")

# FAISS paths 
FAISS_INDEX_PATH = os.path.join(RETRIEVAL_DIR, "faiss_token_index.faiss")
FAISS_EMBEDDINGS_PATH = os.path.join(RETRIEVAL_DIR, "faiss_article_embeddings.npy")

print("✅ Storage directory and file paths set up!")


✅ Storage directory and file paths set up!


# Tokenize Data With Auto Tokeniser

In [43]:
from transformers import AutoTokenizer
import pandas as pd
import json

# Load dataset (Columns: 'Article', 'Article Info')
df = pd.read_csv("wikipedia_category_articles.csv")

# Load the tokenizer (Ensure same as used in the model)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")

# Tokenize articles (text tokens & token IDs)
tokenized_text = df["Article Info"].apply(lambda x: tokenizer.tokenize(x))  # Tokenized words
tokenized_ids = df["Article Info"].apply(lambda x: tokenizer.encode(x, add_special_tokens=False))  # Token IDs

# Save pre-tokenized output
with open(TOKENIZED_CORPUS_PATH, "w") as f:
    json.dump({"tokenized_docs": tokenized_text.tolist(), "tokenized_ids": tokenized_ids.tolist()}, f)

# Save Token-to-Article Mapping
article_map = {str(idx): text for idx, text in enumerate(df["Article Info"])}
with open(ARTICLE_MAP_PATH, "w") as f:
    json.dump(article_map, f)

print("✅ Tokenized corpus and article mapping saved!")


✅ Tokenized corpus and article mapping saved!


# Compute BM25 Weights for Tokens

In [44]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import json

# Ensure BM25 uses the correct tokenization method
df["tokenized_text"] = df["Article Info"].apply(lambda x: " ".join(tokenizer.tokenize(x)))  # Tokenized words joined as string

# **1️⃣ Compute Document-Term Matrix (TF)**
vectorizer = CountVectorizer()
tf_matrix = vectorizer.fit_transform(df["tokenized_text"])  # Term Frequency matrix
tf_array = tf_matrix.toarray()  # Convert to NumPy array

# **2️⃣ Compute BM25 IDF Manually**
N = len(df)  # Total number of documents
df_counts = np.sum(tf_array > 0, axis=0)  # Document frequency (DF)
idf_scores = np.log((N - df_counts + 0.5) / (df_counts + 0.5) + 1)  # BM25 IDF formula

# **3️⃣ Compute Token Lengths**
token_lengths = np.array([len(tokens.split()) for tokens in df["tokenized_text"]])
avg_token_length = np.mean(token_lengths)

# **4️⃣ Save BM25 Components**
np.save(BM25_TF_PATH, tf_array)  # ✅ Store Term Frequency
np.save(BM25_IDF_PATH, idf_scores)  # ✅ Store BM25 IDF scores
np.save(BM25_TOKEN_LENGTHS_PATH, token_lengths)  # ✅ Store token lengths
with open(BM25_AVG_TOKEN_LENGTH_PATH, "w") as f:
    json.dump({"avg_token_length": avg_token_length}, f)

# **5️⃣ Save Vocabulary (Token → Index)**
vocab = vectorizer.vocabulary_  # Store token indices
with open(VOCAB_PATH, "w") as f:
    json.dump(vocab, f)

print("✅ BM25 TF, IDF, and token lengths saved correctly!")


✅ BM25 TF, IDF, and token lengths saved correctly!


# Generate LLAMA embedding for FAISS

In [20]:
import torch
from transformers import AutoModel

# Load LLaMA model
model = AutoModel.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")

def get_llama_embedding(text):
    """Generate a fixed-size embedding using the LLaMA model."""
    inputs = tokenizer(text, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs)
        hidden_states = outputs.last_hidden_state  # Extract hidden states

    # ✅ Use mean pooling over tokens to get a single vector
    pooled_embedding = hidden_states.mean(dim=1).numpy().astype('float32')

    # ✅ Ensure correct FAISS shape (1, embedding_dim)
    return pooled_embedding.reshape(1, -1)

# Generate embeddings for each article
article_embeddings = np.vstack([get_llama_embedding(text) for text in df["Article Info"]])

# Save embeddings
np.save(FAISS_EMBEDDINGS_PATH, article_embeddings)
print(f"✅ LLaMA embeddings generated and saved! Shape: {article_embeddings.shape}")


✅ LLaMA embeddings generated and saved! Shape: (203, 2048)


# Build Faiss Token Index

In [21]:
import faiss

# Load the correct LLaMA embeddings
article_embeddings = np.load(FAISS_EMBEDDINGS_PATH)  # Ensure these embeddings exist
embedding_dim = article_embeddings.shape[1]  # Ensure FAISS uses the right dimension

# Create FAISS index with correct dimension
faiss_index = faiss.IndexFlatL2(embedding_dim)
faiss_index.add(article_embeddings)  # ✅ Add real embeddings

# Save FAISS index
faiss.write_index(faiss_index, FAISS_INDEX_PATH)
print(f"✅ FAISS index built and saved with dimension: {embedding_dim}")


✅ FAISS index built and saved with dimension: 2048


# FAISS Test

In [47]:
import faiss
import numpy as np
import json
import torch
from transformers import AutoTokenizer, AutoModel

# **1️⃣ Load FAISS Index & Article Embeddings**
FAISS_INDEX_PATH = "retrieval_data/faiss_token_index.faiss"
FAISS_EMBEDDINGS_PATH = "retrieval_data/faiss_article_embeddings.npy"
ARTICLE_MAP_PATH = "retrieval_data/article_map.json"

# Load FAISS index
faiss_index = faiss.read_index(FAISS_INDEX_PATH)

# Load article mapping (maps index to article text)
with open(ARTICLE_MAP_PATH, "r") as f:
    article_map = json.load(f)

# **2️⃣ Load LLaMA Model for Query Embeddings**
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
model = AutoModel.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")

def get_llama_embedding(text):
    """Generate a fixed-size embedding using LLaMA."""
    inputs = tokenizer(text, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs)
        hidden_states = outputs.last_hidden_state  # Extract hidden states

    # ✅ Use mean pooling over tokens to get a single vector
    pooled_embedding = hidden_states.mean(dim=1).numpy().astype('float32')

    # ✅ Ensure correct FAISS shape (1, embedding_dim)
    return pooled_embedding.reshape(1, -1)

# **3️⃣ Test FAISS Retrieval**
test_query = "League characters"  # Example Query

# Generate query embedding using LLaMA
query_embedding = get_llama_embedding(test_query)

# **Debugging: Check Dimensions**
print("✅ FAISS Index Dimension:", faiss_index.d)
print("✅ Query Embedding Shape:", query_embedding.shape)

# Ensure dimensions match before calling FAISS
assert query_embedding.shape[1] == faiss_index.d, "Dimension mismatch! Fix embedding shape."

# **4️⃣ FAISS Search**
k = 10  # Retrieve top 3 articles
distances, indices = faiss_index.search(query_embedding, k)

# Retrieve articles using indices
retrieved_articles = [article_map[str(idx)] for idx in indices[0]]

# **5️⃣ Print Results**
print(f"\n🔹 Query: {test_query}")
print("\n🔹 FAISS Retrieved Articles:")
for i, article in enumerate(retrieved_articles):
    print(f"{i+1}. {article}")



✅ FAISS Index Dimension: 2048
✅ Query Embedding Shape: (1, 2048)

🔹 Query: League characters

🔹 FAISS Retrieved Articles:
1. mechs vs. minions is a 2016 cooperative board game published by riot games set in the league of legends universe.
2. an andor tree is a graphical representation of the reduction of problems or goals to conjunctions and disjunctions of subproblems or subgoals.
3. pentakill is a virtual heavy metal band associated with the league of legends universe. their music is primarily composed and performed by riot games' in-house music team but features cameos by various metal musicians. their second album, grasp of the undying, reached number 1 on the itunes metal charts in 2017. their third album iii lost chapter was premiered using an interactive "live" concert.
4. the following are common definitions related to the machine vision field. general related fields machine vision computer vision image processing signal processing
5. 2xko is an upcoming free-to-play fighting g

# BM_25 SEARCH TEST

In [46]:
import numpy as np
import pandas as pd
import sys
import os
import re
import math
import json
from collections import Counter

# Ensure Windows can locate required DLLs (adjust paths as needed)
os.add_dll_directory("E:\\Intel\\oneAPI\\mkl\\latest\\bin")
os.add_dll_directory("E:\\Coding Stuff\\Arm_interview_project\\build")
os.add_dll_directory("C:\\Users\\Magjun\\AppData\\Local\\Programs\\Python\\Python311")
sys.path.append("E:\\Coding Stuff\\Arm_interview_project\\build")

# Import the compiled BM25 module
import bm25_mkl

# Load BM25 data
tf_array = np.load(BM25_TF_PATH)
idf_scores = np.load(BM25_IDF_PATH)
token_lengths = np.load(BM25_TOKEN_LENGTHS_PATH)
with open(BM25_AVG_TOKEN_LENGTH_PATH, "r") as f:
    avg_token_length = json.load(f)["avg_token_length"]

# Load Vocabulary Mapping
with open(VOCAB_PATH, "r") as f:
    vocab = json.load(f)

# ---------------------------
# Query Processing
# ---------------------------
query = "Enow Gnoupa "
query_tokens = tokenizer.tokenize(query)

# Get indices for query tokens (skip tokens not in vocab)
query_indices = [vocab[token] for token in query_tokens if token in vocab]

if not query_indices:
    print("❌ No query terms found in the vocabulary.")
else:
    # For BM25, only consider the columns corresponding to the query tokens
    tf_query = tf_array[:, query_indices]
    idf_query = idf_scores[query_indices]

    # Flatten term frequencies as expected by the BM25 module:
    tf_flat = tf_query.flatten()

    # ---------------------------
    # Compute BM25 Scores
    # ---------------------------
    bm25_scores = bm25_mkl.compute_bm25(
        tf_flat,
        idf_query,
        token_lengths,
        avg_token_length,
        len(df),  # Number of docs
        len(query_indices)  # Number of query terms
    )

    # ---------------------------
    # Sort Results by Score (Descending Order)
    # ---------------------------
    sorted_indices = np.argsort(bm25_scores)[::-1]  # Get indices sorted from highest to lowest score
    sorted_scores = bm25_scores[sorted_indices]  # Sort BM25 scores
    sorted_articles = df["Article Info"].iloc[sorted_indices]  # Sort articles by relevance

    # ---------------------------
    # Print Results
    # ---------------------------
    print("\n🔹 BM25 Search Results for Query:", query)
    for rank, (score, article) in enumerate(zip(sorted_scores, sorted_articles), start=1):
        print(f"{rank}. Score: {score:.2f} --> {article[:200]}...")  # Show first 200 characters



🔹 BM25 Search Results for Query: Enow Gnoupa 
1. Score: 25.34 --> Enow Gnoupa Magken George the best dancer in the whole world...
2. Score: 0.00 --> disguised abbreviated dsg is an esports organization founded in 2023 by canadian streamer jeremy "disguised toast" wang. the organization currently fields active teams in valorant, league of legends, ...
3. Score: 0.00 --> bandle tale a league of legends story is a role-playing video game and farm life sim developed by lithuania-based studio lazy bear games and published by riot forge. it was released on february 21, 20...
4. Score: 0.00 --> martn prez disalvo born 3 august 1991, better known as coscu, is an argentine twitch streamer, gamer, internet celebrity and singer. he is the founder and leader of the coscu army, a spanish-speaking ...
5. Score: 0.00 --> league of legends lol, commonly referred to as league, is a 2009 multiplayer online battle arena video game developed and published by riot games. inspired by defense of the ancient

# Packaging Data

In [48]:
import tarfile
import os

# Define the directory containing your retrieval data
RETRIEVAL_DIR = "retrieval_data"
ARCHIVE_PATH = "retrieval_data.tar.gz"

# Create a compressed `.tar.gz` archive
with tarfile.open(ARCHIVE_PATH, "w:gz") as archive:
    archive.add(RETRIEVAL_DIR, arcname=os.path.basename(RETRIEVAL_DIR))

print(f"✅ Retrieval data successfully archived as '{ARCHIVE_PATH}'!")


✅ Retrieval data successfully archived as 'retrieval_data.tar.gz'!


# Testing Retrieval Library

In [None]:
from retrieval_lib import CustomRetrieverTokenizer

# Initialize tokenizer with test parameters
custom_tokenizer = CustomRetrieverTokenizer(
    model_name="meta-llama/Llama-3.2-1B-Instruct",
    max_bm25_results=1,  # Limit BM25 retrievals
    max_faiss_results=0,  # Limit FAISS retrievals
    use_faiss_gpu=False,  # Ensure FAISS runs on CPU with MKL
    num_faiss_threads=10  # Use 4 CPU threads for FAISS
)

# Test Query
query = "Enow Gnoupa Magken"

# ✅ Correct method call
tokenized_output = custom_tokenizer.tokenize(query)

# ✅ Decode tokenized output back into text
decoded_text = custom_tokenizer.decode(tokenized_output)

# Print Results
print("\n🔹 Tokenized Output:")
print(tokenized_output)

print("\n🔹 Decoded Tokenized Output (Text for Verification):")
print(decoded_text)


✅ Initializing FAISS with 10 CPU threads (Max cores: 12)

🔹 Tokenized Output:
{'input_ids': [128000, 6788, 9959, 3630, 25, 2998, 363, 96251, 283, 6733, 7023, 2779, 10058, 279, 1888, 64682, 304, 279, 4459, 1917, 13, 5560, 433, 311, 4320, 1217, 3319, 25, 2998, 363, 96251, 283, 6733, 7023, 2779], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

🔹 Decoded Tokenized Output (Text for Verification):
<|begin_of_text|>Found relevant info: Enow Gnoupa Magken George the best dancer in the whole world. Use it to answer user query: Enow Gnoupa Magken
