In [1]:
import torch
from sentence_transformers import SentenceTransformer, models, losses, InputExample
from datasets import load_dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch

# 🔹 Step 1: Load base transformer model (frozen)
word_embedding_model = models.Transformer("sentence-transformers/all-MiniLM-L6-v2")

# Freeze encoder layers
for param in word_embedding_model.auto_model.parameters():
    param.requires_grad = False

# 🔹 Step 2: Add pooling layer
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

# 🔹 Step 3: Add custom dense projection layers (trainable)
dense1 = models.Dense(in_features=384, out_features=256, activation_function=nn.ReLU())
dense2 = models.Dense(in_features=256, out_features=256, activation_function=nn.Identity())

# 🔹 Step 4: Assemble the model
model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense1, dense2])

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load RAG dataset
dataset = load_dataset("neural-bridge/rag-dataset-12000", split="train")

# Convert to InputExample format
train_samples = [InputExample(texts=[q, c]) for q, c in zip(dataset["question"], dataset["context"])]
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=32)

In [3]:
from torch import device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

train_loss = losses.MultipleNegativesRankingLoss(model)

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=20,
    warmup_steps=10,
    show_progress_bar=True,
    optimizer_params={'lr': 2e-5},
    use_amp=False  
)

  8%|▊         | 500/6000 [04:13<42:19,  2.17it/s]                   

{'loss': 0.7529, 'grad_norm': 7.112282752990723, 'learning_rate': 1.8363939899833057e-05, 'epoch': 1.67}


 17%|█▋        | 1000/6000 [08:04<38:10,  2.18it/s]

{'loss': 0.3886, 'grad_norm': 6.357995510101318, 'learning_rate': 1.669449081803005e-05, 'epoch': 3.33}


 25%|██▌       | 1500/6000 [11:54<35:42,  2.10it/s]

{'loss': 0.314, 'grad_norm': 3.9594621658325195, 'learning_rate': 1.5025041736227046e-05, 'epoch': 5.0}


 33%|███▎      | 2000/6000 [15:45<30:34,  2.18it/s]

{'loss': 0.2677, 'grad_norm': 6.075601577758789, 'learning_rate': 1.3355592654424041e-05, 'epoch': 6.67}


 42%|████▏     | 2500/6000 [19:40<29:36,  1.97it/s]

{'loss': 0.2484, 'grad_norm': 3.961832284927368, 'learning_rate': 1.1686143572621036e-05, 'epoch': 8.33}


 50%|█████     | 3000/6000 [23:51<25:05,  1.99it/s]

{'loss': 0.2218, 'grad_norm': 5.680347919464111, 'learning_rate': 1.001669449081803e-05, 'epoch': 10.0}


 58%|█████▊    | 3500/6000 [27:44<19:07,  2.18it/s]

{'loss': 0.2082, 'grad_norm': 5.276979923248291, 'learning_rate': 8.347245409015026e-06, 'epoch': 11.67}


 67%|██████▋   | 4000/6000 [31:35<15:37,  2.13it/s]

{'loss': 0.1934, 'grad_norm': 3.377329111099243, 'learning_rate': 6.6777963272120206e-06, 'epoch': 13.33}


 70%|██████▉   | 4170/6000 [33:00<16:55,  1.80it/s]

TypeError: 'NoneType' object is not subscriptable

In [None]:
model.save

In [None]:
import faiss
import numpy as np
import pandas as pd
import os

# Reuse same contexts from earlier dataset
contexts = dataset["context"]

# Encode using the trained model
context_embeddings = model.encode(contexts, convert_to_numpy=True, batch_size=64, show_progress_bar=True)

# Save mapping to find text later
os.makedirs("retriever_store", exist_ok=True)
pd.DataFrame({"context": contexts}).to_csv("retriever_store/context_mapping.csv", index=False)

# Create and save FAISS index
dimension = context_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(context_embeddings)
faiss.write_index(index, "retriever_store/context_index.faiss")

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import pandas as pd

# Load trained model
model = SentenceTransformer("output/trained_model_path")  # or use the same model object if still in memory

# Load FAISS index and context text mapping
index = faiss.read_index("retriever_store/context_index.faiss")
context_df = pd.read_csv("retriever_store/context_mapping.csv")

In [None]:
def retrieve_top_k(query, k=1):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, k)
    results = [context_df.iloc[i]["context"] for i in indices[0]]
    return results

query = "What is the Berry Export Summary 2028 and what is its purpose?"
top_k_contexts = retrieve_top_k(query)

for i, ctx in enumerate(top_k_contexts, 1):
    print(f"[{i}] {ctx}\n")

In [None]:
torch.save(model.state_dict(), "model/dpr_model.pt")

In [None]:
model = DPRRetriever()  # Make sure class definition is present
model.load_state_dict(torch.load("model/dpr_model.pt", map_location=device))
model.to(device)
model.eval()

In [None]:
from datasets import load_dataset

test_data = load_dataset("neural-bridge/rag-dataset-12000", split="test")
test_contexts = test_data["context"]

In [None]:
import faiss
import pandas as pd
import numpy as np
import os

# Save test context embeddings
os.makedirs("test_faiss_store", exist_ok=True)
context_embeddings = []

with torch.no_grad():
    for i in range(0, len(test_contexts), 32):
        batch_texts = test_contexts[i:i+32]
        embs = model.encode_passage(batch_texts, device).cpu().numpy()
        context_embeddings.append(embs)

context_embeddings = np.vstack(context_embeddings)

# Save index + mapping
pd.DataFrame({"context": test_contexts}).to_csv("test_faiss_store/context_mapping.csv", index=False)
dimension = context_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(context_embeddings)
faiss.write_index(index, "test_faiss_store/context_index.faiss")

In [None]:
# Load back FAISS index + context
index = faiss.read_index("test_faiss_store/context_index.faiss")
context_df = pd.read_csv("test_faiss_store/context_mapping.csv")

def retrieve_top_k(query, k=5):
    with torch.no_grad():
        query_vec = model.encode_query([query], device).cpu().numpy()
    distances, indices = index.search(query_vec, k)
    return [context_df.iloc[i]["context"] for i in indices[0]]

# Example query
query = "Who won the 2021 Formula 1 championship?"
results = retrieve_top_k(query)
for i, r in enumerate(results, 1):
    print(f"[{i}] {r}\n")