In [1]:
# from datasets import load_dataset

# data = load_dataset("neural-bridge/rag-dataset-12000")

In [2]:
# data

In [3]:
from datasets import load_dataset

# Load the full dataset
dataset = load_dataset("neural-bridge/rag-dataset-12000", split="train")  # or 'validation'

# Extract needed fields
questions = dataset["question"]
contexts = dataset["context"]
answers = dataset["answer"]  # Not needed for retriever, but useful for generator later

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from sentence_transformers import InputExample

# Prepare InputExample list for training
train_samples = [InputExample(texts=[q, c]) for q, c in zip(questions, contexts)]

In [5]:
from sentence_transformers import SentenceTransformer, losses
from torch.utils.data import DataLoader

# Load pre-trained Sentence-BERT model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# DataLoader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=16)

# Loss function
train_loss = losses.MultipleNegativesRankingLoss(model)

# Fine-tune
# model.fit(
#     train_objectives=[(train_dataloader, train_loss)],
#     epochs=2,
#     warmup_steps=100
# )

In [6]:
import faiss
import numpy as np
import os
import pandas as pd

# Create directory to save retriever assets
os.makedirs("retriever_store", exist_ok=True)

# Encode all contexts
context_embeddings = model.encode(contexts, convert_to_numpy=True, show_progress_bar=True)

# Save context mapping for lookup later
pd.DataFrame({"context": contexts}).to_csv("retriever_store/context_mapping.csv", index=False)

# Build FAISS index
dimension = context_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(context_embeddings)

# Save the index and model
faiss.write_index(index, "retriever_store/context_index.faiss")
model.save("retriever_store/bi_encoder_model")

Batches: 100%|██████████| 300/300 [00:37<00:00,  7.92it/s]


In [7]:
# Load retriever components
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("retriever_store/bi_encoder_model")
index = faiss.read_index("retriever_store/context_index.faiss")
context_df = pd.read_csv("retriever_store/context_mapping.csv")

def retrieve_top_k_contexts(query, k=5):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, k)
    return [context_df.iloc[i]["context"] for i in indices[0]]

# Example query
query = "What is the Berry Export Summary 2028 and what is its purpose?"
top_k_contexts = retrieve_top_k_contexts(query)

for i, ctx in enumerate(top_k_contexts, 1):
    print(f"Context {i}:\n{ctx}\n")

Context 1:
Caption: Tasmanian berry grower Nic Hansen showing Macau chef Antimo Merone around his property as part of export engagement activities.
THE RISE and rise of the Australian strawberry, raspberry and blackberry industries has seen the sectors redouble their international trade focus, with the release of a dedicated export plan to grow their global presence over the next 10 years.
Driven by significant grower input, the Berry Export Summary 2028 maps the sectors’ current position, where they want to be, high-opportunity markets and next steps.
Hort Innovation trade manager Jenny Van de Meeberg said the value and volume of raspberry and blackberry exports rose by 100 per cent between 2016 and 2017. She said the Australian strawberry industry experienced similar success with an almost 30 per cent rise in export volume and a 26 per cent rise in value to $32.6M over the same period.
“Australian berry sectors are in a firm position at the moment,” she said. “Production, adoption of

In [8]:
# from torch.utils.data import Dataset
# import torch
# import random

# class Retriever_Model_Dataset_positive(Dataset):
#     def __init__(self,data):
#         self.data = data

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, index):

#         item = self.data[index]
#         context = item['context']
#         question = item['question']

#         return  context , question , torch.tensor(1.0,dtype= torch.float)
    


# class Retriever_Model_Dataset_negative(Dataset):
#     def __init__(self,data):
#         self.data = data

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, index):

#         item = self.data[index]
#         context = item['context']

#         other_index = index
#         while other_index == index:
#             other_index = random.randint(0,len(self.data)-1)

#         item = self.data[index]
#         question = item['question']
#         return  context , question , torch.tensor(0.0,dtype= torch.float)

In [9]:
# from torch.utils.data import ConcatDataset, DataLoader
# from datasets import load_dataset


# data = load_dataset("neural-bridge/rag-dataset-12000")

# train_data = data.get("train")
# test_data = data.get("test")

# pos_dataset = Retriever_Model_Dataset_positive(train_data)
# neg_dataset = Retriever_Model_Dataset_negative(train_data)

# train_dataset = ConcatDataset([pos_dataset, neg_dataset])
# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [10]:
# for batch in train_loader:
#     contexts, questions, labels = batch
#     for i in range(len(contexts)):
#         print(f"Context   : {contexts[i]}")
#         print(f"Question  : {questions[i]}")
#         print(f"Label     : {labels[i].item()}")
#         print("-" * 60)
#     break

In [11]:
# from sentence_transformers import SentenceTransformer

# class Retriever_Model(torch.nn.Module):

#     def __init__(self):
#         super(Retriever_Model, self).__init__()

#         self.context_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
#         self.questions_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

#         self.linear1 = torch.nn.Linear(384*2,384)
#         self.linear2 = torch.nn.Linear(384,128)  
#         self.linear3 = torch.nn.Linear(128,1)

#         self.activation = torch.nn.ReLU()      

#     def forward(self, question_texts, context_texts):
    
#         with torch.no_grad():
#             q_embed = self.questions_model.encode(question_texts, convert_to_tensor=True)
#             c_embed = self.context_model.encode(context_texts, convert_to_tensor=True)

#         x = torch.cat([q_embed, c_embed], dim=1)

#         x = self.activation(self.linear1(x))
#         x = self.activation(self.linear2(x))
#         score = self.linear3(x)

#         return score

In [12]:
# import torch
# from torch.utils.data import ConcatDataset, DataLoader
# from datasets import load_dataset
# from torch import nn, optim

# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
# print(f"Using device: {device}")

# model = Retriever_Model().to(device)
# optimizer = optim.Adam(model.parameters(), lr=1e-4)
# loss_fn = nn.BCEWithLogitsLoss()

# print("Starting training...\n")
# model.train()

# for epoch in range(1):
#     total_loss = 0.0
#     for contexts, questions, labels in train_loader:
#         labels = labels.to(device)
#         scores = model(questions, contexts).squeeze()
#         loss = loss_fn(scores, labels)

#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         total_loss += loss.item()

#     print(f"Epoch {epoch+1} - Loss: {total_loss:.4f}")

In [13]:
# from sentence_transformers import SentenceTransformer

# model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# sentences = [
#     "That is a happy person",
#     "That is a happy dog",
#     "That is a very happy person",
#     "Today is a sunny day"
# ]
# embeddings = model.encode(sentences)

# similarities = model.similarity(embeddings, embeddings)

# print(similarities)
# # [4, 4]

In [14]:
# from sentence_transformers import SentenceTransformer
# import numpy as np

# # sentences = ["This is an example sentence", "Each sentence is converted"]
# a = "That is a happy person"
# b = "That is a very happy person"
# model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# embeddings_a = model.encode(a)
# embeddings_b = model.encode(b)

# f = np.dot(embeddings_a, embeddings_b)
# print(f)