In [4]:
import pandas as pd
import numpy as np
from datasets import Dataset, load_from_disk
import faiss
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration

In [6]:
df = pd.read_csv('podcastdata_dataset.csv')
def preprocess_tolower(text):
    text=text.lower()
    return text

df['text']=df['text'].apply(preprocess_tolower)

passages = []
for index, row in df.iterrows():
    transcript = row['text']
    passages.extend(transcript[i:i + 1000] for i in range(0, len(transcript), 1000))

passages_df = pd.DataFrame(passages, columns=['Passages'])
passages_df.to_csv('passages.tsv', sep='\t', index=False)

In [7]:

dataset = Dataset.from_dict({"text": passages})
dataset = dataset.map(lambda example: {"title": example["text"]})

#normal embed function for demo
def embed(texts):
    return np.random.random((len(texts), 768)).astype("float32")


dataset = dataset.map(lambda examples: {"embeddings": embed(examples["text"])}, batched=True)

dataset_path = "passages_dataset"
dataset.save_to_disk(dataset_path)

dataset.add_faiss_index(column="embeddings", index_name="embeddings")

index_path = "passages_index.faiss"
dataset.get_index("embeddings").save(index_path)

dataset = load_from_disk(dataset_path)
dataset.load_faiss_index(index_name="embeddings", file=index_path)

Map: 100%|██████████| 38000/38000 [00:00<00:00, 45051.51 examples/s]
Map: 100%|██████████| 38000/38000 [00:00<00:00, 100841.20 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 38000/38000 [00:00<00:00, 52284.79 examples/s]
100%|██████████| 38/38 [00:00<00:00, 428.16it/s]


In [None]:

tokenizer = RagTokenizer.from_pretrained('facebook/rag-sequence-nq')
retriever = RagRetriever.from_pretrained(
    'facebook/rag-sequence-nq',
    index_name="custom",
    indexed_dataset=dataset
)


