### Install Packages

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install faiss-cpu



In [None]:
!pip install datasets



In [None]:
!pip install transformers



# Dense Passage Retrieval

In [None]:
import pandas as pd

# Read the knowledge text file
with open('/content/drive/MyDrive/rag-knowledge-doc.txt', 'r') as file:
    lines = file.readlines()

titles = []
articles = []
current_text = ""


for line in lines:
    if line.startswith('#'):
        # Append the previous text block to the list
        if current_text:
            articles.append(current_text.strip())
            current_text = ""
        titles.append(line.strip('#').strip())
    else:
        current_text += line

# Append the last text block to the list
if current_text:
    articles.append(current_text.strip())


#print(len(articles))
#print(len(titles))
assert (len(articles) == len(titles))




### Corpus Chunking

In [None]:
print('Before splitting, {:,} articles.\n'.format(len(titles)))


passage_titles = []
passages = []

print('Splitting...')


for i in range(len(titles)):

    title = titles[i]
    article = articles[i]

    # Skip over any without contents.
    if len(article) == 0:
        print('Skipping empty article:', title)
        continue

    # Split the text on whitespace.
    words = article.split()

    # Loop over the words, incrementing by 100.
    for i in range(0, len(words), 100):

        # Select the next 100 words.
        # Python slices automatically stop at the end of the array.
        chunk_words = words[i : i + 100]

        # Recombine the words into a passage by joining with whitespace.
        chunk = " ".join(chunk_words)

        # Remove any trailing whitespace.
        chunk = chunk.strip()

        # To avoid a possible edge case, skip any empty chunks.
        if len(chunk) == 0:
            continue

        # Store the chunk. Every chunk in the article uses the article title.
        passage_titles.append(title)
        passages.append(chunk)

print('  Done.\n')

chunked_corpus = {'title': passage_titles, 'text': passages}

print('After splitting, {:,} "passages".'.format(len(chunked_corpus['title'])))

Before splitting, 30 articles.

Splitting...
Skipping empty article: About Pan Card
Skipping empty article: PAN Card Application Process
Skipping empty article: New Pan Card
Skipping empty article: Updation/Correction in the PAN Card
Skipping empty article: Form 49AA
  Done.

After splitting, 42 "passages".


### Document Tokenization


In [None]:
from transformers import DPRContextEncoderTokenizerFast

# Load the tokenizer.
ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained("facebook/dpr-ctx_encoder-multiset-base")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.


In [None]:

num_passages = len(chunked_corpus['title'])

print('Tokenizing {:,} passages for DPR...'.format(num_passages))

# Tokenize the whole dataset! This will take ~15 to 20 seconds.
outputs = ctx_tokenizer(
    chunked_corpus["title"],
    chunked_corpus["text"],
    truncation=True,
    padding="longest",
    return_tensors="pt",
)

print('  DONE.')

# `input_ids` holds the encoded tokens for the entire corpus.
input_ids = outputs["input_ids"]

Tokenizing 42 passages for DPR...
  DONE.


In [None]:
print(input_ids.shape)
#The tensor size tells us that the longest title + passage is 271 tokens.

torch.Size([42, 271])


### Document Encoding

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available!')


There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
# Import the DPR encoder, the model that will generate the embeddings.
from transformers import DPRContextEncoder

ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-multiset-base")

# Move the encoder model to the GPU.
ctx_encoder = ctx_encoder.to(device=device)


In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [None]:
import time
import math

# We're running forward model pass only
torch.set_grad_enabled(False)

# Track elapsed time for progress updates.
t0 = time.time()

# Track the current batch number, also for progress updates.
step = 0

batch_size = 16

# Get the number of passages in the dataset
num_passages = input_ids.size()[0]

# Calculate the number of batches in the dataset.
num_batches = math.ceil(num_passages / batch_size)

# As we embed the passages in batches, accumulate them in this list.
embeds_batches = []

print('Generating embeddings for {:,} passages...'.format(num_passages))

for i in range(0, num_passages, batch_size):

    # Progress update every 100 batches.
    if step % 100 == 0 and not step == 0:
        # Calculate elapsed time in minutes.
        elapsed = format_time(time.time() - t0)

        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, num_batches, elapsed))

    batch_ids = input_ids[i:i + 16, :]
    batch_ids = batch_ids.to(device)

    # Run the encoder!
    outputs = ctx_encoder(
        batch_ids,
        return_dict=True
    )

    # The embeddings are in the field "pooler_output"
    embeddings = outputs["pooler_output"]
    embeddings = embeddings.detach().cpu().numpy()
    embeds_batches.append(embeddings)

    step += 1

print('  DONE.')

Generating embeddings for 42 passages...
  DONE.


In [None]:
import numpy as np

# Combine the results across all batches.
embeddings = np.concatenate(embeds_batches, axis=0)

print('Size of dataset embeddings:', embeddings.shape)

Size of dataset embeddings: (42, 768)


### FAISS Index

In [None]:
import faiss

# "The dimension of the embeddings to pass to the HNSW Faiss index."
dim = 768

# "The number of bi-directional links created for every new element during the
# HNSW index construction."
m = 128

# Let's use the Faiss implementation of HNSW for fast approximate nearest neighbor search
index = faiss.IndexHNSWFlat(dim, m, faiss.METRIC_INNER_PRODUCT)

In [None]:
print('Building the FAISS index...')

t0 = time.time()

index.train(embeddings)

index.add(embeddings)

print('  DONE.')

print('  Adding embeddings to index took', format_time(time.time() - t0))


Building the FAISS index...
  DONE.
  Adding embeddings to index took 0:00:00


### Example Search

In [None]:
from transformers import DPRQuestionEncoder

q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-multiset-base")

q_encoder = q_encoder.to(device=device)


In [None]:
from transformers import DPRQuestionEncoderTokenizerFast

# Load the tokenizer.
q_tokenizer = DPRQuestionEncoderTokenizerFast.from_pretrained("facebook/dpr-question_encoder-multiset-base")


In [None]:
# Tokenize the question.
input_ids = q_tokenizer.encode("What are the documents required to apply for the new PAN?", return_tensors="pt")

input_ids = input_ids.to(device)

# Encode the question
outputs = q_encoder(input_ids)
q_embed = outputs['pooler_output']
q_embed = q_embed.cpu().numpy()

print("Query embedding:", q_embed.shape)

Query embedding: (1, 768)


In [None]:
# Find the k most similar passages to the question embedding `q_embed`.
D, I = index.search(q_embed, k=3)

# Print out the indeces and simlarity scores
print('Closest matching indeces:', I)
print('Inner Products:', D)

Closest matching indeces: [[1 0 8]]
Inner Products: [[76.29766  75.260544 74.83923 ]]


In [None]:
import textwrap

# Wrap text to 80 characters.
wrapper = textwrap.TextWrapper(width=80)

# For each of the top 'k' results..
for i in I[0]:

    print('Index:', i)

    # Retrieve passage and its title.
    title = chunked_corpus['title'][i]
    passage = chunked_corpus['text'][i]

    print('Article Title:   ', title, '\n')

    print('Passage:')
    print(wrapper.fill(passage))

    print('')


Index: 1
Article Title:    Who needs a Pan card? 

Passage:
All individuals/non-individuals (including foreign citizens/entities) earning
taxable income in India must have a PAN card.

Index: 0
Article Title:    What is Pan card? 

Passage:
The PAN card is a unique ten-digit alphanumeric identification number that is
issued by the Income Tax Department of India to track the tax-related
transactions of individuals and entities. The PAN card is mandatory for any
financial transaction in India, including opening a bank account, buying or
selling property, and filing income tax returns.

Index: 8
Article Title:    Documents required for a new PAN Card 

Passage:
**If you have Aadhaar card** No other document is required. You can get your pan
card through your Aadhaar card in 10 minutes. **If you don’t have an Aadhaar
card** - Passport(Any Country) / OCI Card - Passport Size Photograph - Overseas
address proof with zip code (Supporting documents - Indian NRO/NRE Account
statement or Oversea

# Ask Questions

In [None]:
from datasets import Dataset
import pandas as pd

# Create a DataFrame from the dictionary.
df = pd.DataFrame(chunked_corpus)

# Convert the DataFrame into a huggingface Dataset object.
dataset = Dataset.from_pandas(df)

# Check out the object.
print(dataset)

Dataset({
    features: ['title', 'text'],
    num_rows: 42
})


In [None]:
embs = []

for i in range(embeddings.shape[0]):
    embs.append(embeddings[i, :])

In [None]:
dataset = dataset.add_column("embeddings", embs)
dataset

Dataset({
    features: ['title', 'text', 'embeddings'],
    num_rows: 42
})

In [None]:
index = faiss.IndexHNSWFlat(dim, m, faiss.METRIC_INNER_PRODUCT)

dataset.add_faiss_index(column="embeddings", index_name="embeddings", custom_index=index, faiss_verbose=True)

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['title', 'text', 'embeddings'],
    num_rows: 42
})

# Retriever


In [None]:
from transformers import RagRetriever


retriever = RagRetriever.from_pretrained(

    "facebook/rag-sequence-nq", # The specific pre-trained model we'll use.

    use_dummy_dataset=False,

    indexed_dataset=dataset, # Pass in our dataset

    index_name="embeddings", # Specify the name of the FAISS index we created

)



Downloading (…)lve/main/config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)_tokenizer/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.


Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)tokenizer/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)tokenizer/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizerFast'.


In [None]:
from transformers import RagTokenizer

tokenizer = RagTokenizer.from_pretrained(
    "facebook/rag-sequence-nq"
)




The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

# Generator


In [None]:
from transformers import RagSequenceForGeneration

model = RagSequenceForGeneration.from_pretrained(
    "facebook/rag-sequence-nq",
    retriever=retriever
)




Downloading pytorch_model.bin:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/rag-sequence-nq were not used when initializing RagSequenceForGeneration: ['rag.question_encoder.question_encoder.bert_model.pooler.dense.bias', 'rag.question_encoder.question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing RagSequenceForGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RagSequenceForGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RagSequenceForGeneration were not initialized from the model checkpoint at facebook/rag-sequence-nq and are newly initialized: ['rag.generator.lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to us

### Example Questions

In [None]:
import time

t0 = time.time()

question = "What are the documents required to apply for the new PAN?"

# Tokenize the question.
input_ids = tokenizer.question_encoder(question, return_tensors="pt")["input_ids"]

# Give the question to RAG and have it generate an answer!
generated = model.generate(input_ids, max_length=50,
    min_length = 5)

# Convert the answer tokens back into a single string.
generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)[0]

print ("Q: " + question)
print ("A: " + generated_string)

print('\nResponse took %.2f seconds' % (time.time() - t0))

Q: What are the documents required to apply for the new PAN?
A:  a citizenship renunciation letter

Response took 241.23 seconds


In [None]:
def ask_question(question):
    t0 = time.time()

    input_ids = tokenizer.question_encoder(question, return_tensors="pt")["input_ids"]
    generated = model.generate(input_ids, max_length=50,
    min_length = 3)
    generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)[0]

    print ("Q: " + question)
    print ("A: '{:}'".format(generated_string))

    print('\nResponse took %.2f seconds' % (time.time() - t0))
    return generated_string

#### 5.2.1. My Own Questions

In [None]:
questions_df = pd.read_excel('/content/drive/MyDrive/SampleQuestions.xlsx')
questions_list = questions_df['Question'].tolist()

In [None]:
answers = []
count = 0
for question in questions_list:
    answer = ask_question(question)
    count+=1
    print(count)
    answers.append(answer)

Q: What are the documents required to apply for the new pan
A: ' a citizenship renunciation letter'

Response took 223.44 seconds
1
Q: What is the cost/fees of a PAN card?
A: ' us $ 299.90'

Response took 183.50 seconds
2
Q: Can I take the delivery of Pan card at Indian address
A: ' only at an indian address'

Response took 187.19 seconds
3
Q: How long does it usually take to receive the PAN card after applying?
A: ' around 2 - 3 weeks'

Response took 224.00 seconds
4
Q: How to apply for PAN card
A: ' through abc'

Response took 232.43 seconds
5
Q: What is the process to apply for PAN card
A: ' through abc'

Response took 214.96 seconds
6
Q: Can I apply for a PAN card if I am a non-resident Indian (NRI)?
A: ' 49a'

Response took 218.54 seconds
7
Q: Can I apply for pan card without Aadhaar?
A: ' nrs'

Response took 217.40 seconds
8
Q: What are the charges of linking Pan & Aadhaar
A: ' inr 2000/-'

Response took 216.07 seconds
9
Q: How long does Pan & Aadhaar linking take
A: ' 30 june 20