In [1]:
!pip install torch transformers sentence-transformers faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.meta

if using gpu use faiss-gpu

in change runtime type we can change gpu, tpu, we can pay and use

In [2]:
import torch
import faiss
import numpy as np
import textwrap
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer

In [15]:
def load_and_chunk(file_path, chunk_size=300, overlap = 100):
  with open(file_path,'r',encoding='utf-8') as f:
    text = f.read()
  chunks = []
  start = 0
  while start < len(text):
    chunks.append(text[start:start + chunk_size])
    start += chunk_size - overlap
  return chunks

chunks = load_and_chunk('pizza.txt')

In [16]:
len(chunks)

55

In [17]:
chunks[1]

'iginating from the sun-kissed lands of Italy, pizza has evolved into an art form that unites people from diverse backgrounds in a shared love for its mouthwatering combinations. Its history stretches back centuries, with roots tracing back to ancient civilizations like the Greeks, Romans, and Egypti'

In [18]:
#using bert model

embedder = SentenceTransformer("sentence-transformers/bert-base-nli-mean-tokens")

#pretrained model
#uisng bert model and simply encode the chunks, it takes little bit time, we are doing word embedding, that chunks of data
#is converted into embeddings

In [11]:
chunk_embeddings = embedder.encode(chunks, show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [22]:
chunk_embeddings.shape #28 chunks 768 values

(28, 768)

In [21]:
len(chunks)

55

In [23]:
chunk_embeddings[0]

array([-2.20914185e-01,  2.75622010e-01,  5.99907815e-01, -8.37260067e-01,
        1.16918132e-01, -7.86106884e-01, -5.81611335e-01,  9.53733444e-01,
        6.46697342e-01, -6.39264822e-01,  3.26622039e-01,  7.08555162e-01,
        8.10500801e-01,  5.57982028e-01, -4.57581520e-01, -8.88048932e-02,
        5.30135691e-01, -1.81167334e-01, -4.05893922e-01, -1.77109703e-01,
        1.83328222e-02, -4.55846488e-01, -2.47630998e-01,  6.97148979e-01,
        2.23228812e-01,  5.14841080e-01, -2.49408841e-01, -6.41279593e-02,
       -1.24830559e-01,  1.44995809e-01, -5.43251097e-01,  7.62264192e-01,
       -6.98695779e-01, -1.29399979e+00, -5.88165283e-01,  5.91978490e-01,
        7.26240158e-01, -4.13944781e-01, -2.37656265e-01,  6.26204550e-01,
        2.98519462e-01, -7.79117882e-01,  9.83993113e-01,  2.97231581e-02,
       -1.08141232e+00, -2.97381170e-02, -6.21856153e-01,  1.11568105e+00,
        4.35300231e-01, -4.71147090e-01,  1.86175358e+00, -6.37483418e-01,
       -7.33975410e-01, -

In [None]:
These are meaning ful numbers, not bag of words. bag of words only gives us the occurance.
If the word is there then 1 else 0. This is not that kind of data. Each and every word has meaning numbers

Eg. king - man + woman = queen

royal + man - man + woman = queen

Vectors are represented in 2D.

In [26]:
dimension = chunk_embeddings[0].shape[0]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(chunk_embeddings))

In [None]:
#DisDistilGPT2 (short for Distilled-GPT2) is an English-language model pre-trained with the supervision of
#the smallest version of Generative Pre-trained Transformer 2 (GPT-2). Like GPT-2, DistilGPT2 can be used
#to generate text. Users of this model card should also consider information about the design, training, and
#limitations of GPT-2.



In [28]:
gen_model_id = 'distilgpt2'
tokenizer = AutoTokenizer.from_pretrained(gen_model_id)
model = AutoModelForCausalLM.from_pretrained(gen_model_id)
generator = pipeline('text-generation',model=model,tokenizer=tokenizer)

Device set to use cpu


In [31]:
def rag_bert_qa(question, top_k=3, max_tokens=150):
  q_embeddings = embedder.encode([question])
  distances, indices = index.search(np.array(q_embeddings),top_k)
  retrived = "\n".join([chunks[i] for i in indices[0]])


  prompt = f"""Answer the question using context below. Context:{retrived} Question: {question} Answer:"""
  response = generator(prompt,max_new_tokens = max_tokens, do_sample=True, temperature=0.7)
  answer = response[0]['generated_text'].split("Answer:")[-1].strip()
  return textwrap.fill(answer,width=100)

In [None]:
#questions need to be encoded

In [33]:
query = "in which country pizza is more popular?"
print("Question",query)
print("Answer:",rag_bert_qa(query))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question in which country pizza is more popular?
Answer: The United States is the world's most popular pizza destination. The United States is the world's
most popular pizza destination. The United States is the world's most popular pizza destination. The
United States is the world's most popular pizza destination. The United States is the world's most
popular pizza destination. In the United States, the pizza industry is the world's most popular
pizza destination. The United States is the world's most popular pizza destination. The United
States is the world's most popular pizza destination. In the United States, the pizza industry is
the world's most popular pizza destination. The United States is the world's most popular pizza
destination. The United States is the world's most popular pizza destination. In the United States


In [35]:
query = "what is the shape of pizza?"
print("Question",query)
print("Answer:",rag_bert_qa(query))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question what is the shape of pizza?
Answer: it is a pizza. The pizza industry has expanded exponentially, with


In [None]:
# we have only taken small chunks of data to train
# every time we run it may give different results


In [36]:
query = "In which country pizza is first made?"
print("Question",query)
print("Answer:",rag_bert_qa(query))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question In which country pizza is first made?
Answer: Italy. The Italian pizza is made from a mixture of fresh and fresh basil, with a mixture of fresh
and fresh basil, with a mixture of fresh and fresh basil, with a mixture of fresh and fresh basil,
with a mixture of fresh and fresh basil, with a mixture of fresh and fresh basil, with a mixture of
fresh and fresh basil, with a mixture of fresh and fresh basil, with a mixture of fresh and fresh
basil, with a mixture of fresh and fresh basil, with a mixture of fresh and fresh basil, with a
mixture of fresh and fresh basil, with a mixture of fresh and fresh basil, with a mixture of fresh
and fresh basil, with a mixture of fresh and fresh basil, with a mixture of fresh and fresh


In [38]:
query = "what is the taste of the pizza?"
print("Question",query)
print("Answer:",rag_bert_qa(query))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question what is the taste of the pizza?
Answer: it is a pizza that is made from a variety of ingredients. It is a pizza that is made from a variety
of ingredients. It is a pizza that is made from a variety of ingredients. It is a pizza that is made
from a variety of ingredients. It is a pizza that is made from a variety of ingredients. It is a
pizza that is made from a variety of ingredients. It is a pizza that is made from a variety of
ingredients. It is a pizza that is made from a variety of ingredients. It is a pizza that is made
from a variety of ingredients. It is a pizza that is made from a variety of ingredients. It is a
pizza that is made from a variety of ingredients. It is a pizza that is made


In [None]:
# It understandes the context also

#Here it is repeadetly generating the same data because it is trained on small data

# we make temperature = 0.7 means we are giving free hand to generate the text data



In [None]:
It will take lot of time to train, that's why we use gpu

pizza.txt is 10 KB

