In [None]:
!pip install datasets transformers



## Import libraries

In [None]:
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
from datasets import Dataset
import matplotlib.pyplot as plt

In [None]:
!pip install langchain



## Loading and chunking dataset

![](https://miro.medium.com/v2/resize:fit:1127/1*Jq9bEbitg1Pv4oASwEQwJg.png)

In [None]:
!pip install PyPDF2



In [None]:
import PyPDF2

from google.colab import files

uploaded_file = files.upload()  # Prompts to upload files manually

# Get the filename from the uploaded file dictionary
filename = next(iter(uploaded_file))


In [None]:
# Extract text from PDF
with open(filename, "rb") as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    num_pages = len(pdf_reader.pages)
    data = ""
    for page_num in range(num_pages):
        page = pdf_reader.pages[page_num]
        data += page.extract_text()

print("File uploaded and text extracted successfully!")

File uploaded and text extracted successfully!


In [None]:
data

'Welcome to Smallpdf\nDigital Documents—All In One Place\nAccess Files Anytime, Anywhere Enhance Documents in One Click \nCollaborate With Others With the new Smallpdf experience, you can \nfreely upload, organize, and share digital \ndocuments. When you enable the ‘Storage’ \noption, we’ll also store all processed files here. \nYou can access files stored on Smallpdf from \nyour computer, phone, or tablet. We’ll also \nsync files from the Smallpdf Mobile App to our \nonline portalWhen you right-click on a file, we’ll present \nyou with an array of options to convert, \ncompress, or modify it. \nForget mundane administrative tasks. With \nSmallpdf, you can request e-signatures, send \nlarge files, or even enable the Smallpdf G Suite \nApp for your entire organization. Ready to take document management to the next level? \n'

In [None]:
from langchain.docstore.document import Document as LangchainDocument

RAW_KNOWLEDGE_BASE = LangchainDocument(page_content=data)

In [None]:
MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # The maximum number of characters in a chunk: we selected this value arbitrarily
    chunk_overlap=100,  # The number of characters to overlap between chunks
    add_start_index=True,  # If `True`, includes chunk's start index in metadata
    strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
    separators=MARKDOWN_SEPARATORS,
)

In [None]:
docs_processed = text_splitter.split_documents([RAW_KNOWLEDGE_BASE])

In [None]:
docs_processed

In [None]:
!pip install langchain_community
!pip install sentence-transformers



## Tokenizing/Vectorizing the dataset

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
EMBEDDING_MODEL_NAME = "thenlper/gte-small"

In [None]:
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

  embedding_model = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
emb = embedding_model.embed_query(docs_processed[0].page_content)

In [None]:
emb

[-0.01627420447766781,
 -0.01227602083235979,
 0.021282058209180832,
 -0.03335569426417351,
 -0.0036604097113013268,
 -0.005495274439454079,
 0.08040539175271988,
 0.01866147667169571,
 -0.010449361987411976,
 -0.017359865829348564,
 -0.019399654120206833,
 -0.14231932163238525,
 0.05610019713640213,
 0.041807621717453,
 -0.005434860475361347,
 -0.005452435929328203,
 0.00897761806845665,
 0.012530854903161526,
 0.032213274389505386,
 0.025705866515636444,
 0.06727736443281174,
 -0.004610125906765461,
 -0.030497679486870766,
 -0.060319770127534866,
 0.06249995902180672,
 0.028788916766643524,
 -0.03528966009616852,
 -0.06233622878789902,
 -0.059697769582271576,
 -0.16551890969276428,
 0.0006540921167470515,
 -0.014112481847405434,
 0.06551483273506165,
 -0.04733053594827652,
 -0.06113535538315773,
 0.015760814771056175,
 -0.04682299494743347,
 0.062261562794446945,
 0.03365971893072128,
 0.03440278023481369,
 0.03266139701008797,
 -0.0015411797212436795,
 -0.004276457242667675,
 -0.070

In [None]:
import numpy as np
np.array(emb).shape

(384,)

In [None]:
!pip install pinecone-client



## Storing dataset into a vector database

Using: https://pinecone.com

In [None]:
from google.colab import userdata
PINECONE_KEY=userdata.get('PINECONE_KEY')

In [None]:
from tqdm.notebook import tqdm
from pinecone import Pinecone

pc = Pinecone(api_key=PINECONE_KEY)
index = pc.Index("ragtest")

In [None]:
upsert_data = []

for i, entry in tqdm(enumerate(docs_processed[:10])):
#for i, entry in tqdm(enumerate(docs_processed)):
    text = entry.page_content
    vector = embedding_model.embed_query(text)
    upsert_data.append(
        {
            "id": "vec{}".format(i),
            "values": vector,
            "metadata": {"text": text}
        }
    )

0it [00:00, ?it/s]

In [None]:
index.upsert(
    vectors=upsert_data,
    namespace= "ns1"
)

{'upserted_count': 5}

## Loading a LLM

In [None]:
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = "HuggingFaceH4/zephyr-7b-beta"

In [None]:
!pip install bitsandbytes
!pip install accelerate



In [None]:
pip install -U bitsandbytes



In [None]:
import bitsandbytes as bnb
print("bitsandbytes version:", bnb.__version__)


bitsandbytes version: 0.45.2


In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_name)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [None]:
llm_model = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.4,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=500,
)

Device set to use cuda:0


In [None]:
llm_model("Hey there!")

[{'generated_text': '\n\nI’m so excited to share my latest project with you. I’ve teamed up with the amazing folks at The Spice House to bring you a collection of recipes that feature their delicious spices and seasonings.\n\nThe first recipe in this series is for a savory, spicy dish that will warm you up on even the coldest winter nights: Moroccan Lamb Stew.\n\nThis stew is packed with flavor thanks to a blend of warming spices like cumin, coriander, and cinnamon, as well as a touch of sweetness from dried apricots and raisins. The lamb is tender and juicy, while chickpeas and carrots add hearty texture and nutrition. And the best part? It all comes together in one pot, making cleanup a breeze.\n\nHere’s what you’ll need:\n\n- 2 pounds boneless lamb shoulder, cut into bite-sized pieces\n- Salt and freshly ground black pepper, to taste\n- 1 tablespoon olive oil\n- 1 large onion, chopped\n- 4 garlic cloves, minced\n- 1 tablespoon grated fresh ginger\n- 1 teaspoon ground cumin\n- 1 teas

## Prompting the model

In [None]:
prompt = """
<|system|>
You are a helpful assistant that answers on medical questions based on the real information provided from different sources and in the context.
Give the rational and well written response. If you don't have proper info in the context, answer "I don't know"
Respond only to the question asked.

<|user|>
Context:
{}
---
Here is the question you need to answer.

Question: {}
<|assistant|>
"""

In [None]:
from textwrap import fill
user_input = input("User: ")

vectorized_input = embedding_model.embed_query(user_input)

context = index.query(
    namespace="ns1",
    vector=vectorized_input,
    top_k=1,
    include_metadata=True
)

answer = llm_model(prompt.format(context['matches'][0]['metadata']['text'], user_input))

print("AI response: ", fill(answer[0]['generated_text']))

In [None]:
context['matches'][0]['metadata']['text']