In [None]:
"""
This block of code explains the process of building a Small Language Model (SLM) specifically tailored for answering healthcare policy or business-related questions.
The SLM is constructed using a combination of retrieval and generation techniques to provide accurate, context-aware responses.
The code takes a custom corpus.txt file containing healthcare policy documents and splits it into manageable text chunks. These chunks are then embedded into high-dimensional vectors using a
SentenceTransformer model, capturing their semantic meaning.The resulting embeddings are stored in a FAISS index to enable fast and relevant retrieval when users ask questions.
The fine-tuning using LoRA (Low-Rank Adaptation) applied to the Flan-T5-Small architecture.the RAG (Retrieval-Augmented Generation) pipeline is built by integrating the FAISS retriever with the fine-tuned SLM.
This enables the model to retrieve relevant context from the policy corpus and use it to generate accurate answers. The final block of code deploys this system via a Gradio web interface,
allowing real-time interaction with the chatbot.
"""

In [None]:
!pip install transformers datasets accelerate peft bitsandbytes faiss-cpu langchain gradio


Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia

In [None]:
from google.colab import files
uploaded = files.upload()  # select your_text.txt
with open("healthcare_corpus.txt") as f:
    docs = [f.read()]


Saving healthcare_corpus.txt to healthcare_corpus.txt


In [None]:
!pip install --upgrade \
  transformers \
  datasets \
  accelerate \
  peft \
  bitsandbytes \
  faiss-cpu \
  langchain \
  langchain-community \
  gradio


Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.24-py3-none-any.whl.metadata (2.5 kB)
Collecting gradio
  Downloading gradio-5.33.0-py3-none-any.whl.metadata (16 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting gradio-client==1.10.2 (from gradio)
  Downloading gradio_client-1.10.2-py3-none-any.whl.metadata (7.1 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from datacla

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# 3.1 Split into ~1 000-token chunks
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = splitter.split_text(docs[0])

# 3.2 Embed with a lightweight model
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_texts(texts, embedder)


  embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import LoraConfig, get_peft_config, get_peft_model

BASE = "google/flan-t5-small"  # small enough for a laptop

# 4.1 Load model & tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE)
# Use AutoModelForSeq2SeqLM instead of AutoModelForCausalLM for T5-based models
model = AutoModelForSeq2SeqLM.from_pretrained(
    BASE,
    # load_in_8bit=True,         # Removed 8-bit quantization to avoid CUDA dependency
    device_map="auto"
)

# 4.2 Configure LoRA
lora_cfg = LoraConfig(
    task_type="SEQ_2_SEQ_LM", # Change task type to SEQ_2_SEQ_LM
    r=8,                       # rank
    lora_alpha=16,
    bias = "none",
    lora_dropout=0.05
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



trainable params: 344,064 || all params: 77,305,216 || trainable%: 0.4451


In [None]:
from google.colab import files
uploaded = files.upload() # Select your conversations.csv file

Saving conversations.csv to conversations.csv


In [None]:
# … after loading model & tokenizer …

from datasets import load_dataset

ds = load_dataset("csv", data_files={"train": "conversations.csv"})

def tokenize_fn(examples):
    inputs = tokenizer(examples["input"], truncation=True, padding="max_length", max_length=128)
    outputs = tokenizer(examples["output"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = outputs["input_ids"]
    return inputs

tok_ds = ds["train"].map(tokenize_fn, batched=True, remove_columns=ds["train"].column_names)

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="lora-chatbot",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    report_to=["none"],
    run_name="lora-chatbot-run",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tok_ds,
    tokenizer=tokenizer,
)

trainer.train()


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss


TrainOutput(global_step=5, training_loss=18.73194122314453, metrics={'train_runtime': 13.753, 'train_samples_per_second': 1.091, 'train_steps_per_second': 0.364, 'total_flos': 701053009920.0, 'train_loss': 18.73194122314453, 'epoch': 5.0})

In [None]:
model.save_pretrained("lora-chatbot/checkpoint-1")
tokenizer.save_pretrained("lora-chatbot/checkpoint-1")


('lora-chatbot/checkpoint-1/tokenizer_config.json',
 'lora-chatbot/checkpoint-1/special_tokens_map.json',
 'lora-chatbot/checkpoint-1/spiece.model',
 'lora-chatbot/checkpoint-1/added_tokens.json',
 'lora-chatbot/checkpoint-1/tokenizer.json')

In [None]:
!ls -l .
# you should see a folder named "lora-chatbot"


total 20
-rw-r--r-- 1 root root  315 Jun  8 17:32 'conversations (1).csv'
-rw-r--r-- 1 root root  315 Jun  8 17:34  conversations.csv
-rw-r--r-- 1 root root 3695 Jun  8 17:30  healthcare_corpus.txt
drwxr-xr-x 4 root root 4096 Jun  8 17:35  lora-chatbot
drwxr-xr-x 1 root root 4096 Jun  5 13:38  sample_data


In [None]:
!ls -R lora-chatbot

lora-chatbot:
checkpoint-1  checkpoint-5

lora-chatbot/checkpoint-1:
adapter_config.json	   special_tokens_map.json  tokenizer.json
adapter_model.safetensors  spiece.model
README.md		   tokenizer_config.json

lora-chatbot/checkpoint-5:
adapter_config.json	   rng_state.pth	    tokenizer_config.json
adapter_model.safetensors  scheduler.pt		    tokenizer.json
optimizer.pt		   special_tokens_map.json  trainer_state.json
README.md		   spiece.model		    training_args.bin


In [None]:
!ls -l .
!ls -l lora-chatbot
!ls -l lora-chatbot/checkpoint-1



total 20
-rw-r--r-- 1 root root  315 Jun  8 17:32 'conversations (1).csv'
-rw-r--r-- 1 root root  315 Jun  8 17:34  conversations.csv
-rw-r--r-- 1 root root 3695 Jun  8 17:30  healthcare_corpus.txt
drwxr-xr-x 4 root root 4096 Jun  8 17:35  lora-chatbot
drwxr-xr-x 1 root root 4096 Jun  5 13:38  sample_data
total 8
drwxr-xr-x 2 root root 4096 Jun  8 17:35 checkpoint-1
drwxr-xr-x 2 root root 4096 Jun  8 17:34 checkpoint-5
total 4544
-rw-r--r-- 1 root root     768 Jun  8 17:35 adapter_config.json
-rw-r--r-- 1 root root 1389456 Jun  8 17:35 adapter_model.safetensors
-rw-r--r-- 1 root root    5094 Jun  8 17:35 README.md
-rw-r--r-- 1 root root    2543 Jun  8 17:35 special_tokens_map.json
-rw-r--r-- 1 root root  791656 Jun  8 17:35 spiece.model
-rw-r--r-- 1 root root   20830 Jun  8 17:35 tokenizer_config.json
-rw-r--r-- 1 root root 2422499 Jun  8 17:35 tokenizer.json


In [None]:
!pwd


/content


In [None]:
!ls -1


'conversations (1).csv'
conversations.csv
healthcare_corpus.txt
lora-chatbot
sample_data


In [None]:
!ls -R lora-chatbot

lora-chatbot:
checkpoint-1  checkpoint-5

lora-chatbot/checkpoint-1:
adapter_config.json	   special_tokens_map.json  tokenizer.json
adapter_model.safetensors  spiece.model
README.md		   tokenizer_config.json

lora-chatbot/checkpoint-5:
adapter_config.json	   rng_state.pth	    tokenizer_config.json
adapter_model.safetensors  scheduler.pt		    tokenizer.json
optimizer.pt		   special_tokens_map.json  trainer_state.json
README.md		   spiece.model		    training_args.bin


In [None]:
##11
import json

# Load the adapter config from the file
with open("lora-chatbot/checkpoint-1/adapter_config.json", "r") as f:
    config = json.load(f)

# Print the base model name
print(config["base_model_name_or_path"])



google/flan-t5-small


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import PeftModel, PeftConfig
import torch

# Load PEFT config from the adapter
peft_config = PeftConfig.from_pretrained("lora-chatbot/checkpoint-1")

# Load the original base model
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    peft_config.base_model_name_or_path,
    torch_dtype=torch.float32,
    device_map={"": "cpu"}
)

# Load the adapter on top
model = PeftModel.from_pretrained(base_model, "lora-chatbot/checkpoint-1")

# Load tokenizer from the base model
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)

model.eval()


PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 512)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 512)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=512, out_features=384, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=512, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=384, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
              

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# 1) Load your text corpus (replace with your filename)
with open("healthcare_corpus.txt", "r") as f:
    docs = [f.read()]

# 2) Chunk into ~1 000-token pieces
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = splitter.split_text(docs[0])

# 3) Embed and build a FAISS index
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_texts(texts, embedder)


In [None]:
# Assemble RAG chain
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

gen_pipe = pipeline(
    "text2text-generation",
    model=model,         # your loaded/flan-t5 model
    tokenizer=tokenizer,
    max_length=256,
    do_sample=True,
    temperature=0.7
)
hf_llm = HuggingFacePipeline(pipeline=gen_pipe)

qa = RetrievalQA.from_chain_type(
    llm=hf_llm,
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=False
)


Device set to use cpu
  hf_llm = HuggingFacePipeline(pipeline=gen_pipe)


In [None]:
import gradio as gr

def chat_fn(prompt: str):
    return qa.run(prompt)

interface = gr.Interface(
    fn=chat_fn,
    inputs=gr.Textbox(lines=2, placeholder="Ask me…"),
    outputs=gr.Textbox(),  # or gr.Textbox(label="Response")
    title="🦜🔗 RAG + LoRA Chatbot",
    description="Ask questions about your uploaded text."
)
interface.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://89fcb75d767cfb441c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# this folder already exists from Trainer.save_model
!ls -1 lora-chatbot


config.json
generation_config.json
model.safetensors
optimizer.pt
rng_state.pth
scheduler.pt
special_tokens_map.json
spiece.model
tokenizer_config.json
tokenizer.json
trainer_state.json
training_args.bin


In [None]:
# assuming `vectorstore` is your FAISS index from Step 3
vectorstore.save_local("faiss_index")
!ls -1 faiss_index


index.faiss
index.pkl


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# adjust the target path as needed
!cp -r lora-chatbot /content/drive/MyDrive/rag-demo/
!cp -r faiss_index  /content/drive/MyDrive/rag-demo/
!cp corpus.txt   /content/drive/MyDrive/rag-demo/


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# List your working directory to confirm the folder’s there
!ls -1

conversations.csv
corpus.txt
drive
faiss_index
lora-chatbot
sample_data


In [None]:
# Bundle your model, index, and text into one zip
!zip -r rag_demo.zip lora-chatbot faiss_index corpus.txt


  adding: lora-chatbot/ (stored 0%)
  adding: lora-chatbot/tokenizer.json (deflated 74%)
  adding: lora-chatbot/spiece.model (deflated 48%)
  adding: lora-chatbot/tokenizer_config.json (deflated 95%)
  adding: lora-chatbot/model.safetensors (deflated 7%)
  adding: lora-chatbot/special_tokens_map.json (deflated 85%)
  adding: lora-chatbot/generation_config.json (deflated 29%)
  adding: lora-chatbot/scheduler.pt (deflated 57%)
  adding: lora-chatbot/rng_state.pth (deflated 24%)
  adding: lora-chatbot/config.json (deflated 62%)
  adding: lora-chatbot/training_args.bin (deflated 52%)
  adding: lora-chatbot/trainer_state.json (deflated 56%)
  adding: lora-chatbot/optimizer.pt (deflated 26%)
  adding: faiss_index/ (stored 0%)
  adding: faiss_index/index.faiss (deflated 7%)
  adding: faiss_index/index.pkl (deflated 68%)
  adding: corpus.txt (deflated 72%)


In [None]:
from google.colab import files
files.download("rag_demo.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>