In [1]:
!pip install -U transformers bitsandbytes accelerate

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2


In [2]:
!pip install -U trl

Collecting trl
  Downloading trl-0.25.0-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.25.0-py3-none-any.whl (462 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.25.0


In [3]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.6


In [4]:
import fitz
def extract_text_pdf(pdf_path):
  text_blocks = []
  with fitz.open(pdf_path) as f:
    for page in f:
      text = page.get_text("text").strip()
      if text:
        text_blocks.append(text)

  return text_blocks

In [6]:
doc = extract_text_pdf("/content/Metformin.pdf")

In [7]:
doc

['Metformin is one of the most widely prescribed oral antihyperglycemic agents.\u200b\n Its primary mechanism of action involves the activation of AMP-activated protein kinase \n(AMPK), a central metabolic regulator that promotes glucose uptake and fatty acid oxidation \nwhile inhibiting hepatic gluconeogenesis.\u200b\n Beyond its glycemic control, Metformin has been shown to improve cardiovascular outcomes \nand display anti-inflammatory properties.\u200b\n Recent studies also suggest potential anticancer effects through inhibition of the mTOR \nsignaling pathway and suppression of tumor angiogenesis. \n \nClinical trials have demonstrated that combining Atorvastatin with Ezetimibe results in \nsignificant reductions in low-density lipoprotein cholesterol (LDL-C) levels compared to \nmonotherapy.\u200b\n Ezetimibe acts by inhibiting the Niemann–Pick C1-like 1 (NPC1L1) transporter in the intestinal \nwall, reducing cholesterol absorption, while Atorvastatin inhibits hepatic HMG-CoA red

In [8]:
import re
def split_paragraphs(pages):
  paragraphs = []
  for page_text in pages:
    # Split on double lines breaks on long newlines
    chunks = re.split(r"\n\s*\n", page_text)
    for chunk in chunks:
      clean = chunk.strip()
      if len(clean) > 30: # ignore too short lines
         paragraphs.append(clean)
  return paragraphs

In [9]:
paragraphs = split_paragraphs(doc)

In [10]:
data = [{"text": p} for p in paragraphs]

In [11]:
data

[{'text': 'Metformin is one of the most widely prescribed oral antihyperglycemic agents.\u200b\n Its primary mechanism of action involves the activation of AMP-activated protein kinase \n(AMPK), a central metabolic regulator that promotes glucose uptake and fatty acid oxidation \nwhile inhibiting hepatic gluconeogenesis.\u200b\n Beyond its glycemic control, Metformin has been shown to improve cardiovascular outcomes \nand display anti-inflammatory properties.\u200b\n Recent studies also suggest potential anticancer effects through inhibition of the mTOR \nsignaling pathway and suppression of tumor angiogenesis.'},
 {'text': 'Clinical trials have demonstrated that combining Atorvastatin with Ezetimibe results in \nsignificant reductions in low-density lipoprotein cholesterol (LDL-C) levels compared to \nmonotherapy.\u200b\n Ezetimibe acts by inhibiting the Niemann–Pick C1-like 1 (NPC1L1) transporter in the intestinal \nwall, reducing cholesterol absorption, while Atorvastatin inhibits h

In [12]:
from datasets import Dataset
dataset = Dataset.from_list(data)

In [13]:
dataset

Dataset({
    features: ['text'],
    num_rows: 4
})

In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling

model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"

In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [16]:
if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token

In [17]:
def tokenize_fn(examples):
  tokens = tokenizer(examples["text"], truncation = True, padding = "max_length", max_length = 512)
  tokens['labels'] = tokens['input_ids'].copy()
  return tokens

In [18]:
tokenized = dataset.map(tokenize_fn, batched = True, remove_columns = ['text'])

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [19]:
tokenized

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 4
})

In [20]:
model = AutoModelForCausalLM.from_pretrained(model_name)

config.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

In [21]:
training_args = TrainingArguments(
    output_dir = "./llama_pharama_model",
    overwrite_output_dir = True,
    num_train_epochs = 2,
    per_device_train_batch_size = 2,
    save_steps = 500,
    save_total_limit = 2,
    logging_steps = 50,
    learning_rate = 2e-5,
    fp16 = True,
    report_to = 'none'
)

In [22]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized
)

In [24]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 44.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 20.12 MiB is free. Process 2658 has 14.72 GiB memory in use. Of the allocated memory 14.43 GiB is allocated by PyTorch, and 154.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

### **LORA**

In [23]:
!pip install -U transformers bitsandbytes accelerate



In [25]:
from peft import LoraConfig, get_peft_model, TaskType

In [41]:
device = "cuda" if torch.cuda.is_available else "cpu"

In [26]:
model_name

'TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T'

In [27]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [28]:
if tokenizer.pad_token:
  tokenizer.pad_token = tokenizer.eos_token

In [46]:
from transformers import BitsAndBytesConfig
import torch

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    # device_map="auto"
)

In [47]:
lora_config = LoraConfig(
    task_type = TaskType.CAUSAL_LM,
    r = 8,
    lora_alpha = 16,
    target_modules = ["q_proj", "v_proj"],
    lora_dropout = 0.05,
    bias = "none"
)

In [48]:
q_lora_model = get_peft_model(model, lora_config)

In [49]:
q_lora_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Li

In [53]:
training_args = TrainingArguments(
    output_dir = "./llama_pharama_Qmodel",
    overwrite_output_dir = True,
    num_train_epochs = 20,
    per_device_train_batch_size = 2,
    save_steps = 500,
    save_total_limit = 2,
    logging_steps = 50,
    learning_rate = 2e-5,
    fp16 = True,
    report_to = 'none',
)

In [54]:
trainer = Trainer(
    model = q_lora_model,
    args = training_args,
    train_dataset = tokenized
)

In [55]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=40, training_loss=9.324079895019532, metrics={'train_runtime': 24.6595, 'train_samples_per_second': 3.244, 'train_steps_per_second': 1.622, 'total_flos': 254518587555840.0, 'train_loss': 9.324079895019532, 'epoch': 20.0})

In [56]:
trained_model_path = "/content/llama_pharama_Qmodel/checkpoint-40"

In [62]:
tuned_model = AutoModelForCausalLM.from_pretrained(
    trained_model_path
).to(device)

In [63]:
prompt = "Clinical trails demonstrated that combining Atorvastatin with Ezetimibe"

inputs = tokenizer(prompt, return_tensors = "pt").to(device)

In [64]:
outputs = tuned_model.generate(
    **inputs,
    max_new_tokens = 100,
    temperature = 0.8,
    top_p = 0.9,
    do_sample = True,
    repetition_penalty = 1.1
)

In [66]:
print(f"\nModel Output:\n")
print(tokenizer.decode(outputs[0], skip_special_tokens = True))


Model Output:

Clinical trails demonstrated that combining Atorvastatin with Ezetimibe could reduce the risk of heart attacks, strokes and deaths.
Dr. Srinivasan explained the results of the clinical trials for the drug's safety in a recent press release by stating that "the two drugs together provided significant benefits to patients without any serious side effects." Dr. Srinivasan continued by saying that he would like to see more research done on the drug in order to see how it may work on humans. The new drug is expected to be
