In [1]:
# Install necessary libraries
!pip install -U -q transformers datasets peft accelerate bitsandbytes

In [2]:
!pip install evaluate rouge_score

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=cd78f1f5db28203bfd31840c64ae469196954b3863f8570836bb534436c0a91e
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.2 rouge_score-0.1.2


In [3]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, BitsAndBytesConfig
from datasets import Dataset, DatasetDict
from transformers import pipeline, BertTokenizer, BertForQuestionAnswering, BertTokenizerFast
from transformers import TrainingArguments, Trainer
import numpy as np
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration
import nltk
import evaluate
from datasets import load_dataset
from peft import LoraConfig,get_peft_model,prepare_model_for_kbit_training,TaskType
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu

2024-05-21 09:59:13.921011: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-21 09:59:13.921135: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-21 09:59:14.079811: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
# Load the dataset from Hugging Face
dataset = load_dataset("medalpaca/medical_meadow_medical_flashcards")
dataset = dataset['train']

Downloading readme:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/33955 [00:00<?, ? examples/s]

In [5]:
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
temp_dataset = train_test_split['test']

In [6]:
test_valid_split = temp_dataset.train_test_split(test_size=0.5, seed=42)
val_dataset = test_valid_split['train']
test_dataset = test_valid_split['test']

In [7]:
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [15]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

In [17]:
# Load the model
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small",quantization_config=bnb_config, device_map="auto")
model.config.use_cache = False

In [9]:
tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [18]:
# Configure LoRA parameters
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"]

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    target_modules=target_modules,
    task_type="QUESTION_ANS",
)

In [19]:
# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

In [20]:
model = prepare_model_for_kbit_training(model)

In [21]:
model = get_peft_model(model, lora_config)

In [22]:
model.print_trainable_parameters()

trainable params: 589,824 || all params: 61,096,448 || trainable%: 0.9654


In [23]:
# Load the data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [24]:
input_ids = tokenizer("capital of us", return_tensors="pt").input_ids

In [25]:
prefix = "answer the question: "

In [26]:
def preprocess_function(examples):
    inputs = [prefix + question for question in examples["input"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    labels = tokenizer(text_target = examples["output"], max_length=512, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [27]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/27164 [00:00<?, ? examples/s]

Map:   0%|          | 0/3395 [00:00<?, ? examples/s]

Map:   0%|          | 0/3396 [00:00<?, ? examples/s]

In [28]:
# Set up Rouge score for evaluation
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [29]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

In [30]:
# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    num_train_epochs=10,
    predict_with_generate=True,
    push_to_hub=False
)



In [31]:
# Set up trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [32]:
# Train the model
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.4132,0.367115,0.370699,0.252565,0.331689,0.335978
2,0.3826,0.355451,0.374304,0.256779,0.333993,0.338463
3,0.3919,0.348194,0.380174,0.263275,0.339573,0.343571
4,0.3772,0.344172,0.379523,0.265326,0.341275,0.345394
5,0.3878,0.341075,0.383533,0.270118,0.345279,0.349008
6,0.3898,0.338328,0.380598,0.267352,0.344404,0.347959
7,0.3768,0.336762,0.384806,0.271353,0.347173,0.351057
8,0.3798,0.335532,0.384231,0.270301,0.346874,0.350628
9,0.3717,0.334476,0.385894,0.271662,0.348247,0.351854
10,0.3863,0.334028,0.385943,0.271374,0.347976,0.351598




TrainOutput(global_step=90550, training_loss=0.39381288374674905, metrics={'train_runtime': 29191.1667, 'train_samples_per_second': 9.306, 'train_steps_per_second': 3.102, 'total_flos': 2598478609612800.0, 'train_loss': 0.39381288374674905, 'epoch': 10.0})

In [46]:
#Save trained model
trainer.model.save_pretrained("t5_finetuned")

In [49]:
model.save_pretrained("./huggingface_model") 

In [50]:
trainer.save_model("./torch_model") 

# **Manual checking with random question from test dataset**


In [59]:
print(dataset['test'].data)

MemoryMappedTable
input: string
output: string
instruction: string
----
input: [["What is the relationship between very low Mg2+ levels, PTH levels, and Ca2+ levels?","What leads to genitourinary syndrome of menopause (atrophic vaginitis)?","What does low REM sleep latency and experiencing hallucinations/sleep paralysis suggest?","What are some possible causes of low PTH and high calcium levels?","How does the level of anti-müllerian hormone relate to ovarian reserve?",...,"What are the typical shapes of urine crystals found in uric acid kidney stones?","How can urine crystals found in uric acid kidney stones be described in terms of their shape?","What specific shapes do urine crystals take in uric acid kidney stones?","What type of paralysis may result from upper motoneuron lesions?","What is the relationship between upper motoneuron lesions and paralysis?"],["What effect do upper motoneuron lesions have on muscle tone?","How do upper motoneuron lesions affect muscle tone?","What is 

In [112]:
input_ids = tokenizer("Is it common practice to fragment RNA and protein prior to analysis?", return_tensors="pt").input_ids

In [113]:
gen_output = model.generate(input_ids=input_ids, max_length=20)[0]

In [114]:
print(tokenizer.decode(gen_output, skip_special_tokens=True))

Yes, it is common practice to fragment RNA and protein prior to analysis.


# **Compute metrics**

In [65]:
# Get answers from the dataset
test_data = dataset['test']
# Initialize the list for references
references = []

# Iterate over the test data and collect the first 500 'output' values
for index, row in enumerate(test_data):
    if index >= 500:
        break
    references.append(row['output'])

In [67]:
def generate_answer(question):
    prompt = question + " ->"
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    gen_output = model.generate(input_ids=input_ids, max_length=100)[0]
    response = tokenizer.decode(gen_output, skip_special_tokens=True)
    return response

total_questions = len(test_data)
predictions = []
count = 0

# Iterate over the test data and generate answers
for index, row in enumerate(test_data):
    if count >= 500:
        break
    print(f"Processed question {count + 1} out of {total_questions}")
    question = row["input"]
    predicted_answer = generate_answer(question)
    predictions.append(predicted_answer)
    count += 1

Processed question 1 out of 3396
Processed question 2 out of 3396
Processed question 3 out of 3396
Processed question 4 out of 3396
Processed question 5 out of 3396
Processed question 6 out of 3396
Processed question 7 out of 3396
Processed question 8 out of 3396
Processed question 9 out of 3396
Processed question 10 out of 3396
Processed question 11 out of 3396
Processed question 12 out of 3396
Processed question 13 out of 3396
Processed question 14 out of 3396
Processed question 15 out of 3396
Processed question 16 out of 3396
Processed question 17 out of 3396
Processed question 18 out of 3396
Processed question 19 out of 3396
Processed question 20 out of 3396
Processed question 21 out of 3396
Processed question 22 out of 3396
Processed question 23 out of 3396
Processed question 24 out of 3396
Processed question 25 out of 3396
Processed question 26 out of 3396
Processed question 27 out of 3396
Processed question 28 out of 3396
Processed question 29 out of 3396
Processed question 30 o

In [125]:
# Function to calculate embeddings using the tokenizer
def calculate_embeddings(text_list):
    embeddings = []
    for text in text_list:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = model.encoder(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy())
    return embeddings

In [126]:
# Calculate embeddings for references and predictions
reference_embeddings = calculate_embeddings(references)
prediction_embeddings = calculate_embeddings(predictions)

In [128]:
# Compute cosine similarity for each pair of embeddings
cosine_similarities = []
for ref_emb, pred_emb in zip(reference_embeddings, prediction_embeddings):
    ref_emb = ref_emb.reshape(1, -1)
    pred_emb = pred_emb.reshape(1, -1)
    cos_sim = cosine_similarity(ref_emb, pred_emb)[0][0]
    cosine_similarities.append(cos_sim)

In [129]:
# Calculate the average cosine similarity
average_cosine_similarity = sum(cosine_similarities) / len(cosine_similarities)
print(f'Average Cosine Similarity: {average_cosine_similarity:.2f}')

Average Cosine Similarity: 0.68


In [90]:
# Function to calculate BLEU score
def calculate_bleu(reference, prediction):
    reference_tokens = [nltk.word_tokenize(reference)]
    prediction_tokens = nltk.word_tokenize(prediction)
    # Using smoothing function to avoid zero scores for short sequences
    smoothing_function = SmoothingFunction().method1
    bleu_score = sentence_bleu(reference_tokens, prediction_tokens, smoothing_function=smoothing_function)
    return bleu_score

In [130]:
# Calculate BLEU scores for all predictions
bleu_scores = [calculate_bleu(ref, pred) for ref, pred in zip(references, predictions)]

In [131]:
# Calculate average BLEU score
average_bleu_score = sum(bleu_scores) / len(bleu_scores)
print(f'Average BLEU Score: {average_bleu_score:.2f}')

Average BLEU Score: 0.09
