In [1]:
pip install evaluate rouge-score transformers nltk peft

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=764e19e14645bfd7a7ea9bba880cf880bf9c8491d697ce7550693cc915686186
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985

In [3]:
import pandas as pd
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model, TaskType, PeftConfig, PeftModel
from nltk.tokenize import sent_tokenize
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import evaluate
import torch


dataset_name = "bitext/Bitext-customer-support-llm-chatbot-training-dataset"
dataset = load_dataset(dataset_name, split='train')
df_train = dataset.to_pandas()
train_data, eval_data = train_test_split(df_train, test_size=0.2, random_state=42)

# Set up model and tokenizer
model_id = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

# Preprocess function
def preprocess_function(sample, padding="max_length"):
    model_inputs = tokenizer(sample["instruction"], max_length=256, padding=padding, truncation=True)
    labels = tokenizer(sample["response"], max_length=256, padding=padding, truncation=True)
    if padding == "max_length":
        labels["input_ids"] = [[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize datasets
train_tokenized_dataset = Dataset.from_pandas(train_data).map(preprocess_function, batched=True, remove_columns=['flags', 'instruction', 'category', 'intent', 'response'])
test_tokenized_dataset = Dataset.from_pandas(eval_data).map(preprocess_function, batched=True, remove_columns=['flags', 'instruction', 'category', 'intent', 'response'])

# Set up LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Set up data collator and training arguments
label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
)
output_dir = "/kaggle/working/lora-flan-t5-small-chat"
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-3,
    num_train_epochs=3,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=False
)

# Set up trainer and train the model
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_tokenized_dataset,
    eval_dataset=test_tokenized_dataset
)
model.config.use_cache = False
trainer.train()

README.md:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

(…)t_Training_Dataset_27K_responses-v11.csv:   0%|          | 0.00/19.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/26872 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/21497 [00:00<?, ? examples/s]

Map:   0%|          | 0/5375 [00:00<?, ? examples/s]

trainable params: 688,128 || all params: 77,649,280 || trainable%: 0.8862


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113737988888614, max=1.0…

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss
1344,1.7924
2688,1.5154
4032,1.4453


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=4032, training_loss=1.5843820117768788, metrics={'train_runtime': 1714.3001, 'train_samples_per_second': 37.619, 'train_steps_per_second': 2.352, 'total_flos': 6062296973967360.0, 'train_loss': 1.5843820117768788, 'epoch': 3.0})

In [None]:
# # Save the model using the recommended method
# output_dir = "/kaggle/working/flan_t5/lora-flan-t5-small-chat"
# os.makedirs(output_dir, exist_ok=True)
# trainer.model.save_pretrained(output_dir)
# trainer.tokenizer.save_pretrained(output_dir)

In [4]:
# Load the original model and LoRA fine-tuned model
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_id).cuda()
original_tokenizer = AutoTokenizer.from_pretrained(model_id)

peft_model_id = "/kaggle/working/lora-flan-t5-small-chat/checkpoint-4032/"
config = PeftConfig.from_pretrained(peft_model_id)
peft_model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
peft_tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
peft_model = PeftModel.from_pretrained(peft_model, peft_model_id, device_map={"":0}).cuda()
peft_model.eval()



PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 512)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 512)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=512, out_features=384, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=512, out_features=16, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=16, out_features=384, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
             

In [5]:
# Prepare test data for ROUGE evaluation
test_data = eval_data['instruction'].tolist()[:150]
response = eval_data['response'].tolist()[:150]

# Generate responses using both models
def generate_responses(model, tokenizer, inputs):
    responses = []
    for inp in inputs:
        input_ids = tokenizer(inp, return_tensors="pt", truncation=True, max_length=256).input_ids.cuda()
        outputs = model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, max_length=256)
        responses.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
    return responses

original_model_responses = generate_responses(original_model, original_tokenizer, test_data)
peft_model_responses = generate_responses(peft_model, peft_tokenizer, test_data)

# Compute ROUGE scores
rouge = evaluate.load('rouge')
original_model_results = rouge.compute(
    predictions=original_model_responses,
    references=response,
    use_aggregator=True,
    use_stemmer=True,
)
peft_model_results = rouge.compute(
    predictions=peft_model_responses,
    references=response,
    use_aggregator=True,
    use_stemmer=True,
)

# Print ROUGE scores
print('\nOriginal Model ROUGE Scores:')
print(original_model_results)
print('\nPEFT Model ROUGE Scores:')
print(peft_model_results)


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]


Original Model ROUGE Scores:
{'rouge1': 0.07738113835765134, 'rouge2': 0.017617387788871293, 'rougeL': 0.06259720008228106, 'rougeLsum': 0.06401729186231758}

PEFT Model ROUGE Scores:
{'rouge1': 0.44270525310677267, 'rouge2': 0.181763302085444, 'rougeL': 0.2878179321772827, 'rougeLsum': 0.30936962797949247}


In [7]:
# Compute BLEU scores
bleu = evaluate.load('bleu')
original_model_bleu = bleu.compute(
    predictions=original_model_responses,  # Keep the original model responses as strings
    references=[[ref] for ref in response]  # Each reference should be a list of strings
)
peft_model_bleu = bleu.compute(
    predictions=peft_model_responses,
    references=[[ref] for ref in response]  # Each reference should be a list of strings
)

# Print BLEU scores
print('\nOriginal Model BLEU Score:')
print(original_model_bleu)

print('\nPEFT Model BLEU Score:')
print(peft_model_bleu)


Original Model BLEU Score:
{'bleu': 2.6740041175707e-07, 'precisions': [0.4480836236933798, 0.08786936236391912, 0.029488291413703384, 0.015518913676042677], 'brevity_penalty': 4.104263196064477e-06, 'length_ratio': 0.07460746594572112, 'translation_length': 1435, 'reference_length': 19234}

PEFT Model BLEU Score:
{'bleu': 0.13296333953604048, 'precisions': [0.4446771587577045, 0.18507336513495562, 0.10499055511547133, 0.06617059221450096], 'brevity_penalty': 0.8598662737517796, 'length_ratio': 0.868826037225746, 'translation_length': 16711, 'reference_length': 19234}
