In [1]:
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.1

In [2]:
from datasets import load_dataset

def create_conversation(sample):
  system_message = "Your task is a three-way classification formulation."
  # prompt = "Giving a premise: '" + sample["premise"] + "' Giving a hypothesis: '" \
  #           + sample["hypothesis"] + "' Predict whether the hypothesis is entailed" \
  #           + ", contradicted, or neutral given the premise."
  prompt = "Giving a premise: '" + sample["premise"] + "' The parse of the premise is:'" \
              + sample["premise_binary_parse"] + "' Giving a hypothesis: '" \
              + sample["hypothesis"] + "' The parse of the hypothesis is:'" +\
              sample["hypothesis_binary_parse"] + "' Predict whether the hypothesis is entailed" \
              + ", contradicted, or neutral given the premise."
  label = sample["label"]
  if label == 0:
    answer = "entailment"
  elif label == 1:
    answer = "neutral"
  elif label == 2:
    answer = "contradiction"
  return {
    "messages": [
      {"role": "system", "content": system_message},
      {"role": "user", "content": prompt},
      {"role": "assistant", "content": answer}
    ]
  }

def create_conversation_eva(sample):
  system_message = "Your task is a three-way classification formulation."
  # prompt = "Giving a premise: '" + sample["sentence1"] + "' Giving a hypothesis: '" \
  #           + sample["sentence2"] + "' Predict whether the hypothesis is entailed" \
  #           + ", contradicted, or neutral given the premise."
  prompt = "Giving a premise: '" + sample["sentence1"] + "' The parse of the premise is:'" \
              + sample["sentence1_binary_parse"] + "' Giving a hypothesis: '" \
              + sample["sentence2"] + "' The parse of the hypothesis is:'" +\
              sample["sentence2_binary_parse"] + "' Predict whether the hypothesis is entailed" \
              + ", contradicted, or neutral given the premise."
  return {
    "messages": [
      {"role": "system", "content": system_message},
      {"role": "user", "content": prompt},
      {"role": "assistant", "content": sample["gold_label"]}
    ]
  }

dataset_train = load_dataset("json", data_files="/content/train.json", split="train")
dataset_eva_m = load_dataset("json", data_files="/content/dev_matched_sampled-1.jsonl", split="train")
dataset_eva_mism = load_dataset("json", data_files="/content/dev_mismatched_sampled-1.jsonl", split="train")
# Convert dataset to OAI messages
dataset_train = dataset_train.map(create_conversation,
                      remove_columns=dataset_train.features,batched=False)
dataset_eva_m = dataset_eva_m.map(create_conversation_eva,
                      remove_columns=dataset_eva_m.features,batched=False)
dataset_eva_mism = dataset_eva_mism.map(create_conversation_eva,
                      remove_columns=dataset_eva_mism.features,batched=False)

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import setup_chat_format

model_id = "google/gemma-2b-it"

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model, tokenizer = setup_chat_format(model, tokenizer)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
from peft import LoraConfig
import transformers
from trl import SFTTrainer

peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=256,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)

args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=300,
        # Copied from other hugging face tuning blog posts
        learning_rate=2e-4,
        fp16=True,
        # It makes training faster
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    )
max_seq_length = 512 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset_train,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)



In [5]:
trainer.train()
trainer.save_model("gemma_ft_saved")

Step,Training Loss
1,4.0394
2,3.874
3,2.8801
4,3.4526
5,2.5787
6,2.091
7,1.6847
8,1.4754
9,1.4208
10,1.3009




In [6]:
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline

peft_model_id = "./gemma_ft_saved"

model = AutoPeftModelForCausalLM.from_pretrained(
  peft_model_id,
  device_map="auto",
  torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCau

In [7]:
from tqdm import tqdm
import pandas as pd
import os

def evaluate(sample):
    prompt = pipe.tokenizer.apply_chat_template(sample["messages"][:2], tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
    predicted_answer = outputs[0]['generated_text'][len(prompt):].strip()
    if predicted_answer.lower() == sample["messages"][2]["content"].lower():
        return 1, predicted_answer.lower(), sample["messages"][2]["content"].lower()
    else:
        return 0, predicted_answer.lower(), sample["messages"][2]["content"].lower()

success_rate = []
predictions = []
actuals = []


for s in tqdm(dataset_eva_m):
    x, y, z = evaluate(s)
    success_rate.append(x)
    predictions.append(y)
    actuals.append(z)

accuracy = sum(success_rate)/len(success_rate)
print(f"Accuracy: {accuracy*100:.2f}%")
final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
final_df.to_csv(os.path.join("./",'predictions_match.csv'))

success_rate = []
predictions = []
actuals = []


for s in tqdm(dataset_eva_mism):
    x, y, z = evaluate(s)
    success_rate.append(x)
    predictions.append(y)
    actuals.append(z)

accuracy = sum(success_rate)/len(success_rate)
print(f"Accuracy: {accuracy*100:.2f}%")
final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
final_df.to_csv(os.path.join("./",'predictions_mismatch.csv'))

100%|██████████| 2500/2500 [07:38<00:00,  5.46it/s]


Accuracy: 67.56%


100%|██████████| 2500/2500 [07:43<00:00,  5.39it/s]

Accuracy: 67.72%



