# Fine-tune Llama 2 on iSarcasmEval dataset
### This notebook is  inspired by [Fine-tune Llama 2 for sentiment analysis](https://www.kaggle.com/code/lucamassaron/fine-tune-llama-2-for-sentiment-analysis/notebook)  by **Luca Massaron** and [Fine-Tuning LLaMA 2](https://www.datacamp.com/tutorial/fine-tuning-llama-2)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!pip install -q -U "torch==2.1.2" tensorboard
!pip install -q -U "transformers==4.36.2" "datasets==2.16.1" "accelerate==0.26.1" "bitsandbytes==0.42.0"
!pip install -q -U git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e
!pip install -q -U git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m67.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.7/731.7 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━

## Preparing Data

In [None]:
!git clone https://github.com/iabufarha/iSarcasmEval.git

Cloning into 'iSarcasmEval'...
remote: Enumerating objects: 72, done.[K
remote: Counting objects: 100% (72/72), done.[K
remote: Compressing objects: 100% (64/64), done.[K
remote: Total 72 (delta 25), reused 32 (delta 6), pack-reused 0[K
Receiving objects: 100% (72/72), 535.16 KiB | 5.95 MiB/s, done.
Resolving deltas: 100% (25/25), done.


In [None]:
filename ='/content/iSarcasmEval/train/train.En.csv'
X = pd.read_csv(filename,
                 usecols=["tweet", "sarcastic"],
                 encoding="utf-8", encoding_errors="replace")



In [None]:
X

Unnamed: 0,tweet,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1
...,...,...
3463,The population spike in Chicago in 9 months is...,0
3464,You'd think in the second to last English clas...,0
3465,I’m finally surfacing after a holiday to Scotl...,0
3466,Couldn't be prouder today. Well done to every ...,0


In [None]:
def generate_prompt(data_point):
    return f"""
            Determine the whether the tweet enclosed in square brackets is sarcastic or non-sarcastic,
            and return the answer as the corresponding label "1" for sarcastic or "0" for non-sarcastic.

            [{data_point["tweet"]}] = {data_point["sarcastic"]}
            """.strip()

X = pd.DataFrame(X.apply(generate_prompt, axis=1),
                       columns=["text"])

X = X.sample(frac=1)

train_size = int(len(X)*0.8)
val_size = len(X)-train_size

X_train = X[:train_size]
X_val = X[train_size:]

In [None]:
X_train

Unnamed: 0,text
359,Determine the whether the tweet enclosed in sq...
632,Determine the whether the tweet enclosed in sq...
2275,Determine the whether the tweet enclosed in sq...
822,Determine the whether the tweet enclosed in sq...
2945,Determine the whether the tweet enclosed in sq...
...,...
2371,Determine the whether the tweet enclosed in sq...
2605,Determine the whether the tweet enclosed in sq...
1715,Determine the whether the tweet enclosed in sq...
793,Determine the whether the tweet enclosed in sq...


In [None]:
X_val

Unnamed: 0,text
2540,Determine the whether the tweet enclosed in sq...
376,Determine the whether the tweet enclosed in sq...
1604,Determine the whether the tweet enclosed in sq...
2237,Determine the whether the tweet enclosed in sq...
1584,Determine the whether the tweet enclosed in sq...
...,...
954,Determine the whether the tweet enclosed in sq...
471,Determine the whether the tweet enclosed in sq...
763,Determine the whether the tweet enclosed in sq...
2024,Determine the whether the tweet enclosed in sq...


In [None]:
test_filename ='/content/iSarcasmEval/test/task_A_En_test.csv'

X_test = pd.read_csv(test_filename,
                 usecols=["text", "sarcastic"],
                #  names = ["tweet", "sarcastic"],
                 encoding="utf-8", encoding_errors="replace")

X_test = X_test.rename(columns={'text':'tweet'})

X_test = X_test.sample(frac=1)

def generate_test_prompt(data_point):
    return f"""
            Determine the whether the tweet enclosed in square brackets is sarcastic or non-sarcastic,
            and return the answer as the corresponding label "1" for sarcastic or "0" for non-sarcastic.

            [{data_point["tweet"]}] = """.strip()


y_test = X_test['sarcastic']
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

In [None]:
X_test

Unnamed: 0,text
1153,Determine the whether the tweet enclosed in sq...
852,Determine the whether the tweet enclosed in sq...
944,Determine the whether the tweet enclosed in sq...
135,Determine the whether the tweet enclosed in sq...
840,Determine the whether the tweet enclosed in sq...
...,...
1368,Determine the whether the tweet enclosed in sq...
252,Determine the whether the tweet enclosed in sq...
112,Determine the whether the tweet enclosed in sq...
1208,Determine the whether the tweet enclosed in sq...


## Functions

In [None]:
def evaluate(y_true, y_pred):
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')

    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true))
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [None]:
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"working on {device}")

working on cuda:0


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf",trust_remote_code=True,)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                             device_map=device,
                                              torch_dtype=compute_dtype,
                                              quantization_config=bnb_config,)

model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model, tokenizer = setup_chat_format(model, tokenizer)

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [None]:
def predict(test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["text"]
        pipe = pipeline(task="text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_new_tokens=2
                       )
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("=")[-1]
        if "1" in answer:
            y_pred.append(1)
        elif "0" in answer:
            y_pred.append(0)
        else:
            y_pred.append(0)
    return y_pred

## Fine-tuning

In [None]:
train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_val)

In [None]:
output_dir="trained_weigths"

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir=output_dir,                    # directory to save and repository id
    num_train_epochs=4,                       # number of training epochs
    per_device_train_batch_size=4,            # batch size per device during training
    gradient_accumulation_steps=1,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,                         # log every 10 steps
    learning_rate=2e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=True,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="tensorboard",                  # report metrics to tensorboard
    evaluation_strategy="epoch"               # save checkpoint every epoch
)



In [None]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=1024,
    packing=False,
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    }
)

Map:   0%|          | 0/2774 [00:00<?, ? examples/s]

Map:   0%|          | 0/694 [00:00<?, ? examples/s]

In [None]:
trainer.train()

## Evaluate

In [None]:
y_pred = predict(X_test, model, tokenizer)

100%|██████████| 1400/1400 [06:56<00:00,  3.36it/s]


In [None]:
evaluate(y_test.tolist(), y_pred)

Accuracy: 0.821
Accuracy for label 0: 0.837
Accuracy for label 1: 0.730

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.84      0.89      1200
           1       0.43      0.73      0.54       200

    accuracy                           0.82      1400
   macro avg       0.69      0.78      0.71      1400
weighted avg       0.87      0.82      0.84      1400


Confusion Matrix:
[[1004  196]
 [  54  146]]


# Data Augmentation: Back Translation

In [None]:
#https://gist.github.com/kylegallatin/fcadec9e4c5071251fe96ef5643307c4
#https://huggingface.co/docs/transformers/model_doc/marian

from transformers import MarianTokenizer, MarianMTModel
model_name_1 = 'Helsinki-NLP/opus-mt-en-fr'
model_1 = MarianMTModel.from_pretrained(model_name_1).to('cuda')
tokenizer_1 = MarianTokenizer.from_pretrained(model_name_1)

model_name_2 = 'Helsinki-NLP/opus-mt-fr-en'
model_2 = MarianMTModel.from_pretrained(model_name_2).to('cuda')
tokenizer_2 = MarianTokenizer.from_pretrained(model_name_2)

def back_translate(text):
  #translate
  translated = model_1.generate(**tokenizer_1(text, return_tensors="pt", padding=True).to('cuda'))
  translated_decode = [tokenizer_1.decode(t, skip_special_tokens=True) for t in translated]

  #back translate
  back_translated = model_2.generate(**tokenizer_2(translated_decode, return_tensors="pt", padding=True).to('cuda'))
  back_translated_decode =[tokenizer_2.decode(t, skip_special_tokens=True) for t in back_translated]
  return back_translated_decode

def bt_data_point(data_point):
  return back_translate(str(data_point["tweet"]))

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

In [None]:
X_original = pd.read_csv(filename,
                 usecols=["tweet", "sarcastic"],
                 encoding="utf-8", encoding_errors="replace")

In [None]:
X_to_augment = X_original.loc[X_train.index]

In [None]:
X_to_augment.loc[3222].tweet

'One of my workmates got me bubble tea for free bc she works in one of the shops 😭 my heart 😭'

In [None]:
X_augmented = X_to_augment.copy()

In [None]:
X_augmented["back_translated"]= X_augmented.apply(bt_data_point, axis=1)

In [None]:
X_augmented

Unnamed: 0,tweet,sarcastic,back_translated
1348,1 in 1000 men will develop breast cancer in th...,0,1 in 1000 men will develop breast cancer in th...
3385,Some man just cat called me by saying “let me ...,0,"A cat called me and said, ""Let me take you hom..."
1039,Drake literally ended everyone in the industry...,0,Drake literally put an end to everyone in the ...
2979,@SenRickScott @marcorubio \nDoctors today are ...,0,@SenRickScott @marcorubio Doctors are today wh...
1575,@jbrylinsabres15 @JoeCaramagna @FriedgeHNIC @N...,0,@jbrylinsabres15 @JoeCaramagna @FriedgeHNIC @N...
...,...,...,...
644,Got my first shot. Feeling pretty good. Kinda ...,1,I had my first shot. Feeling pretty good. A li...
1581,Is only 12 hours away from my first #raceforli...,0,Is just 12 hours from my first #raceforlife in...
763,Currently bombing aces on the volleyball court...,1,Currently bombing ace on the volleyball field ...
1799,Get the Police involved.\nRefuse to sail until...,0,Engage the police. Refuse to navigate until th...


In [None]:
X_augmented = X_augmented[X_augmented["tweet"]!=X_augmented["back_translated"]][["back_translated","sarcastic"]]

In [None]:
X_augmented_ = X_augmented.rename(columns={'back_translated': 'tweet'})

In [None]:
X_augmented_

Unnamed: 0,tweet,sarcastic
1348,1 in 1000 men will develop breast cancer in th...,0
3385,"A cat called me and said, ""Let me take you hom...",0
1039,Drake literally put an end to everyone in the ...,0
2979,@SenRickScott @marcorubio Doctors are today wh...,0
1575,@jbrylinsabres15 @JoeCaramagna @FriedgeHNIC @N...,0
...,...,...
644,I had my first shot. Feeling pretty good. A li...,1
1581,Is just 12 hours from my first #raceforlife in...,0
763,Currently bombing ace on the volleyball field ...,1
1799,Engage the police. Refuse to navigate until th...,0


In [None]:
X_augmented = pd.concat([X_to_augment, X_augmented_], ignore_index=True)

In [None]:
X_augmented_ = pd.DataFrame(X_augmented.apply(generate_prompt, axis=1),
                       columns=["text"])

In [None]:
X_val

Unnamed: 0,text
361,Determine the whether the tweet enclosed in sq...
1460,Determine the whether the tweet enclosed in sq...
3386,Determine the whether the tweet enclosed in sq...
720,Determine the whether the tweet enclosed in sq...
3439,Determine the whether the tweet enclosed in sq...
...,...
1543,Determine the whether the tweet enclosed in sq...
1267,Determine the whether the tweet enclosed in sq...
1129,Determine the whether the tweet enclosed in sq...
3053,Determine the whether the tweet enclosed in sq...


In [None]:
train_data_aug = Dataset.from_pandas(X_augmented_)
eval_data = Dataset.from_pandas(X_val)

In [None]:
output_dir="trained_weigths"

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir=output_dir,                    # directory to save and repository id
    num_train_epochs=4,                       # number of training epochs
    per_device_train_batch_size=4,            # batch size per device during training
    gradient_accumulation_steps=1,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,                         # log every 10 steps
    learning_rate=2e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=True,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="tensorboard",                  # report metrics to tensorboard
    evaluation_strategy="epoch"               # save checkpoint every epoch
)



In [None]:
trainer2 = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data_aug,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=1024,
    packing=False,
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    }
)

Map:   0%|          | 0/5471 [00:00<?, ? examples/s]

Map:   0%|          | 0/694 [00:00<?, ? examples/s]

In [None]:
trainer2.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.7353,1.020386
2,0.7472,1.0394
3,0.6897,1.104806
4,0.5626,1.151065


TrainOutput(global_step=5472, training_loss=0.8574707309405009, metrics={'train_runtime': 3840.3171, 'train_samples_per_second': 5.698, 'train_steps_per_second': 1.425, 'total_flos': 8.077981083979776e+16, 'train_loss': 0.8574707309405009, 'epoch': 4.0})

In [None]:
y_pred = predict(X_test, model, tokenizer)

100%|██████████| 1400/1400 [06:57<00:00,  3.35it/s]


# Result

In [None]:
evaluate(y_test.tolist(), y_pred)

Accuracy: 0.804
Accuracy for label 0: 0.805
Accuracy for label 1: 0.795

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.81      0.88      1200
           1       0.40      0.80      0.54       200

    accuracy                           0.80      1400
   macro avg       0.68      0.80      0.71      1400
weighted avg       0.88      0.80      0.83      1400


Confusion Matrix:
[[966 234]
 [ 41 159]]
