# Multi-Task Learning DA with 
### This notebook is  inspired by [Fine-tune Llama 2 for sentiment analysis](https://www.kaggle.com/code/lucamassaron/fine-tune-llama-2-for-sentiment-analysis/notebook)  by **Luca Massaron** and [Fine-Tuning LLaMA 2](https://www.datacamp.com/tutorial/fine-tuning-llama-2)

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split

In [3]:
import numpy as np
import pandas as pd
import re

## Preparing Data

In [4]:
!git clone https://github.com/iabufarha/iSarcasmEval.git
!git clone https://github.com/headacheboy/data-of-multimodal-sarcasm-detection.git

fatal: destination path 'iSarcasmEval' already exists and is not an empty directory.
fatal: destination path 'data-of-multimodal-sarcasm-detection' already exists and is not an empty directory.


In [5]:
import re
import random
random.seed(42)

In [6]:
filename ='back_translation.csv'
X = pd.read_csv(filename,
                 usecols=["tweet", "sarcastic"],
                 encoding="utf-8", encoding_errors="replace")



In [7]:
def generate_prompt(data_point):
    return f"""
            Determine the whether the tweet enclosed in square brackets is sarcastic or non-sarcastic,
            and return the answer as the corresponding label "1" for sarcastic or "0" for non-sarcastic.

            Some words like "really", "never", "actually", etc. can be considered a symbol of sarcasm and express some contradictory and criticized attitudes.

            We also notice that some misspelled words (e.g., "so"->"soooo", "love"->"looove", "sure"->"sureeee") and capitalized words (not located at the beginning of a sentence) can sometimes exaggerate the emotional expression.


            [{data_point["tweet"]}] = {data_point["sarcastic"]}
            """.strip()

X = pd.DataFrame(X.apply(generate_prompt, axis=1),
                       columns=["text"])

X = X.sample(frac=1)

train_size = int(len(X)*0.8)
val_size = len(X)-train_size

X_train = X[:train_size]
X_val = X[train_size:]

In [8]:
X_train

Unnamed: 0,text
6524,Determine the whether the tweet enclosed in sq...
6392,Determine the whether the tweet enclosed in sq...
5593,Determine the whether the tweet enclosed in sq...
5552,Determine the whether the tweet enclosed in sq...
4732,Determine the whether the tweet enclosed in sq...
...,...
441,Determine the whether the tweet enclosed in sq...
5720,Determine the whether the tweet enclosed in sq...
3325,Determine the whether the tweet enclosed in sq...
2582,Determine the whether the tweet enclosed in sq...


In [9]:
test_filename ='iSarcasmEval/test/task_A_En_test.csv'

X_test = pd.read_csv(test_filename,
                 usecols=["text", "sarcastic"],
                #  names = ["tweet", "sarcastic"],
                 encoding="utf-8", encoding_errors="replace")

X_test = X_test.rename(columns={'text':'tweet'})

X_test = X_test.sample(frac=1)

def generate_test_prompt(data_point):
    return f"""
            Determine whether the tweet enclosed in square brackets is sarcastic or non-sarcastic,
            and return the answer as the corresponding label "1" for sarcastic or "0" for non-sarcastic.

            Some words like "really", "never", "actually", etc. can be considered a symbol of sarcasm and express some contradictory and criticized attitudes.

            We also notice that some misspelled words (e.g., "so"->"soooo", "love"->"looove", "sure"->"sureeee") and capitalized words (not located at the beginning of a sentence) can sometimes exaggerate the emotional expression.

            [{data_point["tweet"]}] = """.strip()


y_test = X_test['sarcastic']
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

## Functions

In [10]:
def evaluate(y_true, y_pred):
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')

    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true))
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [11]:
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)


In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"working on {device}")

working on cuda:0


In [13]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf",trust_remote_code=True,)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                             device_map=device,
                                              torch_dtype=compute_dtype,
                                              quantization_config=bnb_config,)

model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model, tokenizer = setup_chat_format(model, tokenizer)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
def predict(test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["text"]
        pipe = pipeline(task="text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_new_tokens=2
                       )
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("=")[-1]
        if "1" in answer:
            y_pred.append(1)
        elif "0" in answer:
            y_pred.append(0)
        else:
            y_pred.append(2)
    return y_pred

In [15]:
X_sentiment = pd.read_csv('mtl.csv')

In [16]:
import pandas as pd

# Assuming X_sentiment is your DataFrame

# Removing the 'Unnamed: 0' column
X_sentiment = X_sentiment.drop(columns=['Unnamed: 0'])

# Renaming 'clean_comment' column to 'text'
X_sentiment = X_sentiment.rename(columns={'clean_comment': 'text'}).sample(n=5000, random_state=1)


In [17]:
X_sentiment

Unnamed: 0,text
9953,Determine the whether the sentiement of tweet ...
3850,Determine the whether the sentiement of tweet ...
4962,Determine the whether the sentiement of tweet ...
3886,Determine the whether the sentiement of tweet ...
5437,Determine the whether the sentiement of tweet ...
...,...
2241,Determine the whether the sentiement of tweet ...
3661,Determine the whether the sentiement of tweet ...
701,Determine the whether the sentiement of tweet ...
2426,Determine the whether the sentiement of tweet ...


In [18]:
X_train = np.concatenate([X_train,X_sentiment])

In [19]:
X_train_df = pd.DataFrame(X_train, columns=['text'])
X_val_df = pd.DataFrame(X_val, columns=['text'])

## Fine-tuning

In [20]:
train_data = Dataset.from_pandas(X_train_df)
eval_data = Dataset.from_pandas(X_val_df)

In [21]:
output_dir="trained_weigths"

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir=output_dir,                    # directory to save and repository id
    num_train_epochs=4,                       # number of training epochs
    per_device_train_batch_size=4,            # batch size per device during training
    gradient_accumulation_steps=1,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,                         # log every 10 steps
    learning_rate=2e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=True,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="tensorboard",                  # report metrics to tensorboard
    evaluation_strategy="epoch"               # save checkpoint every epoch
)



In [22]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=1024,
    packing=False,
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    }
)

Map:   0%|          | 0/10548 [00:00<?, ? examples/s]

Map:   0%|          | 0/1388 [00:00<?, ? examples/s]

In [23]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.7762,0.480163
2,0.5829,0.469262
3,0.6199,0.465138
4,0.4964,0.468604


TrainOutput(global_step=10548, training_loss=0.7341067060449605, metrics={'train_runtime': 11082.2059, 'train_samples_per_second': 3.807, 'train_steps_per_second': 0.952, 'total_flos': 2.6374778837434368e+17, 'train_loss': 0.7341067060449605, 'epoch': 4.0})

## Evaluate

In [24]:
y_pred = predict(X_test, model, tokenizer)

100%|██████████████████████████████████████████████████████████████████████████████| 1400/1400 [07:39<00:00,  3.05it/s]


In [25]:
evaluate(y_test.tolist(), y_pred)

Accuracy: 0.821
Accuracy for label 0: 0.819
Accuracy for label 1: 0.830

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.82      0.89      1200
           1       0.43      0.83      0.57       200

    accuracy                           0.82      1400
   macro avg       0.70      0.82      0.73      1400
weighted avg       0.89      0.82      0.84      1400


Confusion Matrix:
[[983 217]
 [ 34 166]]
