# Fine-tune Llama 2 on FigLang2020 dataset
### This notebook is  inspired by [Fine-tune Llama 2 for sentiment analysis](https://www.kaggle.com/code/lucamassaron/fine-tune-llama-2-for-sentiment-analysis/notebook)  by **Luca Massaron** and [Fine-Tuning LLaMA 2](https://www.datacamp.com/tutorial/fine-tuning-llama-2)

In [None]:
from huggingface_hub import login, notebook_login

notebook_login()
# login('[YOUR HF API KEY]')

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
!pip install -q -U "torch==2.1.2" tensorboard
!pip install -q -U "transformers==4.36.2" "datasets==2.16.1" "accelerate==0.26.1" "bitsandbytes==0.42.0"
!pip install -q -U git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e
!pip install -q -U git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
# import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m76.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.8/209.8 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.2/89.2 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.15.0 requires tensorboard<2.16,>=2.15, but you have tensorboard 2.16.2 which is incompatible.
torchaudio 2.2.1+cu121 requires torch==2.2.1, but you have torch 2.1.2 which is incompatible.
torchtext 0.17.1 requires torch==2.2.1, but you have torch 2.1.2 which is incompatible.
torchvision 0.17.1+cu121 requires torch==

In [None]:
import numpy as np
import pandas as pd
import re

# Without Context

## Preparing Data

### load files via upload (provided in the repository)

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
filename_train = 'FigLang2020_twitter_training_updated.csv'
# filename_test = 'FigLang2020_twitter_testing_updated.csv'

### replace abbreviation

In [None]:
#https://stackoverflow.com/questions/57503786/python-replace-abbreviation-in-text
replacers = {'dm': 'direct message',
 'thx': 'thanks',
 'dming': 'direct messaging',
 'dmed': 'direct messaged',
 'plz': 'please',
 'u': 'you',
 'asap': 'as soon as possible',
 '...': '',
 '. . .': '',
 'r': 'are',
 'U':'You',
 'idk':"i don't know",
 'omg':'oh my god',
 'sry':'sorry',
  'fb':'facebook',
 }

replace_abb = lambda x: ' '.join([replacers.get(e, e) for e in x])


### replace USERNAME

In [None]:
replace_username = lambda x: re.sub('@[^\s]+','<user>',x)

# replace_hashtag = lambda x: re.sub('#[^\s]+','<TAG>',x)

replace_link = lambda x: re.sub(r'http?s?://\S+', '<url>', x)

### demojize

In [None]:
# ! pip install emoji

In [None]:
import pandas as pd
from emoji import emojize, demojize

In [None]:
def replace_emoji(text):
  demojize(text)#delimiters=(":", ":")

### process

In [None]:
def process(text):
    text = replace_abb(str(text).split())
    text = replace_username(text)
    # text = replace_hashtag(text)
    text = replace_link(text)
    # text = replace_emoji(text)
    return text

In [None]:
import re
import random
random.seed(42)

In [None]:
X = pd.read_csv(filename_train,
                 usecols=["tweet", "sarcastic"],
                 encoding="utf-8", encoding_errors="replace")

X

Unnamed: 0,sarcastic,tweet
0,1,@USER @USER @USER I don't get this .. obviousl...
1,1,@USER @USER trying to protest about . Talking ...
2,1,@USER @USER @USER He makes an insane about of ...
3,1,@USER @USER Meanwhile Trump won't even release...
4,1,@USER @USER Pretty Sure the Anti-Lincoln Crowd...
...,...,...
4995,0,@USER You don't . I have purchased a lot on Am...
4996,0,@USER #Emotions you say 🤔 never knew that I th...
4997,0,"@USER @USER @USER You are so right ... "" Yes !..."
4998,0,@USER @USER @USER Another lazy delusional vote...


In [None]:
X = X.sample(frac=1)

In [None]:
total_train_size = int(len(X)*0.7)

train_size = int(len(X)*0.7*0.8)
val_size = int(len(X)*0.7*0.2)

X_test = X[total_train_size:]
X = X[:total_train_size]

In [None]:
for index, row in X.iterrows():
    X.at[index, 'tweet'] = demojize(row['tweet'], delimiters=("[", "]"))

X

Unnamed: 0,sarcastic,tweet
1501,1,@USER @USER Okay . I was just having a little ...
2586,0,@USER @USER @USER I grew up in Crowborough . I...
2653,0,scripted tv show is not the way to a person's ...
1055,1,@USER @USER Don't forget that evocative sense ...
705,1,@USER @USER It ’ s funny how there aren ’ t an...
...,...,...
2087,1,@USER funny how 2DS friend said earlier that t...
1889,1,@USER @USER @USER You think I'm ACTUALLY argui...
4622,0,"And if it all feels to heavy , just remember :..."
1591,1,@USER @USER @USER They guy calling for a relig...


In [None]:
X['tweet'] = X['tweet'].apply(process)

In [None]:
X

Unnamed: 0,sarcastic,tweet
1501,1,<user> <user> Okay . I was just having a littl...
2586,0,<user> <user> <user> I grew up in Crowborough ...
2653,0,scripted tv show is not the way to a person's ...
1055,1,<user> <user> Don't forget that evocative sens...
705,1,<user> <user> It ’ s funny how there aren ’ t ...
...,...,...
2087,1,<user> funny how 2DS friend said earlier that ...
1889,1,<user> <user> <user> You think I'm ACTUALLY ar...
4622,0,"And if it all feels to heavy , just remember :..."
1591,1,<user> <user> <user> They guy calling for a re...


In [None]:
def generate_prompt(data_point):
    return f"""
            Determine the whether the tweet enclosed in square brackets is sarcastic or non-sarcastic,
            and return the answer as the corresponding label "1" for sarcastic or "0" for non-sarcastic.

            Some words like "really", "never", "actually", etc. can be considered a symbol of sarcasm and express some contradictory and criticized attitudes.

            We also notice that some misspelled words (e.g., "so"->"soooo", "love"->"looove", "sure"->"sureeee") and capitalized words (not located at the beginning of a sentence) can sometimes exaggerate the emotional expression.


            [{data_point["tweet"]}] = {data_point["sarcastic"]}
            """.strip()

X = pd.DataFrame(X.apply(generate_prompt, axis=1),
                       columns=["text"])


X_train = X[:train_size]
X_val = X[train_size:]

In [None]:
X_train

Unnamed: 0,text
1501,Determine the whether the tweet enclosed in sq...
2586,Determine the whether the tweet enclosed in sq...
2653,Determine the whether the tweet enclosed in sq...
1055,Determine the whether the tweet enclosed in sq...
705,Determine the whether the tweet enclosed in sq...
...,...
1093,Determine the whether the tweet enclosed in sq...
4839,Determine the whether the tweet enclosed in sq...
1376,Determine the whether the tweet enclosed in sq...
4913,Determine the whether the tweet enclosed in sq...


In [None]:
X_val

Unnamed: 0,text
1922,Determine the whether the tweet enclosed in sq...
3173,Determine the whether the tweet enclosed in sq...
1764,Determine the whether the tweet enclosed in sq...
743,Determine the whether the tweet enclosed in sq...
4318,Determine the whether the tweet enclosed in sq...
...,...
2087,Determine the whether the tweet enclosed in sq...
1889,Determine the whether the tweet enclosed in sq...
4622,Determine the whether the tweet enclosed in sq...
1591,Determine the whether the tweet enclosed in sq...


In [None]:
# X_test = pd.read_csv(filename_test,
#                  usecols=["tweet", "sarcastic"],
#                 #  names = ["tweet", "sarcastic"],
#                  encoding="utf-8", encoding_errors="replace")

# X_test = X_test.rename(columns={'text':'tweet'})

X_test = X_test.sample(frac=1)

def generate_test_prompt(data_point):
    return f"""
            Determine whether the tweet enclosed in square brackets is sarcastic or non-sarcastic,
            and return the answer as the corresponding label "1" for sarcastic or "0" for non-sarcastic.

            Some words like "really", "never", "actually", etc. can be considered a symbol of sarcasm and express some contradictory and criticized attitudes.

            We also notice that some misspelled words (e.g., "so"->"soooo", "love"->"looove", "sure"->"sureeee") and capitalized words (not located at the beginning of a sentence) can sometimes exaggerate the emotional expression.

            [{data_point["tweet"]}] = """.strip()


y_test = X_test['sarcastic']
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

In [None]:
X_test

Unnamed: 0,text
3533,Determine whether the tweet enclosed in square...
2397,Determine whether the tweet enclosed in square...
492,Determine whether the tweet enclosed in square...
4857,Determine whether the tweet enclosed in square...
3440,Determine whether the tweet enclosed in square...
...,...
4261,Determine whether the tweet enclosed in square...
3056,Determine whether the tweet enclosed in square...
3511,Determine whether the tweet enclosed in square...
4363,Determine whether the tweet enclosed in square...


## Functions

In [None]:
def evaluate(y_true, y_pred):
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')

    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true))
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [None]:
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"working on {device}")

working on cuda:0


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf",trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                             device_map=device,
                                              torch_dtype=compute_dtype,
                                              quantization_config=bnb_config)

model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model, tokenizer = setup_chat_format(model, tokenizer)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def predict(test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["text"]
        pipe = pipeline(task="text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_new_tokens=2
                       )
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("=")[-1]
        if "1" in answer:
            y_pred.append(1)
        elif "0" in answer:
            y_pred.append(0)
        else:
            y_pred.append(2)
    return y_pred

## Fine-tuning

In [None]:
train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_val)

In [None]:
repo_name="llama2-7b-without-context"

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir=repo_name,                     # directory to save and repository id
    num_train_epochs=4,                       # number of training epochs
    per_device_train_batch_size=4,            # batch size per device during training
    gradient_accumulation_steps=1,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,                         # log every 10 steps
    learning_rate=2e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=True,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="tensorboard",                  # report metrics to tensorboard
    evaluation_strategy="epoch",               # save checkpoint every epoch
    save_strategy="epoch",
    push_to_hub=True,
)



In [None]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=1024,
    packing=False,
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    }
)

Map:   0%|          | 0/2800 [00:00<?, ? examples/s]

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

In [None]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.4112,0.550131
2,0.3723,0.542638
3,0.3621,0.540924
4,0.3565,0.547233


Checkpoint destination directory llama2-7b-without-context/checkpoint-700 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory llama2-7b-without-context/checkpoint-1400 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory llama2-7b-without-context/checkpoint-2100 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory llama2-7b-without-context/checkpoint-2800 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=2800, training_loss=0.5424081798962185, metrics={'train_runtime': 3225.9053, 'train_samples_per_second': 3.472, 'train_steps_per_second': 0.868, 'total_flos': 9.665483079745536e+16, 'train_loss': 0.5424081798962185, 'epoch': 4.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.547232985496521,
 'eval_runtime': 53.9981,
 'eval_samples_per_second': 12.963,
 'eval_steps_per_second': 1.63,
 'epoch': 4.0}

In [None]:
trainer.push_to_hub()

events.out.tfevents.1713073888.52e170c5729c.2439.5:   0%|          | 0.00/359 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/K-kiron/llama2-7b-without-context/commit/37eb3da6c510760085d2fb53497a0c4374511e09', commit_message='End of training', commit_description='', oid='37eb3da6c510760085d2fb53497a0c4374511e09', pr_url=None, pr_revision=None, pr_num=None)

## Evaluate

In [None]:
y_pred = predict(X_test, model, tokenizer)

100%|██████████| 1500/1500 [10:41<00:00,  2.34it/s]


In [None]:
evaluate(y_test.tolist(), y_pred)

Accuracy: 0.789
Accuracy for label 0: 0.778
Accuracy for label 1: 0.802

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.78      0.79       784
           1       0.77      0.80      0.78       716

    accuracy                           0.79      1500
   macro avg       0.79      0.79      0.79      1500
weighted avg       0.79      0.79      0.79      1500


Confusion Matrix:
[[610 174]
 [142 574]]


# CONTEXT

## Preparing Data

### load files via upload (provided in the repository)

In [None]:
from google.colab import files
uploaded = files.upload()

Saving FigLang2020_twitter_training_updated_context.csv to FigLang2020_twitter_training_updated_context.csv


In [None]:
filename_train = 'FigLang2020_twitter_training_updated_context.csv'
# filename_test = 'FigLang2020_twitter_testing_updated.csv'

### replace abbreviation

In [None]:
#https://stackoverflow.com/questions/57503786/python-replace-abbreviation-in-text
replacers = {'dm': 'direct message',
 'thx': 'thanks',
 'dming': 'direct messaging',
 'dmed': 'direct messaged',
 'plz': 'please',
 'u': 'you',
 'asap': 'as soon as possible',
 '...': '',
 '. . .': '',
 'r': 'are',
 'U':'You',
 'idk':"i don't know",
 'omg':'oh my god',
 'sry':'sorry',
  'fb':'facebook',
 }

replace_abb = lambda x: ' '.join([replacers.get(e, e) for e in x])


### replace USERNAME

In [None]:
replace_username = lambda x: re.sub('@[^\s]+','<user>',x)

# replace_hashtag = lambda x: re.sub('#[^\s]+','<TAG>',x)

replace_link = lambda x: re.sub(r'http?s?://\S+', '<url>', x)

### demojize

In [None]:
# ! pip install emoji

In [None]:
import pandas as pd
from emoji import emojize, demojize

def replace_emoji(text):
  demojize(text)#delimiters=(":", ":")

### process

In [None]:
def process(text):
    text = replace_abb(str(text).split())
    text = replace_username(text)
    # text = replace_hashtag(text)
    text = replace_link(text)
    # text = replace_emoji(text)
    return text

In [None]:
import re
import random
random.seed(42)

In [None]:
X = pd.read_csv(filename_train,
                 usecols=["tweet", "sarcastic", "context"],
                 encoding="utf-8", encoding_errors="replace")

X

Unnamed: 0,sarcastic,tweet,context
0,1,A minor child deserves privacy and should be k...,A minor child deserves privacy and should be k...
1,1,@USER @USER Why is he a loser ? He's just a Pr...,@USER @USER Why is he a loser ? He's just a Pr...
2,1,Donald J . Trump is guilty as charged . The ev...,Donald J . Trump is guilty as charged . The ev...
3,1,Jamie Raskin tanked Doug Collins . Collins loo...,Jamie Raskin tanked Doug Collins . Collins loo...
4,1,Man ... y ’ all gone “ both sides ” the apocal...,Man ... y ’ all gone “ both sides ” the apocal...
...,...,...,...
4995,0,@USER Apologies for the inconvenience you face...,@USER Apologies for the inconvenience you face...
4996,0,"@USER 🤔 idk tho , I think I ’ m #hungry . But ...","@USER 🤔 idk tho , I think I ’ m #hungry . But ..."
4997,0,"@USER @USER @USER Peace to you , and two count...","@USER @USER @USER Peace to you , and two count..."
4998,0,Bernie Sanders told Elizabeth Warren in privat...,Bernie Sanders told Elizabeth Warren in privat...


In [None]:
X = X.sample(frac=1)

In [None]:
total_train_size = int(len(X)*0.7)

train_size = int(len(X)*0.7*0.8)
val_size = int(len(X)*0.7*0.2)

X_test = X[total_train_size:]
X = X[:total_train_size]

In [None]:
for index, row in X.iterrows():
    X.at[index, 'tweet'] = demojize(row['tweet'], delimiters=("[", "]"))

X

Unnamed: 0,sarcastic,tweet,context
1501,1,Blah blah blah . Just listen to this guy . He ...,Blah blah blah . Just listen to this guy . He ...
2586,0,There ’ s a beautiful #WolfMoon rising over Cr...,There ’ s a beautiful #WolfMoon rising over Cr...
2653,0,#TheBachelor I have no idea why such beautiful...,#TheBachelor I have no idea why such beautiful...
1055,1,"In other news , I'm going to take up art . [ar...","In other news , I'm going to take up art . 🎨 <..."
705,1,Is this photo real ? If so please tell me how ...,Is this photo real ? If so please tell me how ...
...,...,...,...
2087,1,A particular highlight came during the @USER V...,A particular highlight came during the @USER V...
1889,1,@USER This is same guy that says Marvel films ...,@USER This is same guy that says Marvel films ...
4622,0,Venus has entered Pisces where it will stay un...,Venus has entered Pisces where it will stay un...
1591,1,@USER have you no shame ? seriously ... I'm co...,@USER have you no shame ? seriously ... I'm co...


In [None]:
X['tweet'] = X['tweet'].apply(process)

In [None]:
X

Unnamed: 0,sarcastic,tweet,context
1501,1,Blah blah blah . Just listen to this guy . He ...,Blah blah blah . Just listen to this guy . He ...
2586,0,There ’ s a beautiful #WolfMoon rising over Cr...,There ’ s a beautiful #WolfMoon rising over Cr...
2653,0,#TheBachelor I have no idea why such beautiful...,#TheBachelor I have no idea why such beautiful...
1055,1,"In other news , I'm going to take up art . [ar...","In other news , I'm going to take up art . 🎨 <..."
705,1,Is this photo real ? If so please tell me how ...,Is this photo real ? If so please tell me how ...
...,...,...,...
2087,1,A particular highlight came during the <user> ...,A particular highlight came during the @USER V...
1889,1,<user> This is same guy that says Marvel films...,@USER This is same guy that says Marvel films ...
4622,0,Venus has entered Pisces where it will stay un...,Venus has entered Pisces where it will stay un...
1591,1,<user> have you no shame ? seriously I'm comp...,@USER have you no shame ? seriously ... I'm co...


In [None]:
def generate_prompt(data_point):
    # Here is the context of the tweet {data_point["context"]}, indicating the linguistic context, which can be used as a reference to determine whether the tweet enclosed in square brackets is sarcastic or non-sarcastic.
    return f"""
            Determine the whether the tweet enclosed in square brackets is sarcastic or non-sarcastic,
            and return the answer as the corresponding label "1" for sarcastic or "0" for non-sarcastic.

            Some words like "really", "never", "actually", etc. can be considered a symbol of sarcasm and express some contradictory and criticized attitudes.

            We also notice that some misspelled words (e.g., "so"->"soooo", "love"->"looove", "sure"->"sureeee") and capitalized words (not located at the beginning of a sentence) can sometimes exaggerate the emotional expression.


            [{data_point["tweet"]}] = {data_point["sarcastic"]}
            """.strip()

X = pd.DataFrame(X.apply(generate_prompt, axis=1),
                       columns=["text"])


X_train = X[:train_size]
X_val = X[train_size:]

In [None]:
X_train

Unnamed: 0,text
1501,Determine the whether the tweet enclosed in sq...
2586,Determine the whether the tweet enclosed in sq...
2653,Determine the whether the tweet enclosed in sq...
1055,Determine the whether the tweet enclosed in sq...
705,Determine the whether the tweet enclosed in sq...
...,...
1093,Determine the whether the tweet enclosed in sq...
4839,Determine the whether the tweet enclosed in sq...
1376,Determine the whether the tweet enclosed in sq...
4913,Determine the whether the tweet enclosed in sq...


In [None]:
X_val

Unnamed: 0,text
1922,Determine the whether the tweet enclosed in sq...
3173,Determine the whether the tweet enclosed in sq...
1764,Determine the whether the tweet enclosed in sq...
743,Determine the whether the tweet enclosed in sq...
4318,Determine the whether the tweet enclosed in sq...
...,...
2087,Determine the whether the tweet enclosed in sq...
1889,Determine the whether the tweet enclosed in sq...
4622,Determine the whether the tweet enclosed in sq...
1591,Determine the whether the tweet enclosed in sq...


In [None]:
# X_test = pd.read_csv(filename_test,
#                  usecols=["tweet", "sarcastic"],
#                 #  names = ["tweet", "sarcastic"],
#                  encoding="utf-8", encoding_errors="replace")

# X_test = X_test.rename(columns={'text':'tweet'})

X_test = X_test.sample(frac=1)

def generate_test_prompt(data_point):
    # Here is the context of the tweet {data_point["context"]}, indicating the linguistic context, which can be used as a reference to determine whether the tweet enclosed in square brackets is sarcastic or non-sarcastic.
    return f"""
            Determine whether the tweet enclosed in square brackets is sarcastic or non-sarcastic,
            and return the answer as the corresponding label "1" for sarcastic or "0" for non-sarcastic.

            Some words like "really", "never", "actually", etc. can be considered a symbol of sarcasm and express some contradictory and criticized attitudes.

            We also notice that some misspelled words (e.g., "so"->"soooo", "love"->"looove", "sure"->"sureeee") and capitalized words (not located at the beginning of a sentence) can sometimes exaggerate the emotional expression.

            [{data_point["tweet"]}] = """.strip()


y_test = X_test['sarcastic']
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

In [None]:
X_test

Unnamed: 0,text
3533,Determine whether the tweet enclosed in square...
2397,Determine whether the tweet enclosed in square...
492,Determine whether the tweet enclosed in square...
4857,Determine whether the tweet enclosed in square...
3440,Determine whether the tweet enclosed in square...
...,...
4261,Determine whether the tweet enclosed in square...
3056,Determine whether the tweet enclosed in square...
3511,Determine whether the tweet enclosed in square...
4363,Determine whether the tweet enclosed in square...


## Functions

In [None]:
def evaluate(y_true, y_pred):
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')

    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true))
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [None]:
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"working on {device}")

working on cuda:0


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf",trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                             device_map=device,
                                              torch_dtype=compute_dtype,
                                              quantization_config=bnb_config)

model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model, tokenizer = setup_chat_format(model, tokenizer)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def predict(test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["text"]
        pipe = pipeline(task="text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_new_tokens=2
                       )
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("=")[-1]
        if "1" in answer:
            y_pred.append(1)
        elif "0" in answer:
            y_pred.append(0)
        else:
            y_pred.append(2)
    return y_pred

## Fine-tuning

In [None]:
train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_val)

In [None]:
repo_name="llama2-7b-context-combine"

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir=repo_name,                     # directory to save and repository id
    num_train_epochs=4,                       # number of training epochs
    per_device_train_batch_size=4,            # batch size per device during training
    gradient_accumulation_steps=1,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,                         # log every 10 steps
    learning_rate=2e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=True,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="tensorboard",                  # report metrics to tensorboard
    evaluation_strategy="epoch",               # save checkpoint every epoch
    save_strategy="epoch",
    push_to_hub=True,
)



In [None]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=1024,
    packing=False,
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    }
)

Map:   0%|          | 0/2800 [00:00<?, ? examples/s]

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

In [None]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,1.0176,1.328869
2,1.0067,1.31182
3,0.9754,1.307567


Epoch,Training Loss,Validation Loss
1,1.0176,1.328869
2,1.0067,1.31182
3,0.9754,1.307567


In [None]:
trainer.evaluate()

In [None]:
trainer.push_to_hub()

## Evaluate

In [None]:
y_pred = predict(X_test, model, tokenizer)

 84%|████████▍ | 1257/1500 [12:08<02:23,  1.69it/s]

In [None]:
evaluate(y_test.tolist(), y_pred)

Accuracy: 0.799
Accuracy for label 0: 0.846
Accuracy for label 1: 0.747

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.85      0.81       784
           1       0.82      0.75      0.78       716

    accuracy                           0.80      1500
   macro avg       0.80      0.80      0.80      1500
weighted avg       0.80      0.80      0.80      1500


Confusion Matrix:
[[663 121]
 [181 535]]
