# SA LLaMA

In [1]:
# installations for colab
#!pip install huggingface_hub transformers torch accelerate datasets peft urlextract

In [2]:
#!pip install -U bitsandbytes

In [None]:
# huggingface login necessary because llama is gated
# token should be invalidated and refreshed after every git commit
from huggingface_hub import login
login(token="")

In [None]:
import pandas as pd

df_tweets_bigtech_10k_sample = pd.read_csv("./tweets_bigtech_10ksample.csv")
df_tweets_bigtech_10k_sample.head()

In [None]:
# loading the model using peft to be able to fine-tune
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training

model = AutoModelForSequenceClassification.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf",
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
    llm_int8_enable_fp32_cpu_offload=True
)

lora_config = LoraConfig(
     task_type=TaskType.SEQ_CLS,
     inference_mode=False,
     r=8,
     lora_alpha=16,
     lora_dropout=0.1
)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

# Predicting with pure llama-chat

In [None]:
pipeline = 
pipeline("The new Iphone is not good.")

In [None]:
df_tweets_bigtech_10k_sample["predicted_sentiment"] = df_tweets_bigtech_10k_sample["text"].apply(lambda x: pipeline(x)[0]["label"])

In [None]:
df_tweets_bigtech_10k_sample["predicted_sentiment"].value_counts()

In [None]:
# adapting predicted labels to fit format
df_tweets_bigtech_10k_sample["predicted_sentiment"] = df_tweets_bigtech_10k_sample["predicted_sentiment"].apply(lambda x: 2 if x == "LABEL_1" else 0)

In [None]:
# removing neutral tweets, since model only predicted negative and positive
df_tweets_bigtech_10k_sample = df_tweets_bigtech_10k_sample[df_tweets_bigtech_10k_sample["sentiment"] != 1]

In [None]:
df_tweets_bigtech_10k_sample.head()

In [None]:
df_tweets_bigtech_10k_sample["sentiment"].value_counts()

In [None]:
df_tweets_bigtech_10k_sample["predicted_sentiment"].value_counts()

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def performance_metrics(df, label, prediction):
     accuracy = accuracy_score(df[label], df[prediction])
     precision = precision_score(df[label], df[prediction], average="weighted")
     recall = recall_score(df[label], df[prediction], average="weighted")
     f1 = f1_score(df[label], df[prediction], average="weighted")

     print(f"Accuracy: {accuracy}")
     print(f"Precision: {precision}")
     print(f"Recall: {recall}")
     print(f"F1-Score: {f1}")

performance_metrics(df_tweets_bigtech_10k_sample, "sentiment", "predicted_sentiment")

# Fine-tuning LLaMA

In [None]:
#converting polarity to sentiment and trying to get llama to predict three labels (negative, neutral, positive)
def polarity_to_sentiment(polarity):
    if -0.2 <= polarity <= 0.2:
        return 1
    if polarity > 0.2:
        return 2
    else:
        return 0

df_tweets_bigtech_10k_sample["sentiment"] = df_tweets_bigtech_10k_sample["polarity"].apply(polarity_to_sentiment)
df_tweets_bigtech_10k_sample.head()

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_tweets_bigtech_10k_sample)

## fine-tuning with Tweets BigTech

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto",
    num_labels=3 # specifying num_labels resolves runtime CUDA error
)

In [None]:
# pad token definitions for both tokenizer and model allow for batch size > 1
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

In [None]:
import pandas as pd
from datasets import Dataset

df = pd.read_csv("/content/tweets_bigtech_20ksample.csv")
df.dropna(inplace=True)

# model expects column explicitly called "labels"
df = df.rename({"sentiment" : "labels"}, axis=1)
df.head()

In [None]:
dataset = Dataset.from_pandas(df)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

In [None]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,  # rank of the low-rank adaptation
    lora_alpha=32,  # scaling factor
    target_modules=["q_proj"],#, "v_proj"],  # target modules for LoRA
    lora_dropout=0.1,  # dropout rate
    bias="none",  # bias handling
    task_type="SEQ_CLS" # sentiment analysis is a form of sequence classification
)

model = get_peft_model(model, lora_config)

In [None]:
from transformers import TrainingArguments

# experimenting with different training argument definitions

# batch size >= 64 results in out of memory error even with best GPU (A100)
training_args = TrainingArguments(
    output_dir="./llama2-sentiment-analysis",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    learning_rate=5e-5,
    fp16=True,
    save_total_limit=2
)

In [None]:
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [None]:
from transformers import Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.
        """
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        outputs = model(**inputs) # removing num_items_in_batch
        # saving past state if it exists
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            loss = self.label_smoother(outputs, labels)
        else:
            # not using .loss here since the model may return tuples instead of ModelOutput
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]

        return (loss, outputs) if return_outputs else loss

In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("./llama2-sentiment-analysis-finetuned")
tokenizer.save_pretrained("./llama2-sentiment-analysis-finetuned")

### applying the model

In [None]:
# loading the model and applying it
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

llama_ft = AutoModelForSequenceClassification.from_pretrained(
    "./llama2-sentiment-analysis-finetuned",
    quantization_config=quantization_config,
    device_map="auto",
    num_labels=3 # specifying num_labels resolves runtime CUDA error
)

tokenizer_llama_ft = AutoTokenizer.from_pretrained("./llama2-sentiment-analysis-finetuned")

# moving model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
llama_ft.to(device)

In [None]:
df_tweets_bigtech_test = pd.DataFrame(test_dataset)

In [None]:
tokens = tokenizer_llama_ft(df_tweets_bigtech_test["text"].tolist(), padding=True, truncation=True, return_tensors="pt")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokens = {key: value.to(device) for key, value in tokens.items()}

In [None]:
llama_ft.config.pad_token_id = llama_ft.config.eos_token_id

In [None]:
import torch

# setting the model to evaluation-mode
llama_ft.eval()

batch_size = 32
num_batches = len(df_tweets_bigtech_test) // batch_size + 1
predictions = []

# performing evaluation in batches to not overload GPU
for i in range(num_batches):
  start_idx = i * batch_size
  end_idx = min((i + 1) * batch_size, len(df_tweets_bigtech_test))
  batch_tokens = tokenizer_llama_ft(df_tweets_bigtech_test["text"][start_idx:end_idx].tolist(), padding=True, truncation=True, return_tensors="pt")
  batch_tokens = {key: value.to(device) for key, value in batch_tokens.items()}

  with torch.no_grad():
      outputs = llama_ft(**batch_tokens)
      batch_predictions = torch.argmax(outputs.logits, dim=-1)
      predictions.extend(batch_predictions.cpu().numpy())

df_tweets_bigtech_test["llama_ft_prediction"] = predictions

In [14]:
def llama_pred(model, tokenizer, df, text_col):
    # setting padding tokens
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

    # moving model to GPU and setting it to evaluation-mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(model.device)
    model.eval()

    batch_size = 32
    num_batches = len(df) // batch_size + 1
    predictions = []

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))
        batch_tokens = tokenizer(df[text_col][start_idx:end_idx].tolist(), padding="max_length", truncation=True, return_tensors="pt", max_length=128)
        batch_tokens = {key: value.to(device) for key, value in batch_tokens.items()}

        with torch.no_grad():
            outputs = model(**batch_tokens)
            batch_predictions = torch.argmax(outputs.logits, dim=-1)
            predictions.extend(batch_predictions.cpu().numpy())

    df["llama_ft_prediction"] = predictions
    return df

In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def performance_metrics(df, label, prediction):
     accuracy = accuracy_score(df[label], df[prediction])
     precision = precision_score(df[label], df[prediction], average="weighted")
     recall = recall_score(df[label], df[prediction], average="weighted")
     f1 = f1_score(df[label], df[prediction], average="weighted")

     print(f"Accuracy: {accuracy}")
     print(f"Precision: {precision}")
     print(f"Recall: {recall}")
     print(f"F1-Score: {f1}")

In [None]:
performance_metrics(df_tweets_bigtech_test, "labels", "llama_ft_prediction")

## fine-tuning with brand sa data

In [None]:
import pandas as pd
df_brd_tr = pd.read_csv("Dataset - Train.csv")
df_brd_tst = pd.read_csv("Dataset - Test.csv")
df_brd = pd.concat([df_brd_tr, df_brd_tst])
df_brd = df_brd.drop("Tweet", axis=1)
df_brd.head()

In [None]:
df_brd = df_brd[df_brd["tweet_text"].notna()]
len(df_brd)

In [None]:
df_brd["labels"] = df_brd["is_there_an_emotion_directed_at_a_brand_or_product"].replace({"Negative emotion" : 0, "Positive emotion" : 2, "No emotion toward brand or product" : 1, "I can't tell" : 1})

In [None]:
import re
from urlextract import URLExtract
extractor = URLExtract()

def format_tweet(tweet):
    if not isinstance(tweet, str):
        return tweet
    # mask web urls
    urls = extractor.find_urls(tweet)
    for url in urls:
        tweet = tweet.replace(url, "{{URL}}")
    # format twitter account
    tweet = re.sub(r"\b(\s*)(@[\S]+)\b", r'\1{\2@}', tweet)
    return tweet

In [None]:
df_brd["text"] = df_brd["tweet_text"].apply(format_tweet)

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_brd)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./llama2-sentiment-analysis",
    per_device_train_batch_size=40,
    per_device_eval_batch_size=40,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=500,
    learning_rate=5e-5,
    save_total_limit=2
)

In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("./llama2-sentiment-analysis-finetuned-brdsa")
tokenizer.save_pretrained("./llama2-sentiment-analysis-finetuned-brdsa")

In [None]:
trainer.evaluate(test_dataset)

### applying the model

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
from transformers import BitsAndBytesConfig
from peft import PeftModel

# first loading base model, then reloading fine-tuned model on top of it with PEFT
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForSequenceClassification.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    quantization_config=quantization_config,
    device_map="auto",
    num_labels=3
)

model_ft = PeftModel.from_pretrained(
    base_model,
    "llama-2-7b-sentiment-analysis-finetuned-brdsa"
)

tokenizer_ft = AutoTokenizer.from_pretrained("llama-2-7b-sentiment-analysis-finetuned-brdsa")

In [None]:
df_tweets_bigtech_10k = pd.read_csv("tweets_bigtech_10ksample.csv")
df_tweets_bigtech_10k.dropna(inplace=True)
df_tweets_bigtech_2k = df_tweets_bigtech_10k.sample(n=2000)
df_tweets_bigtech_2k.head()

In [None]:
llama_pred(model_ft, tokenizer_ft, df_tweets_bigtech_2k, "text")

In [None]:
performance_metrics(df_tweets_bigtech_2k, "sentiment", "llama_ft_prediction")

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

# second evaluation function to confirm accuracy of results
def evaluate_sentiment(df, true_col, pred_col):
    eval_df = df.dropna(subset=[true_col, pred_col])

    accuracy = accuracy_score(eval_df[true_col], eval_df[pred_col])
    precision, recall, f1, _ = precision_recall_fscore_support(
        eval_df[true_col],
        eval_df[pred_col],
        average='weighted'
    )

    print(classification_report(eval_df[true_col], eval_df[pred_col]))

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "num_samples": len(eval_df)
    }

metrics = evaluate_sentiment(df_tweets_bigtech_2k, "sentiment", "llama_ft_prediction")
print(f"Accuracy: {metrics['accuracy']:.4f}")
print(f"F1 Score: {metrics['f1']:.4f}")

In [None]:
llama_pred(model_ft, tokenizer_ft, df_brd, "text")

In [None]:
performance_metrics(df_brd, "labels", "llama_ft_prediction")

## fine-tuning with sentiment corpus

In [4]:
import pandas as pd
df_sc = pd.read_json("sentiment_corpus.json", orient="records")
df_sc.head()

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_name = "meta-llama/Llama-2-7b-hf"
llama_t = AutoTokenizer.from_pretrained(model_name)
llama = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto",
    num_labels=3 # specifying num_labels resolves runtime CUDA error
)

llama_t.pad_token = llama_t.eos_token
llama.config.pad_token_id = llama.config.eos_token_id

In [6]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

llama = prepare_model_for_kbit_training(llama)

lora_config = LoraConfig(
    r=16,  # rank of the low-rank adaptation
    lora_alpha=32,  # scaling factor
    target_modules=["k_proj", "q_proj", "v_proj", "o_proj"],  # target modules for LoRA
    lora_dropout=0.05,  # dropout rate
    bias="none",  # bias handling
    task_type="SEQ_CLS" # sentiment analysis is a form of sequence classification
)

llama = get_peft_model(llama, lora_config)

In [7]:
from datasets import Dataset

ds = Dataset.from_pandas(df_sc)

def tokenize_function(examples):
    return llama_t(examples["text"], padding="max_length", truncation=True, max_length=128)

In [8]:
tokenized_dataset = ds.map(tokenize_function, batched=True)

In [9]:
train_test_split1 = tokenized_dataset.train_test_split(test_size=0.15, seed=42)
train_val_dataset = train_test_split1['train']
test_dataset = train_test_split1['test']

train_test_split2 = train_val_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = train_test_split2['train']
val_dataset = train_test_split2['test']

In [10]:
from transformers import TrainingArguments
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, precision_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

label_names = ["negative", "neutral", "positive"]

# batch size >= 64 results in out of memory error even with best GPU (A100)
training_args = TrainingArguments(
    output_dir="./llama2-sentiment-analysis",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    learning_rate=1e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    save_total_limit=2,
    label_names=label_names
)

In [None]:
trainer = CustomTrainer(
    model=llama,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=llama_t
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate(test_dataset)

In [None]:
llama.save_pretrained("./llama2-sentiment-corpus")
llama_t.save_pretrained("./llama2-sentiment-corpus")

In [None]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader
from tqdm import tqdm

label_names = ["negative", "neutral", "positive"]

training_args = TrainingArguments(
    output_dir="./sentiment-llama-2-7b",
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=4,
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    num_train_epochs=7,
    save_total_limit=7,
    label_names=label_names
)

trainer = Trainer(
    model=llama,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=llama_t
)

trainer.train()

In [12]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
from transformers import BitsAndBytesConfig
from peft import PeftModel

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_llama = AutoModelForSequenceClassification.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    quantization_config=quantization_config,
    device_map="auto",
    num_labels=3
)

In [15]:
best_f1 = 0
best_epoch = 0

val_df = val_dataset.to_pandas()

id_to_sentiment = {0: "negative", 1: "neutral", 2: "positive"}

folders = [113, 226, 339, 452, 565, 678, 784]
for epoch in range(7):
    foldernr = folders[epoch]
    checkpoint_dir = f"./sentiment-llama-2-7b/checkpoint-{str(foldernr)}"
    print(f"Loading model from {checkpoint_dir}")

    model = PeftModel.from_pretrained(
    base_llama,
    checkpoint_dir)
    tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir)

    val_df_copy = val_df.copy()
    val_df_with_preds = llama_pred(
        model,
        tokenizer,
        val_df_copy,
        text_col="text"
    )

    true_labels = val_df_with_preds["labels"].tolist()
    pred_labels = val_df_with_preds["llama_ft_prediction"].tolist()

    report = classification_report(
        true_labels,
        pred_labels,
        target_names=list(id_to_sentiment.values()),
        output_dict=True
    )

    accuracy = report["accuracy"]
    precision = report["weighted avg"]["precision"]
    recall = report["weighted avg"]["recall"]
    f1 = report["weighted avg"]["f1-score"]

    print(f"Epoch {epoch+1}: Accuracy={accuracy:.4f}, Precision={precision:.4f}, Recall={recall:.4f}, F1={f1:.4f}")

    print("\nPer-class metrics:")
    for sentiment_class in id_to_sentiment.values():
        print(f"{sentiment_class}: F1={report[sentiment_class]['f1-score']:.4f}, " +
              f"Precision={report[sentiment_class]['precision']:.4f}, " +
              f"Recall={report[sentiment_class]['recall']:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        best_epoch = epoch

print(f"Best epoch according to F1-score: {best_epoch+1}, F1: {best_f1:.4f}")

### applying the model

### after 2 epochs

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
from transformers import BitsAndBytesConfig
from peft import PeftModel

# first loading base model, then reloading fine-tuned model on top of it with PEFT
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForSequenceClassification.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    quantization_config=quantization_config,
    device_map="auto",
    num_labels=3
)

model_corpus_ft = PeftModel.from_pretrained(
    base_model,
    "./llama2-sentiment-corpus"
)

tokenizer_corpus_ft = AutoTokenizer.from_pretrained("./llama2-sentiment-corpus")

In [18]:
df_corpus_test = test_dataset.to_pandas()

In [None]:
llama_pred(model_corpus_ft, tokenizer_corpus_ft, df_corpus_test, "text")

In [23]:
# extended function for performance metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def performance_metrics(df, label, prediction):
    accuracy = accuracy_score(df[label], df[prediction])
    precision = precision_score(df[label], df[prediction], average="weighted")
    recall = recall_score(df[label], df[prediction], average="weighted")
    f1 = f1_score(df[label], df[prediction], average="weighted")

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

    print("\nClassification Report:")
    label_names = ["negative", "neutral", "positive"]
    report = classification_report(df[label], df[prediction], target_names=label_names, digits=4)
    print(report)

In [None]:
performance_metrics(df_corpus_test, "labels", "llama_ft_prediction")

In [None]:
df_corpus_test.head()

### after new method with checkpointing

In [16]:
# first loading base model, then reloading fine-tuned model on top of it with PEFT
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_llama = AutoModelForSequenceClassification.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    quantization_config=quantization_config,
    device_map="auto",
    num_labels=3
)

llama_corpus_4 = PeftModel.from_pretrained(
    base_llama,
    "./sentiment-llama-2-7b/checkpoint-452"
)

llama_corpus_4_t = AutoTokenizer.from_pretrained("./sentiment-llama-2-7b/checkpoint-452")

In [19]:
llama_pred(llama_corpus_4, llama_corpus_4_t, df_corpus_test, "text")

In [24]:
performance_metrics(df_corpus_test, "labels", "llama_ft_prediction")