<a href="https://colab.research.google.com/github/Joongeun/MIT-Internship-2024/blob/main/climategpt_finetuning_LoRa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Install packages and log in to huggingface to upload finetuned model
!pip install accelerate peft bitsandbytes transformers trl wandb datasets torch
!huggingface-cli login

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer
import pandas as pd

In [None]:
#Load pre-split datasets
base_model = "eci-io/climategpt-7b"
train_ds = load_dataset("csv", data_files="sf_train.csv", split="train")
test_ds = load_dataset("csv", data_files="sf_test.csv", split="train")
train_ds, test_ds

(Dataset({
     features: ['text', 'labels'],
     num_rows: 205
 }),
 Dataset({
     features: ['text', 'labels'],
     num_rows: 89
 }))

In [None]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-5,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    hub_model_id="finetuned-climategpt-7b", #Name of finetuned model when saving
    hub_token="INSERT_YOUR_HUGGINGFACE_TOKEN_HERE"
)
trainer = SFTTrainer(
    model=model,
    train_dataset=train_ds,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)
trainer.train()

In [None]:
new_model="finetuned_climategpt"
# trainer.model.save_pretrained(new_model)
# trainer.tokenizer.save_pretrained(new_model)
#Uploading model to huggingface hub
trainer.push_to_hub()

('finetuned_climategpt/tokenizer_config.json',
 'finetuned_climategpt/special_tokens_map.json',
 'finetuned_climategpt/tokenizer.json')

In [None]:
#Evaluating fine-tuned model on test dataset
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
test_texts = test_ds["text"]
test_labels = test_ds["labels"]
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
rows = []
for ind, prompt in enumerate(test_texts):
    result = pipe(prompt, num_return_sequences=1, max_new_tokens=1)[0]['generated_text'].split("ANSWER: ")[1][0]
    print(str(ind)+":", str(result))
    rows.append(result)

df = pd.DataFrame({"outputs": rows})
df.to_csv("finetuned_model_outputs.csv", index=False)

def compute_accuracy(path, preds_col):
    df = pd.read_csv(path)
    y_true = test_labels
    y_pred = list(df[preds_col])
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    recall = recall_score(y_true=y_true, y_pred=y_pred, average="weighted")
    precision = precision_score(y_true=y_true, y_pred=y_pred, average="weighted")
    f1 = f1_score(y_true=y_true, y_pred=y_pred, average=None)
    f1_weighted = f1_score(y_true=y_true, y_pred=y_pred, average="weighted")
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1 for each class": f1, "f1 weighted": f1_weighted}

path = "finetuned_model_outputs.csv"
preds_col = "outputs"

scores = compute_accuracy(path, preds_col)
print(scores)

0: 1
1: 5
2: 5
3: 1
4: 1
5: 1
6: 1
7: 1
8: 1
9: 1
10: 5
11: 1
12: 1
13: 4
14: 1
15: 1
16: 2
17: 1
18: 1
19: 1
20: 1
21: 2
22: 2
23: 1
24: 1
25: 4
26: 1
27: 5
28: 5
29: 1
30: 2
31: 1
32: 5
33: 1
34: 5
35: 5
36: 1
37: 5
38: 1
39: 5
40: 4
41: 2
42: 1
43: 1
44: 5
45: 1
46: 5
47: 5
48: 1
49: 2
50: 5
51: 3
52: 1
53: 4
54: 1
55: 4
56: 1
57: 1
58: 1
59: 1
60: 1
61: 1
62: 1
63: 2
64: 5
65: 2
66: 1
67: 2
68: 4
69: 1
70: 5
71: 1
72: 1
73: 1
74: 2
75: 1
76: 4
77: 1
78: 1
79: 4
80: 1
81: 1
82: 1
83: 5
84: 1
85: 5
86: 1
87: 1
88: 2
{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1 for each class': array([1., 1., 1., 1., 1.]), 'f1 weighted': 1.0}


In [None]:
#Importing fine-tuned model from huggingface hub and seeing if it works
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name_or_path = "Joon007/finetuned-climategpt-7b" #path/to/your/model/or/name/on/hub
device = "cuda" # or "cuda" if you have a GPU

model = AutoModelForCausalLM.from_pretrained(model_name_or_path).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

inputs = tokenizer.encode("This is a climate-misinformation classification task. Your task is that of telling whether the given text presents a contrarian claim regarding climate change. Your reply should be: 1: incorrect/inaccurate/flawed reasoning, 2: unsupported/misleading, 3: lacks context/imprecise/partially correct, 4: mostly accurate/mostly correct, or 5: accurate/correct. Your reply should contain only the corresponding number and nothing else (i.e., 1, 2, 3, 4, or 5). Terminate your response after including the number. Don't say im_end The claim is: \"Most likely the primary control knob [on climate change] is the ocean waters and this environment that we live in.\" ANSWER: ", return_tensors="pt").to(device)
outputs = model.generate(inputs, num_return_sequences=1, max_new_tokens=1)
print(tokenizer.decode(outputs[0]))