## Fine-tune Mixtral-7B for sentiment


In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import os
os.environ["WANDB_DISABLED"] = "true"

In the following cell there are all the other imports for running the notebook

In [4]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

[2024-02-22 14:12:01,100] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [5]:
print(f"pytorch version {torch.__version__}")

pytorch version 2.1.2+cu121


In [6]:
filename = "/home/jomondal/experiments/mywork/llama_exp/dataset/fin_sentiment/all-data.csv"

df = pd.read_csv(filename, 
                 names=["sentiment", "text"],
                 encoding="utf-8", encoding_errors="replace")

X_train = list()
X_test = list()
for sentiment in ["positive", "neutral", "negative"]:
    train, test  = train_test_split(df[df.sentiment==sentiment], 
                                    train_size=300,
                                    test_size=300, 
                                    random_state=42)
    X_train.append(train)
    X_test.append(test)

X_train = pd.concat(X_train).sample(frac=1, random_state=10)
X_test = pd.concat(X_test)

eval_idx = [idx for idx in df.index if idx not in list(train.index) + list(test.index)]
X_eval = df[df.index.isin(eval_idx)]
X_eval = (X_eval
          .groupby('sentiment', group_keys=False)
          .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))
X_train = X_train.reset_index(drop=True)

def generate_prompt(data_point):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{data_point["text"]}] = {data_point["sentiment"]}
            """.strip()

def generate_test_prompt(data_point):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{data_point["text"]}] = """.strip()

X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1), 
                       columns=["text"])
X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1), 
                      columns=["text"])

y_true = X_test.sentiment
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)

In [7]:
def evaluate(y_true, y_pred):
    labels = ['positive', 'neutral', 'negative']
    mapping = {'positive': 2, 'neutral': 1, 'none':1, 'negative': 0}
    def map_func(x):
        return mapping.get(x, 1)
    
    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')
    
    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels
    
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true)) 
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')
        
    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [8]:
model_name = "/home/jomondal/experiments/mywork/pretrained_models/Mixtral-8x7B-Instruct-v0.1"
trained_model_name = f"/home/jomondal/experiments/mywork/myTasks/mlsp/trained_models/sentiment_fintech_{model_name.split('/')[-1]}"

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

# tokenizer = AutoTokenizer.from_pretrained(model_name, 
#                                           trust_remote_code=True,
#                                          )

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [9]:
def predict(test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["text"]
        pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens = 1, 
                        temperature = 0.001,
                       )
        result = pipe(prompt,pad_token_id=tokenizer.eos_token_id)
        answer = result[0]['generated_text'].split("=")[-1]
        if "positive" in answer:
            y_pred.append("positive")
        elif "negative" in answer:
            y_pred.append("negative")
        elif "neutral" in answer:
            y_pred.append("neutral")
        else:
            y_pred.append("none")
    return y_pred

In [10]:
y_pred = predict(test, model, tokenizer)

100%|█████████████████████████████████████████████████████████████████████████████| 900/900 [06:31<00:00,  2.30it/s]


In [11]:
evaluate(y_true, y_pred)

Accuracy: 0.502
Accuracy for label 0: 0.160
Accuracy for label 1: 0.880
Accuracy for label 2: 0.467

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.16      0.27       300
           1       0.39      0.88      0.54       300
           2       0.80      0.47      0.59       300

    accuracy                           0.50       900
   macro avg       0.72      0.50      0.47       900
weighted avg       0.72      0.50      0.47       900


Confusion Matrix:
[[ 48 252   0]
 [  1 264  35]
 [  1 159 140]]


In [12]:
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=64,
    bias="none",
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir="logs",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8, # 4
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    evaluation_strategy="epoch"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    max_seq_length=1024,
)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Detected kernel version 5.4.17, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [13]:
# Train model
trainer.train()

Epoch,Training Loss,Validation Loss
0,0.7504,0.812775
2,0.5696,0.809627


TrainOutput(global_step=336, training_loss=0.7159555227983565, metrics={'train_runtime': 2343.4953, 'train_samples_per_second': 1.152, 'train_steps_per_second': 0.143, 'total_flos': 7.635113166665318e+16, 'train_loss': 0.7159555227983565, 'epoch': 2.99})

In [14]:
# Save trained model
trainer.model.save_pretrained(trained_model_name)
trainer.tokenizer.save_pretrained(trained_model_name)

('/home/jomondal/experiments/mywork/myTasks/mlsp/trained_models/sentiment_fintech_Mixtral-8x7B-Instruct-v0.1/tokenizer_config.json',
 '/home/jomondal/experiments/mywork/myTasks/mlsp/trained_models/sentiment_fintech_Mixtral-8x7B-Instruct-v0.1/special_tokens_map.json',
 '/home/jomondal/experiments/mywork/myTasks/mlsp/trained_models/sentiment_fintech_Mixtral-8x7B-Instruct-v0.1/tokenizer.model',
 '/home/jomondal/experiments/mywork/myTasks/mlsp/trained_models/sentiment_fintech_Mixtral-8x7B-Instruct-v0.1/added_tokens.json',
 '/home/jomondal/experiments/mywork/myTasks/mlsp/trained_models/sentiment_fintech_Mixtral-8x7B-Instruct-v0.1/tokenizer.json')

In [15]:
y_pred = predict(test, model, tokenizer)
evaluate(y_true, y_pred)

100%|█████████████████████████████████████████████████████████████████████████████| 900/900 [07:02<00:00,  2.13it/s]

Accuracy: 0.851
Accuracy for label 0: 0.957
Accuracy for label 1: 0.760
Accuracy for label 2: 0.837

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.96      0.94       300
           1       0.81      0.76      0.78       300
           2       0.81      0.84      0.83       300

    accuracy                           0.85       900
   macro avg       0.85      0.85      0.85       900
weighted avg       0.85      0.85      0.85       900


Confusion Matrix:
[[287  11   2]
 [ 17 228  55]
 [  5  44 251]]





In [16]:
evaluation = pd.DataFrame({'text': X_test["text"], 
                           'y_true':y_true, 
                           'y_pred': y_pred},
                         )
evaluation.to_csv("results/fintech_sentiment/test_predictions.csv", index=False)