In [None]:
# Check instance GPUs
!nvidia-smi

In [None]:
# Install required libraries

%%capture
%pip install -q huggingface_hub
%pip install -q -U trl transformers accelerate bitsandbytes xformers
%pip install -q -U datasets einops wandb evaluate

In [None]:
# Install git lfs

%%capture
%conda install --yes -c conda-forge git-lfs

%git lfs install --yes

In [None]:
# Import Libraries

from datasets import load_dataset
import torch
from transformers import Trainer, TrainingArguments
import wandb
from huggingface_hub import notebook_login
import evaluate
import numpy as np
from transformers import AutoConfig, AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
import os

In [None]:
# Log in to WandB
wandb.login()

In [None]:
# Create WandB project
%env WANDB_PROJECT=sentiment_finance

In [None]:
# Log in to your Hugging Face account
# Get your API token here https://huggingface.co/settings/token

notebook_login()

In [None]:
# Load dataset
from datasets import load_dataset
dataset = load_dataset("financial_phrasebank","sentences_50agree", split='train')

In [None]:
# Create train and test datasets
train_dataset = dataset.select([i for i in list(range(0,3873))])
test_dataset = dataset.select([i for i in list(range(3873,4841))])

print(train_dataset[0])
print(test_dataset[0])

In [None]:
# Load Metric

accuracy = evaluate.load("accuracy")

In [None]:
# Create compute metrics class

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)

    # Convert predictions and labels into int32 from int64
    predictions = predictions.astype(np.int32)
    labels = labels.astype(np.int32)
    
    # Compute metric
    accuracy_res = accuracy.compute(predictions=predictions, references=labels)
    return accuracy_res

In [None]:
# Define DistilBERT as our base model:
base_model_name = "distilbert-base-uncased"

# Add names of multiclass labels
label2id = {
    "Negative": 0,
    "Neutral": 1,
    "Positive": 2
  }
id2label = {
    "0": "Negative",
    "1": "Neutral",
    "2": "Positive"
  }

# Add labels to model config
config = AutoConfig.from_pretrained(base_model_name, label2id=label2id, id2label=id2label)

In [None]:
# Set device map according to instance
# device_map = "auto"
device_map = {'': 0}

In [None]:
# Create model object

model = AutoModelForSequenceClassification.from_pretrained(
    base_model_name,
    device_map=device_map,
    trust_remote_code=True,
    use_auth_token=True,
    config=config,
    max_memory=f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB'
)

In [None]:
# Add more settings to model config

model.config.use_cache = False
model.config.pretraining_tp = 1 

In [None]:
# Create a function to get parameters of the model

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_trainable_parameters(model)

In [None]:
# Initialize tokenizer object from pre-trained model and test

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print(tokenizer.encode("Hello this is a test"))

In [None]:
# Create tokenized train and test datasets with truncation
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True)

train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

In [None]:
# Create data collator with padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Set repository name and output folder

repo_name = "distilbert-finance"
output_dir = repo_name

# Set training args
training_args = TrainingArguments(
    output_dir=output_dir,
    
    # High batch size as it is a small model
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    learning_rate=2e-5,
    logging_steps=20, # Logs every 20 steps and measures everything
    weight_decay=0.001,
    eval_accumulation_steps=64, # Low accumulation to avoid OOM
    num_train_epochs=10,
    lr_scheduler_type = "cosine", # Schedules linear/cosine
    evaluation_strategy="steps", # Evaluates every 20 steps
    save_strategy="no",
    #save_steps = 500,
    seed = 42,
    save_safetensors = True,
    push_to_hub=True,
    gradient_checkpointing = True,
    auto_find_batch_size = True, # Good for avoiding OOM
    #optim="paged_adamw_8bit",
    
    # other args and kwargs here
    report_to="wandb",  # enable logging to W&B
    run_name="distilbert (cosine2)",  # name of the W&B run (optional)
)

# Set Sequence length
max_seq_length = 512

# Initialize trainer object with args and dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
# Train the model and evaluate according to strategy

trainer.train()

In [None]:
# Empties GPU cache to avoid OOM

torch.cuda.empty_cache()

In [None]:
# Finishes run logging on WandB

wandb.finish()

In [None]:
# Saves final checkpoint, model and tokenizer

output_dir = os.path.join(output_dir, "final_checkpoint")
trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
# Upload the model to the Hub
# model.push_to_hub("distilbert-finance")
# tokenizer.push_to_hub("distilbert-finance")
trainer.push_to_hub("distilbert-finance")