In [1]:
!pip install datasets
!pip install trl



In [2]:
from enum import Enum
from functools import partial
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, set_seed
from datasets import load_dataset
from trl import SFTTrainer

In [3]:
from datasets import DatasetDict

In [4]:
dataset_name = "FinGPT/fingpt-sentiment-train"
text_column = "input"
label_column = "output"
dataset = load_dataset(dataset_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 76772
    })
})

In [6]:
train_dataset = dataset["train"]  # Access the train dataset within the DatasetDict

# Define the desired test size (e.g., 20%)
test_size = 0.2

# Split the train dataset into train and test subsets
train_test_split = train_dataset.train_test_split(test_size=test_size)

# Access the train and test datasets after the split
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

# (Optional) Create a new DatasetDict to hold the split datasets
new_data = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [7]:
new_data

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 61417
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 15355
    })
})

In [8]:
# wandb.init(project="prompt_learning_methods", name="prompt_tuning")
seed = 42
device = "cuda"
max_length = 64
lr = 1e-4
num_epochs = 10
batch_size = 8
set_seed(seed)

In [9]:
import numpy as np
classes = list(np.unique(dataset["train"]["output"]))

In [10]:
classes

['mildly negative',
 'mildly positive',
 'moderately negative',
 'moderately positive',
 'negative',
 'neutral',
 'positive',
 'strong negative',
 'strong positive']

In [11]:
from collections import Counter
Counter(new_data["train"]["output"])

Counter({'neutral': 23402,
         'moderately negative': 2326,
         'positive': 17279,
         'moderately positive': 4904,
         'negative': 9436,
         'mildly negative': 1682,
         'mildly positive': 2032,
         'strong positive': 175,
         'strong negative': 181})

In [12]:
tokenizer = AutoTokenizer.from_pretrained("Locutusque/TinyMistral-248M")
model = AutoModelForCausalLM.from_pretrained("Locutusque/TinyMistral-248M")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
from types import new_class
new_classes = []
for class_name in classes:
  string_list = class_name.split()
  if len(string_list) > 1:
    new_classes.append(string_list[0]+"_"+string_list[1])
  else:
    new_classes.append(string_list[0])
new_classes

['mildly_negative',
 'mildly_positive',
 'moderately_negative',
 'moderately_positive',
 'negative',
 'neutral',
 'positive',
 'strong_negative',
 'strong_positive']

In [14]:
[len(tokenizer(class_label)["input_ids"]) for class_label in new_classes]

[5, 6, 5, 6, 2, 2, 2, 4, 5]

In [15]:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in new_classes])
print(f"{target_max_length=}")

target_max_length=6


In [16]:
def preprocess_function(examples):
  batch_size = len(examples[text_column])
  inputs = [f"{text_column} : {x}\noutput : " for x in examples[text_column]]
  targets = [str(x) for x in examples[label_column]]
  model_inputs = tokenizer(inputs)
  labels = tokenizer(targets, add_special_tokens=False)  # don't add bos token because we concatenate with inputs
  for i in range(batch_size):
    sample_input_ids = model_inputs["input_ids"][i]
    label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]
    # print(i, sample_input_ids, label_input_ids)
    model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
    labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
    model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])

  for i in range(batch_size):
    sample_input_ids = model_inputs["input_ids"][i]
    label_input_ids = labels["input_ids"][i]
    model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (max_length - len(sample_input_ids)) + sample_input_ids
    model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs["attention_mask"][i]
    labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
    model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])

    model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
    labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [17]:
def change_output_class(examples):
  joined_outputs = []
  output_value = examples["output"]
  for value in output_value:
    if len(value.split()) > 1:  # Check for multiple words
      joined_outputs.append("_".join(value.split()))
    else:
      joined_outputs.append(value)  # Keep single words unchanged
  examples["output"] = joined_outputs
  return examples

In [18]:
dataset_2 = new_data.map(
    change_output_class,
    batched=True)

Map:   0%|          | 0/61417 [00:00<?, ? examples/s]

Map:   0%|          | 0/15355 [00:00<?, ? examples/s]

In [19]:
dataset_2["train"][26]

{'input': "Netflix lost a million subscribers (and that's a good thing!) | Engadget Podcast Read our full story on Engadget: Get More Engadget: • Like us on Facebook: http://www.facebook.com/engadget • Follow us on Twitter: http://www.twitter.com/engadget • Follow us on Instagram: http://www.instagram.com/engadget • Read more: http://www.engadget.com",
 'output': 'mildly_negative',
 'instruction': 'What is the sentiment of this news? Please choose an answer from {strong negative/moderately negative/mildly negative/neutral/mildly positive/moderately positive/strong positive}.'}

In [20]:
train_dataset = dataset_2["train"].map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

Map:   0%|          | 0/61417 [00:00<?, ? examples/s]

In [21]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 61417
})

In [22]:
def test_preprocess_function(examples):
  batch_size = len(examples[text_column])
  inputs = [f"{text_column} : {x}\noutput : " for x in examples[text_column]]
  model_inputs = tokenizer(inputs)
  for i in range(batch_size):
    sample_input_ids = model_inputs["input_ids"][i]
    model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (max_length - len(sample_input_ids)) + sample_input_ids

    model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs["attention_mask"][i]
    model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
    model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])

  return model_inputs

In [23]:
test_dataset = dataset_2["test"].map(
    test_preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/15355 [00:00<?, ? examples/s]

In [24]:
test_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 15355
})

In [25]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 61417
})

In [26]:
!pip install peft



In [27]:
from peft import get_peft_model, LoraConfig, TaskType
peft_config = LoraConfig(r=8,lora_alpha=16,lora_dropout=0.1,task_type=TaskType.CAUSAL_LM)

In [28]:
model.resize_token_embeddings(len(tokenizer))
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 319,488 || all params: 248,343,552 || trainable%: 0.1286


In [29]:
per_device_train_batch_size = 1
per_device_eval_batch_size = 1
gradient_accumulation_steps = 8
logging_steps = 5
learning_rate = 5e-4
max_grad_norm = 1.0
max_steps = 250
num_train_epochs=1
warmup_ratio = 0.1
lr_scheduler_type = "cosine"
max_seq_length = 2048

In [30]:
output_dir = "openhathi_instruct"

In [31]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    save_strategy="no",
    evaluation_strategy="epoch",
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    weight_decay=0.1,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    fp16=True,
    num_train_epochs=num_train_epochs
)




In [32]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset= train_dataset,
    eval_dataset= test_dataset,
    tokenizer=tokenizer,
    packing=True,
    max_seq_length=max_seq_length,
)



In [33]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,0.1201,No log


TrainOutput(global_step=7677, training_loss=0.6903103836714422, metrics={'train_runtime': 4590.917, 'train_samples_per_second': 13.378, 'train_steps_per_second': 1.672, 'total_flos': 5083957882257408.0, 'train_loss': 0.6903103836714422, 'epoch': 0.9999837178631323})

In [39]:
trainer.save_model()



In [34]:
dataset_2["test"][0]

{'input': "The presentation material can be viewed on the company 's website in English after the conference .",
 'output': 'neutral',
 'instruction': 'What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}.'}

In [37]:
import random
total_rows = len(test_dataset)
# Sample 5 random indices without replacement
random_indices = random.sample(range(total_rows), 5)
# Select the data points using the indices
sampled_data = test_dataset.select(random_indices)

In [42]:
import numpy as np

In [45]:
input_ids = sampled_data["input_ids"]
attention_mask = sampled_data["attention_mask"]
input_ids = np.array(input_ids)
attention_mask = np.array(attention_mask)
# Make predictions using your model
predictions = model(input_ids, attention_mask=attention_mask)
# Access the predicted labels or probabilities depending on your model output
predicted_labels = predictions.logits.argmax(-1)  # For classification models

AttributeError: 'numpy.ndarray' object has no attribute 'device'

In [54]:
model.eval()
i = 36
inputs = tokenizer(f'{text_column} : {dataset_2["test"][i]["input"]}\nLabel : ', return_tensors="pt")
print(dataset_2["test"][i]["input"])
print("\n")
print(dataset_2["test"][i]["output"])

External net sales from the printing business fell by 43.7 % , partly due to the termination of the printing contract between Ilkka-Yhtyma 's printing house I-print Oy and sector player HSS Media AB in December 2009 and the fall in printing prices .


negative


In [56]:
inputs

{'input_ids': tensor([[32000,  2787,   714,  1529,  2885,  2512,  6292,   477,   272, 19246,
          1955,  5970,   486, 28705, 28781, 28770, 28723, 28787,  1239,  1200,
         19208,  2940,   298,   272,  1850,  2235,   302,   272, 19246,  5001,
          1444,  2661, 28729,  2117, 28733, 28802,   407, 28724,   705,   464,
         28713, 19246,  2134,   315, 28733,  2031,   451, 28724,   304,  9642,
          4385,   382,  1383,  9347, 18322,   297,  4925, 28705, 28750, 28734,
         28734, 28774,   304,   272,  2949,   297, 19246,  8506,   842,    13,
          4565,   714, 28705]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]])}

In [59]:
input_ids = test_dataset[i]["input_ids"]
attention_mask = test_dataset[i]["attention_mask"]

In [66]:
np.array(input_ids)

array([32000,  2787,   714,  1529,  2885,  2512,  6292,   477,   272,
       19246,  1955,  5970,   486, 28705, 28781, 28770, 28723, 28787,
        1239,  1200, 19208,  2940,   298,   272,  1850,  2235,   302,
         272, 19246,  5001,  1444,  2661, 28729,  2117, 28733, 28802,
         407, 28724,   705,   464, 28713, 19246,  2134,   315, 28733,
        2031,   451, 28724,   304,  9642,  4385,   382,  1383,  9347,
       18322,   297,  4925, 28705, 28750, 28734, 28734, 28774,   304,
         272])

In [67]:
with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(
        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=10, eos_token_id=tokenizer.eos_token_id
    )

Setting `pad_token_id` to `eos_token_id`:32001 for open-end generation.


In [68]:
print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])

input : External net sales from the printing business fell by 43.7 % , partly due to the termination of the printing contract between Ilkka-Yhtyma 's printing house I-print Oy and sector player HSS Media AB in December 2009 and the fall in printing prices .
Label :  negative
