In [33]:
#Make essential imports
import torch
from transformers import BloomForCausalLM, BloomTokenizerFast, TrainingArguments, DataCollatorForLanguageModeling, Trainer
from tokenizers import Tokenizer
from tokenizers.processors import TemplateProcessing
from datasets import Dataset
import pandas as pd

In [2]:
#Detect device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [3]:
#Load the raw dataset
df = pd.read_json('original_data.json')
df

Unnamed: 0,instruction,input,output
0,Generate a physics excercise in the style of P...,,Due corpi vengono fatti cadere con velocità in...
1,Generate a physics excercise in the style of P...,,Un corpo viene lanciato verticalmente verso l’...
2,Generate a physics excercise in the style of P...,,Un corpo viene lanciato verticalmente verso l’...
3,Generate a physics excercise in the style of P...,,Un corpo viene lanciato verticalmente verso l’...
4,Generate a physics excercise in the style of P...,,Due corpi di massa diversa vengono lanciati ve...
...,...,...,...
449,Generate a physics exercise in the style of Po...,,In quanto tempo (come ordine di grandezza) un ...
450,Generate a math exercise in the style of Polim...,,Un triangolo equilatero ha un lato di lunghezz...
451,Generate a physics exercise in the style of po...,,Un oggetto viene lanciato verticalmente verso ...
452,Generate a math exercise in the style of Polim...,,Un negozio offre uno sconto del 30% su tutti i...


In [4]:
#Create a column containing the full prompt and answer
for i in range(len(df)):
    df.at[i, 'text'] = f"Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: {df.at[i, 'instruction']}. ### Response:{df.at[i, 'output']}"

df

Unnamed: 0,instruction,input,output,text
0,Generate a physics excercise in the style of P...,,Due corpi vengono fatti cadere con velocità in...,Below is an instruction that describes a task....
1,Generate a physics excercise in the style of P...,,Un corpo viene lanciato verticalmente verso l’...,Below is an instruction that describes a task....
2,Generate a physics excercise in the style of P...,,Un corpo viene lanciato verticalmente verso l’...,Below is an instruction that describes a task....
3,Generate a physics excercise in the style of P...,,Un corpo viene lanciato verticalmente verso l’...,Below is an instruction that describes a task....
4,Generate a physics excercise in the style of P...,,Due corpi di massa diversa vengono lanciati ve...,Below is an instruction that describes a task....
...,...,...,...,...
449,Generate a physics exercise in the style of Po...,,In quanto tempo (come ordine di grandezza) un ...,Below is an instruction that describes a task....
450,Generate a math exercise in the style of Polim...,,Un triangolo equilatero ha un lato di lunghezz...,Below is an instruction that describes a task....
451,Generate a physics exercise in the style of po...,,Un oggetto viene lanciato verticalmente verso ...,Below is an instruction that describes a task....
452,Generate a math exercise in the style of Polim...,,Un negozio offre uno sconto del 30% su tutti i...,Below is an instruction that describes a task....


In [9]:
#Mix the dataset
df = df.sample(frac = 1, ignore_index=True)
df

Unnamed: 0,instruction,input,output,text
0,Generate a physics exercise in the style of Po...,,Un filo rettilineo infinitamente lungo è perco...,Below is an instruction that describes a task....
1,Generate a physics excercise in the style of P...,,Un blocco di massa m = 1 kg è posto su un pian...,Below is an instruction that describes a task....
2,Generate a physics excercise in the style of P...,,In un sistema di riferimento cartesiano ortogo...,Below is an instruction that describes a task....
3,Generate a physics exercise in the style of Po...,,Un corpo viene lanciato orizzontalmente da una...,Below is an instruction that describes a task....
4,Generate a physics exercise in the style of Po...,,Un inventore afferma di poter costruire una ma...,Below is an instruction that describes a task....
...,...,...,...,...
449,Generate a physics excercise in the style of P...,,"Una tazza di latte, di massa 250 g, viene risc...",Below is an instruction that describes a task....
450,Generate a physics excercise in the style of P...,,Due sorgenti di onde luminose emettono luce di...,Below is an instruction that describes a task....
451,Generate a physics exercise in the style of Po...,,Un raggio di luce passa attraverso un prisma d...,Below is an instruction that describes a task....
452,Generate a physics exercise in the style of Po...,,Un recipiente a pareti rigide contiene argon i...,Below is an instruction that describes a task....


In [10]:
#Split between train and test datasets
train_len = int(len(df)*0.7)
df_train = df.iloc[:train_len]
df_test = df.iloc[train_len:-1]

In [11]:
#Turn the df into a Dataset object
data_train = Dataset.from_pandas(df_train)
data_test = Dataset.from_pandas(df_test)

In [12]:
#Defnition of the tokenize function
def tokenize(element):
    return {'input_ids':tokenizer.encode(element['text']).ids}

In [23]:
#Load tokenizer
bloom_path = "bigscience/bloomz-560m"
tokenizer = Tokenizer.from_pretrained(bloom_path) 
#Add a post-processor to add the BOS and EOS tokens
tokenizer.post_processor = TemplateProcessing(single="<s> $0 </s>", special_tokens=[("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>"))])

In [27]:
tokenizer.enable_padding()
tokenizer.padding

{'length': None,
 'pad_to_multiple_of': None,
 'pad_id': 0,
 'pad_token': '[PAD]',
 'pad_type_id': 0,
 'direction': 'right'}

In [15]:
#Preprocess the dataset with BLOOM's tokenizer
X_train = data_train.map(tokenize, remove_columns=data_train.column_names)
X_test = data_test.map(tokenize, remove_columns=data_test.column_names)
X_train, X_test

Map: 100%|███████████████████████████████████████████████████████████████████| 317/317 [00:00<00:00, 2734.20 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████| 136/136 [00:00<00:00, 2427.53 examples/s]


(Dataset({
     features: ['input_ids'],
     num_rows: 317
 }),
 Dataset({
     features: ['input_ids'],
     num_rows: 136
 }))

In [17]:
#Load model
model = BloomForCausalLM.from_pretrained(bloom_path).to(device)

In [36]:
#Setup training arguments and load datacollator
#We have to load the tokenizer as a diferent object because the one created before does not has the pad function
bloom_tokenizer = BloomTokenizerFast.from_pretrained(bloom_path)
data_collator = DataCollatorForLanguageModeling(bloom_tokenizer, mlm=False)

args = TrainingArguments(
    "output",
    fp16=False,
    gradient_accumulation_steps= 1,
    per_device_train_batch_size = 2,
    learning_rate = 2e-5,
    num_train_epochs=2,
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    tokenizer=bloom_tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset = X_train,
    eval_dataset = X_test
)

In [37]:
trainer.train()

You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,2.8358
20,1.9335
30,1.7293
40,1.4446
50,1.482
60,1.4802
70,1.1056
80,1.0631
90,1.1916
100,1.0725


TrainOutput(global_step=318, training_loss=0.8627694845199585, metrics={'train_runtime': 869.1559, 'train_samples_per_second': 0.729, 'train_steps_per_second': 0.366, 'total_flos': 195432962482176.0, 'train_loss': 0.8627694845199585, 'epoch': 2.0})

In [38]:
#Save model manually to output folder
trainer.save_model("output")

In [40]:
#Load trained model
modelo = BloomForCausalLM.from_pretrained('output')
modelo

BloomForCausalLM(
  (transformer): BloomModel(
    (word_embeddings): Embedding(250880, 1024)
    (word_embeddings_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (h): ModuleList(
      (0-23): 24 x BloomBlock(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (self_attention): BloomAttention(
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): BloomMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (gelu_impl): BloomGelu()
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (

In [41]:
#Example inference
prompt = 'Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: Generate a physics excercise in the style of Polimi exams. The problem should be exclusively about kinematics. ### Response:'
input = torch.tensor([tokenizer.encode(prompt).ids])
output = modelo.generate(input, max_new_tokens=200)
tokenizer.decode(output[0].detach().numpy())

'Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: Generate a physics excercise in the style of Polimi exams. The problem should be exclusively about kinematics. ### Response:Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Response:Un corpo viene lanciato verticalmente verso l’alto con una velocità iniziale di 20 m/s. Qual è l’altezza massima raggiunta dal corpo? A) 30 m B) 40 m C) 50 m D) 60 m E) 80 m'