In [None]:
!pip install transformers
!pip install transformers[torch]
!pip install accelerate -U

!pip install evaluate

In [None]:
from transformers import Trainer, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM
from transformers import DataCollatorWithPadding, DataCollatorForLanguageModeling, TrainingArguments
from datasets import load_dataset
from torch.nn import Softmax
import pandas as pd
import numpy as np
import evaluate
import torch

In [3]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Tokenizer

*   The Tokenizer is a callable
*   It outputs a dictionary-like, which has the format expected by AutoModel







In [None]:
s1 = "This school is pretty difficult"
s2 = "PoliTo requires a lot of studying"

In [None]:
#Notice the return_tensor that has to match pytorch library "pt"
print( "Type:\t" , type(tokenizer(s1,return_tensors="pt")) )
print( tokenizer(s1,return_tensors="pt") )
print("")
print( "string:\t\t",s1.split(" ") )
print( "tokens: \t" , [ tokenizer.convert_ids_to_tokens( x ) for x in tokenizer(s1,return_tensors="pt")["input_ids"] ])
print("")
print( "input_ids:\t",tokenizer(s1,return_tensors="pt")["input_ids"] )
print( "token_type_ids:\t",tokenizer(s1,return_tensors="pt")["token_type_ids"] )
print( "attention_mask:\t",tokenizer(s1,return_tensors="pt")["attention_mask"] )

Type:	 <class 'transformers.tokenization_utils_base.BatchEncoding'>
{'input_ids': tensor([[ 101, 1188, 1278, 1110, 2785, 2846,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

string:		 ['This', 'school', 'is', 'pretty', 'difficult']
tokens: 	 [['[CLS]', 'This', 'school', 'is', 'pretty', 'difficult', '[SEP]']]

input_ids:	 tensor([[ 101, 1188, 1278, 1110, 2785, 2846,  102]])
token_type_ids:	 tensor([[0, 0, 0, 0, 0, 0, 0]])
attention_mask:	 tensor([[1, 1, 1, 1, 1, 1, 1]])


For sequences of two sentences it produce a single input for the model



In [None]:
#Notice the 'token_type_ids' that tells you if the token belongs to first or second sentence
print( tokenizer(s1,s2,return_tensors="pt") )
print("")
print( "string 1:\t",s1.split(" ") )
print( "string 2:\t",s2.split(" ") )
print( "tokens: \t" , [ tokenizer.convert_ids_to_tokens( x ) for x in tokenizer(s1,s2,return_tensors="pt") ["input_ids"] ])
print("")
print( "input_ids:\t",tokenizer(s1,s2,return_tensors="pt") ["input_ids"] )
print( "token_type_ids:\t",tokenizer(s1,s2,return_tensors="pt") ["token_type_ids"] )
print( "attention_mask:\t",tokenizer(s1,s2,return_tensors="pt") ["attention_mask"] )

{'input_ids': tensor([[  101,  1188,  1278,  1110,  2785,  2846,   102, 17129,  1182,  1942,
          1186,  5315,   170,  1974,  1104,  5076,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

string 1:	 ['This', 'school', 'is', 'pretty', 'difficult']
string 2:	 ['PoliTo', 'requires', 'a', 'lot', 'of', 'studying']
tokens: 	 [['[CLS]', 'This', 'school', 'is', 'pretty', 'difficult', '[SEP]', 'Pol', '##i', '##T', '##o', 'requires', 'a', 'lot', 'of', 'studying', '[SEP]']]

input_ids:	 tensor([[  101,  1188,  1278,  1110,  2785,  2846,   102, 17129,  1182,  1942,
          1186,  5315,   170,  1974,  1104,  5076,   102]])
token_type_ids:	 tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
attention_mask:	 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


# Model

In [None]:
s1 = "This school is pretty difficult"
s2 = "PoliTo requires a lot of studying"

*   Generate a suitable input for the model
*   Through "**" called unpacking procedure pass the arguments to the model

In [None]:
input = tokenizer(s1,s2, return_tensors="pt")
output = model(**input)

The output of the model has the logits of the classification process relative to each label

In [None]:
print( "Type:\t" , type(output) )
print( output )
print("")
print("Logits:\t" , output.logits)

Type:	 <class 'transformers.modeling_outputs.SequenceClassifierOutput'>
SequenceClassifierOutput(loss=None, logits=tensor([[0.5498, 0.1653]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

Logits:	 tensor([[0.5498, 0.1653]], grad_fn=<AddmmBackward0>)


We have to transform them to probabilities using softmax

In [None]:
softmax = Softmax(dim=1)
probabilities = softmax(output.logits)
print("Logits:\t\t" , output.logits)
print("Probabilities:\t" , probabilities)

Logits:		 tensor([[0.5498, 0.1653]], grad_fn=<AddmmBackward0>)
Probabilities:	 tensor([[0.5950, 0.4050]], grad_fn=<SoftmaxBackward0>)


The logits and probabilities tell us that the second label is the one predicted by the model

In [None]:
predicted_label = probabilities.argmax()
print("Label:\t" , predicted_label)

Label:	 tensor(0)


In [None]:
model.config.id2label

{0: 'LABEL_0', 1: 'LABEL_1'}

In [None]:
model.config.id2label[int(predicted_label)]

'LABEL_0'

# Dataset

In [None]:
import pandas as pd

df_val_agnostic = pd.read_json("/content/val.model-agnostic.json")
df_val_aware = pd.read_json("/content/val.model-aware.json")
df_trial = pd.read_json("/content/trial-v1.json")

In [None]:
df_trial = df_trial[["hyp","tgt","label"]]
df_val_aware = df_val_aware[["hyp","tgt","label"]]
df_val_agnostic = df_val_agnostic[["hyp","tgt","label"]]

In [None]:
df = pd.concat([df_trial , df_val_aware , df_val_agnostic])
df.head()

Unnamed: 0,hyp,tgt,label
0,"A district of Kowloon, China.",The Chaoshan region where the Teochew dialect ...,Hallucination
1,(Wicca) A witchdoctor.,(South Africa) A traditional tribal diviner or...,Hallucination
2,(nautical) A halyard.,(nautical) A staff that carries a flag or penn...,Not Hallucination
3,The quality of being rational.,(archaic) The quality or state of being rational.,Not Hallucination
4,(uncountable) The study of trees.,The worship of trees.,Hallucination


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1080 entries, 0 to 498
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   hyp     1080 non-null   object
 1   tgt     1080 non-null   object
 2   label   1080 non-null   object
dtypes: object(3)
memory usage: 33.8+ KB


In [None]:
df["x"] = df.apply( lambda x : [ x["hyp"] , x["tgt"] ] , axis = 1)
df.head()

Unnamed: 0,hyp,tgt,label,x
0,"A district of Kowloon, China.",The Chaoshan region where the Teochew dialect ...,Hallucination,"[A district of Kowloon, China., The Chaoshan r..."
1,(Wicca) A witchdoctor.,(South Africa) A traditional tribal diviner or...,Hallucination,"[(Wicca) A witchdoctor., (South Africa) A trad..."
2,(nautical) A halyard.,(nautical) A staff that carries a flag or penn...,Not Hallucination,"[(nautical) A halyard., (nautical) A staff tha..."
3,The quality of being rational.,(archaic) The quality or state of being rational.,Not Hallucination,"[The quality of being rational., (archaic) The..."
4,(uncountable) The study of trees.,The worship of trees.,Hallucination,"[(uncountable) The study of trees., The worshi..."


# Dataset HugginFace

In [16]:
from datasets import load_dataset

ds = load_dataset("json", data_files=["/content/val.model-aware.json" , "/content/val.model-agnostic.json" , "/content/trial-v1.json"])
ds = ds['train'].train_test_split(train_size=0.8)
ds = ds.select_columns(['tgt', 'hyp', 'label'])
ds = ds.with_format("torch")
ds

DatasetDict({
    train: Dataset({
        features: ['tgt', 'hyp', 'label'],
        num_rows: 864
    })
    test: Dataset({
        features: ['tgt', 'hyp', 'label'],
        num_rows: 216
    })
})

In [21]:
ds = load_dataset("json", data_files=["/content/val.model-agnostic.json"])
ds2 = load_dataset("json", data_files=["/content/trial-v1.json"])
#ds = ds['train'].train_test_split(train_size=0.8)
ds['test'] = ds2['train']
ds = ds.select_columns(['tgt', 'hyp', 'label'])
ds

DatasetDict({
    train: Dataset({
        features: ['tgt', 'hyp', 'label'],
        num_rows: 499
    })
    test: Dataset({
        features: ['tgt', 'hyp', 'label'],
        num_rows: 80
    })
})

In [22]:
def preprocess(example , tokenizer):
  #single = [f"{hyp}; {tgt}" for hyp,tgt in zip(example["hyp"], example["tgt"])]
  #model_input = tokenizer( single , max_length = 1024, truncation = True )
  model_input = tokenizer(example["hyp"], example["tgt"] , max_length = 1024, truncation = True )
  model_input["label"] = [1 if t == "Hallucination" else 0 for t in example["label"]]
  return model_input

In [23]:
ds = ds.map( lambda x : preprocess(x , tokenizer) , batched = True)
ds = ds.remove_columns(["hyp" , "tgt"])
ds

Map:   0%|          | 0/499 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 499
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 80
    })
})

# Train Loop

In [24]:
!pip install accelerate -U



In [27]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [28]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

BATCH_SIZE = 100
NUM_EPOCHS = 25

training_args = TrainingArguments(
    output_dir="local_model",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=1,
    load_best_model_at_end=True,
)
optimizer = torch.optim.NAdam( model.parameters() )
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma = .95)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator = data_collator,
    tokenizer = tokenizer,
    train_dataset = ds["train"],
    eval_dataset = ds["test"],
    compute_metrics = compute_metrics,
    #optimizers = (optimizer , scheduler )
)

In [29]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6996,0.693765,0.475
2,0.7005,0.694834,0.475
3,0.6738,0.683603,0.5
4,0.6324,0.66125,0.575
5,0.5715,0.639967,0.625
6,0.5222,0.59338,0.6875
7,0.4626,0.533724,0.7375
8,0.427,0.51782,0.7625
9,0.3642,0.51576,0.75
10,0.3344,0.542593,0.7625


TrainOutput(global_step=125, training_loss=0.27329531228542325, metrics={'train_runtime': 318.5185, 'train_samples_per_second': 39.166, 'train_steps_per_second': 0.392, 'total_flos': 502798546238820.0, 'train_loss': 0.27329531228542325, 'epoch': 25.0})

# Bert model pre-train

In [32]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained( checkpoint )
model = AutoModelForMaskedLM.from_pretrained( checkpoint )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [46]:
ds = load_dataset("json", data_files=["/content/val.model-agnostic.json"])
ds = ds.select_columns(["hyp"])
ds

DatasetDict({
    train: Dataset({
        features: ['hyp'],
        num_rows: 499
    })
})

In [41]:
def preprocess_ML( example , tokenizer):
    return tokenizer( example["hyp"] , max_length = 1024, truncation = True)


In [47]:
ds = ds.map( lambda x : preprocess_ML(x , tokenizer)  , batched = True)
ds = ds.remove_columns(["hyp"])
ds = ds['train'].train_test_split(train_size=0.8)
ds

Map:   0%|          | 0/499 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 399
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 100
    })
})

In [63]:
#tokenizer.pad_token = tokenizer.sep_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [64]:
training_args = TrainingArguments(
    output_dir="my_pretrain_bert_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=25,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    tokenizer = tokenizer,
    args=training_args,
    train_dataset= ds["train"],
    eval_dataset= ds["test"],
    data_collator=data_collator,
)

In [65]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,2.507431
2,No log,2.254484
3,No log,1.652493
4,No log,1.817558
5,No log,1.863397
6,No log,2.258225
7,No log,1.78153
8,No log,1.782266
9,No log,1.837048
10,1.683900,1.428883


TrainOutput(global_step=1250, training_loss=1.2983341186523438, metrics={'train_runtime': 138.1084, 'train_samples_per_second': 72.226, 'train_steps_per_second': 9.051, 'total_flos': 97002283098600.0, 'train_loss': 1.2983341186523438, 'epoch': 25.0})

In [2]:
pretrained_model = AutoModelForSequenceClassification.from_pretrained("/content/my_pretrain_bert_model/checkpoint-1000")
pretrained_tokenizer = AutoTokenizer.from_pretrained("/content/my_pretrain_bert_model/checkpoint-1000")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/my_pretrain_bert_model/checkpoint-1000 and are newly initialized: ['classifier.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
ds = load_dataset("json", data_files=["/content/val.model-agnostic.json"])
ds2 = load_dataset("json", data_files=["/content/trial-v1.json"])
#ds = ds['train'].train_test_split(train_size=0.8)
ds['test'] = ds2['train']
ds = ds.select_columns(['tgt', 'hyp', 'label'])
ds

DatasetDict({
    train: Dataset({
        features: ['tgt', 'hyp', 'label'],
        num_rows: 499
    })
    test: Dataset({
        features: ['tgt', 'hyp', 'label'],
        num_rows: 80
    })
})

In [6]:
def preprocess(example , tokenizer):
  #single = [f"{hyp}; {tgt}" for hyp,tgt in zip(example["hyp"], example["tgt"])]
  #model_input = tokenizer( single , max_length = 1024, truncation = True )
  model_input = tokenizer(example["hyp"], example["tgt"] , max_length = 1024, truncation = True )
  model_input["label"] = [1 if t == "Hallucination" else 0 for t in example["label"]]
  return model_input

In [7]:
ds = ds.map( lambda x : preprocess(x , pretrained_tokenizer) , batched = True)
ds = ds.remove_columns(["hyp" , "tgt"])
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 499
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 80
    })
})

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=pretrained_tokenizer)

In [9]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [13]:
BATCH_SIZE = 100
NUM_EPOCHS = 25

training_args = TrainingArguments(
    output_dir="pretrain_local_model",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=1,
    load_best_model_at_end=True,
)
#optimizer = torch.optim.NAdam( model.parameters() )
#scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma = .95)

trainer = Trainer(
    model=pretrained_model,
    args=training_args,
    data_collator = data_collator,
    tokenizer = pretrained_tokenizer,
    train_dataset = ds["train"],
    eval_dataset = ds["test"],
    compute_metrics = compute_metrics,
    #optimizers = (optimizer , scheduler )
)

In [14]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6532,0.662501,0.5875
2,0.6214,0.629212,0.625
3,0.5882,0.575773,0.675
4,0.5153,0.526305,0.725
5,0.4394,0.500969,0.7375
6,0.3501,0.498095,0.7
7,0.3489,0.476459,0.775
8,0.2862,0.506519,0.775
9,0.1785,0.540076,0.7625
10,0.1813,0.539197,0.7875


TrainOutput(global_step=125, training_loss=0.18821196886152028, metrics={'train_runtime': 301.9976, 'train_samples_per_second': 41.308, 'train_steps_per_second': 0.414, 'total_flos': 502798546238820.0, 'train_loss': 0.18821196886152028, 'epoch': 25.0})