In [8]:
from datasets import load_dataset
from transformers import AutoTokenizer, Trainer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
import numpy as np
import evaluate

In [45]:
dataset = load_dataset("yelp_review_full")
print(type(dataset))
print(len(dataset))
print(dataset.keys())
example = dataset["train"][66]
print(type(example))
example

Found cached dataset yelp_review_full (/root/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf)


  0%|          | 0/2 [00:00<?, ?it/s]

<class 'datasets.dataset_dict.DatasetDict'>
2
dict_keys(['train', 'test'])
<class 'dict'>


{'label': 3,
 'text': 'I belong to this gym...  I live in the South section of Pittsburgh, and I find that this gym is not too far from me.  The staff is friendly, the equipment is quite good.  You get two free personal training sessions when you join.  They have lots of weights (which my boyfriend uses) and a decent cardio room.  The only thing I would say is to increase some of the cardio equipment.  Water is only $1 a bottle!'}

In [10]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))

Loading cached processed dataset at /root/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf/cache-aad1af4c7095bfa1.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf/cache-29f27748f0b54d01.arrow
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf/cache-11a7619c6a3c070f.arrow
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf/cache-3c5c2a245be1b332.arrow


In [11]:
# The warning "Some weights of the model checkpoint at bert-base-cased were not used when initializing" is normal,
# because the pretrained head of the BERT model is discarded
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

# Freezing the last layer
# https://github.com/huggingface/transformers/issues/400
for name, param in model.named_parameters():
    print(name)
    if 'classifier' not in name: # classifier layer
        param.requires_grad = False


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

bert.embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.weight
bert.embeddings.LayerNorm.bias
bert.encoder.layer.0.attention.self.query.weight
bert.encoder.layer.0.attention.self.query.bias
bert.encoder.layer.0.attention.self.key.weight
bert.encoder.layer.0.attention.self.key.bias
bert.encoder.layer.0.attention.self.value.weight
bert.encoder.layer.0.attention.self.value.bias
bert.encoder.layer.0.attention.output.dense.weight
bert.encoder.layer.0.attention.output.dense.bias
bert.encoder.layer.0.attention.output.LayerNorm.weight
bert.encoder.layer.0.attention.output.LayerNorm.bias
bert.encoder.layer.0.intermediate.dense.weight
bert.encoder.layer.0.intermediate.dense.bias
bert.encoder.layer.0.output.dense.weight
bert.encoder.layer.0.output.dense.bias
bert.encoder.layer.0.output.LayerNorm.weight
bert.encoder.layer.0.output.LayerNorm.bias
bert.encoder.layer.1.attention.self.query.weight
bert.enc

In [12]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [13]:
# https://huggingface.co/docs/transformers/v4.28.1/en/main_classes/trainer#transformers.TrainingArguments
training_args = TrainingArguments(
    output_dir="../assets/22_training_head", 
    evaluation_strategy="epoch",

    eval_steps=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    seed=0,
    
    
    save_strategy='steps',
    save_steps=1,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
    

)
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.623453,0.21


TrainOutput(global_step=100, training_loss=1.6406195068359375, metrics={'train_runtime': 146.724, 'train_samples_per_second': 0.682, 'train_steps_per_second': 0.682, 'total_flos': 26311814246400.0, 'train_loss': 1.6406195068359375, 'epoch': 1.0})

# Inference

In [17]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("../assets/22_training_head/checkpoint-100")

In [43]:
batch = tokenizer(
    ['I belong to this gym...  I live in the South section of Pittsburgh'],
    padding = True,
    truncation = True,
    max_length = 512,
    return_tensors = 'pt'
)

In [44]:
import torch
import torch.nn.functional as F
with torch.no_grad():
    outputs = model(**batch)
    predictions = F.softmax(outputs.logits, dim=1)
    labels = torch.argmax(predictions, dim=1)
labels

tensor([0])