In [2]:
import transformers as tr

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tokenizer = tr.AutoTokenizer.from_pretrained("bert-base-cased")

Downloading: 100%|██████████| 29.0/29.0 [00:00<00:00, 29.0kB/s]
Downloading: 100%|██████████| 570/570 [00:00<00:00, 590kB/s]
Downloading: 100%|██████████| 208k/208k [00:00<00:00, 672kB/s]  
Downloading: 100%|██████████| 426k/426k [00:00<00:00, 1.02MB/s] 


In [9]:
batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
]
encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
print(encoded_input)

{'input_ids': tensor([[  101,  1252,  1184,  1164,  1248,  6462,   136,   102,     0,     0,
             0,     0,     0,     0,     0],
        [  101,  1790,   112,   189,  1341,  1119,  3520,  1164,  1248,  6462,
           117, 21902,  1643,   119,   102],
        [  101,  1327,  1164,  5450, 23434,   136,   102,     0,     0,     0,
             0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [10]:
tokenizer.decode(encoded_input["input_ids"][0])

'[CLS] But what about second breakfast? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [12]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")
dataset["train"][100]

Downloading builder script: 4.41kB [00:00, 2.21MB/s]                   
Downloading metadata: 2.04kB [00:00, 1.02MB/s]                 


Downloading and preparing dataset yelp_review_full/yelp_review_full (download: 187.06 MiB, generated: 496.94 MiB, post-processed: Unknown size, total: 684.00 MiB) to C:\Users\Ameno\.cache\huggingface\datasets\yelp_review_full\yelp_review_full\1.0.0\e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf...


Downloading data: 100%|██████████| 196M/196M [01:30<00:00, 2.16MB/s] 
                                                                                         

Dataset yelp_review_full downloaded and prepared to C:\Users\Ameno\.cache\huggingface\datasets\yelp_review_full\yelp_review_full\1.0.0\e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf. Subsequent calls will reuse this data.


100%|██████████| 2/2 [00:00<00:00, 13.61it/s]


{'label': 0,
 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. 

In [13]:
tokenized_datasets = dataset.map(lambda x: tokenizer(x['text'], padding="max_length", truncation=True), batched=True)

100%|██████████| 650/650 [07:15<00:00,  1.49ba/s]
100%|██████████| 50/50 [00:28<00:00,  1.78ba/s]


In [25]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(10))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(10))

Loading cached shuffled indices for dataset at C:\Users\Ameno\.cache\huggingface\datasets\yelp_review_full\yelp_review_full\1.0.0\e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf\cache-385f4f630def7870.arrow
Loading cached shuffled indices for dataset at C:\Users\Ameno\.cache\huggingface\datasets\yelp_review_full\yelp_review_full\1.0.0\e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf\cache-ea5369b944bbd613.arrow


In [26]:
small_train_dataset

Dataset({
    features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10
})

In [17]:
from transformers import AutoModelForSequenceClassification

# Fetches the pre- trained model of BERT, removes the head and replaces it with a classification head with specified output nodes 
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

Downloading: 100%|██████████| 416M/416M [02:18<00:00, 3.15MB/s] 
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceC

In [22]:
from transformers import TrainingArguments, Trainer


# DO THIS MANUALLY WITH PYTORCH AFTERWARDS 
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [19]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

Downloading builder script: 3.19kB [00:00, 3.27MB/s]                   


In [20]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [28]:
trainer.train()

***** Running training *****
  Num examples = 10
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6

[A
[A
[A***** Running Evaluation *****
  Num examples = 10
  Batch size = 8


[A[A

                                                     
[A                                             

[A[A                                       


 33%|███▎      | 125/375 [4:12:45<2:20:33, 33.73s/it]
[A

[A[A

[A[A

{'eval_loss': 1.4118900299072266, 'eval_accuracy': 0.5, 'eval_runtime': 19.055, 'eval_samples_per_second': 0.525, 'eval_steps_per_second': 0.105, 'epoch': 1.0}



[A
[A***** Running Evaluation *****
  Num examples = 10
  Batch size = 8


[A[A

                                                     
[A                                               

[A[A                                       


 33%|███▎      | 125/375 [4:13:50<2:20:33, 33.73s/it]
[A

[A[A

[A[A

{'eval_loss': 1.3822016716003418, 'eval_accuracy': 0.4, 'eval_runtime': 20.385, 'eval_samples_per_second': 0.491, 'eval_steps_per_second': 0.098, 'epoch': 2.0}



[A
[A***** Running Evaluation *****
  Num examples = 10
  Batch size = 8


[A[A

                                                     
[A                                               

[A[A                                       


 33%|███▎      | 125/375 [4:14:53<2:20:33, 33.73s/it]
[A

[A[A

[A[A

Training completed. Do not forget to share your model on huggingface.co/models =)


                                                     
[A                                               

 33%|███▎      | 125/375 [4:14:53<2:20:33, 33.73s/it]
100%|██████████| 6/6 [03:19<00:00, 33.20s/it]

{'eval_loss': 1.365817666053772, 'eval_accuracy': 0.2, 'eval_runtime': 18.452, 'eval_samples_per_second': 0.542, 'eval_steps_per_second': 0.108, 'epoch': 3.0}
{'train_runtime': 199.384, 'train_samples_per_second': 0.15, 'train_steps_per_second': 0.03, 'train_loss': 1.199616511662801, 'epoch': 3.0}





TrainOutput(global_step=6, training_loss=1.199616511662801, metrics={'train_runtime': 199.384, 'train_samples_per_second': 0.15, 'train_steps_per_second': 0.03, 'train_loss': 1.199616511662801, 'epoch': 3.0})

In [None]:
import torch as ts

class hugBert(ts.nn.Module):
  def __init__(self, num_classes):
    super(hugBert, self).__init__()
    device = ts.device('cuda' if ts.cuda.is_available() else 'cpu')
    self.bert = tr.BertModel.from_pretrained('bert-base-uncased', return_dict=True).to(device)
    self.fc = ts.nn.Linear(768, num_classes, bias=False)
  
  def forward(self, x_input_ids, x_type_ids, attn_mask):
    outputs = self.bert(x_input_ids, token_type_ids=x_type_ids, attention_mask=attn_mask)
    pred = self.fc(outputs.pooler_output)
    return pred

device = ts.device('cuda' if ts.cuda.is_available() else 'cpu')
model = hugBert(5).to(device)
