In [3]:
import transformers as tr
import torch as ts
# use own pytorch training. matching saved with noted model weights is probably easier than. also try using own data set

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tokenizer = tr.AutoTokenizer.from_pretrained("bert-base-cased")

In [5]:
batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
]
encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
print(encoded_input)

{'input_ids': tensor([[  101,  1252,  1184,  1164,  1248,  6462,   136,   102,     0,     0,
             0,     0,     0,     0,     0],
        [  101,  1790,   112,   189,  1341,  1119,  3520,  1164,  1248,  6462,
           117, 21902,  1643,   119,   102],
        [  101,  1327,  1164,  5450, 23434,   136,   102,     0,     0,     0,
             0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [6]:
tokenizer.decode(encoded_input["input_ids"][0])

'[CLS] But what about second breakfast? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [7]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")
dataset["train"][100]

Reusing dataset yelp_review_full (C:\Users\Ameno\.cache\huggingface\datasets\yelp_review_full\yelp_review_full\1.0.0\e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf)
100%|██████████| 2/2 [00:00<00:00,  8.10it/s]


{'label': 0,
 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. 

In [8]:
tokenized_datasets = dataset.map(lambda x: tokenizer(x['text'], padding="max_length", truncation=True), batched=True)

Loading cached processed dataset at C:\Users\Ameno\.cache\huggingface\datasets\yelp_review_full\yelp_review_full\1.0.0\e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf\cache-1212de8c9c92504b.arrow
Loading cached processed dataset at C:\Users\Ameno\.cache\huggingface\datasets\yelp_review_full\yelp_review_full\1.0.0\e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf\cache-7a0846779351e1d6.arrow


In [9]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(4))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(4))

Loading cached shuffled indices for dataset at C:\Users\Ameno\.cache\huggingface\datasets\yelp_review_full\yelp_review_full\1.0.0\e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf\cache-385f4f630def7870.arrow
Loading cached shuffled indices for dataset at C:\Users\Ameno\.cache\huggingface\datasets\yelp_review_full\yelp_review_full\1.0.0\e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf\cache-5550b17c557af15e.arrow


In [10]:
small_train_dataset

Dataset({
    features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4
})

In [11]:
from transformers import AutoModelForSequenceClassification

# Fetches the pre- trained model of BERT, removes the head and replaces it with a classification head with specified output nodes 
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

# Freeze base model parameters
for param in model.base_model.parameters():
    param.requires_grad = False

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [17]:
from transformers import TrainingArguments, Trainer


# DO THIS MANUALLY WITH PYTORCH AFTERWARDS 
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", per_device_train_batch_size=2, per_device_eval_batch_size=2)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [13]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

In [14]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    print("Logits,labels:", logits, labels)
    print("Length Logits,labels:", len(logits), len(labels))
    predictions = np.argmax(logits, axis=1)
    print("Predictions:", predictions)
    return metric.compute(predictions=predictions, references=labels)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset, 
    compute_metrics=compute_metrics
)

trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 6
 33%|███▎      | 2/6 [00:08<00:16,  4.16s/it]The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4
  Batch size = 8

 33%|███▎      | 2/6 [00:14<00:16,  4.16s/it]

Logits,labels: [[-0.538383    0.5337215   0.5731572  -0.11133076 -0.12448423]
 [-0.5120326   0.57388365  0.47389227 -0.11375438 -0.11103021]
 [-0.58758616  0.5968258   0.5394944  -0.01521367 -0.06999421]
 [-0.54773515  0.5843127   0.5431256   0.02034825 -0.19257933]] [2 4 1 4]
Length Logits,labels: 4 4
Predictions: [2 1 1 1]
{'eval_loss': 1.5561915636062622, 'eval_accuracy': 0.5, 'eval_runtime': 6.2501, 'eval_samples_per_second': 0.64, 'eval_steps_per_second': 0.16, 'epoch': 1.0}


 67%|██████▋   | 4/6 [00:21<00:10,  5.32s/it]The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4
  Batch size = 8

 67%|██████▋   | 4/6 [00:26<00:10,  5.32s/it]

Logits,labels: [[-0.5214137   0.5049325   0.5695592  -0.14008394 -0.10587934]
 [-0.49494272  0.5447287   0.47052875 -0.14284314 -0.09227729]
 [-0.56935954  0.566539    0.53444016 -0.04554288 -0.0502921 ]
 [-0.5297557   0.55436563  0.538118   -0.00963444 -0.1729612 ]] [2 4 1 4]
Length Logits,labels: 4 4
Predictions: [2 1 1 1]
{'eval_loss': 1.5449827909469604, 'eval_accuracy': 0.5, 'eval_runtime': 5.292, 'eval_samples_per_second': 0.756, 'eval_steps_per_second': 0.189, 'epoch': 2.0}


100%|██████████| 6/6 [00:33<00:00,  5.54s/it]The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4
  Batch size = 8
                                             
100%|██████████| 6/6 [00:39<00:00,  5.54s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 6/6 [00:39<00:00,  6.64s/it]

Logits,labels: [[-0.51801574  0.49247348  0.5720341  -0.15254135 -0.0962552 ]
 [-0.49158627  0.5320278   0.47309422 -0.15553589 -0.08244899]
 [-0.5655383   0.55357647  0.53641474 -0.05851236 -0.0402116 ]
 [-0.5259829   0.5415562   0.54015946 -0.02245489 -0.16300747]] [2 4 1 4]
Length Logits,labels: 4 4
Predictions: [2 1 1 1]
{'eval_loss': 1.539475679397583, 'eval_accuracy': 0.5, 'eval_runtime': 6.031, 'eval_samples_per_second': 0.663, 'eval_steps_per_second': 0.166, 'epoch': 3.0}
{'train_runtime': 39.8541, 'train_samples_per_second': 0.301, 'train_steps_per_second': 0.151, 'train_loss': 1.8410495122273762, 'epoch': 3.0}





TrainOutput(global_step=6, training_loss=1.8410495122273762, metrics={'train_runtime': 39.8541, 'train_samples_per_second': 0.301, 'train_steps_per_second': 0.151, 'train_loss': 1.8410495122273762, 'epoch': 3.0})

In [None]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 24
  Batch size = 8





[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A

Logits,labels: [[ 0.18258709  0.6449136   0.02170461 -0.05187801  0.04040352]
 [ 0.1297358   0.52309656  0.13992578  0.01721375  0.04056168]
 [ 0.12446832  0.3988679   0.29450804 -0.11364052  0.19103867]
 [ 0.11484936  0.43675023  0.18905312 -0.10919274  0.20120177]
 [ 0.0477355   0.4001835   0.16998968 -0.07382765  0.20749728]
 [ 0.16096789  0.5558137   0.16383743 -0.02628025  0.16251692]
 [ 0.10272156  0.6948503   0.02308258  0.05988012 -0.01831058]
 [-0.00119486  0.49840537  0.10345562 -0.08012985  0.19469029]
 [ 0.18575524  0.51793563  0.19671038 -0.08756423  0.17177215]
 [ 0.14862874  0.4860131   0.15348169 -0.05234826  0.1371122 ]
 [ 0.16855292  0.5516685   0.1913124  -0.05412991  0.10209301]
 [ 0.06834032  0.4326025   0.26622927 -0.17369732  0.2609436 ]
 [ 0.11839488  0.41477114  0.26771176 -0.17259559  0.25046068]
 [ 0.10908683  0.47811532  0.20536703 -0.128439    0.20818639]
 [ 0.18329838  0.42602223  0.15838951 -0.12605736  0.18591505]
 [ 0.0658401   0.4718269   0.20397963 -0

100%|██████████| 3/3 [00:34<00:00, 11.57s/it]


{'eval_loss': 1.650819182395935,
 'eval_accuracy': 0.20833333333333334,
 'eval_runtime': 51.0834,
 'eval_samples_per_second': 0.47,
 'eval_steps_per_second': 0.059,
 'epoch': 3.0}

In [None]:

ts.save(model.state_dict(), "../model/bert.pt")

In [None]:
class hugBert(ts.nn.Module):
  def __init__(self, num_classes):
    super(hugBert, self).__init__()
    # device = ts.device('cuda' if ts.cuda.is_available() else 'cpu')
    self.bert = tr.BertModel.from_pretrained('bert-base-cased', return_dict=True)#.to(device)
    self.classifier = ts.nn.Linear(768, num_classes)#, bias=False)
  
  # def forward(self, *args, **kwargs):
    
    
  def forward(self, input_ids, token_type_ids, attention_mask, labels):
    # outputs = self.bert(*args, **kwargs)
    print("im here")
    outputs = self.bert(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)

    # print("im herenow", outputs)
    pred = self.classifier(outputs[1])#outputs.pooler_output)
    print("pred:",pred)
    return pred

# device = ts.device('cuda' if ts.cuda.is_available() else 'cpu')
myModel = hugBert(5)#.to(device)


loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at C:\Users\Ameno/.cache\huggingface\transformers\a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "return_dict": false,
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file https://huggingface.co/bert-base-cased/resolve/main/

In [None]:
myModel.load_state_dict(ts.load("../model/bert.pt"))

<All keys matched successfully>

In [None]:
myModel.eval()
testing_args = TrainingArguments(output_dir="test_trainer1", per_device_eval_batch_size=2)#, evaluation_strategy="epoch")
evaluator = Trainer(
    model=myModel,
    args=testing_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset, 
    compute_metrics=compute_metrics,
)
evaluator.evaluate()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the evaluation set  don't have a corresponding argument in `hugBert.forward` and have been ignored: text. If text are not expected by `hugBert.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 24
  Batch size = 8


im here
pred: tensor([[ 0.1826,  0.6449,  0.0217, -0.0519,  0.0404],
        [ 0.1297,  0.5231,  0.1399,  0.0172,  0.0406],
        [ 0.1245,  0.3989,  0.2945, -0.1136,  0.1910],
        [ 0.1148,  0.4368,  0.1891, -0.1092,  0.2012],
        [ 0.0477,  0.4002,  0.1700, -0.0738,  0.2075],
        [ 0.1610,  0.5558,  0.1638, -0.0263,  0.1625],
        [ 0.1027,  0.6949,  0.0231,  0.0599, -0.0183],
        [-0.0012,  0.4984,  0.1035, -0.0801,  0.1947]])






[A[A[A[A

im here






[A[A[A[A

pred: tensor([[ 0.1858,  0.5179,  0.1967, -0.0876,  0.1718],
        [ 0.1486,  0.4860,  0.1535, -0.0523,  0.1371],
        [ 0.1686,  0.5517,  0.1913, -0.0541,  0.1021],
        [ 0.0683,  0.4326,  0.2662, -0.1737,  0.2609],
        [ 0.1184,  0.4148,  0.2677, -0.1726,  0.2505],
        [ 0.1091,  0.4781,  0.2054, -0.1284,  0.2082],
        [ 0.1833,  0.4260,  0.1584, -0.1261,  0.1859],
        [ 0.0658,  0.4718,  0.2040, -0.0485,  0.2178]])
im here






[A[A[A[A

pred: tensor([[ 0.1459,  0.5513,  0.1596, -0.0636,  0.1044],
        [ 0.2238,  0.5377,  0.1196, -0.0678,  0.1676],
        [ 0.1532,  0.5909,  0.1280, -0.0536,  0.0773],
        [ 0.1729,  0.5308,  0.1734, -0.0843,  0.1461],
        [ 0.1579,  0.4627,  0.2737, -0.2181,  0.3360],
        [ 0.1801,  0.4877,  0.2296, -0.1265,  0.1675],
        [ 0.1012,  0.5931,  0.0573,  0.0957,  0.0299],
        [ 0.2419,  0.4604,  0.1317, -0.1441,  0.1572]])
Logits,labels: [[ 0.1297358   0.52309656  0.13992578  0.01721375  0.04056168]
 [ 0.12446832  0.3988679   0.29450804 -0.11364052  0.19103867]
 [ 0.11484936  0.43675023  0.18905312 -0.10919274  0.20120177]
 [ 0.0477355   0.4001835   0.16998968 -0.07382765  0.20749728]
 [ 0.16096789  0.5558137   0.16383743 -0.02628025  0.16251692]
 [ 0.10272156  0.6948503   0.02308258  0.05988012 -0.01831058]
 [-0.00119486  0.49840537  0.10345562 -0.08012985  0.19469029]
 [ 0.14862874  0.4860131   0.15348169 -0.05234826  0.1371122 ]
 [ 0.16855292  0.5516685   0.19131

ValueError: Mismatch in the number of predictions (21) and references (24)