In [1]:
import os

from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoTokenizer,\
AutoModelForTokenClassification, AutoModelForMaskedLM, TrainingArguments, Trainer,AutoModelForSequenceClassification,DataCollatorWithPadding
import evaluate
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset,Dataset, DatasetDict


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#@title Load Dataset
hf_dataset = load_dataset("bgglue/bgglue","xnlibg")
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 392702
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 5010
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 2490
    })
})

In [3]:
label_list = hf_dataset["train"].features["label"].names
label_list, len(label_list)

(['entailment', 'neutral', 'contradiction'], 3)

In [4]:
# Load model
model_checkpoint = "mor40/BulBERT-chitanka-model"
model_raw = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mor40/BulBERT-chitanka-model and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
hf_dataset["train"][0]

{'premise': 'концептуално крем краде има две основни измерения - продукт и география .',
 'hypothesis': 'продукт и география са това , което прави крем краде работа .',
 'label': 1}

0 - entailment
1 - neutral
2 - contradiction


In [6]:
def preprocess_function(examples):
        # Tokenize the texts
        return tokenizer(
            examples["premise"],
            examples["hypothesis"],
            padding=True,
            truncation=True,
            return_tensors='pt'
        )
eval_dataset = hf_dataset["validation"].map(
                preprocess_function,
                batched=True,
                desc="Running tokenizer on validation dataset",
                remove_columns = ["premise", "hypothesis"]
)
test_dataset = hf_dataset["test"].map(
                preprocess_function,
                batched=True,
                desc="Running tokenizer on test dataset",
                remove_columns = ["premise", "hypothesis"]
)

Running tokenizer on validation dataset: 100%|██████████| 2490/2490 [00:00<00:00, 3102.74 examples/s]


In [7]:
eval_dataset = eval_dataset.rename_column("label", "labels")
test_dataset = test_dataset.rename_column("label", "labels")

In [8]:
model = AutoModelForSequenceClassification.from_pretrained("mor40/BulBERT-xnli-2epochs")

In [9]:
example = eval_dataset[0]

In [10]:
with torch.no_grad():
    logits = model(**example).logits[0]
logits

AttributeError: 'list' object has no attribute 'size'

In [34]:
example_pt

NameError: name 'example_pt' is not defined

In [11]:
id2label ={
      0 : "entailment",
      1 :'neutral',
      2 : 'contradiction'
}
def get_predictions(example, model):
      inputs = tokenizer( example["premise"],example["hypothesis"],padding=True,truncation=True,return_tensors='pt')
      # Run through model
      with torch.no_grad():
            logits = model(**inputs).logits[0]

      label_id = np.argmax(logits).item()
      label_as_word = id2label[label_id]
      example["predicted_label_id"] = label_id
      example["predicted_label"] = label_as_word
      return example
dataset_validation = hf_dataset["validation"].map(get_predictions,fn_kwargs={"model": model})
dataset_test= hf_dataset["test"].map(get_predictions,fn_kwargs={"model": model})

Map: 100%|██████████| 2490/2490 [05:18<00:00,  7.81 examples/s]
Map: 100%|██████████| 5010/5010 [08:56<00:00,  9.33 examples/s]


In [12]:
dataset_validation

Dataset({
    features: ['premise', 'hypothesis', 'label', 'predicted_label_id', 'predicted_label'],
    num_rows: 2490
})

In [13]:
correct = 0
for i, example in enumerate(dataset_validation):
  if example["predicted_label_id"] == example["label"]: correct+=1

print("Accuracy: " ,correct / len(dataset_validation))

Accuracy:  0.7016064257028113


In [14]:
predictions = []

for i, example in enumerate(dataset_test):
  rec = {}
  rec['index'] = i
  rec['label'] = example["predicted_label"]
  predictions.append(rec)

In [None]:
predictions

In [17]:
import jsonlines

with jsonlines.open('predictions_xnli_2epochs.jsonl', 'w') as writer:
    writer.write_all(predictions)