In [None]:
#@title Install libraries
!pip install --quiet datasets transformers evaluate seqeval accelerate -U jsonlines imbalanced-learn

In [None]:
#@title Load Libraries
import os
import torch
import evaluate
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset,Dataset, DatasetDict
from transformers import AutoTokenizer, DataCollatorForTokenClassification, \
AutoModelForTokenClassification, AutoModelForMaskedLM, TrainingArguments, Trainer,AutoModelForSequenceClassification,DataCollatorWithPadding

In [None]:
#@title Load model
model_checkpoint = "mor40/BulBERT-chitanka-model"
model_raw = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mor40/BulBERT-chitanka-model and are newly initialized: ['classifier.weight', 'classifier.bias', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#@title Load Dataset
hf_dataset = load_dataset("bgglue/bgglue","ct21t1").remove_columns(["tweet_id"])
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['id_str', 'topic_id', 'tweet_text', 'labels'],
        num_rows: 3000
    })
    validation: Dataset({
        features: ['id_str', 'topic_id', 'tweet_text', 'labels'],
        num_rows: 350
    })
    test: Dataset({
        features: ['id_str', 'topic_id', 'tweet_text', 'labels'],
        num_rows: 357
    })
})

## Train Model

In [None]:
hf_dataset.set_format('pandas')
hf_dataset["train"][:5]

Unnamed: 0,tweet_text,labels
0,Препоръките към държавите-членки в рамките на ...,0
1,"За тия, дет си мислят, че няма вирус https://t...",0
2,"Отменят се част от противоепидемичните мерки, ...",0
3,Oпазването на биоразнообразието 🦋🐅🐘🌴 остава пр...,0
4,Кратък обзор над мерките в сградостроителствот...,0


In [None]:
hf_dataset["train"]["labels"].value_counts()


0    2608
1     392
Name: labels, dtype: int64

In [None]:
from imblearn.over_sampling import RandomOverSampler


X = hf_dataset["train"]['tweet_text']
y = hf_dataset["train"]['labels']
X_arr = np.array(X).reshape(-1, 1)
# Initialize the RandomOverSampler
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)

# Apply oversampling to your data
X_resampled, y_resampled = oversampler.fit_resample(X_arr, y)
flattened_X = [item for sublist in X_resampled for item in sublist]

# Now you have X_resampled and y_resampled with oversampled data
result_dataset = pd.DataFrame({"tweet_text": flattened_X, "labels":y_resampled })
result_dataset

Unnamed: 0,tweet_text,labels
0,Препоръките към държавите-членки в рамките на ...,0
1,"За тия, дет си мислят, че няма вирус https://t...",0
2,"Отменят се част от противоепидемичните мерки, ...",0
3,Oпазването на биоразнообразието 🦋🐅🐘🌴 остава пр...,0
4,Кратък обзор над мерките в сградостроителствот...,0
...,...,...
5211,Посланик на #Азербайджан: #Армения започна таз...,1
5212,САЩ и Великобритания спряха за COVID-19 медика...,1
5213,"Супер интересно, досега не знаех, че ваксини м...",1
5214,Андрю Куомо продължава да използва думата Евро...,1


In [None]:
result_dataset["labels"].value_counts()

0    2608
1    2608
Name: labels, dtype: int64

In [None]:
balanced_train_dataset = Dataset.from_pandas(result_dataset)

In [None]:
def tokenize(batch):
 return tokenizer(batch["tweet_text"],  truncation=True)


train_tokenzied = balanced_train_dataset.map(tokenize, batched=True, batch_size=None)
hf_dataset["validation"].reset_format()
validation_tokenzied = hf_dataset["validation"].map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/5216 [00:00<?, ? examples/s]

Map:   0%|          | 0/350 [00:00<?, ? examples/s]

In [None]:
#@title Define model training args
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

args = TrainingArguments(
    "BulBERT-ct21-8pochs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=8,
    weight_decay=0.01,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    push_to_hub=True,
)


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
del args

In [None]:
del trainer

In [None]:
del model_raw

In [None]:
#@title Train
trainer = Trainer(
    model=model_raw,
    args=args,
    train_dataset=train_tokenzied,
    eval_dataset=validation_tokenzied,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

trainer.push_to_hub(commit_message="Training complete")

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.489052,0.774286
2,No log,0.547468,0.825714
3,No log,0.78894,0.82
4,0.288000,0.943759,0.828571
5,0.288000,1.005149,0.84


'https://huggingface.co/mor40/BulBERT-ct21-5pochs/tree/main/'

## Test Model

In [None]:
from transformers.tools.text_classification import AutoModelForSequenceClassification
checkpoint = "mor40/BulBERT-ct21-5pochs"

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



Downloading (…)lve/main/config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/748k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.54M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
def get_predictions(example, model):
  inputs = tokenizer(example["tweet_text"], return_tensors="pt",truncation = True)
  #actual = example["label"]

  # Run through model
  with torch.no_grad():
        logits = model(**inputs).logits[0]

  label_id = np.argmax(logits).item()
  example["predicted_label"] = label_id

  return example

dataset_validation = hf_dataset["validation"].map(get_predictions,fn_kwargs={"model": model})
dataset_test = hf_dataset["test"].map(get_predictions,fn_kwargs={"model": model})

Map:   0%|          | 0/350 [00:00<?, ? examples/s]

Map:   0%|          | 0/357 [00:00<?, ? examples/s]

In [None]:
correct = 0
for i, example in enumerate(dataset_validation):
  if example["predicted_label"] == example["labels"]: correct+=1


print("Accuracy: " ,correct / len(dataset_validation))


Accuracy:  0.84


In [None]:
dataset_test

Dataset({
    features: ['id_str', 'topic_id', 'tweet_text', 'labels', 'predicted_label'],
    num_rows: 357
})

In [None]:
predictions = []

for i, example in enumerate(dataset_test):
  rec = {}
  rec['topic_id'] = example['topic_id']
  rec['id_str'] = example['id_str']
  rec['label'] = example["predicted_label"]
  predictions.append(rec)

In [None]:
predictions

[{'topic_id': 'covid-19', 'id_str': '1241620737565360128', 'label': 1},
 {'topic_id': 'covid-19', 'id_str': '1293890411052900352', 'label': 1},
 {'topic_id': 'covid-19', 'id_str': '1294530807349030913', 'label': 0},
 {'topic_id': 'covid-19', 'id_str': '1290911256312975362', 'label': 0},
 {'topic_id': 'covid-19', 'id_str': '1256499987636248576', 'label': 0},
 {'topic_id': 'covid-19', 'id_str': '1285586200183570434', 'label': 0},
 {'topic_id': 'covid-19', 'id_str': '1248186232540143616', 'label': 0},
 {'topic_id': 'covid-19', 'id_str': '1239830696589504513', 'label': 0},
 {'topic_id': 'covid-19', 'id_str': '1271089562266161152', 'label': 0},
 {'topic_id': 'covid-19', 'id_str': '1243435022163546112', 'label': 0},
 {'topic_id': 'covid-19', 'id_str': '1261179521014915073', 'label': 1},
 {'topic_id': 'covid-19', 'id_str': '1299276141577015297', 'label': 0},
 {'topic_id': 'covid-19', 'id_str': '1295945515994144770', 'label': 0},
 {'topic_id': 'covid-19', 'id_str': '1264909985177362438', 'labe

In [None]:
import jsonlines

with jsonlines.open('predictions_ct21-5-epochs.jsonl', 'w') as writer:
    writer.write_all(predictions)