In [1]:
# Load Libraries
import os
import torch
import evaluate
import pandas as pd
import numpy as np
from datasets import load_dataset,Dataset, DatasetDict
from transformers import AutoTokenizer, DataCollatorForTokenClassification, \
AutoModelForTokenClassification, AutoModelForMaskedLM, TrainingArguments, Trainer,AutoModelForSequenceClassification,DataCollatorWithPadding

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Load model
model_checkpoint = "mor40/BulBERT-chitanka-model"
model_raw = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mor40/BulBERT-chitanka-model and are newly initialized: ['classifier.bias', 'classifier.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
hf_dataset = load_dataset("bgglue/bgglue","ct21t1").remove_columns(["tweet_id","id_str","topic_id"])
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['tweet_text', 'labels'],
        num_rows: 3000
    })
    validation: Dataset({
        features: ['tweet_text', 'labels'],
        num_rows: 350
    })
    test: Dataset({
        features: ['tweet_text', 'labels'],
        num_rows: 357
    })
})

In [4]:
hf_dataset.set_format('pandas')
hf_dataset["train"][:5]

Unnamed: 0,tweet_text,labels
0,Препоръките към държавите-членки в рамките на ...,0
1,"За тия, дет си мислят, че няма вирус https://t...",0
2,"Отменят се част от противоепидемичните мерки, ...",0
3,Oпазването на биоразнообразието 🦋🐅🐘🌴 остава пр...,0
4,Кратък обзор над мерките в сградостроителствот...,0


In [5]:
hf_dataset["train"]["labels"].value_counts()

labels
0    2608
1     392
Name: count, dtype: int64

In [6]:
from imblearn.over_sampling import RandomOverSampler
X = hf_dataset["train"]['tweet_text']
y = hf_dataset["train"]['labels']
X_arr = np.array(X).reshape(-1, 1)
# Initialize the RandomOverSampler
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)

# Apply oversampling to your data
X_resampled, y_resampled = oversampler.fit_resample(X_arr, y)
flattened_X = [item for sublist in X_resampled for item in sublist]

# Now you have X_resampled and y_resampled with oversampled data
result_dataset = pd.DataFrame({"tweet_text": flattened_X, "labels":y_resampled })
result_dataset["labels"].value_counts()

labels
0    2608
1    2608
Name: count, dtype: int64

In [7]:
balanced_train_dataset = Dataset.from_pandas(result_dataset)

In [8]:
def tokenize(batch):
 return tokenizer(batch["tweet_text"],  truncation=True)


train_tokenzied = balanced_train_dataset.map(tokenize, batched=True, batch_size=None)
hf_dataset["validation"].reset_format()
validation_tokenzied = hf_dataset["validation"].map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/5216 [00:00<?, ? examples/s]

Map: 100%|██████████| 5216/5216 [00:00<00:00, 6351.33 examples/s]


In [9]:
#@title Define model training args
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

args = TrainingArguments(
    "BulBERT-ct21-5pochs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    push_to_hub=True,
)