In [1]:
import gzip
import json
import os
import pandas as pd
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification, TrainingArguments, Trainer, pipeline
from torch import cuda
from sklearn.utils import shuffle
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"


2024-04-17 15:06:10.792881: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [23]:
def read_folder(folder_path):
    dataframes = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.jsonl.gz'):
            with gzip.open(os.path.join(folder_path, filename), 'rt', encoding='utf-8') as file:
                for line in file:
                    json_data = json.loads(line)
                    df = pd.DataFrame(json_data)
                    dataframes.append(df)
    if dataframes:
        aggregated_df = pd.concat(dataframes, ignore_index=True)
        return aggregated_df
    else:
        print("No jsonl files found in the directory.")
        return None

In [24]:
def reduce():
    # csak a test
    test = os.path.join(os.getcwd(), 'test')
    df = read_folder(test)
    df.drop('uuid', axis=1, inplace=True)
    return df.groupby('major_topic_pred').apply(lambda x: x.sample(n=min(10, len(x)))).reset_index(drop=True)


In [25]:
reduced_df = reduce()
df = shuffle(reduced_df)
labels = df['major_topic_pred'].unique().tolist()
number_of_labels = len(labels)
id_to_label = {_id: label for _id, label in enumerate(labels)}
label_to_id = {label: _id for _id, label in enumerate(labels)}
print(id_to_label)
df['major_topic_pred_index'] = df['major_topic_pred'].map(lambda x: label_to_id[x])
df.head(10)

{0: 20, 1: 23, 2: 13, 3: 7, 4: 9, 5: 4, 6: 21, 7: 12, 8: 8, 9: 18, 10: 6, 11: 5, 12: 1, 13: 16, 14: 14, 15: 10, 16: 999, 17: 17, 18: 19, 19: 3, 20: 15, 21: 2}


Unnamed: 0,title,lead,article,domain,url,date_of_creation,cc_date,tags,doc_similarity,major_topic_pred,major_topic_pred_index
188,Az MSZP szerint a választási iroda meghekkelte...,Az MSZP-Párbeszéd csütörtöki sajtótájékoztatój...,Sajtótájékoztatót tartott csütörtökön Budapest...,hvg.hu,http://hvg.hu/itthon/20180412_Toth_Bertalan_sz...,2018-04-12T18:45:00,2018-04-23T13:38:32,listázás,0.591405,20,0
208,Kokas Ignác hagyatéka: parasztházban nyílt kép...,A Kossuth-díjas Kokas Ignác festőművész képein...,Kokas Ignác hagyatékának egy részét vásárolta ...,feol.hu,https://www.feol.hu/kultura/helyi-kultura/koka...,2019-09-30T11:30:00,2020-04-08T01:38:45,Kósa Judit,0.600307,23,1
110,Így számítják ki a nyugdíjat 2014-ben,2014-ben is a nettó átlagkeresettől és a szolg...,A nyugdíj alapjául szolgáló átlagkeresetnél 20...,hvg.hu,https://hvg.hu/gazdasag/20131230_Igy_szamitjak...,2013-12-30T15:50:00,2019-09-15T10:58:07,nyugdíj,0.841047,13,2
116,Aprópénz a rezsiutalvány az ellenzékben ígért ...,A Fidesz a 2006-os választás előtt azzal kampá...,"Talán már feledésbe merült, de a 2006-os orszá...",24.hu,https://24.hu/fn/gazdasag/2019/09/04/rezsiutal...,2019-09-04T06:01:00,2019-09-19T02:34:35,Gazdaság,0.91587,13,2
61,Hegyekben áll a turisták szemete a Maldív-szig...,Thilafushi egy hulladékból épített sziget az I...,Az Indiai-óceánban található Maldív-szigeteket...,origo.hu,https://www.origo.hu/utazas/20170316-ide-hordj...,2017-03-17T05:00:00,2022-08-15T15:31:10,hulladék,0.421541,7,3
66,A téli etetés növeli nagyra a trófeát,Jóval több takarmány fogy a nagy hó és a zord ...,"A vadgazdálkodási terv része az etetés, ám ily...",kisalfold.hu,http://www.kisalfold.hu/mosonmagyarovari_hirek...,2013-01-23T08:35:00,2017-07-25T21:27:43,Mosonmagyaróvár,0.610558,7,3
189,Őrsi Gergely: Kérdés nélkül vállalom a drogtes...,"Tételesen cáfol minden, vele szemben – név nél...","Miből ismert magára, hogy a nyilvánosság elé k...",index.hu,https://index.hu/belfold/2019/12/10/angyal_ugy...,2019-12-10T07:21:00,2021-01-16T19:14:25,őrsi gergely,0.546769,20,0
87,300 afgán család éhségsztrájkol Belgiumban,Éhségsztrájkot tart egy hete háromszáz afgán c...,Egy hete éhségsztrájkot tart egy brüsszeli bel...,origo.hu,https://www.origo.hu/nagyvilag/20030731300afga...,,2019-12-16T11:31:00,ENSZ,0.93195,9,4
38,Szamurájdarázs ölheti meg az összes poloskát,"Nem elég, hogy küzdünk a régi ellenségeinkkel,...",A poloskáknak egész szép kis arzenálja települ...,nlc.hu,https://nlc.hu/ezvan/20190607/poloska-szamuraj...,2019-06-07T00:00:00,2021-03-08T00:23:03,poloska harlekinkatica kártevő szamurájdarázs,0.525787,4,5
192,Talán mégsem az összes mezőhegyesi árverési er...,Csak azokra a földekre nem köthetik meg a mező...,Lázár János Miniszterelnökséget vezető miniszt...,hvg.hu,https://hvg.hu/gazdasag/20160301_mezohegyes_ar...,2016-03-01T11:54:00,2019-12-09T23:35:06,kormánybiztos,0.529447,21,6


In [26]:
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cpu'

In [27]:
model = BertForSequenceClassification.from_pretrained(
    "google-bert/bert-base-uncased",
    num_labels=number_of_labels,
    id2label=id_to_label,
    label2id=label_to_id
)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [28]:
tokenizer = BertTokenizerFast.from_pretrained("google-bert/bert-base-uncased")
size = df.shape[0]
half = size // 2
three_fourth = (3 * size) // 4
train_texts = list(df['article'][:half])
val_texts = list(df['article'][half:three_fourth])
test_texts = list(df['article'][three_fourth:])
print(len(train_texts), len(val_texts), len(test_texts))


110 55 55


In [29]:
train_labels = list(df['major_topic_pred_index'][:half])
val_labels = list(df['major_topic_pred_index'][half:three_fourth])
test_labels = list(df['major_topic_pred_index'][three_fourth:])
print(len(train_labels), len(val_labels), len(test_labels))

110 55 55


In [30]:
class DataLoader(Dataset):
    def __init__(self, encodings, _labels):
        self.encodings = encodings
        self.labels = _labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [31]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_dataloader = DataLoader(train_encodings, train_labels)
val_dataloader = DataLoader(val_encodings, val_labels)
test_dataset = DataLoader(test_encodings, test_labels)

In [32]:
training_args = TrainingArguments(
    output_dir='./z',
    do_train=True,
    do_eval=True,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=50,
    weight_decay=0.01,
    logging_strategy='steps',
    logging_dir='./logs',
    logging_steps=50,
    evaluation_strategy='steps',
    eval_steps=50,
    save_strategy='steps',
    load_best_model_at_end=True,
    gradient_accumulation_steps=4
)

In [33]:
def compute_metrics(predictions):
    labels = predictions.label_ids
    preds = predictions.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

In [34]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataloader,
    eval_dataset=val_dataloader,
    compute_metrics=compute_metrics
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/15 [00:00<?, ?it/s]

{'train_runtime': 844.975, 'train_samples_per_second': 0.651, 'train_steps_per_second': 0.018, 'train_loss': 3.1433509826660155, 'epoch': 4.29}


TrainOutput(global_step=15, training_loss=3.1433509826660155, metrics={'train_runtime': 844.975, 'train_samples_per_second': 0.651, 'train_steps_per_second': 0.018, 'train_loss': 3.1433509826660155, 'epoch': 4.29})

In [35]:
def predict(text):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    inputs = tokenizer(text, padding='longest', truncation=True, max_length=512, return_tensors='pt').to(device)

    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    pred_label_idx = probs.argmax()

    pred_label = model.config.id2label[pred_label_idx.item()]
    return probs.cpu(), pred_label_idx.cpu(), pred_label

predict("Egyre t\u00f6bb a t\u00f6meges H1N1-megbetegedes...")

(tensor([[0.0274, 0.0413, 0.0463, 0.0452, 0.0735, 0.0298, 0.0773, 0.0756, 0.0274,
          0.0360, 0.0378, 0.0390, 0.0484, 0.0430, 0.0355, 0.0474, 0.0387, 0.0518,
          0.0594, 0.0293, 0.0447, 0.0450]], grad_fn=<SoftmaxBackward0>),
 tensor(6),
 21)

In [36]:
model_path = 'poltextlab-like-classification-model'
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

('poltextlab-like-classification-model/tokenizer_config.json',
 'poltextlab-like-classification-model/special_tokens_map.json',
 'poltextlab-like-classification-model/vocab.txt',
 'poltextlab-like-classification-model/added_tokens.json',
 'poltextlab-like-classification-model/tokenizer.json')

In [37]:
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer=BertTokenizerFast.from_pretrained(model_path)
nlp = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

In [38]:
nlp("Egyre t\u00f6bb a t\u00f6meges H1N1-megbetegedes, sok a beteg a korhazakban, betegseg, rak, illness angolul")

[{'label': 21, 'score': 0.08608993142843246}]

In [39]:
nlp("Iran lebombazta irzaelt bombakkal, raketakkal")

[{'label': 9, 'score': 0.07747523486614227}]

In [41]:
nlp("Az MSZP-Párbeszéd csütörtöki sajtótájékoztatój...	")

[{'label': 21, 'score': 0.10057864338159561}]