In [1]:
import gzip
import json
import os
import pandas as pd
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification, TrainingArguments, Trainer, pipeline
from torch import cuda
from sklearn.utils import shuffle
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [2]:
def read_folder(folder_path):
    dataframes = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.jsonl.gz'):
            with gzip.open(os.path.join(folder_path, filename), 'rt', encoding='utf-8') as file:
                for line in file:
                    json_data = json.loads(line)
                    df = pd.DataFrame(json_data)
                    dataframes.append(df)
    if dataframes:
        aggregated_df = pd.concat(dataframes, ignore_index=True)
        return aggregated_df
    else:
        print("No jsonl files found in the directory.")
        return None

In [3]:
def reduce():
    # csak a test
    test = os.path.join(os.getcwd(), 'test')
    df = read_folder(test)
    df.drop('uuid', axis=1, inplace=True)
    return df.groupby('major_topic_pred').apply(lambda x: x.sample(n=min(5, len(x)))).reset_index(drop=True)


In [4]:
reduced_df = reduce()
df = shuffle(reduced_df)
labels = df['major_topic_pred'].unique().tolist()
number_of_labels = len(labels)
id_to_label = {_id: label for _id, label in enumerate(labels)}
label_to_id = {label: _id for _id, label in enumerate(labels)}
print(id_to_label)
df['major_topic_pred_index'] = df['major_topic_pred'].map(lambda x: label_to_id[x])
df.head(10)

{0: 15, 1: 8, 2: 4, 3: 7, 4: 5, 5: 23, 6: 2, 7: 12, 8: 10, 9: 9, 10: 3, 11: 1, 12: 14, 13: 6, 14: 16, 15: 18, 16: 20, 17: 19, 18: 21, 19: 13, 20: 999, 21: 17}


Unnamed: 0,title,lead,article,domain,url,date_of_creation,cc_date,tags,doc_similarity,major_topic_pred,major_topic_pred_index
67,Fájó Hungerit-vereség Dunaújvárosban,Egyszer vezetett a Hungerit MetalCom Szentes e...,Dunaújváros–Hungerit MetalCom-Szentes 11-10 (2...,delmagyar.hu,https://www.delmagyar.hu/sport/fajo_hungerit-v...,2011-03-07T20:59:00,2018-12-15T03:03:18,vízilabda,0.200436,15,0
39,Fontos szintek alá zuhant az olaj ára,"Nagyot estek szerdán a világpiaci olajárak, íg...",A szerdai esést nagyrészt az amerikai nyersola...,portfolio.hu,https://www.portfolio.hu/uzlet/20181018/fontos...,2018-10-18T09:13:00,2019-11-19T08:48:58,"opec,",0.604739,8,1
15,Magosz: elfogadhatatlan a fejlesztési terv,A Magyar Gazdakörök és Gazdaszövetkezetek Szöv...,"Az elnök szerint a középtávú stratégiai terv, ...",origo.hu,https://www.origo.hu/gazdasag/20060906magosz.html,2006-09-06T12:40:00,2019-08-25T17:07:46,Jakab István,0.678365,4,2
30,Akár teljesen el is tűnhetnek az izlandi glecc...,Az amerikai űrkutatási hivatal (NASA) műholdas...,"Nagyjából 750 négyzetkilométerrel zsugorodtak,...",kisalfold.hu,https://www.kisalfold.hu/egyperces/2021/06/aka...,2021-06-05T11:00:00,2023-02-06T05:28:33,#globális felmelegedés,0.653863,7,3
24,Mérsékletet kér az unió a sztrájkolóktól,A német ipari dolgozókat mérsékletre szólított...,A sztrájk legprominensebb támadója Pedro Solbe...,origo.hu,https://www.origo.hu/gazdasag/hirek/20020507me...,2002-05-10T11:01:00,2020-01-22T08:39:15,tüntetés,0.621243,5,4
104,"Pornósztár a szomszédban, Zoolanderék zsarubőrben","Beindult az év eleji filmdömping, s ez azt jel...",A napokban dobták be a köztudatba a The Girl N...,origo.hu,https://www.origo.hu/filmklub/20040116pornoszt...,,2023-03-20T22:59:16,Kardhal,0.486126,23,5
8,Nem volt gyűlölet-bűncselekmény a terhes roma ...,A rendőrség szerint csak becsületsértés történ...,"Gyöngyöspatán, március elején fekete maszkos, ...",index.hu,https://index.hu/belfold/2011/06/01/nem_volt_g...,2011-06-01T12:56:00,2021-03-01T23:18:27,gyűlölet,0.766326,2,6
50,Nem csalt adót Tasnádi,A bíróság helybenhagyta a Pesti Központi Kerül...,Jogerősen felmentette Tasnádi Pétert az adócsa...,index.hu,http://index.hu/bulvar/tsndptr0524/,2005-05-24T10:42:00,2017-12-12T18:40:51,Bulvár,0.873905,12,7
49,Fontos részletek derültek ki a 3-as metró megh...,Február 17-én került sor a BKK lakossági fórum...,Az eseményen a közlekedési központot Rácz Zolt...,portfolio.hu,https://www.portfolio.hu/gazdasag/20200220/fon...,2020-02-20T10:30:00,2020-03-31T01:59:28,"uniós támogatás,",0.46658,10,8
51,Rendőröket vádolnak egy turistanő megerőszakol...,"Három francia rendőrt vádolnak azzal, hogy iro...",Vád alá helyezték a párizsi rendőrség elitalak...,nlc.hu,https://nlc.hu/ezvan/20140428/megeroszakolas-f...,2014-04-28T00:00:00,2021-06-23T01:32:03,francia Franciaország rendőrség megerőszakolás...,0.796423,12,7


In [5]:
model = BertForSequenceClassification.from_pretrained(
    "google-bert/bert-base-uncased",
    num_labels=number_of_labels,
    id2label=id_to_label,
    label2id=label_to_id
)
device = 'cuda' if cuda.is_available() else 'cpu'
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [6]:
tokenizer = BertTokenizerFast.from_pretrained("google-bert/bert-base-uncased")
size = df.shape[0]
half = size // 2
three_fourth = (3 * size) // 4
train_texts = list(df['article'][:half])
val_texts = list(df['article'][half:three_fourth])
test_texts = list(df['article'][three_fourth:])
print(len(train_texts), len(val_texts), len(test_texts))


55 27 28


In [7]:
train_labels = list(df['major_topic_pred_index'][:half])
val_labels = list(df['major_topic_pred_index'][half:three_fourth])
test_labels = list(df['major_topic_pred_index'][three_fourth:])
print(len(train_labels), len(val_labels), len(test_labels))

55 27 28


In [8]:
class DataLoader(Dataset):
    def __init__(self, encodings, _labels):
        self.encodings = encodings
        self.labels = _labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [9]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_dataloader = DataLoader(train_encodings, train_labels)
val_dataloader = DataLoader(val_encodings, val_labels)
test_dataset = DataLoader(test_encodings, test_labels)

In [10]:
training_args = TrainingArguments(
    output_dir='./z',
    do_train=True,
    do_eval=True,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=50,
    weight_decay=0.01,
    logging_strategy='steps',
    logging_dir='./logs',
    logging_steps=50,
    evaluation_strategy='steps',
    eval_steps=50,
    save_strategy='steps',
    load_best_model_at_end=True
)

In [11]:
def compute_metrics(predictions):
    labels = predictions.label_ids
    preds = predictions.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataloader,
    eval_dataset=val_dataloader,
    compute_metrics=compute_metrics
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss


TrainOutput(global_step=20, training_loss=3.0876028060913088, metrics={'train_runtime': 1491.1479, 'train_samples_per_second': 0.184, 'train_steps_per_second': 0.013, 'total_flos': 72368533248000.0, 'train_loss': 3.0876028060913088, 'epoch': 5.0})

In [15]:
def predict(text):
    inputs = tokenizer(text, padding='longest', truncation=True, max_length=512, return_tensors='pt')

    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    pred_label_idx = probs.argmax()

    pred_label = model.config.id2label[pred_label_idx.item()]
    return probs, pred_label_idx, pred_label

predict("Egyre t\u00f6bb a t\u00f6meges H1N1-megbetegedes...")

(tensor([[0.0433, 0.0428, 0.0294, 0.0438, 0.0301, 0.0439, 0.0427, 0.0661, 0.0559,
          0.0381, 0.0423, 0.0295, 0.0546, 0.0500, 0.0473, 0.0569, 0.0419, 0.0591,
          0.0518, 0.0299, 0.0483, 0.0524]], grad_fn=<SoftmaxBackward0>),
 tensor(7),
 12)

In [16]:
model_path = 'poltextlab-like-classification-model'
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

('poltextlab-like-classification-model\\tokenizer_config.json',
 'poltextlab-like-classification-model\\special_tokens_map.json',
 'poltextlab-like-classification-model\\vocab.txt',
 'poltextlab-like-classification-model\\added_tokens.json',
 'poltextlab-like-classification-model\\tokenizer.json')

In [18]:
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer=BertTokenizerFast.from_pretrained(model_path)
nlp = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

In [21]:
nlp("Egyre t\u00f6bb a t\u00f6meges H1N1-megbetegedes, sok a beteg a korhazakban, betegseg, rak, illness angolul")

[{'label': 999, 'score': 0.07323040813207626}]

In [20]:
nlp("Iran lebombazta irzaelt bombakkal, raketakkal")

[{'label': 12, 'score': 0.07266714423894882}]