In [1]:
import gzip
import json
import os
import pandas as pd
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification, TrainingArguments, Trainer, pipeline
from torch import cuda
from sklearn.utils import shuffle
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [2]:
def read_folder(folder_path):
    dataframes = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.jsonl.gz'):
            with gzip.open(os.path.join(folder_path, filename), 'rt', encoding='utf-8') as file:
                for line in file:
                    json_data = json.loads(line)
                    df = pd.DataFrame(json_data)
                    dataframes.append(df)
    if dataframes:
        aggregated_df = pd.concat(dataframes, ignore_index=True)
        return aggregated_df
    else:
        print("No jsonl files found in the directory.")
        return None

In [3]:
def reduce():
    # csak a test
    test = os.path.join(os.getcwd(), 'test')
    df = read_folder(test)
    df.drop('uuid', axis=1, inplace=True)
    return df.groupby('major_topic_pred').apply(lambda x: x.sample(n=min(5, len(x)))).reset_index(drop=True)


In [4]:
reduced_df = reduce()
df = shuffle(reduced_df)
labels = df['major_topic_pred'].unique().tolist()
number_of_labels = len(labels)
id_to_label = {_id: label for _id, label in enumerate(labels)}
label_to_id = {label: _id for _id, label in enumerate(labels)}
print(id_to_label)
df['major_topic_pred_index'] = df['major_topic_pred'].map(lambda x: label_to_id[x])
df.head(10)

{0: 9, 1: 8, 2: 18, 3: 12, 4: 15, 5: 17, 6: 13, 7: 1, 8: 5, 9: 20, 10: 4, 11: 23, 12: 21, 13: 10, 14: 2, 15: 3, 16: 14, 17: 19, 18: 16, 19: 7, 20: 999, 21: 6}


Unnamed: 0,title,lead,article,domain,url,date_of_creation,cc_date,tags,doc_similarity,major_topic_pred,major_topic_pred_index
44,"Cáfolja a Frontex, hogy erőszakosak az uniós h...",Az európai uniós határ- és partvédelmi ügynöks...,A Frontex a dpa német hírügynökségnek e-mailbe...,hvg.hu,https://hvg.hu/vilag/20190805_frontex_hatarors...,2019-08-05T20:26:00,2020-02-23T06:00:06,határőr,0.483312,9,0
36,Fontos szintek alá zuhant az olaj ára,"Nagyot estek szerdán a világpiaci olajárak, íg...",A szerdai esést nagyrészt az amerikai nyersola...,portfolio.hu,https://www.portfolio.hu/uzlet/20181018/fontos...,2018-10-18T09:13:00,2019-11-19T08:48:58,wti,0.604739,8,1
83,Kedden rajtol a 77. Genfi Autószalon,A sajtó számára kedden nyitja meg kapuit Európ...,Idén március 8-án nyitja meg a kapuit a nagyér...,origo.hu,https://www.origo.hu/auto/20070303autocsodak.html,2007-03-05T10:57:00,2020-01-18T08:22:35,Ford Motor Company,0.688749,18,2
51,Galambos Lajos: Nem vagyok szökésben,"Nincs oka szökni vagy Mexikóban maradni, hisze...","""Nem vagyok szökésben, hiszen nincs ellenem ér...",origo.hu,https://www.origo.hu/teve/20160323-galambos-la...,2016-03-23T08:10:00,2021-09-22T09:05:33,Tévé,0.615171,12,3
67,"Kettő otthon, kettő az úton: folytatódik az NB...",A tizenkilencedik fordulóval folytatódnak az N...,"2019. 03. 17., 15.00 óra\n\nIváncsa KSE (8.)– ...",feol.hu,https://www.feol.hu/sport/helyi-sport/ketto-ot...,2019-03-15T21:32:00,2019-03-24T07:49:33,Iváncsa KSE,0.424394,15,4
78,A legmeghökkentőbb számítógép-konfigurációk,A számítástechnika - és ezen belül a gépek - r...,A számítástechnika - és ezen belül a gépek - r...,hvg.hu,https://hvg.hu/tudomany/20090921_furcsa_erdeke...,2009-09-22T05:14:00,2019-04-21T20:52:27,billentyűzet,0.697015,17,5
59,Így számítják ki a nyugdíjat 2014-ben,2014-ben is a nettó átlagkeresettől és a szolg...,A nyugdíj alapjául szolgáló átlagkeresetnél 20...,hvg.hu,https://hvg.hu/gazdasag/20131230_Igy_szamitjak...,2013-12-30T15:50:00,2019-09-15T10:58:07,szolgálati idő,0.841047,13,6
68,"Fucsovics egy helyet javított, Medvegyev megel...",Fucsovics Márton egy helyet javítva múlt heti ...,"A rangsorban következő magyar játékos, Balázs ...",index.hu,https://index.hu/sport/tenisz/2021/05/10/fucso...,2021-05-10T08:55:00,2021-06-15T21:56:42,novak djokovics,0.593206,15,4
53,Enyhítették Hunvald büntetését,A Kúrián folytatott eljárásban az ügyész a fel...,Harmadfokú döntés született kedden Gál György ...,24.hu,https://24.hu/kozelet/2014/07/01/enyhitettek-h...,2014-07-01T10:10:00,2020-09-28T05:02:26,Közélet,0.400061,12,3
1,"Aki hátralép, veszíthet... – Szijjártó Péter n...","„Az útépítések, a digitális infrastruktúra fej...",Egyebek mellett erről is beszélt lapunknak teg...,zaol.hu,https://www.zaol.hu/gazdasag/aki-hatralep-vesz...,2018-04-07T08:00:00,2020-08-08T17:56:29,helyi gazdaság,0.559537,1,7


In [5]:
model = BertForSequenceClassification.from_pretrained(
    "google-bert/bert-base-uncased",
    num_labels=number_of_labels,
    id2label=id_to_label,
    label2id=label_to_id
)
device = 'cuda' if cuda.is_available() else 'cpu'
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [6]:
tokenizer = BertTokenizerFast.from_pretrained("google-bert/bert-base-uncased")
size = df.shape[0]
half = size // 2
three_fourth = (3 * size) // 4
train_texts = list(df['article'][:half])
val_texts = list(df['article'][half:three_fourth])
test_texts = list(df['article'][three_fourth:])
print(len(train_texts), len(val_texts), len(test_texts))


55 27 28


In [7]:
train_labels = list(df['major_topic_pred'][:half])
val_labels = list(df['major_topic_pred'][half:three_fourth])
test_labels = list(df['major_topic_pred'][three_fourth:])
print(len(train_labels), len(val_labels), len(test_labels))

55 27 28


In [8]:
class DataLoader(Dataset):
    def __init__(self, encodings, _labels):
        self.encodings = encodings
        self.labels = _labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [9]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_dataloader = DataLoader(train_encodings, train_labels)
val_dataloader = DataLoader(val_encodings, val_labels)
test_dataset = DataLoader(test_encodings, test_labels)

In [10]:
training_args = TrainingArguments(
    output_dir='./z',
    do_train=True,
    do_eval=True,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=50,
    weight_decay=0.01,
    logging_strategy='steps',
    logging_dir='./logs',
    logging_steps=50,
    evaluation_strategy='steps',
    eval_steps=50,
    save_strategy='steps',
    load_best_model_at_end=True
)

In [11]:
def compute_metrics(predictions):
    labels = predictions.label_ids
    preds = predictions.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataloader,
    eval_dataset=val_dataloader,
    compute_metrics=compute_metrics
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


IndexError: Target 999 is out of bounds.

In [ ]:
def predict(text):
    inputs = tokenizer(text, padding='True', truncation=True, max_length=512, return_tensors='pr')

    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    pred_label_idx = probs.argmax()

    pred_label = model.config.id2label[pred_label_idx.item()]
    return probs, pred_label_idx, pred_label

predict("Egyre t\u00f6bb a t\u00f6meges H1N1-megbetegedes...")

In [ ]:
model_path = 'poltextlab-like-classification-model'
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

In [ ]:
def reload():
    model = BertForSequenceClassification.from_pretrained(model_path)
    tokenizer=BertTokenizerFast.from_pretrained(model_path)
    nlp = pipeline('sentiment analysis', model=model, tokenizer=tokenizer)

    # nlp(...)
