The purpose of the following code was to train a classifier using the presseportal articles scraped in notebook 6 (Scraping Presseportal articles).
Parts of this code were generated with the help of ChatGPT and altered for the specific needs of this study.

In [None]:
!pip install -U transformers datasets evaluate accelerate




In [None]:
from datasets import load_dataset
from transformers import pipeline
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.utils import resample

df = pd.read_csv(
    "master_presseportal_news_articles_dataset.csv",
    encoding="utf-8",
    engine="python"
)

# dropping categories that either don't have enough datapoints or aren't topics
df = df[df["category"] != "Presseschau"].dropna().reset_index(drop=True)
df = df[df["category"] != "People"].dropna().reset_index(drop=True)
df = df[df["category"] != "Fashion / Beauty"].dropna().reset_index(drop=True)

# merging finance related topics to one "Wirtschaft" category
df["category"] = df["category"].replace({
    "Handel": "Wirtschaft",
    "Finanzen": "Wirtschaft",
    "Wirtschaft": "Wirtschaft"
})

# eliminating noise from the articles
df["text"] = df["text"].astype(str)
df["text"] = df["text"].str.split("(ots)", n=1, regex=False).str[-1]
#concatenating the title and text
df["text"] = df["title"].astype(str) + ": " + df["text"]
df["text"] = (
    df["text"]
    .str.replace(r"\s*\n+\s*", " ", regex=True)
    .str.replace(r"\s*\r+\s*", " ", regex=True)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# Splitting into train (80%), validation (10%), and test (10%)
train_df, temp_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['category']
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=42,
    stratify=temp_df['category']
)

def oversample_to_min_count(df_in, target_col="category", min_count=2000, random_state=42):
    counts = df_in[target_col].value_counts()
    parts = []

    for cat, count in counts.items():
        df_cat = df_in[df_in[target_col] == cat]
        if count < min_count:
            df_cat = resample(df_cat, replace=True, n_samples=min_count, random_state=random_state)
        parts.append(df_cat)

    return (
        pd.concat(parts, axis=0)
        .sample(frac=1, random_state=random_state)
        .reset_index(drop=True)
    )

train_df_over = oversample_to_min_count(train_df, target_col="category", min_count=2000, random_state=42)

# Convert pandas DataFrames to Hugging Face Dataset format
datasets = DatasetDict({
    "train": Dataset.from_pandas(train_df_over.reset_index(drop=True)),
    "validation": Dataset.from_pandas(val_df.reset_index(drop=True)),
    "test": Dataset.from_pandas(test_df.reset_index(drop=True)),
})


In [None]:
print(df['category'].value_counts())
print(df.head())

category
Wirtschaft              17826
Medien / Kultur         17815
Politik                  6371
Panorama                 5712
Gesundheit / Medizin     4442
Auto / Verkehr           3153
Soziales                 1805
Bau / Immobilien         1688
Umwelt                   1624
Wissen / Bildung         1393
Netzwelt                 1154
Tourismus / Urlaub       1094
Sport                     998
Name: count, dtype: int64
                  date         category  \
0  2023-12-31 16:43:00            Sport   
1  2023-12-31 15:09:00         Panorama   
2  2023-12-31 13:00:00       Wirtschaft   
3  2023-12-31 12:59:00  Medien / Kultur   
4  2023-12-31 12:38:00            Sport   

                                               title              source  \
0  Mit allen Wassern gewaschen: 21. Silvesterschw...  Achensee Tourismus   
1  Fünf Millionäre zum neuen Jahr / MillionenKrac...           WestLotto   
2  KfW Research: KfW-ifo-Fachkräftebarometer - Fa...                 KfW   
3  Russland 

In [None]:
print(len(datasets["validation"]))
print(len(datasets["test"]))
print(len(datasets["train"]))

6507
6508
58256


In [None]:
#checking the category distribution across the datasets
for split in ["train", "validation", "test"]:
    df_split = datasets[split].to_pandas()
    print(f"\n{split.upper()} ({len(df_split)})")
    print(df_split["category"].value_counts())



TRAIN (58256)
category
Wirtschaft              14261
Medien / Kultur         14252
Politik                  5097
Panorama                 4570
Gesundheit / Medizin     3554
Auto / Verkehr           2522
Bau / Immobilien         2000
Sport                    2000
Umwelt                   2000
Tourismus / Urlaub       2000
Soziales                 2000
Wissen / Bildung         2000
Netzwelt                 2000
Name: count, dtype: int64

VALIDATION (6507)
category
Wirtschaft              1782
Medien / Kultur         1781
Politik                  637
Panorama                 571
Gesundheit / Medizin     444
Auto / Verkehr           315
Soziales                 180
Bau / Immobilien         168
Umwelt                   163
Wissen / Bildung         140
Netzwelt                 116
Tourismus / Urlaub       110
Sport                    100
Name: count, dtype: int64

TEST (6508)
category
Wirtschaft              1783
Medien / Kultur         1782
Politik                  637
Panorama            

In [None]:
import os
import pandas as pd
import torch
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)
from datasets import DatasetDict

os.environ["TOKENIZERS_PARALLELISM"] = "false"

model_name = "dbmdz/bert-base-german-uncased"

# Encode category labels
category_labels = {label: idx for idx, label in enumerate(df["category"].unique())}
id2label = {v: k for k, v in category_labels.items()}

# mapping of categories and label_ids
mapping_df = pd.DataFrame([
    {"category": cat, "label_id": idx}
    for cat, idx in category_labels.items()
]).sort_values("label_id")

print(mapping_df)
mapping_df.to_csv("category_label_mapping.csv", index=False)

def encode_labels(example):
    example["labels"] = category_labels[example["category"]]
    return example
datasets = datasets.map(encode_labels)

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

tokenized_datasets = datasets.map(tokenize_function, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(category_labels)
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision_macro": precision_score(labels, preds, average="macro", zero_division=0),
        "recall_macro": recall_score(labels, preds, average="macro", zero_division=0),
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./final_model",
    eval_strategy="epoch",
    report_to="none",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics
)

trainer.train()

# Macro evaluation
eval_results = trainer.evaluate(tokenized_datasets["test"])
pd.DataFrame([eval_results]).to_csv("bert_base_german_uncased_eval_results.csv", index=False)

test_output = trainer.predict(tokenized_datasets["test"])
logits = test_output.predictions
y_true = test_output.label_ids
y_pred = np.argmax(logits, axis=-1)

label_ids_sorted = sorted(id2label.keys())
target_names = [id2label[i] for i in label_ids_sorted]

# Confusion matrix
cm = confusion_matrix(y_true, y_pred, labels=label_ids_sorted)
cm_df = pd.DataFrame(cm, index=target_names, columns=target_names)
cm_df.to_csv("confusion_matrix_categories.csv")

# Per-class metrics
report_dict = classification_report(
    y_true,
    y_pred,
    labels=label_ids_sorted,
    target_names=target_names,
    output_dict=True,
    zero_division=0
)
report_df = pd.DataFrame(report_dict).transpose()
report_df.to_csv("per_class_metrics_categories.csv")

print ("Process finished")

                category  label_id
0                  Sport         0
1               Panorama         1
2             Wirtschaft         2
3        Medien / Kultur         3
4       Wissen / Bildung         4
5   Gesundheit / Medizin         5
6                Politik         6
7               Soziales         7
8         Auto / Verkehr         8
9               Netzwelt         9
10      Bau / Immobilien        10
11                Umwelt        11
12    Tourismus / Urlaub        12


Map:   0%|          | 0/58256 [00:00<?, ? examples/s]

Map:   0%|          | 0/6507 [00:00<?, ? examples/s]

Map:   0%|          | 0/6508 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/58256 [00:00<?, ? examples/s]

Map:   0%|          | 0/6507 [00:00<?, ? examples/s]

Map:   0%|          | 0/6508 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,0.6307,0.565907,0.809129,0.717444,0.776323,0.741623


Process finished


In [None]:
# downloading the classification model as well as evaluation metrics
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")

import shutil
shutil.make_archive("final_model", "zip", "./final_model")

from google.colab import files
files.download('final_model.zip')

files.download('per_class_metrics_categories.csv')
files.download('confusion_matrix_categories.csv')
files.download('bert_base_german_uncased_eval_results.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# macro evaluation
eval_results = trainer.evaluate(tokenized_datasets["test"])
eval_df = pd.DataFrame([eval_results]).T
eval_df.columns = ["value"]
print(eval_df)

                              value
eval_loss                  0.539316
eval_accuracy              0.818070
eval_precision_macro       0.728600
eval_recall_macro          0.779441
eval_f1_macro              0.749456
eval_runtime             198.571900
eval_samples_per_second   32.774000
eval_steps_per_second      4.099000
epoch                      1.000000


In [None]:
report_dict = classification_report(
    y_true,
    y_pred,
    labels=label_ids_sorted,
    target_names=target_names,
    output_dict=True,
    zero_division=0
)

In [None]:
report_dict

{'Sport': {'precision': 0.7168141592920354,
  'recall': 0.81,
  'f1-score': 0.7605633802816901,
  'support': 100.0},
 'Panorama': {'precision': 0.6835748792270532,
  'recall': 0.4956217162872154,
  'f1-score': 0.5746192893401015,
  'support': 571.0},
 'Wirtschaft': {'precision': 0.8541545613015689,
  'recall': 0.8244531688166012,
  'f1-score': 0.839041095890411,
  'support': 1783.0},
 'Medien / Kultur': {'precision': 0.9416299559471366,
  'recall': 0.9595959595959596,
  'f1-score': 0.9505280711506392,
  'support': 1782.0},
 'Wissen / Bildung': {'precision': 0.6709677419354839,
  'recall': 0.7482014388489209,
  'f1-score': 0.7074829931972789,
  'support': 139.0},
 'Gesundheit / Medizin': {'precision': 0.7540650406504065,
  'recall': 0.8355855855855856,
  'f1-score': 0.7927350427350427,
  'support': 444.0},
 'Politik': {'precision': 0.827768014059754,
  'recall': 0.7394034536891679,
  'f1-score': 0.7810945273631841,
  'support': 637.0},
 'Soziales': {'precision': 0.7447916666666666,
  'r