# BERT Training for Narrative Classification

This notebook demonstrates how to train a BERT model for narrative classification using the modular code structure.

In [None]:
import sys
import logging
from datetime import datetime
import os
import pandas as pd
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from tqdm import tqdm
import ast

# Import our custom modules
from dataset import prepare_data, CustomDataset
from model import predict, initialize_model
from trainer import train_bert
from modules.utils import debug_misclassifications, setup_logging

In [None]:
setup_logging()
# Create logs directory if it doesn't exist
logs_dir = os.path.join(os.getcwd(), "code", "logs")
os.makedirs(logs_dir, exist_ok=True)

# Setup logging with specified directory
log_filename = os.path.join(
    logs_dir, f"preprocessing_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
)
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    handlers=[logging.FileHandler(log_filename), logging.StreamHandler(sys.stdout)],
)

# Get logger
logger = logging.getLogger(__name__)

## Processing Summary Class

First, let's define our ProcessingSummary class to track and display results.

In [None]:
class ProcessingSummary:
    def __init__(self):
        self.start_time = datetime.now()
        self.steps_completed = []
        self.ml_results = {}
        self.document_stats = {}

    def add_step(self, step_name, details=None):
        step = {
            "name": step_name,
            "timestamp": datetime.now().strftime("%H:%M:%S"),
            "details": details,
        }
        self.steps_completed.append(step)

    def add_ml_result(self, model_type, metrics):
        self.ml_results[model_type] = metrics

    def display_summary(self):
        duration = datetime.now() - self.start_time
        minutes = int(duration.total_seconds() // 60)
        seconds = int(duration.total_seconds() % 60)

        print("\n" + "=" * 80)
        print(f"{'PROCESSING SUMMARY':^80}")
        print("=" * 80)

        print("\nGENERAL INFORMATION")
        print("-" * 80)
        print(f"Total Processing Time: {minutes}m {seconds}s")
        print(f"Steps Completed: {len(self.steps_completed)}")

        print("\nDOCUMENT STATISTICS")
        print("-" * 80)
        for key, value in self.document_stats.items():
            if isinstance(value, float):
                print(f"{key.replace('_', ' ').title()}: {value:.2f}")
            else:
                print(f"{key.replace('_', ' ').title()}: {value}")

        print("\nPROCESSING TIMELINE")
        print("-" * 80)
        for step in self.steps_completed:
            print(f"\n[{step['timestamp']}] {step['name']}")
            if step.get("details"):
                for key, value in step["details"].items():
                    print(f"  └─ {key}: {value}")

        if self.ml_results:
            print("\nML RESULTS")
            print("-" * 80)
            for model, metrics in self.ml_results.items():
                print(f"\n{model}:")
                for metric, value in metrics.items():
                    if isinstance(value, float):
                        print(f"  └─ {metric}: {value:.4f}")
                    else:
                        print(f"  └─ {metric}: {value}")

        print("\n" + "=" * 80 + "\n")


# Initialize processing summary
summary = ProcessingSummary()

## Set Up Paths and Load Data

Define paths and load the preprocessed data files.

In [None]:
# Define paths
base_path = os.path.join(os.getcwd(), "..", "..")
output_dir = os.path.join(base_path, "outputs")
os.makedirs(output_dir, exist_ok=True)

# Load preprocessed data
try:
    df_normalized = pd.read_csv(os.path.join(base_path, "df_normalized.csv"))
    df_normalized_ua = pd.read_csv(os.path.join(base_path, "df_normalized_ua.csv"))
    df_normalized_cc = pd.read_csv(os.path.join(base_path, "df_normalized_cc.csv"))

    # Update summary
    summary.document_stats = {
        "total_documents": len(df_normalized),
        "ua_documents": len(df_normalized_ua),
        "cc_documents": len(df_normalized_cc),
    }

    summary.add_step(
        "Data Loading",
        {
            "total_documents": len(df_normalized),
            "ua_documents": len(df_normalized_ua),
            "cc_documents": len(df_normalized_cc),
        },
    )

except Exception as e:
    logger.error(f"Error loading data: {str(e)}")
    raise

## Training Options

Define functions to handle different training scenarios.

In [None]:
def get_ml_choice():
    print("\nSelect processing option:")
    print("1. Train BERT on all data")
    print("2. Train BERT on UA data only")
    print("3. Train BERT on CC data only")
    print("4. Run all BERT training variations")
    print("5. Skip training")

    while True:
        try:
            choice = int(input("\nEnter your choice (1-5): "))
            if 1 <= choice <= 5:
                return choice
            print("Please enter a number between 1 and 5.")
        except ValueError:
            print("Please enter a valid number.")


def run_selected_ml(
    choice, df_normalized, df_normalized_ua, df_normalized_cc, base_path, summary
):
    if choice == 1:
        logger.info("Starting BERT training on full dataset...")
        training_results = train_bert(df_normalized, base_path)
        summary.add_ml_result("BERT (Full Dataset)", training_results)
        logger.info(
            f"BERT training on full data completed. Results: {training_results}"
        )

    elif choice == 2:
        logger.info("Starting BERT training on UA dataset...")
        training_results_ua = train_bert(df_normalized_ua, base_path)
        summary.add_ml_result("BERT (UA Dataset)", training_results_ua)
        logger.info(
            f"BERT training on UA data completed. Results: {training_results_ua}"
        )
    elif choice == 3:
        logger.info("Starting BERT training on CC dataset...")
        training_results_cc = train_bert(df_normalized_cc, base_path)
        summary.add_ml_result("BERT (CC Dataset)", training_results_cc)
        logger.info(
            f"BERT training on CC data completed. Results: {training_results_cc}"
        )

    elif choice == 4:
        logger.info("Starting BERT training on all variations...")

        training_results = train_bert(df_normalized, base_path)
        summary.add_ml_result("BERT (Full Dataset)", training_results)
        logger.info(
            f"BERT training on full data completed. Results: {training_results}"
        )

        training_results_ua = train_bert(df_normalized_ua, base_path)
        summary.add_ml_result("BERT (UA Dataset)", training_results_ua)
        logger.info(
            f"BERT training on UA data completed. Results: {training_results_ua}"
        )

        training_results_cc = train_bert(df_normalized_cc, base_path)
        summary.add_ml_result("BERT (CC Dataset)", training_results_cc)
        logger.info(
            f"BERT training on CC data completed. Results: {training_results_cc}"
        )

## Model Analysis and Debugging

Analyze model performance and debug misclassifications.

In [None]:
# Get the path to the latest trained model
current_date = datetime.now().strftime("%Y%m%d")
model_path = os.path.join(base_path, f"models/bert_20250113")

# Load the model and tokenizer
from model import load_model_and_tokenizer

model, tokenizer = load_model_and_tokenizer(model_path)

# Load label mapping
import json

with open(os.path.join(model_path, "label_mapping.json"), "r") as f:
    label_mapping = json.load(f)

# Analyze misclassifications
misclassifications = debug_misclassifications(
    dataset=df_normalized,
    model=model,
    tokenizer=tokenizer,
    label_mapping=label_mapping,
    dataset_type="Training",
)

# Save misclassification analysis
misclassifications_path = os.path.join(
    output_dir, f"analysis/misclassifications_{current_date}.csv"
)
os.makedirs(os.path.dirname(misclassifications_path), exist_ok=True)
misclassifications.to_csv(misclassifications_path, index=False)

print("\nMisclassified Examples:")
display(misclassifications)
print(f"\nAnalysis saved to: {misclassifications_path}")

## Description of Deep Leaning baseline and used methods

As Deep Learning model we used BERT. We have trained the model using labeled data and measured the performance of the model using the following metrics:
* Accuracy
* Recall,
* Precision
* F1 Score

Before training the model, we have normalized and tokenized the data.
1. Normalization and tokenization: Cleaning and tokenizing the narratives, so that the model can be trained.
2. Label mapping: Creating a mapping between all unique class labels and integers to meet BERTs' requirements for training

For the models training and testing, we used a 80% training / 20% testing split.
By splitting the narratives, we followed different approaches to be able to evaluate the differences between them:
* Handling both, Ukraine War and Climate Change narratives in one dataframe, so that the model could learn from all the data
* Splitting narratives into two dataframes, containing only Ukraine War narratives or only Climate Change narratives each
* Using stratification to improve the distribution of classes between training and testing sets.

# Analysis

When the narratives were split by topic, the metrics for Climate Change improved, while the metrics for the Ukraine War worsened.
Having all metrics of 0.5135 during the approach with all dataframes at the same time, it is a moderate performance.

After that, we used only Climate Change narratives and all resulting metrics got much better: 0.7143.
Using only Ukraine War narratives, we received 0.4167 for all metrics, which is the worst result, although it is the topic with the highest amount of narratives that we had for training.

### Analyzing single classes

To be able to analyze differences in prediction of different classes, we analyzed the confusion matrices for each class individually.

For both topics, we received a similar distribution for the confusion matrix of the class *Other*. Again for both topics, the model predicted class "Other" correctly as positive. It also predicted other classes to be from class *Other*.

In [None]:
import matplotlib.pyplot as plt

images = ["cc_other.png", "cc_all.png", "ua_other.png", "ua_all.png"]
labels = [
    "Climate Change Class Other",
    "Climate Change all remaining classes",
    "Ukraine War class Other",
    "Ukraine War all remaining classes",
]
img_base_path = "../../../info/screens/"
fig, ax = plt.subplots(2, 2, figsize=(10, 8))

for i, (img_name, label) in enumerate(zip(images, labels)):
    row = i // 2
    col = i % 2
    img_path = img_base_path + "/" + img_name
    img = plt.imread(img_path)
    ax[row, col].imshow(img)
    ax[row, col].axis("off")
    ax[row, col].set_title(label)
plt.show()

#### Climate Change

For Climate Change narratives, we received for class *Other* the following Confusion Matrix:
[[0, 4], [0, 10]]

Calculating the metrics for this specific class:
* Accuracy: 71,43 %
* Precision: 71,43 %
* Recall: 100%
* F1 Score: 83,33 %

For all other classes, we received the following equal Confusion Matrix:
[[13, 0], [1, 0]]
* Accuracy: 92,86 %
* Precision: 0 %
* Recall: 0%
* F1 Score: 0 %


#### Ukraine War

For Ukraine War narratives, we received for class *Other* the following Confusion Matrix:
[[0, 14], [0, 10]]

Calculating the metrics for this specific class:
* Accuracy: 41,7 %
* Precision: 41,7 %
* Recall: 100%
* F1 Score: 58,8%

For all other classes, we received the following equal Confusion Matrix:
[[23, 0], [1, 0]]
* Accuracy: 95,8%
* Precision: 0%
* Recall: 0%
* F1 Score: 0%


For both approaches, we can see that all classes except of *Other* have no counts for *True Positives*, but a very high count for *True Negatives*. On the hand, we have the class *Other*, where all samples of class *Other* are classified correctly as *Other*, but still we have almost as many or even more narratives that were classified as *Other*, although they had a different class.

From those unequally distributed results we see that BERTs' prediction performance is best for class *Other* and no other class.

An issue that could cause that, can be a data imbalance. The class *Other* occurs most frequently in the dataset. From that, the model could have learned to predict especially this class mostly accurately, while it fails for all other classes.

## Possible solutions
Although we have used stratification to receive a better distribution of all classes, there are still some solutions that we should consider in the next part.

Apperently, the class *Other* is overrepresented in the data set. Since we only had less than 200 narratives available, one possible solution to improve the predictions is to use more data.
In the context of this task we also have narratives in other languages available that we can use to handle the imbalance.Having more data.

The issue could also be that  the class labeling was not carried out cleanly, so that we have the broad class *Other*, where most of the narratives belong to. Specifying the classes more accurate could help to balance the dataset.





## Qualitative Analysis

While finishing up our project, we accidentally deleted one of the result runs from our BERT model. Because of this, we can’t reproduce the predictions the model originally made, which were the basis for this qualitative analysis.

This notebook contains our qualitative analysis, and all the outputs are already included in the markdown and code cells exactly as they were when we first ran it. Thus, rerunning the notebook is not possible!

Since we had the freedom to choose the format for this analysis, we think this shouldn’t be a big issue as long as the outputs are left as they are.

In [None]:
base_path = os.path.dirname(os.path.abspath(os.getcwd()))
label_directory = os.path.join(
    base_path, "models", "bert_20250113", "label_mapping.json"
)
input_file_full = os.path.join(base_path, "predicted_dataframe.csv")
dataset = pd.read_csv(input_file_full)

In [None]:
texts, labels, label_mapping = prepare_data(dataset)
print(f"Sample text: {texts[:3]}")
print(f"Sample label: {labels[:3]}")

training_results = train_bert(dataset, base_path)
print(f"Training Results: {training_results}")

In [None]:
tqdm.pandas()
model_path = os.path.join(base_path, "models\bert_20250113")


# Preparing predictions
def classify_row(row):
    try:
        predicted_label, _ = predict(row["tokens_normalized"], model_path)
        return predicted_label
    except Exception as e:
        print(f"Error for row {row.name}: {e}")
        return None


# Predicting
dataset["predicted_narrative"] = dataset.progress_apply(classify_row, axis=1)
# dataset.to_csv(os.path.join(base_path, "predicted_dataframe.csv"), index=False)

In [None]:
dataset = pd.read_csv(os.path.join(base_path, "predicted_dataframe.csv"))
dataset.head()

Here, we will pick one class label and analyze why some articles are getting predicted and especially why most of
the news articles that are actually assigned to that class are not predicted to be in this class.

We will again choose one class for qualitative analysis which has at least one *True Positive* and some *False Negatives* in English and Russian both, that we can compare with each other.

### English

In [None]:
english_dataset = dataset[dataset["language"] == "EN"]
russian_dataset = dataset[dataset["language"] == "RU"]


# Backparse both narrative columns
def parse_and_normalize(column):
    """Parse and normalize a column containing dictionaries stored as strings."""
    return column.apply(
        lambda x: (
            {k: v for k, v in sorted(ast.literal_eval(x).items())}
            if isinstance(x, str)
            else x
        )
    )


english_dataset.loc[:, "temp_narrative"] = parse_and_normalize(
    english_dataset["temp_narrative"]
)
english_dataset.loc[:, "predicted_narrative"] = parse_and_normalize(
    english_dataset["predicted_narrative"]
)

# Getting unique classes from both narrative columns
all_classes = set(
    tuple(sorted(d.items())) for d in english_dataset["temp_narrative"]
) | set(tuple(sorted(d.items())) for d in english_dataset["predicted_narrative"])

class_summary_list = []

# Creating binary labels
for target_class in all_classes:
    y_true = (
        english_dataset["temp_narrative"]
        .apply(lambda x: tuple(sorted(x.items())) == target_class)
        .astype(int)
    )
    y_pred = (
        english_dataset["predicted_narrative"]
        .apply(lambda x: tuple(sorted(x.items())) == target_class)
        .astype(int)
    )

    # Calculating CMs
    cm = confusion_matrix(y_true, y_pred)
    tp, fn, fp, tn = (
        cm.ravel() if cm.size == 4 else (cm[1, 1], cm[1, 0], cm[0, 1], cm[0, 0])
    )

    class_summary_list.append(
        {"target_class": dict(target_class), "TP": tp, "FN": fn, "FP": fp, "TN": tn}
    )

    disp = ConfusionMatrixDisplay(
        confusion_matrix=cm, display_labels=["Negative", "Positive"]
    )
    disp.plot(cmap="Blues")
    plt.title(f"Confusion Matrix for Target Class: {dict(target_class)}")
    plt.show()

We will choose class *narrative: Hidden plots by secret schemes of powerful groups, subnarrative: Climate agenda has hidden motives* for the qualitative analysis of english.

We will print out the needed columns of the texts that were predicted with a different label, although they had the label mentioned above. Also, we will print news articles where the model predicted the label correctly to inspect those outputs.

In [None]:
analyzing_class = {
    "narrative": "Hidden plots by secret schemes of powerful groups",
    "subnarrative": "Climate agenda has hidden motives",
}

# Filtering TPs
true_positive_english = english_dataset[
    (english_dataset["predicted_narrative"] == analyzing_class)
    & (english_dataset["temp_narrative"] == analyzing_class)
]

# Filtering FNs
false_negatives_english = english_dataset[
    (english_dataset["predicted_narrative"] != analyzing_class)
    & (english_dataset["temp_narrative"] == analyzing_class)
]

# Index for TP or FN
print("True Positive Indices:")
print(true_positive_english.index.tolist())
print("\nFalse Negative Indices:")
print(false_negatives_english.index.tolist())

### True positive content:

In [None]:
print(true_positive_english.loc[153, "content"])

Reading through this news article, we can see that there are two sentences in the beginning, where the author already starts to point to the direction, the new regulations being met "under the guise of fighting 'climate change'". He also says that most dishwasers that are sold today already align with the planned regulations and that the government should focus on other topics, the meaning of the direction the text is focused to gets clear. Those two cases have probably led to the correct assumption, that there are hidden plots by powerful groups (the government).
Having the text part "under the guise" likely led to the assumption that the climate agenda has hidden motives.

We can assume that the texts the model was trained with contained words like "hidden", "climate agenda" or "regime", because those words are all contained in the news article that was predicted correctly by the model. Those words were also used to hold information which likely led the label human classify the text under that label.

Further, the sentence, that "Critics have also poked major holes in the regime´s claim [...]" also could contribute to the fact that the model predicted especially this narrative-subnarrative pair, as the governments attempt to reduce energy consumption is getting criticized.



### False negative content:

In [None]:
print(false_negatives_english.loc[104, "content"])

In [None]:
print(false_negatives_english.loc[293, "content"])

In the first article, there are some sentences that indicate the topic *Climate Change* slightly. Especially the sentence saying "Caroline van der Plas founded [the movement]" and the following part about "nitrogen hoax", "climate change lies" and "[...] buy up most of the farmers" should lead the model to predict the actual label, as those words and sentences shall be likely been used in other news articles with that true label for training.

Nevertheless, the second part of the article starts getting religious with text parts like "Accept Jesus Christ as our saviour" or "Read the bible, fast and pray" or "Amen". Those words and especially this topic and tone of writing does not anything contribute to the true label. Thus, this drift off the climate change topic could lead the model to assume the text to belong toa different label, especially if there were some religious texts that were predicted with Other-Other or a different label, where religious topics would fit into the label.

The second article contains little Climate Change information throughout the whole text, which is understandable if one knows what kind of meetup was in Dubai and why influential people are joining it. Parts like "Globalist oligarchs met [...]" or "Unelected Bond villains who want to decide our fate." and "Mega Rich Elite [...]" tries to frame the politicians as a powerful group in a pejorative way.
Still, the model did not predict this news article as the true label. One reason for that could be, that the article is written in a sarcastic and pejorative way, which could lead the model to predict the label Other-Other, because the climate change content of the text is a bit hidden in political and social concepts that need to be understood.