# Discourse Markers

## Imports and Config

In [None]:
import torch
from transformers import (
    Trainer,
    AutoConfig,
    TrainingArguments,
    RobertaTokenizerFast,
    EarlyStoppingCallback,
    DataCollatorWithPadding,
    RobertaForSequenceClassification,
)

import os
import evaluate
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import Dataset, load_dataset
from IPython.core.pylabtools import figsize
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

## Dataset

In [None]:
df = pd.read_csv(os.path.join('data', 'en.csv'))
df.head()

### Discourse Mapping

* NOTE: This classification is a computational interpretation. Many markers are polysemous (belong to multiple classes depending on context. This map provides a best-fit, single-class assignment for all markers.

In [None]:
dm_to_class_map = {
    # == Contrastive Discourse Markers (CDMs) ==
    # Show opposition, contrast, concession, or correction
    'although': 'CDM',
    'but': 'CDM',
    'by comparison': 'CDM',
    'by contrast': 'CDM',
    'conversely': 'CDM',
    'however': 'CDM',
    'in contrast': 'CDM',
    'instead': 'CDM',
    'nevertheless': 'CDM',
    'nonetheless': 'CDM',
    'on the contrary': 'CDM',
    'on the other hand': 'CDM',
    'otherwise': 'CDM',
    'rather': 'CDM',
    'regardless': 'CDM',
    'still': 'CDM',
    'though': 'CDM',
    'yet': 'CDM',

    # == Elaborative Discourse Markers (EDMs) ==
    # Add info, specify, rephrase, give examples, or add speaker stance
    'absolutely': 'EDM',
    'actually': 'EDM',
    'additionally': 'EDM',
    'admittedly': 'EDM',
    'again': 'EDM',
    'also': 'EDM',
    'alternately': 'EDM',
    'alternatively': 'EDM',
    'altogether': 'EDM',
    'amazingly': 'EDM',
    'and': 'EDM',
    'anyway': 'EDM',
    'apparently': 'EDM',
    'arguably': 'EDM',
    'basically': 'EDM',
    'besides': 'EDM',
    'certainly': 'EDM',
    'clearly': 'EDM',
    'coincidentally': 'EDM',
    'collectively': 'EDM',
    'curiously': 'EDM',
    'elsewhere': 'EDM',
    'especially': 'EDM',
    'essentially': 'EDM',
    'evidently': 'EDM',
    'for example': 'EDM',
    'for instance': 'EDM',
    'fortunately': 'EDM',
    'frankly': 'EDM',
    'further': 'EDM',
    'furthermore': 'EDM',
    'generally': 'EDM',
    'happily': 'EDM',
    'here': 'EDM',
    'honestly': 'EDM',
    'hopefully': 'EDM',
    'ideally': 'EDM',
    'importantly': 'EDM',
    'in fact': 'EDM',
    'in other words': 'EDM',
    'in particular': 'EDM',
    'in short': 'EDM',
    'in sum': 'EDM',
    'incidentally': 'EDM',
    'indeed': 'EDM',
    'interestingly': 'EDM',
    'ironically': 'EDM',
    'likewise': 'EDM',
    'locally': 'EDM',
    'luckily': 'EDM',
    'maybe': 'EDM',
    'meaning': 'EDM',
    'moreover': 'EDM',
    'mostly': 'EDM',
    'namely': 'EDM',
    'nationally': 'EDM',
    'naturally': 'EDM',
    'notably': 'EDM',
    'obviously': 'EDM',
    'oddly': 'EDM',
    'only': 'EDM',
    'optionally': 'EDM',
    'or': 'EDM',
    'overall': 'EDM',
    'particularly': 'EDM',
    'perhaps': 'EDM',
    'personally': 'EDM',
    'plus': 'EDM',
    'preferably': 'EDM',
    'presumably': 'EDM',
    'probably': 'EDM',
    'realistically': 'EDM',
    'really': 'EDM',
    'remarkably': 'EDM',
    'sadly': 'EDM',
    'separately': 'EDM',
    'seriously': 'EDM',
    'significantly': 'EDM',
    'similarly': 'EDM',
    'specifically': 'EDM',
    'strangely': 'EDM',
    'supposedly': 'EDM',
    'surely': 'EDM',
    'surprisingly': 'EDM',
    'technically': 'EDM',
    'thankfully': 'EDM',
    'theoretically': 'EDM',
    'together': 'EDM',
    'truly': 'EDM',
    'truthfully': 'EDM',
    'undoubtedly': 'EDM',
    'unfortunately': 'EDM',
    'unsurprisingly': 'EDM',
    'well': 'EDM',

    # == Implicative Discourse Markers (IDMs) ==
    # Show result, consequence, or inference
    'accordingly': 'IDM',
    'as a result': 'IDM',
    'because of that': 'IDM',
    'because of this': 'IDM',
    'by doing this': 'IDM',
    'consequently': 'IDM',
    'hence': 'IDM',
    'in turn': 'IDM',
    'inevitably': 'IDM',
    'so': 'IDM',
    'thereby': 'IDM',
    'therefore': 'IDM',
    'thus': 'IDM',

    # == Temporal Discourse Markers (TDMs) ==
    # Show time or sequence
    'afterward': 'TDM',
    'already': 'TDM',
    'by then': 'TDM',
    'currently': 'TDM',
    'eventually': 'TDM',
    'finally': 'TDM',
    'first': 'TDM',
    'firstly': 'TDM',
    'frequently': 'TDM',
    'gradually': 'TDM',
    'historically': 'TDM',
    'immediately': 'TDM',
    'in the end': 'TDM',
    'in the meantime': 'TDM',
    'increasingly': 'TDM',
    'initially': 'TDM',
    'lastly': 'TDM',
    'lately': 'TDM',
    'later': 'TDM',
    'meantime': 'TDM',
    'meanwhile': 'TDM',
    'next': 'TDM',
    'normally': 'TDM',
    'now': 'TDM',
    'occasionally': 'TDM',
    'often': 'TDM',
    'once': 'TDM',
    'originally': 'TDM',
    'presently': 'TDM',
    'previously': 'TDM',
    'recently': 'TDM',
    'second': 'TDM',
    'secondly': 'TDM',
    'simultaneously': 'TDM',
    'slowly': 'TDM',
    'sometimes': 'TDM',
    'soon': 'TDM',
    'subsequently': 'TDM',
    'suddenly': 'TDM',
    'then': 'TDM',
    'thereafter': 'TDM',
    'third': 'TDM',
    'thirdly': 'TDM',
    'traditionally': 'TDM',
    'typically': 'TDM',
    'ultimately': 'TDM',
    'usually': 'TDM',
}

In [None]:
df['label'] = [ dm_to_class_map.get(str(dm).lower().strip()) for dm in df.dm ]

print(f'Original size: {len(df)}')
df = df.loc[df['label'].notnull()].copy()
print(f'Size after filtering: {len(df)}')

In [None]:
import csv
df.to_csv('data/dm_en.csv', index=False, quoting=csv.QUOTE_ALL)

In [None]:
df = pd.read_csv('data/dm_en.csv')
df.head()

### Exploration

In [None]:
sns.displot(df, x='label')

In [None]:
df['text'] = [ ' '.join([row['s1'], row['s2']]) for _, row in df.iterrows() ]
df['full_length'] = [ len(row['text']) for _, row in df.iterrows() ]

In [None]:
sns.set_palette(palette="plasma")

plt.figure(figsize=(10, 4))
sns.histplot(df, x='full_length', hue='label', shrink=.8, multiple='stack', bins=75)

## Pre-processing

In [None]:
dataset = Dataset.from_pandas(df)

dataset = dataset.class_encode_column("label")
print(dataset.features)

In [None]:
train_test_split = dataset.train_test_split(
    test_size=0.2,
    seed=42,
    stratify_by_column="label"
)
test_val_split = train_test_split['test'].train_test_split(
    test_size=0.5,
    seed=42,
    stratify_by_column="label"
)

train_dataset = train_test_split['train']
val_dataset = test_val_split['train']
test_dataset = test_val_split['test']

print(f"Train size: {len(train_dataset)}")
print(f"Validation size: {len(val_dataset)}")
print(f"Test size: {len(test_dataset)}")

In [None]:
class_names = train_dataset.features['label'].names
num_labels = len(class_names)
id2label = {i: label for i, label in enumerate(class_names)}
label2id = {label: i for i, label in enumerate(class_names)}

print(f"Number of labels: {num_labels}")
print(f"The labels: {class_names}")
print(f"id2label map: {id2label}")

In [None]:
model_id = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

def tokenize(batch):
    return tokenizer(
        batch['s1'],
        batch['s2'],
        truncation=True,
        max_length=512,
        padding=False
    )

In [None]:
train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

In [None]:
columns_to_keep = ["input_ids", "attention_mask", "label"]
train_dataset.set_format("torch", columns=columns_to_keep)
val_dataset.set_format("torch", columns=columns_to_keep)
test_dataset.set_format("torch", columns=columns_to_keep)

## Metrics

In [None]:
metric_f1 = evaluate.load("f1")
metric_accuracy = evaluate.load("accuracy")
metric_precision = evaluate.load("precision")
metric_recall = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred

    predictions = np.argmax(logits, axis=-1)

    # 'weighted' is good for unbalanced classes.
    f1_weighted = metric_f1.compute(
        predictions=predictions,
        references=labels,
        average="weighted"
    )
    # 'macro' treats all classes equally.
    f1_macro = metric_f1.compute(
        predictions=predictions,
        references=labels,
        average="macro"
    )
    accuracy = metric_accuracy.compute(
        predictions=predictions,
        references=labels
    )
    precision = metric_precision.compute(
        predictions=predictions,
        references=labels,
        average="weighted"
    )
    recall = metric_recall.compute(
        predictions=predictions,
        references=labels,
        average="weighted"
    )

    return {
        "accuracy": accuracy["accuracy"],
        "f1_weighted": f1_weighted["f1"],
        "f1_macro": f1_macro["f1"],
        "precision": precision["precision"],
        "recall": recall["recall"],
    }

## Model

In [None]:
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
repository_id = "./models/roberta-base-dm-4class"

train: 350k / 32 ~ 11k steps per epoch

In [None]:
training_args = TrainingArguments(
    output_dir=repository_id,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,

    eval_strategy="steps",
    eval_steps=2000,
    save_strategy="steps",
    save_steps=2000,
    logging_strategy="steps",
    logging_steps=2000,
    logging_dir=f"{repository_id}/logs",

    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=1000,
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",

    metric_for_best_model="f1_macro",
)

class_weights

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

## Train

In [None]:
trainer.train()

## Evaluation

In [None]:
log_history = trainer.state.log_history
df_logs = pd.DataFrame(log_history)

df_eval = df_logs.dropna(subset=['eval_loss'])
df_eval.head()

In [None]:
df_to_plot = df_eval[['step', 'eval_loss', 'eval_f1_weighted', 'eval_f1_macro']]

# 'melt' to group the metrics into a single column.
df_melted = df_to_plot.melt(
    id_vars=['step'],
    var_name='metric',
    value_name='value'
)

plt.figure(figsize=(12, 6))
sns.lineplot(data=df_melted, x='step', y='value', hue='metric')
plt.title('Training and Validation Metrics over Steps')
plt.xlabel('Training Step')
plt.ylabel('Value')
plt.legend(title='Metric')
plt.grid(True)
plt.show()

In [None]:
trainer.evaluate()

In [None]:
class_names = ['CDM', 'EDM', 'IDM', 'TDM']

print("Running predictions on test set...")
predictions_output = trainer.predict(test_dataset)

y_pred_logits = predictions_output.predictions
y_true = predictions_output.label_ids

y_pred = np.argmax(y_pred_logits, axis=-1)
cm = confusion_matrix(y_true, y_pred)

In [None]:
print("Plotting confusion matrix...")
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix on Test Set')
plt.show()

## Saving Model

In [None]:
trainer.save_model(repository_id)
tokenizer.save_pretrained(repository_id)

# END