# Imports


In [1]:
import torch
import os
import evaluate
from tqdm import tqdm
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer
from transformers import BertTokenizer, BertConfig , BertModel, DistilBertModel, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score
import torch.nn.functional as F
import torch.nn as nn
from transformers import AutoModel, pipeline
from datasets import Dataset

# Setup logging
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Set the device and load the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


  from .autonotebook import tqdm as notebook_tqdm
2025-02-18 12:44:46.328680: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739882686.394971   52005 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739882686.413570   52005 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-18 12:44:46.524893: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Agrument Analyses
This part load dataset will be use to train and test BERT

This dataset was also created by our `llama_week_labeling.py` script.

We use dataset from [`US Election 2020 - Presidential Debates`](https://www.kaggle.com/datasets/headsortails/us-election-2020-presidential-debates) collection and start to create the labels:
- `Restatement`: The second statement restates or reinforces the first.
- `Counterargument`: The second statement opposes the first.
- `Neutral`: No clear relationship between the statements.

In [2]:
# Define the labels
label2id = {"Neutral": 0, "Counterargument": 1, "Restatement": 2}
id2label = {v: k for k, v in label2id.items()}  # Reverse mapping

# Load datasets for training
df_agrument_1=pd.read_csv(os.path.join('../Datasets','us_debates','agrument','1st_presidential_agrument.csv'))
# Only one speaker
df_agrument_2=pd.read_csv(os.path.join('../Datasets','us_debates','agrument','trump_town_hall_agrument.csv'))

# Map the labels
df_agrument_1["label_map"]=df_agrument_1["label"].map(label2id)
df_agrument_2["label_map"]=df_agrument_2["label"].map(label2id)

# Print some dataset information
df_agrument_1.info()
df_agrument_1.head()
df_agrument_2.info()
df_agrument_2.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 745 entries, 0 to 744
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   speaker1    745 non-null    object
 1   statement1  745 non-null    object
 2   speaker2    745 non-null    object
 3   statement2  745 non-null    object
 4   label       745 non-null    object
 5   label_map   745 non-null    int64 
dtypes: int64(1), object(5)
memory usage: 35.1+ KB


Unnamed: 0,speaker1,statement1,speaker2,statement2,label,label_map
0,Vice President Joe Biden,"How you doing, man?",President Donald J. Trump,How are you doing?,Neutral,0
1,Vice President Joe Biden,"How you doing, man?",President Donald J. Trump,"Thank you very much, Chris. I will tell you ve...",Counterargument,1
2,Vice President Joe Biden,I’m well.,President Donald J. Trump,How are you doing?,Neutral,0
3,Vice President Joe Biden,I’m well.,President Donald J. Trump,"Thank you very much, Chris. I will tell you ve...",Neutral,0
4,Vice President Joe Biden,I’m well.,President Donald J. Trump,And we won the election and therefore we have ...,Counterargument,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416 entries, 0 to 415
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   speaker1    416 non-null    object
 1   statement1  416 non-null    object
 2   speaker2    416 non-null    object
 3   statement2  416 non-null    object
 4   label       416 non-null    object
 5   label_map   416 non-null    int64 
dtypes: int64(1), object(5)
memory usage: 19.6+ KB


Unnamed: 0,speaker1,statement1,speaker2,statement2,label,label_map
0,Savannah Guthrie,"It’s nothing but noise. What? Okay. All right,...",President Trump,"I’m feeling great, I don’t know about you. How...",Neutral,0
1,Savannah Guthrie,"It’s nothing but noise. What? Okay. All right,...",President Trump,"It’s great to be back in my home state, Florid...",Counterargument,1
2,Savannah Guthrie,"Tonight, Donald Trump in the arena. His first ...",President Trump,"I’m feeling great, I don’t know about you. How...",Neutral,0
3,Savannah Guthrie,"Tonight, Donald Trump in the arena. His first ...",President Trump,"It’s great to be back in my home state, Florid...",Neutral,0
4,Savannah Guthrie,"Tonight, Donald Trump in the arena. His first ...",President Trump,My goal is to fight for you and fight for your...,Neutral,0


In [3]:
# Concat the 2 early datasets
df_trainer_final = pd.concat([df_agrument_1, df_agrument_2])
df_trainer_final.head(10)
df_trainer_final.info()

Unnamed: 0,speaker1,statement1,speaker2,statement2,label,label_map
0,Vice President Joe Biden,"How you doing, man?",President Donald J. Trump,How are you doing?,Neutral,0
1,Vice President Joe Biden,"How you doing, man?",President Donald J. Trump,"Thank you very much, Chris. I will tell you ve...",Counterargument,1
2,Vice President Joe Biden,I’m well.,President Donald J. Trump,How are you doing?,Neutral,0
3,Vice President Joe Biden,I’m well.,President Donald J. Trump,"Thank you very much, Chris. I will tell you ve...",Neutral,0
4,Vice President Joe Biden,I’m well.,President Donald J. Trump,And we won the election and therefore we have ...,Counterargument,1
5,Vice President Joe Biden,"Well, first of all, thank you for doing this a...",President Donald J. Trump,"Thank you very much, Chris. I will tell you ve...",Neutral,0
6,Vice President Joe Biden,"Well, first of all, thank you for doing this a...",President Donald J. Trump,And we won the election and therefore we have ...,Counterargument,1
7,Vice President Joe Biden,"Well, first of all, thank you for doing this a...",President Donald J. Trump,"Thank you, Joe.",Neutral,0
8,Vice President Joe Biden,The American people have a right to have a say...,President Donald J. Trump,And we won the election and therefore we have ...,Counterargument,1
9,Vice President Joe Biden,The American people have a right to have a say...,President Donald J. Trump,"Thank you, Joe.",Neutral,0


<class 'pandas.core.frame.DataFrame'>
Index: 1161 entries, 0 to 415
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   speaker1    1161 non-null   object
 1   statement1  1161 non-null   object
 2   speaker2    1161 non-null   object
 3   statement2  1161 non-null   object
 4   label       1161 non-null   object
 5   label_map   1161 non-null   int64 
dtypes: int64(1), object(5)
memory usage: 63.5+ KB


## Training and testing
We decided to use `distilbert-base-uncased` model with the following train hyperparamenters

In [4]:
#Define our models hyperparameters
bert_model_name = 'distilbert-base-uncased' # smaller bert model
num_classes = 6
max_length = 128
batch_size = 8
num_epochs = 10
learning_rate = 2e-5
warmup_steps=500  # number of warmup steps for learning rate scheduler

In [5]:
# Compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    report = classification_report(labels, predictions, output_dict=True)
    
    # Make last report and last accuracy a global variable
    logger.info("\n📊 Classification Report:\n%s", classification_report(labels, predictions, digits=4))
    logger.info(f"Validation Accuracy: {report["accuracy"]:.4f}")

    classification_report_metrics = {
        "accuracy": report["accuracy"],
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"],
    }
    return classification_report_metrics

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["statement1"], examples["statement2"], padding="max_length", truncation=True, return_tensors="pt")

def train_agrument_clasifier(model, statement1_train_list: list, statement2_train_list: list, labels_list: list, tokenizer):

    x = list(zip(statement1_train_list, statement2_train_list))
    y = labels_list

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.35, stratify=y)

    # Do a split with stratify to preserve class distribution
    train_data = Dataset.from_dict({
        'statement1': [t[0] for t in x_train],
        'statement2': [t[1] for t in x_train],
        'label': [int(label) for label in y_train]
    })

    test_data = Dataset.from_dict({
        'statement1': [t[0] for t in x_test],
        'statement2': [t[1] for t in x_test],
        'label': [int(label) for label in y_test]
    })

    train_data = train_data.map(tokenize_function, batched=True)
    test_data = test_data.map(tokenize_function, batched=True)

    # Remove original text columns (keep only tokenized inputs)
    train_data = train_data.remove_columns(["statement1", "statement2"])
    test_data = test_data.remove_columns(["statement1", "statement2"])
    
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        warmup_steps=warmup_steps,
        save_total_limit=2,  # limit the total amount of checkpoints, delete the older checkpoints
        eval_steps=100, # Perform evaluation every 100 steps
        save_strategy="epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        metric_for_best_model="precision",  # Metric to use for selecting the best model
        greater_is_better=True,  # Whether a higher value of the metric is better
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,  # training data
        eval_dataset=test_data,  # evaluation data
        compute_metrics=compute_metrics
    )

    trainer.train()
    trainer.save_model("../models/distilbert_agrument_classifier")
    tokenizer.save_pretrained("../models/distilbert_agrument_classifier")

    # Reload with new model
    model = AutoModelForSequenceClassification.from_pretrained("../models/distilbert_agrument_classifier").to(device)
    tokenizer = BertTokenizer.from_pretrained("../models/distilbert_agrument_classifier")

In [8]:
# Load default model
config = BertConfig.from_pretrained(bert_model_name, num_labels=len(label2id), label2id=label2id, id2label=id2label)
model = AutoModelForSequenceClassification.from_pretrained(bert_model_name, config=config).to(device)
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

# Training
statement1_train_list=df_trainer_final["statement1"].to_list()
statement2_train_list=df_trainer_final["statement2"].to_list()
labels_list=df_trainer_final["label_map"].to_list()

train_agrument_clasifier(model, statement1_train_list, statement2_train_list, labels_list, tokenizer)

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'enc

{'loss': 0.592, 'grad_norm': 19.568687438964844, 'learning_rate': 2e-05, 'epoch': 5.26}


100%|██████████| 950/950 [06:16<00:00,  2.52it/s]


{'train_runtime': 376.9641, 'train_samples_per_second': 20.002, 'train_steps_per_second': 2.52, 'train_loss': 0.4909568224455181, 'epoch': 10.0}


### Evaluate models perfomance
To evaluate the model we gonna give some agrument pre classified by llama:

In [9]:
# Function to test the sentiment of a text
nli_model = pipeline("text-classification", model="../models/distilbert_agrument_classifier", device=device)

def test_model(statement1, statement2, model, tokenizer):
    input_text=f"{statement1} </s></s> {statement2}"
    model_result=model(input_text, truncation=True)
    print(f"Classification is {model_result[0]['label']}")

# Test sentiment prediction
test_model("The deal is that it’s going to wipe out pre-existing conditions. And, by the way, the 200,000 people that have died on his watch, how many of those have survived? Well, there’s seven million people that contracted COVID."
           ,"And if you were here, it wouldn’t be 200, it would be two million people because you were very late on the draw. You didn’t want me to ban China, which was heavily infected. You didn’t want me to ban Europe", 
           nli_model, tokenizer) # Counterargument
test_model("People want to be safe.","Those states are not doing well that are shut down right now.", nli_model, tokenizer) # Counterargument
test_model("People want to be safe.", "Because it’s a political thing.", nli_model, tokenizer) # Neutral

Classification is Counterargument
Classification is Counterargument
Classification is Neutral
