# Imports


In [8]:
import torch
import os
import evaluate
from tqdm import tqdm
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer
from transformers import BertTokenizer, BertConfig , BertModel, DistilBertModel, AutoModelForSequenceClassification, BertForNextSentencePrediction, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score
import torch.nn.functional as F
import torch.nn as nn
from transformers import AutoModel
from sklearn.metrics import f1_score
from datasets import Dataset

# Setup logging
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Set the device and load the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Fact and Agrument Classification

This part load dataset will be use to train and test BERT

This dataset was created by our `llama_week_labeling.py` script using `unsloth/llama-3-8b-Instruct-bnb-4bit` model running locally

This dataset need to be created using a week labeling, in this case using llama text generation model, since we can't find usable dataset. 

We use dataset from [`US Election 2020 - Presidential Debates`](https://www.kaggle.com/datasets/headsortails/us-election-2020-presidential-debates) collection and start to create the labels

In [9]:
# Load datasets
df_fact_agrument_1=pd.read_csv(os.path.join('../Datasets','us_debates','fact-agrument','1st_presidential_fact_agument.csv'))
df_fact_agrument_2=pd.read_csv(os.path.join('../Datasets','us_debates','fact-agrument','vice_presidential_fact_agrument.csv'))

# Some info about each dataset
df_fact_agrument_1.head(10)
df_fact_agrument_1.info()
df_fact_agrument_1['label'].unique()

df_fact_agrument_2.head(10)
df_fact_agrument_2.info()
df_fact_agrument_2['label'].unique()



Unnamed: 0,speaker,statement,label
0,Vice President Joe Biden,"How you doing, man?",Argument
1,Vice President Joe Biden,I’m well.,Argument
2,Vice President Joe Biden,"Well, first of all, thank you for doing this a...",Argument
3,Vice President Joe Biden,The American people have a right to have a say...,Argument
4,Vice President Joe Biden,"Now, what’s at stake here is the President’s m...",Argument
5,Vice President Joe Biden,"And that ended when we, in fact, passed the Af...",Argument
6,Vice President Joe Biden,He’s elected to the next election.,Argument
7,Vice President Joe Biden,That’s simply not true.,Argument
8,Vice President Joe Biden,Open discussion.,Argument
9,Vice President Joe Biden,"Number one, he knows what I proposed. What I p...",Argument


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 563 entries, 0 to 562
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   speaker    563 non-null    object
 1   statement  563 non-null    object
 2   label      563 non-null    object
dtypes: object(3)
memory usage: 13.3+ KB


array(['Argument', 'Fact'], dtype=object)

Unnamed: 0,speaker,statement,label
0,Kamala Harris,"Thank you, Susan. Well, the American people ha...",Argument
1,Kamala Harris,"Can you imagine if you knew on January 28th, a...",Argument
2,Kamala Harris,… right to reelection based on this.,Argument
3,Kamala Harris,Absolutely. Whatever the vice president is cla...,Argument
4,Kamala Harris,"No. But Susan, this is important. And I want t...",Argument
5,Kamala Harris,"Mr. Vice President, I’m speaking.",Argument
6,Kamala Harris,I’m speaking.,Argument
7,Kamala Harris,Thank you. So I want to ask the American peopl...,Argument
8,Kamala Harris,… when your children-,Argument
9,Kamala Harris,… couldn’t see your parents because you were a...,Argument


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   speaker    157 non-null    object
 1   statement  157 non-null    object
 2   label      157 non-null    object
dtypes: object(3)
memory usage: 3.8+ KB


array(['Argument', 'Fact'], dtype=object)

We also gonna include the `Politifact Fact Check` since the debate we get are more agruments that facts.

We need to create a new dataset from this one with the same format as the one above, and concat the 3

In [10]:
df_polifact = pd.read_json(os.path.join('../Datasets','politifact_factcheck_data.json'), lines=True)
# Cut dataset to randomly 1300 rows, to prevent to many facts
df_polifact = df_polifact.sample(n=1300, random_state=42)

# Dataset for training
df_trainer_final=pd.DataFrame(columns=["speaker", "statement", "label"])

# We add the speaker but is not needed 
# (the speaker information was not use to prevent biases)
df_trainer_final["speaker"]=df_polifact["statement_originator"]
df_trainer_final["statement"]=df_polifact["statement"]
df_trainer_final["label"]="Fact"

# Concat with the 2 early datasets and shuffe dataset
df_trainer_final = pd.concat([df_trainer_final, df_fact_agrument_1, df_fact_agrument_2])
df_trainer_final = df_trainer_final.sample(frac=1, random_state=50).reset_index(drop=True)

# At last we gonna map the lables
label2id = {"Fact": 0, "Argument": 1}
id2label = {v: k for k, v in label2id.items()}  # Reverse mapping
df_trainer_final["label_map"]=df_trainer_final["label"].map(label2id)

df_trainer_final.head(10)
df_trainer_final.info()

Unnamed: 0,speaker,statement,label,label_map
0,Mike Pence,And I’m going to speak up on behalf of what th...,Argument,1
1,Barack Obama,"""Already we've identified $2 trillion in defic...",Fact,0
2,Kamala Harris,… for the Recovery Act that brought America ba...,Argument,1
3,Vice President Joe Biden,"Because you in fact passed that, that was your...",Argument,1
4,Vice President Joe Biden,Mr. Vice-,Fact,0
5,David Beckham,"""Swaziland has the highest rate of HIV infecti...",Fact,0
6,Facebook posts,Quotes Mike Pence as saying that people with p...,Fact,0
7,Cindy O’Laughlin,"""When comparing state by state, the data clear...",Fact,0
8,Barack Obama,"""We've recovered (from the recession) faster a...",Fact,0
9,President Donald J. Trump,"A lot of people, between drugs and alcohol and...",Argument,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2020 entries, 0 to 2019
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   speaker    2020 non-null   object
 1   statement  2020 non-null   object
 2   label      2020 non-null   object
 3   label_map  2020 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 63.3+ KB


## Training and testing
We decided to use `distilbert-base-uncased` model with the following train hyperparamenters

In [11]:
#Define our models hyperparameters
bert_model_name = 'distilbert-base-uncased' # smaller bert model
num_classes = 6
max_length = 128
batch_size = 8
num_epochs = 10
learning_rate = 2e-5
warmup_steps=500  # number of warmup steps for learning rate scheduler

In [12]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    report = classification_report(labels, predictions, output_dict=True)
    
    # Make last report and last accuracy a global variable
    logger.info("\n📊 Classification Report:\n%s", classification_report(labels, predictions, digits=4))
    logger.info(f"Validation Accuracy: {report["accuracy"]:.4f}")

    classification_report_metrics = {
        "accuracy": report["accuracy"],
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"],
    }
    return classification_report_metrics

In [15]:
def train_fact_agrument_clasifier(model, statement_list: list, labels_list: list, tokenizer):

    def tokenize_function(examples):
        return tokenizer(examples["statement"], padding="max_length", truncation=True, return_tensors="pt")

    # Do a split with stratify to preserve class distribution
    x_train, x_test, y_train, y_test = train_test_split(statement_list, labels_list, test_size=0.35, stratify=labels_list)

    # Create new train and test dataset
    train_data = Dataset.from_dict({
        'statement': [t[0] for t in x_train],
        'label': [int(label) for label in y_train]
    })

    test_data = Dataset.from_dict({
        'statement': [t[0] for t in x_test],
        'label': [int(label) for label in y_test]
    })

    # Map the train and text dataset with the tokeneizer
    train_data = train_data.map(tokenize_function, batched=True)
    test_data = test_data.map(tokenize_function, batched=True)

    # Remove original text columns (keep only tokenized inputs)
    train_data = train_data.remove_columns(["statement"])
    test_data = test_data.remove_columns(["statement"])
    
    # Define training agruments
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        warmup_steps=warmup_steps,
        save_total_limit=2,  # limit the total amount of checkpoints, delete the older checkpoints
        eval_steps=100, # Perform evaluation every 100 steps
        save_strategy="epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        metric_for_best_model="precison",  # Metric to use for selecting the best model
        greater_is_better=True,  # Whether a higher value of the metric is better
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,  # training data
        eval_dataset=test_data,  # evaluation data
        compute_metrics=compute_metrics
    )

    trainer.train()
    trainer.save_model("../models/distilbert_fact_agrument_classifier")
    tokenizer.save_pretrained("../models/distilbert_fact_agrument_classifier")

    # Reload with new model
    model = AutoModelForSequenceClassification.from_pretrained("../models/distilbert_fact_agrument_classifier").to(device)
    tokenizer = BertTokenizer.from_pretrained("../models/distilbert_fact_agrument_classifier")

In [16]:
# Load default model
config = BertConfig.from_pretrained(bert_model_name, num_labels=len(label2id), label2id=label2id, id2label=id2label)
model = AutoModelForSequenceClassification.from_pretrained(bert_model_name, config=config).to(device)
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

# Training
statement_train_list=df_trainer_final["statement"].to_list()
labels_list=df_trainer_final["label_map"].to_list()

train_fact_agrument_clasifier(model, statement_train_list, labels_list, tokenizer)

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'enc

{'loss': 0.4786, 'grad_norm': 17.310762405395508, 'learning_rate': 2e-05, 'epoch': 3.03}


 61%|██████    | 1000/1650 [06:50<04:38,  2.34it/s]
 61%|██████    | 1000/1650 [06:50<04:38,  2.34it/s]

{'loss': 0.3842, 'grad_norm': 2.7215335369110107, 'learning_rate': 1.1304347826086957e-05, 'epoch': 6.06}


 91%|█████████ | 1500/1650 [10:23<01:02,  2.41it/s]
 91%|█████████ | 1500/1650 [10:23<01:02,  2.41it/s]

{'loss': 0.3543, 'grad_norm': 3.4246113300323486, 'learning_rate': 2.6086956521739132e-06, 'epoch': 9.09}


100%|█████████▉| 1649/1650 [11:24<00:00,  2.45it/s]
100%|██████████| 1650/1650 [11:30<00:00,  2.39it/s]


{'train_runtime': 690.4151, 'train_samples_per_second': 19.018, 'train_steps_per_second': 2.39, 'train_loss': 0.400108115456321, 'epoch': 10.0}


### Evaluate models perfomance
To evaluate the model we gonna give some text samples from polifact website and some text from the debates dataset

In [None]:
# Reload new model
model = AutoModelForSequenceClassification.from_pretrained("../models/distilbert_fact_agrument_classifier").to(device)

# Function to test the sentiment of a text
def test_model(text, model, tokenizer):
    labels = {0: "Fact", 1: "Argument"}
    # Tokenize the text and add padding/truncation
    encoding = tokenizer(text, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Make the model prediction
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1).item()  # Convert logits to predicted class index

    # Output the prediction label based on the model's class index
    predicted_label = labels[predictions]  # Map the predicted index to the label
    print(f"Sentiment for text is {predicted_label}")


# Test sentiment prediction
test_model("The poverty rate decreased by 3% in the last two years", model, tokenizer) # Fact
test_model("FEMA sent $59M LAST WEEK to luxury hotels in New York City to house illegal migrants… That money is meant for American disaster relief.", model, tokenizer) # Agrument
test_model("But many people are catching it. Many people are getting this disease that was sent to us by China, and it shouldn’t have been allowed to happen.", model, tokenizer) # Agrument

Sentiment for text is Fact
Sentiment for text is Argument
Sentiment for text is Argument
