
# Political Fact-Checking with Transformers

This project trains an AI model to classify political statements as **true** or **false**. 
We leverage transformer-based architectures, preprocessing techniques, and multiple classification heads to evaluate performance.

## Workflow Overview
1. **Load and Preprocess Data**: We utilize datasets like LIAR and POLITIFACT.
2. **Model Selection**: Different transformer-based architectures are tested.
3. **Training and Evaluation**: Models are trained and evaluated for accuracy.
4. **Results Analysis**: Performance is compared across different architectures.

---


### Here we test diferent combinations of arquitecture to evaluate and get the best option

## 1. Importing Dependencies
We import PyTorch, transformers, and other essential libraries.

In [1]:
# Standard libraries
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm
import pandas as pd

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Transformers
from transformers import (
    RobertaModel,
    RobertaTokenizer,
    AdamW,
    get_linear_schedule_with_warmup,
    AutoModel,
    AutoTokenizer,
    get_scheduler
)


  from .autonotebook import tqdm as notebook_tqdm
2025-02-18 12:13:20.097472: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739880800.108728   27105 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739880800.111972   27105 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-18 12:13:20.124145: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 2. Dataset Class Definition
We define a PyTorch Dataset class for handling tokenization and labels.

In [2]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, labels, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data[idx]["sentence"]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

## 3. Data Preprocessing
This function cleans and processes the dataset, mapping labels and removing unnecessary columns.

In [3]:
def data_preprocessing(dataset):

  #turn label either false or true
  dataset['label']=[1 if x=="true"or x=="mostly-true" or x=="half-true" or x=="barely-true" else 0 for x in dataset[1]]

  #Dropping unwanted columns
  dataset = dataset.drop(labels=[0,1,8,9,10,11,12] ,axis=1)
  #Dealing with empty datapoints for metadata columns - subject, speaker, job, state,affiliation, context
  meta = []
  for i in range(len(dataset)):
      subject = dataset[3][i]
      if subject == 0:
          subject = 'None'

      speaker =  dataset[4][i]
      if speaker == 0:
          speaker = 'None'

      job =  dataset[5][i]
      if job == 0:
          job = 'None'

      state =  dataset[6][i]
      if state == 0:
          state = 'None'

      affiliation =  dataset[7][i]
      if affiliation == 0:
          affiliation = 'None'

      context =  dataset[13][i]
      if context == 0 :
          context = 'None'

      meta.append(str(subject) + ' ' + str(speaker) + ' ' + str(job) + ' ' + str(state) + ' ' + str(affiliation) + ' ' + str(context)) #combining all the meta data columns into a single column

  #Adding cleaned and combined metadata column to the dataset
  dataset[14] = meta
  dataset["sentence"] = dataset[14].astype('str')+" "+dataset[2] #Combining metadata and the text columns into single columns

  dataset = dataset.drop([2,3,4,5,6,7,13,14], axis=1) #dropping metadata columns, as we have merged them into a single column
  dataset.dropna() #Dropping if there are still any null values

  return dataset

## 4. Model Architectures
We define different classification heads for the transformer-based model.

In [4]:
# Define different architectures for the classification head
def get_classification_head(hidden_size, head_type="basic", dropout_rate=0.1):
    if head_type == "basic":
        return nn.Linear(hidden_size, 2)  # Simple linear layer
    elif head_type == "mlp":
        return nn.Sequential(
            nn.Linear(hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, 2)
        )
    elif head_type == "gelu_norm":
        return nn.Sequential(
            nn.Linear(hidden_size, 256),
            nn.GELU(),
            nn.LayerNorm(256),
            nn.Dropout(dropout_rate),
            nn.Linear(256, 2)
        )
    else:
        raise ValueError("Unknown head type")

# Define different base models
BASE_MODELS = {
    "bert": "bert-base-uncased",
    "roberta": "roberta-base",
    "distilbert": "distilbert-base-uncased"
}

## 5. Transformer-Based Model
We create a model that integrates a transformer backbone with the classification head.

In [5]:
class TransformerClassifier(nn.Module):
    def __init__(self, model_name, head_type="basic", dropout_rate=0.1):
        super(TransformerClassifier, self).__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        hidden_size = self.transformer.config.hidden_size
        self.classifier = get_classification_head(hidden_size, head_type, dropout_rate)

    def forward(self, input_ids, attention_mask):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        return self.classifier(pooled_output)

## 6. Hyperparameters
Setting up hyperparameters like batch size, learning rate, and device configuration.

In [6]:
# Define hyperparameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 3
batch_size = 16
learning_rate = 2e-5

## 7. Loading Datasets
We load and combine multiple fact-checking datasets for training and evaluation.

In [7]:

import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# Loading the data

# Load the LIAR dataset from TSV
df1 = pd.read_csv(os.path.join('../Datasets','LIAR.tsv'), sep="\t", header=None)
# Load the POLITIFACT dataset
df2 = pd.read_json(os.path.join('../Datasets','politifact_factcheck_data.json'), lines=True)

#---------------Preprocess LIAR dataset----------------------
df1 = data_preprocessing(df1)

#---------------Preprocess Politifact dataset----------------------
df2["sentence"] = df2["statement_originator"] + " said: " + df2["statement"] + " (" + df2["statement_date"] + " via " + df2["statement_source"] + ")"

# Encode labels: 1 for true, mostly-true, half-true, barely-true; 0 otherwise
df2['label'] = df2['verdict'].apply(lambda x: 1 if x in ['true', 'mostly-true', 'half-true', 'barely-true'] else 0)

# Handle metadata and combine into a single column
meta_columns = ["statement_originator", "statement_date", "statement_source"]
df2['metadata'] = df2[meta_columns].fillna('None').astype(str).agg(' '.join, axis=1)

# Create sentence column combining metadata and statement
df2["sentence"] = df2["metadata"] + " " + df2["statement"]

# Drop original columns (optional)
df2 = df2.drop(columns=meta_columns + ["statement", "verdict","metadata","factcheck_analysis_link","factchecker", "factcheck_date"])


df = pd.concat([df1, df2])
df.fillna("None", inplace=True)
df = df.dropna()


#split data
train_sentences, val_sentences, train_labels, val_labels = train_test_split(df["sentence"].tolist(), df["label"].tolist(), test_size=0.2)

train_data = [{"sentence" : stm, "label" : vrd} for stm, vrd in zip(train_sentences, train_labels)]
val_data = [{"sentence" : stm, "label" : vrd} for stm, vrd in zip(val_sentences, val_labels)]

print("Train data size:", len(train_data))
print("train data", train_data[0])




Train data size: 27154
train data {'sentence': 'candidates-biography,city-government,education,state-finances,taxes scott-walker Milwaukee County Executive Wisconsin republican a speech Says because of his actions, Wisconsin property taxes today are lower than they were four years ago.', 'label': 1}


## 8. Training and Evaluation
Training multiple transformer models and evaluating their performance.

In [8]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

# Train and evaluate different models

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

results = {}
best_model = None  # To store the best model
best_f1_score = 0  # Initialize the best F1 score

for model_key, model_name in BASE_MODELS.items():
    for head_type in ["basic", "mlp", "gelu_norm"]:
        print(f"Training {model_key} with {head_type} head...")

        # Tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Create DataLoaders, from pytorch
        train_dataset = CustomDataset(train_data, tokenizer, train_labels)
        val_dataset = CustomDataset(val_data, tokenizer, val_labels)
        train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
        val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

        model = TransformerClassifier(model_name, head_type).to(device)
        optimizer = AdamW(model.parameters(), lr=learning_rate)
        scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * epochs)
        criterion = nn.CrossEntropyLoss()

        model.train()
        for epoch in range(epochs):
            total_loss = 0
            all_preds = []
            all_labels = []

            # Adding a progress bar for the training loop using tqdm
            for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}", unit="batch"):
                optimizer.zero_grad()

                # Move data to device
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                # Forward pass
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)

                # Calculate loss
                loss = criterion(outputs, labels)

                # Backward pass and optimization
                loss.backward()
                optimizer.step()
                scheduler.step()

                total_loss += loss.item()

                # Store predictions and true labels for evaluation
                _, preds = torch.max(outputs, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

            # Compute metrics after each epoch
            accuracy = accuracy_score(all_labels, all_preds)
            precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

            print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_dataloader):.4f}")
            print(f"Epoch {epoch+1}/{epochs}, Accuracy: {accuracy:.4f}")
            print(f"Epoch {epoch+1}/{epochs}, Precision: {precision:.4f}")
            print(f"Epoch {epoch+1}/{epochs}, Recall: {recall:.4f}")
            print(f"Epoch {epoch+1}/{epochs}, F1 Score: {f1:.4f}")

            # Save the best model based on F1 score
            if f1 > best_f1_score:
                best_f1_score = f1
                best_model = model.state_dict()  # Save the model's state_dict

        # Save results for this model and head type
        results[(model_key, head_type)] = {
            "loss": total_loss / len(train_dataloader),
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1
        }

# Save the best model
if best_model is not None:
    torch.save(best_model, "best_model.pth")
    print("\nBest model saved with F1 score:", best_f1_score)

# Print comparison
print("\nModel Comparison Results:")
for key, metrics in results.items():
    print(f"{key}: Loss {metrics['loss']:.4f}, Accuracy {metrics['accuracy']:.4f}, Precision {metrics['precision']:.4f}, Recall {metrics['recall']:.4f}, F1 Score {metrics['f1']:.4f}")


Device: cuda
Training bert with basic head...


Epoch 1/3: 100%|██████████| 1698/1698 [05:08<00:00,  5.51batch/s]


Epoch 1/3, Loss: 0.5603
Epoch 1/3, Accuracy: 0.7077
Epoch 1/3, Precision: 0.7109
Epoch 1/3, Recall: 0.7077
Epoch 1/3, F1 Score: 0.7011


Epoch 2/3: 100%|██████████| 1698/1698 [05:29<00:00,  5.15batch/s]


Epoch 2/3, Loss: 0.4589
Epoch 2/3, Accuracy: 0.7789
Epoch 2/3, Precision: 0.7795
Epoch 2/3, Recall: 0.7789
Epoch 2/3, F1 Score: 0.7771


Epoch 3/3: 100%|██████████| 1698/1698 [05:29<00:00,  5.16batch/s]


Epoch 3/3, Loss: 0.2966
Epoch 3/3, Accuracy: 0.8716
Epoch 3/3, Precision: 0.8715
Epoch 3/3, Recall: 0.8716
Epoch 3/3, F1 Score: 0.8714
Training bert with mlp head...


Epoch 1/3: 100%|██████████| 1698/1698 [05:36<00:00,  5.04batch/s]


Epoch 1/3, Loss: 0.5586
Epoch 1/3, Accuracy: 0.7090
Epoch 1/3, Precision: 0.7127
Epoch 1/3, Recall: 0.7090
Epoch 1/3, F1 Score: 0.7020


Epoch 2/3:  43%|████▎     | 732/1698 [02:25<03:11,  5.05batch/s]


KeyboardInterrupt: 

# 9. Save the model

In [None]:
# Define save path
save_dir = "../models"
os.makedirs(save_dir, exist_ok=True)

# Save the trained model
model_save_path = os.path.join(save_dir, "fact_checker_model.pt")
torch.save(best_model.state_dict(), model_save_path)

# Save the tokenizer
tokenizer_save_path = os.path.join(save_dir, "tokenizer")
tokenizer.save_pretrained(tokenizer_save_path)

print(f"Model saved at {model_save_path}")
print(f"Tokenizer saved at {tokenizer_save_path}")

# 10 Simulate real world usage

In [None]:
def load_model(model_path, model_name, head_type="basic"):
    model = TransformerClassifier(model_name, head_type)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()
    return model

def classify_statement(statement, model, tokenizer, max_length=128):
    # Tokenize input
    inputs = tokenizer(statement, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Get model prediction
    with torch.no_grad():
        output = model(**inputs)
        prediction = torch.argmax(output, dim=1).item()

    return "True" if prediction == 1 else "False"

# Load saved model and tokenizer
loaded_model = load_model("../models/fact_checker_model.pt", model_name)
loaded_tokenizer = tokenizer.from_pretrained("../models/tokenizer")

# Test with an example statement
test_statement = "The government increased healthcare funding by 20% this year."
print(f"Statement: '{test_statement}'")
print(f"Prediction: {classify_statement(test_statement, loaded_model, loaded_tokenizer)}")
