# Imports


In [None]:
import torch
from tqdm import tqdm
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer
from transformers import BertTokenizer, BertConfig , BertModel, DistilBertModel, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score
import torch.nn.functional as F
import torch.nn as nn
from transformers import AutoModel


  from .autonotebook import tqdm as notebook_tqdm
2025-02-15 19:03:06.927318: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739646186.940205  333389 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739646186.944173  333389 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-15 19:03:06.960520: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Fact Analyses
We need to classifie agruments as `true` or `false`.

For that we will use the `Politifact Fact Check` and `Snopes Fact-news Data` datasets from the EDA

## Define Data Processing
This defines a Custom Dataset class to treat our data for Fact Checking

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, labels, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data[idx]["sentence"]
        label = self.labels[idx]
        encoding = self.tokenizer(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

### Pre Processing Data function
We frist start to tranform all the labels in true or false.

Only the labels `true` and `mostly-true` are considered as `true: 1` and the other labels are considered `false: 0`.

One area of improvement in the future, is to use the other labels provided by the datasets of the websites (or even create custom ones).

We also remove all the unwanted colums and add `None` to empty values of the remaining columns.

In [2]:
def data_preprocessing(dataset):
  
  #turn label either false or true
  dataset['label']=[1 if x=="true"or x=="mostly-true" or x=="half-true" else 0 for x in dataset[1]] 

  #Dropping unwanted columns
  dataset = dataset.drop(labels=[0,1,8,9,10,11,12] ,axis=1)
  #Dealing with empty datapoints for metadata columns - subject, speaker, job, state,affiliation, context
  meta = []
  for i in range(len(dataset)):
      subject = dataset[3][i]
      if subject == 0:
          subject = 'None'

      speaker =  dataset[4][i]
      if speaker == 0:
          speaker = 'None'

      job =  dataset[5][i]
      if job == 0:
          job = 'None'

      state =  dataset[6][i]
      if state == 0:
          state = 'None'

      affiliation =  dataset[7][i]
      if affiliation == 0:
          affiliation = 'None'

      context =  dataset[13][i]
      if context == 0 :
          context = 'None'

      meta.append(str(subject) + ' ' + str(speaker) + ' ' + str(job) + ' ' + str(state) + ' ' + str(affiliation) + ' ' + str(context)) #combining all the meta data columns into a single column
  
  #Adding cleaned and combined metadata column to the dataset
  dataset[14] = meta
  dataset["sentence"] = dataset[14].astype('str')+" "+dataset[2] #Combining metadata and the text columns into single columns

  dataset = dataset.drop([2,3,4,5,6,7,13,14], axis=1) #dropping metadata columns, as we have merged them into a single column
  dataset.dropna() #Dropping if there are still any null values

  return dataset

## Defining BERT Train functions
- Define BERT Model with classification head
- Define the function to train the model
- Define evaluate method
- Define function to predict a verdict


In [None]:
import torch.nn as nn
from transformers import AutoModel
from transformers import (
    BertForSequenceClassification,    
    BertTokenizer,
    RobertaForSequenceClassification,
    RobertaTokenizer,
    AdamW)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.preprocessing import label_binarize

# Classification Head
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name):
        super(BERTClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, 2)  # Binary classification
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        x = self.fc(x)
        return self.softmax(x)


# Train Function
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    loss_fn = nn.CrossEntropyLoss()
    progress_bar = tqdm(data_loader, desc="Training", leave=True)
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

# Evaluation Function
def evaluate(model, data_loader,device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

# Label predict Function
def predict_label(text, model, tokenizer, device, label_encoder, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        pred_class = torch.argmax(outputs, dim=1).item()

    return label_encoder.inverse_transform([pred_class])[0]  # Return actual label
    

## Training and testing
We decided to use `distilbert-base-uncased` model with the following train paramenters

**Jaime uma pequena explicaçao porque os usaste estes parametros e o destilbert**

In [4]:
#Define our models hyperparameters
bert_model_name = 'distilbert-base-uncased'
num_classes = 6
max_length = 128
batch_size = 8
num_epochs = 10
learning_rate = 2e-5

### Combining Both DS

In [5]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# Loading the data

# Load the LIAR dataset from TSV
#df1 = pd.read_csv(os.path.join('Datasets','LIAR.tsv'), sep="\t", header=None)
# Load the POLITIFACT dataset
df2 = pd.read_json(os.path.join('Datasets','politifact_factcheck_data.json'), lines=True)

#---------------Preprocess LIAR dataset----------------------
#df1 = data_preprocessing(df1)

#---------------Preprocess Politifact dataset----------------------
df2["sentence"] = df2["statement_originator"] + " said: " + df2["statement"] + " (" + df2["statement_date"] + " via " + df2["statement_source"] + ")"

# Encode labels: 1 for true, mostly-true, half-true, barely-true; 0 otherwise
df2['label'] = df2['verdict'].apply(lambda x: 1 if x in ['true', 'mostly-true'] else 0)

# Handle metadata and combine into a single column
meta_columns = ["statement_originator", "statement_date", "statement_source"]
df2['metadata'] = df2[meta_columns].fillna('None').astype(str).agg(' '.join, axis=1)

# Create sentence column combining metadata and statement
df2["sentence"] = df2["metadata"] + " " + df2["statement"]

# Drop original columns (optional)
df2 = df2.drop(columns=meta_columns + ["statement", "verdict","metadata","factcheck_analysis_link","factchecker", "factcheck_date"])


#df = pd.concat([df1, df2])
df = df2
df.fillna("None", inplace=True)
df = df.dropna()

#split data
train_sentences, val_sentences, train_labels, val_labels = train_test_split(df["sentence"].tolist(), df["label"].tolist(), test_size=0.2)	

train_data = [{"sentence" : stm, "label" : vrd} for stm, vrd in zip(train_sentences, train_labels)]
val_data = [{"sentence" : stm, "label" : vrd} for stm, vrd in zip(val_sentences, val_labels)]

print("Train data size:", len(train_data))
print("train data", train_data[0])




Train data size: 16921
train data {'sentence': 'Marco Rubio 12/17/2014 other "The reason why Cubans don\'t have access to 21st century telecommunications — like smart phones, like access to the Internet — is because it is illegal in Cuba."', 'label': 1}


In [6]:
df.head(500)

Unnamed: 0,sentence,label
0,Barack Obama 6/11/2008 speech John McCain oppo...,1
1,"Matt Gaetz 6/7/2022 television ""Bennie Thompso...",0
2,Kelly Ayotte 5/18/2016 news Says Maggie Hassan...,1
3,"Bloggers 2/1/2021 blog ""BUSTED: CDC Inflated C...",0
4,"Bobby Jindal 8/30/2015 television ""I'm the onl...",0
...,...,...
495,"Dana Young 10/20/2016 other Says she ""voted fo...",0
496,"Viral image 1/1/2021 social_media ""President-e...",0
497,Michael MacDonald 3/24/2021 statement Michigan...,0
498,Vincent Fort 7/6/2010 other Says Democratic op...,0


In [7]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True)

# Create DataLoaders, from pytorch (this gives me an error)
train_dataset = CustomDataset(train_data, tokenizer,train_labels)
val_dataset = CustomDataset(val_data, tokenizer,val_labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# Set the device and load the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name).to(device)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


### Training and save the model


In [None]:
# Optimizer and scheduler
# AdamW is a class from the huggingface library
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Total number of training steps, being the number of batches * number of epochs
total_steps = len(train_dataloader) * num_epochs 
num_warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=total_steps)


for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, report = evaluate(model, val_dataloader,device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)

torch.save(model.state_dict(), os.path.join("models","distilbert_fact_classifier.pth"))



Epoch 1/10


Training: 100%|██████████| 2116/2116 [01:56<00:00, 18.18it/s]


Validation Accuracy: 0.7455
              precision    recall  f1-score   support

           0       0.76      0.94      0.84      3050
           1       0.62      0.23      0.34      1181

    accuracy                           0.75      4231
   macro avg       0.69      0.59      0.59      4231
weighted avg       0.72      0.75      0.70      4231

Epoch 2/10


Training: 100%|██████████| 2116/2116 [02:02<00:00, 17.32it/s]


Validation Accuracy: 0.7317
              precision    recall  f1-score   support

           0       0.80      0.84      0.82      3050
           1       0.52      0.45      0.49      1181

    accuracy                           0.73      4231
   macro avg       0.66      0.65      0.65      4231
weighted avg       0.72      0.73      0.73      4231

Epoch 3/10


Training: 100%|██████████| 2116/2116 [02:07<00:00, 16.59it/s]


Validation Accuracy: 0.7299
              precision    recall  f1-score   support

           0       0.80      0.83      0.82      3050
           1       0.52      0.48      0.50      1181

    accuracy                           0.73      4231
   macro avg       0.66      0.65      0.66      4231
weighted avg       0.72      0.73      0.73      4231

Epoch 4/10


Training: 100%|██████████| 2116/2116 [02:09<00:00, 16.34it/s]


Validation Accuracy: 0.7143
              precision    recall  f1-score   support

           0       0.81      0.79      0.80      3050
           1       0.49      0.52      0.50      1181

    accuracy                           0.71      4231
   macro avg       0.65      0.65      0.65      4231
weighted avg       0.72      0.71      0.72      4231

Epoch 5/10


Training: 100%|██████████| 2116/2116 [02:10<00:00, 16.23it/s]


Validation Accuracy: 0.7393
              precision    recall  f1-score   support

           0       0.78      0.89      0.83      3050
           1       0.55      0.36      0.44      1181

    accuracy                           0.74      4231
   macro avg       0.67      0.62      0.63      4231
weighted avg       0.72      0.74      0.72      4231

Epoch 6/10


Training: 100%|██████████| 2116/2116 [02:11<00:00, 16.11it/s]


Validation Accuracy: 0.7374
              precision    recall  f1-score   support

           0       0.79      0.87      0.83      3050
           1       0.54      0.40      0.46      1181

    accuracy                           0.74      4231
   macro avg       0.66      0.63      0.64      4231
weighted avg       0.72      0.74      0.72      4231

Epoch 7/10


Training: 100%|██████████| 2116/2116 [02:11<00:00, 16.11it/s]


Validation Accuracy: 0.7329
              precision    recall  f1-score   support

           0       0.80      0.84      0.82      3050
           1       0.53      0.44      0.48      1181

    accuracy                           0.73      4231
   macro avg       0.66      0.64      0.65      4231
weighted avg       0.72      0.73      0.73      4231

Epoch 8/10


Training: 100%|██████████| 2116/2116 [02:11<00:00, 16.14it/s]


Validation Accuracy: 0.7379
              precision    recall  f1-score   support

           0       0.81      0.84      0.82      3050
           1       0.53      0.48      0.51      1181

    accuracy                           0.74      4231
   macro avg       0.67      0.66      0.66      4231
weighted avg       0.73      0.74      0.73      4231

Epoch 9/10


Training: 100%|██████████| 2116/2116 [02:12<00:00, 15.97it/s]


Validation Accuracy: 0.7393
              precision    recall  f1-score   support

           0       0.79      0.87      0.83      3050
           1       0.54      0.40      0.46      1181

    accuracy                           0.74      4231
   macro avg       0.67      0.64      0.65      4231
weighted avg       0.72      0.74      0.73      4231

Epoch 10/10


Training: 100%|██████████| 2116/2116 [02:12<00:00, 15.98it/s]


Validation Accuracy: 0.7317
              precision    recall  f1-score   support

           0       0.80      0.84      0.82      3050
           1       0.52      0.46      0.49      1181

    accuracy                           0.73      4231
   macro avg       0.66      0.65      0.65      4231
weighted avg       0.72      0.73      0.73      4231



### Evaluate models perfomance
To evaluate the model we gonna give some text samples from polifact website:

In [46]:
# Reload new model
model.load_state_dict(torch.load('./models/distilbert_fact_classifier.pth', weights_only=False))
model = model.to(device)

# Function to test the sentiment of a text
def test_sentiment(text, model, tokenizer):
    label_encoder = {0: 'False', 1: 'True'}
    sentiment = predict_label(text, model, tokenizer, device, label_encoder)
    print(f"Predicted sentiment: {sentiment}")

# Test sentiment prediction
test_sentiment("The poverty rate decreased by 3% in the last two years", model, tokenizer) # True
test_sentiment("FEMA sent $59M LAST WEEK to luxury hotels in New York City to house illegal migrants… That money is meant for American disaster relief.", model, tokenizer) # False
test_sentiment("Former USAID Administrator Samantha Power’s net worth “skyrocketed” from $6.7 million to $30 million in three years.", model, tokenizer) # False

Predicted sentiment: True
Predicted sentiment: False
Predicted sentiment: False
