### Same mode, but we only classify either true or false


### Define Data processing functions and imports


In [1]:
import torch
from tqdm import tqdm
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer
from transformers import BertTokenizer, BertModel,DistilBertModel  # Changed imports
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score
import torch.nn.functional as F
import torch.nn as nn
from transformers import AutoModel





class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, labels, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data[idx]["sentence"]
        label = self.labels[idx]
        encoding = self.tokenizer(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }




  from .autonotebook import tqdm as notebook_tqdm


### Pre Processing Data function

In [13]:
def data_preprocessing(dataset):
  
  #turn label either false or true
  dataset['label']=[1 if x=="true"or x=="mostly-true" or x=="half-true" or x=="barely-true" else 0 for x in dataset[1]] 

  #Dropping unwanted columns
  dataset = dataset.drop(labels=[0,1,8,9,10,11,12] ,axis=1)
  #Dealing with empty datapoints for metadata columns - subject, speaker, job, state,affiliation, context
  meta = []
  for i in range(len(dataset)):
      subject = dataset[3][i]
      if subject == 0:
          subject = 'None'

      speaker =  dataset[4][i]
      if speaker == 0:
          speaker = 'None'

      job =  dataset[5][i]
      if job == 0:
          job = 'None'

      state =  dataset[6][i]
      if state == 0:
          state = 'None'

      affiliation =  dataset[7][i]
      if affiliation == 0:
          affiliation = 'None'

      context =  dataset[13][i]
      if context == 0 :
          context = 'None'

      meta.append(str(subject) + ' ' + str(speaker) + ' ' + str(job) + ' ' + str(state) + ' ' + str(affiliation) + ' ' + str(context)) #combining all the meta data columns into a single column
  
  #Adding cleaned and combined metadata column to the dataset
  dataset[14] = meta
  dataset["sentence"] = dataset[14].astype('str')+" "+dataset[2] #Combining metadata and the text columns into single columns

  dataset = dataset.drop([2,3,4,5,6,7,13,14], axis=1) #dropping metadata columns, as we have merged them into a single column
  dataset.dropna() #Dropping if there are still any null values

  return dataset

### Defining BERT Model with classification head 

In [3]:
import torch.nn as nn
from transformers import AutoModel
from transformers import (
    BertForSequenceClassification,    
    BertTokenizer,
    RobertaForSequenceClassification,
    RobertaTokenizer,
    AdamW)

class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name):
        super(BERTClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, 2)  # Binary classification
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_embedding)
        x = self.fc(x)
        return self.softmax(x)



    

### Define the function to train the model


In [4]:

def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    loss_fn = nn.CrossEntropyLoss()
    progress_bar = tqdm(data_loader, desc="Training", leave=True)
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

### Define evaluate method

In [5]:
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.preprocessing import label_binarize

def evaluate(model, data_loader,device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)


### Define the function to predict a verdict

In [6]:
def predict_label(text, model, tokenizer, device, label_encoder, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        pred_class = torch.argmax(outputs, dim=1).item()

    return label_encoder.inverse_transform([pred_class])[0]  # Return actual label


### Training and testing

In [7]:
#Define our models hyperparameters
bert_model_name = 'distilbert-base-uncased'
num_classes = 6
max_length = 128
batch_size = 8
num_epochs = 10
learning_rate = 2e-5

### Combining Both DS

In [19]:

import pandas as pd
from sklearn.preprocessing import LabelEncoder
# Loading the data

# Load the LIAR dataset from TSV
df1 = pd.read_csv("../data/LIAR.tsv", sep="\t", header=None)
# Load the POLITIFACT dataset
df2 = pd.read_csv("../data/politifact_data.csv") 

#---------------Preprocess LIAR dataset----------------------
df1 = data_preprocessing(df1)

#---------------Preprocess Politifact dataset----------------------
df2["sentence"] = df2["statement_originator"] + " said: " + df2["statement"] + " (" + df2["statement_date"] + " via " + df2["statement_source"] + ")"

# Encode labels: 1 for true, mostly-true, half-true, barely-true; 0 otherwise
df2['label'] = df2['verdict'].apply(lambda x: 1 if x in ['true', 'mostly-true', 'half-true', 'barely-true'] else 0)

# Handle metadata and combine into a single column
meta_columns = ["statement_originator", "statement_date", "statement_source"]
df2['metadata'] = df2[meta_columns].fillna('None').astype(str).agg(' '.join, axis=1)

# Create sentence column combining metadata and statement
df2["sentence"] = df2["metadata"] + " " + df2["statement"]

# Drop original columns (optional)
df2 = df2.drop(columns=meta_columns + ["statement", "verdict","metadata","factcheck_analysis_link","factchecker", "factcheck_date"])


df = pd.concat([df1, df2])
df.fillna("None", inplace=True)
df = df.dropna()


#split data
train_sentences, val_sentences, train_labels, val_labels = train_test_split(df["sentence"].tolist(), df["label"].tolist(), test_size=0.2)	

train_data = [{"sentence" : stm, "label" : vrd} for stm, vrd in zip(train_sentences, train_labels)]
val_data = [{"sentence" : stm, "label" : vrd} for stm, vrd in zip(val_sentences, val_labels)]

print("Train data size:", len(train_data))
print("train data", train_data[0])




Train data size: 27154
train data {'sentence': 'corrections-and-updates,economy,poverty larry-elder Radio talk show host nan talk-show-host comments on CNN If black America were a country, itd be the 15th wealthiest nation in the world.', 'label': 0}


In [15]:
df.head(500)

Unnamed: 0,label,sentence
0,0,abortion dwayne-bohac State representative Tex...
1,1,"energy,history,job-accomplishments scott-surov..."
2,1,foreign-policy barack-obama President Illinois...
3,0,health-care blog-posting nan nan none a news r...
4,1,"economy,jobs charlie-crist nan Florida democra..."
...,...,...
495,1,"foreign-policy,public-health tom-daschle nan n..."
496,1,"federal-budget,transportation sarah-palin nan ..."
497,1,"state-budget,state-finances,transportation jef..."
498,1,"education,state-budget virginia-education-asso..."


In [21]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

# Create DataLoaders, from pytorch
train_dataset = CustomDataset(train_data, tokenizer,train_labels)
val_dataset = CustomDataset(val_data, tokenizer,val_labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

### Set up model and device


In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

model = BERTClassifier(bert_model_name).to(device)

Device: cuda


### Training the model


In [22]:
# Optimizer and scheduler
# AdamW is a class from the huggingface library
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Total number of training steps, being the number of batches * number of epochs
total_steps = len(train_dataloader) * num_epochs 
num_warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=total_steps)


for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, report = evaluate(model, val_dataloader,device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)



Epoch 1/10


Training: 100%|██████████| 3395/3395 [05:49<00:00,  9.71it/s]


Validation Accuracy: 0.7154
              precision    recall  f1-score   support

           0       0.75      0.53      0.62      2993
           1       0.70      0.86      0.77      3796

    accuracy                           0.72      6789
   macro avg       0.72      0.70      0.70      6789
weighted avg       0.72      0.72      0.71      6789

Epoch 2/10


Training: 100%|██████████| 3395/3395 [06:17<00:00,  9.00it/s]


Validation Accuracy: 0.7284
              precision    recall  f1-score   support

           0       0.76      0.56      0.65      2993
           1       0.71      0.86      0.78      3796

    accuracy                           0.73      6789
   macro avg       0.74      0.71      0.71      6789
weighted avg       0.73      0.73      0.72      6789

Epoch 3/10


Training:   3%|▎         | 97/3395 [00:10<05:49,  9.43it/s]


KeyboardInterrupt: 

### Save the final model


In [None]:
torch.save(model.state_dict(), "bert_classifier.pth")

### Evaluate models perfomance

In [None]:
# Test sentiment prediction
test_text = "The poverty rate decreased by 3% in the last two years"
sentiment = predict_label(test_text, model, tokenizer, device,label_encoder)
print(f"Predicted sentiment: {sentiment}")

Predicted sentiment: mostly-true
