## Deep Learning & BERT


### Setup and Imports

In [1]:
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## 2. Load and Preprocess Dataset
train_df = pd.read_csv('liar_dataset/train.tsv', sep='\t')
valid_df = pd.read_csv('liar_dataset/valid.tsv', sep='\t')

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@w+|\#','', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [4]:
train_df.head()

Unnamed: 0,2635.json,false,Says the Annies List political group supports third-trimester abortions on demand.,abortion,dwayne-bohac,State representative,Texas,republican,0,1,0.1,0.2,0.3,a mailer
0,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
1,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
2,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
3,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN
4,12465.json,true,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0.0,3.0,2.0,5.0,1.0,a an online opinion-piece


In [5]:
# data sets are without headers .. lets give some headers i.e defining column names

# Define column names
column_names = ['statement_id',	'label', 'statement', 'subject', 'speaker', 'speaker_job', 'state', 'party', 'barely_true_counts',	'false_Count',	'half_true_counts',	'mostly_true_counts', 'pants_on_fire_counts', 'context_venue']

# Read TSV files without headers
train_df = pd.read_csv('liar_dataset/train.tsv', sep='\t', names=column_names)
valid_df = pd.read_csv('liar_dataset/valid.tsv', sep='\t', names=column_names)
test_df = pd.read_csv('liar_dataset/test.tsv', sep='\t', names=column_names)


In [6]:
train_df['clean_text'] = train_df['statement'].apply(clean_text)
valid_df['clean_text'] = valid_df['statement'].apply(clean_text)

In [7]:
## 3. Deep Learning Dataset Preparation
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [8]:
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_len)
        self.labels = [int(label) for label in labels.tolist()]  # ensure numeric

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

In [9]:
label2id = {label: idx for idx, label in enumerate(train_df['label'].unique())}

# Apply mapping to both datasets
train_df['label_id'] = train_df['label'].map(label2id)
valid_df['label_id'] = valid_df['label'].map(label2id)


In [10]:
train_dataset = NewsDataset(train_df['clean_text'], train_df['label_id'], tokenizer)
valid_dataset = NewsDataset(valid_df['clean_text'], valid_df['label_id'], tokenizer)


In [11]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16)

In [12]:
## 4. BERT Model Fine-Tuning
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [13]:
from torch.optim import AdamW


In [14]:
optimizer = AdamW(model.parameters(), lr=2e-5)


# Training loop
epochs = 2
model.train()

for epoch in range(epochs):
    loop = tqdm(train_loader, leave=True)
    total_loss = 0
    for batch in loop:
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Progress bar update
        loop.set_description(f"Epoch {epoch + 1}")
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1} finished. Avg loss: {total_loss / len(train_loader):.4f}")