# 7. BERT (Fine Tunning)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, precision_score, recall_score, f1_score

from datasets import Dataset # Crucial for Hugging Face data compatibility
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import torch
from transformers import TrainingArguments, Trainer


  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# --- 1. Prepare Data ( Loading from CSV) ---
fnd_bert_df = pd.read_csv("../data/processed/balanced_fake_news_dataset.csv")
    

In [3]:
print("Dataset loaded successfully from CSV.")
print(f"Initial DataFrame shape: {fnd_bert_df.shape}")
print("DataFrame head:")
print(fnd_bert_df.head())
print("\nDataFrame info:")
fnd_bert_df.info()

Dataset loaded successfully from CSV.
Initial DataFrame shape: (62030, 8)
DataFrame head:
                                                text  label original_label  \
0  Says comprehensive immigration reform will add...      0      half-true   
1  Ellen DeGeneres makes joke about Jennifer Anis...      0              0   
2  When we lower tax rates, we generate more in r...      0      half-true   
3  Karma it s a beautiful thing A massive makeshi...      0              0   
4  Ellen DeGeneres' wife Portia de Rossi makes he...      0              0   

               dataset                                         clean_text  \
0                 LIAR  says comprehensive immigration reform will add...   
1  FakeNewsNet_Minimal  ellen degeneres makes joke about jennifer anis...   
2                 LIAR  when we lower tax rates we generate more in re...   
3                 ISOT  karma it s a beautiful thing a massive makeshi...   
4  FakeNewsNet_Minimal  ellen degeneres wife portia de r

In [4]:
fnd_bert_df = fnd_bert_df[['clean_text', 'label']].rename(columns={'clean_text': 'text'})
fnd_bert_df['label'] = fnd_bert_df['label'].astype(int) # Ensure labels are integers (0 or 1)

In [5]:
# Convert pandas DataFrame to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(fnd_bert_df)
num_labels = fnd_bert_df['label'].nunique()

In [6]:
fnd_bert_df.head()

Unnamed: 0,text,label
0,says comprehensive immigration reform will add...,0
1,ellen degeneres makes joke about jennifer anis...,0
2,when we lower tax rates we generate more in re...,0
3,karma it s a beautiful thing a massive makeshi...,0
4,ellen degeneres wife portia de rossi makes her...,0


In [7]:
# --- 2. Initialize Tokenizer and Model ---
model_name = "bert-base-uncased" # A common BERT model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# --- 3. Tokenization Function ---
def tokenize_function(examples):
    # padding=True will pad to the longest sequence in the batch
    # max_length=128 is a common length, adjust if your texts are longer/shorter
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=128)

In [9]:
# Apply tokenization
tokenized_hf_dataset = hf_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/62030 [00:00<?, ? examples/s]

Map: 100%|██████████| 62030/62030 [00:23<00:00, 2618.98 examples/s]


In [10]:
# Rename 'label' to 'labels' as Hugging Face Trainer expects 'labels'
tokenized_hf_dataset = tokenized_hf_dataset.rename_column("label", "labels")
# Remove original text column and set format for PyTorch
tokenized_hf_dataset = tokenized_hf_dataset.remove_columns(["text"])
tokenized_hf_dataset.set_format("torch")

In [11]:
print(tokenized_hf_dataset[0])

{'labels': tensor(0), 'input_ids': tensor([ 101, 2758, 7721, 7521, 5290, 2097, 5587, 4551, 2000, 2256, 4610,  102,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0

In [12]:
# --- 4. Split Dataset ---
# Splits into 80% training and 20% validation.
train_test_split = tokenized_hf_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test'] # Using 'test' as validation set for Trainer

In [13]:
print(f"\nTraining dataset size: {len(train_dataset)}")
print(f"Evaluation dataset size: {len(eval_dataset)}")



Training dataset size: 49624
Evaluation dataset size: 12406


In [14]:
# --- 5. Define compute_metrics Function (Simplified) ---
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='binary', zero_division=0),
        'precision': precision_score(labels, predictions, average='binary', zero_division=0),
        'recall': recall_score(labels, predictions, average='binary', zero_division=0)
    }

In [15]:
import transformers
print(transformers.__version__)


4.40.1


In [16]:
# --- 6. Define Training Arguments (SIMPLIFIED FOR COMPATIBILITY) ---
training_args = TrainingArguments(
    output_dir='../models/results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=32,      # use higher if your GPU can handle it
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,    
    seed=42,                       # <--- Enables mixed precision on GPU (speeds up training)
)


In [17]:
# --- 7. Data Collator ---
# Ensures dynamic padding for batches, making training more efficient.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [18]:
# --- 8. Initialize and Train the Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer, # Pass tokenizer to trainer for data_collator
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [19]:
trainer.train()

  1%|          | 50/4653 [04:32<7:00:43,  5.48s/it]

{'loss': 0.4216, 'grad_norm': 16.099550247192383, 'learning_rate': 4.9473457984096284e-05, 'epoch': 0.03}


  2%|▏         | 100/4653 [09:07<6:56:06,  5.48s/it]

{'loss': 0.3031, 'grad_norm': 3.3180601596832275, 'learning_rate': 4.8946915968192565e-05, 'epoch': 0.06}


  3%|▎         | 150/4653 [13:39<6:48:01,  5.44s/it]

{'loss': 0.3247, 'grad_norm': 3.491584300994873, 'learning_rate': 4.840962819686224e-05, 'epoch': 0.1}


  4%|▍         | 200/4653 [18:12<6:47:11,  5.49s/it]

{'loss': 0.2982, 'grad_norm': 1.7096076011657715, 'learning_rate': 4.787234042553192e-05, 'epoch': 0.13}


  5%|▌         | 250/4653 [22:46<6:42:21,  5.48s/it]

{'loss': 0.2785, 'grad_norm': 3.079103469848633, 'learning_rate': 4.7335052654201596e-05, 'epoch': 0.16}


  6%|▋         | 300/4653 [27:21<6:37:41,  5.48s/it]

{'loss': 0.2987, 'grad_norm': 2.8390426635742188, 'learning_rate': 4.679776488287127e-05, 'epoch': 0.19}


  8%|▊         | 350/4653 [31:55<6:33:13,  5.48s/it]

{'loss': 0.2846, 'grad_norm': 2.7651307582855225, 'learning_rate': 4.626047711154094e-05, 'epoch': 0.23}


  9%|▊         | 400/4653 [36:29<6:28:46,  5.48s/it]

{'loss': 0.2728, 'grad_norm': 2.3923697471618652, 'learning_rate': 4.572318934021062e-05, 'epoch': 0.26}


 10%|▉         | 450/4653 [41:03<6:23:59,  5.48s/it]

{'loss': 0.2563, 'grad_norm': 2.92568302154541, 'learning_rate': 4.51859015688803e-05, 'epoch': 0.29}


 11%|█         | 500/4653 [45:37<6:19:24,  5.48s/it]

{'loss': 0.2857, 'grad_norm': 2.6446657180786133, 'learning_rate': 4.4648613797549974e-05, 'epoch': 0.32}


 12%|█▏        | 550/4653 [50:11<6:14:53,  5.48s/it]

{'loss': 0.247, 'grad_norm': 3.995162010192871, 'learning_rate': 4.4111326026219644e-05, 'epoch': 0.35}


 13%|█▎        | 600/4653 [54:45<6:10:30,  5.48s/it]

{'loss': 0.251, 'grad_norm': 1.3295079469680786, 'learning_rate': 4.357403825488932e-05, 'epoch': 0.39}


 14%|█▍        | 650/4653 [3:13:46<6:03:44,  5.45s/it]     

{'loss': 0.25, 'grad_norm': 3.600801467895508, 'learning_rate': 4.3036750483559e-05, 'epoch': 0.42}


 15%|█▌        | 700/4653 [3:18:18<5:58:24,  5.44s/it]

{'loss': 0.2446, 'grad_norm': 3.8947651386260986, 'learning_rate': 4.2499462712228675e-05, 'epoch': 0.45}


 15%|█▌        | 716/4653 [3:19:45<5:57:08,  5.44s/it]

KeyboardInterrupt: 