In [14]:
!pip install -q transformers==4.36.2 peft==0.9.0 accelerate==0.25.0


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
import transformers
print(transformers.__version__)

4.36.2


# Dataset Initialization and Preprocessing

In [16]:
import pandas as pd

df = pd.read_csv('/kaggle/input/truth-seeker-model/Truth_Seeker_Model_Dataset.csv')
print(df.head())

   Unnamed: 0      author                                          statement  \
0           0  D.L. Davis  End of eviction moratorium means millions of A...   
1           1  D.L. Davis  End of eviction moratorium means millions of A...   
2           2  D.L. Davis  End of eviction moratorium means millions of A...   
3           3  D.L. Davis  End of eviction moratorium means millions of A...   
4           4  D.L. Davis  End of eviction moratorium means millions of A...   

   target  BinaryNumTarget                 manual_keywords  \
0    True              1.0  Americans, eviction moratorium   
1    True              1.0  Americans, eviction moratorium   
2    True              1.0  Americans, eviction moratorium   
3    True              1.0  Americans, eviction moratorium   
4    True              1.0  Americans, eviction moratorium   

                                               tweet 5_label_majority_answer  \
0  @POTUS Biden Blunders - 6 Month Update\n\nInfl...            Mo

In [17]:
print(df['BinaryNumTarget'].value_counts())

BinaryNumTarget
1.0    68930
0.0    65268
Name: count, dtype: int64


In [18]:
import re

def clean_text(text):
    text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)           # Remove mentions
    text = re.sub(r"#", "", text)              # Remove hashtag symbol
    text = re.sub(r"[^A-Za-z0-9\s]", "", text) # Remove special characters
    text = text.strip()
    return text

df['tweet'] = df['tweet'].apply(clean_text)

# Check first few cleaned tweets
print(df['tweet'].head())


0    Biden Blunders  6 Month Update\n\nInflation De...
1    Not as many people are literally starving and ...
2    THE SUPREME COURT is siding with super rich pr...
3    Biden Blunders\n\nBroken campaign promises Inf...
4    I agree The confluence of events right now is ...
Name: tweet, dtype: object


In [19]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['tweet'].tolist(),
    df['BinaryNumTarget'].tolist(),
    test_size=0.1,           # 10% for validation
    random_state=42,
    stratify=df['BinaryNumTarget']  # Keep the label distribution balanced
)

print(f"Training samples: {len(train_texts)}, Validation samples: {len(val_texts)}")


Training samples: 120778, Validation samples: 13420


In [20]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

# Tokenize the training and validation texts
train_encodings = tokenizer(
    train_texts,
    truncation=True,
    padding=True,
    max_length=128
)

val_encodings = tokenizer(
    val_texts,
    truncation=True,
    padding=True,
    max_length=128
)

# Inspect one example
print(train_encodings['input_ids'][0])




[0, 100, 40, 45, 3568, 10, 11445, 5, 200, 86, 198, 7632, 54, 34, 45, 5335, 26999, 40, 45, 109, 24, 38, 33, 26999, 142, 38, 109, 679, 2866, 635, 187, 38, 33, 5335, 26999, 38, 109, 45, 240, 7, 3568, 10, 11445, 166, 240, 7, 912, 42, 20175, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [21]:
import torch

class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Create training and validation datasets
train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)

# Check the first example
print(train_dataset[0])


{'input_ids': tensor([    0,   100,    40,    45,  3568,    10, 11445,     5,   200,    86,
          198,  7632,    54,    34,    45,  5335, 26999,    40,    45,   109,
           24,    38,    33, 26999,   142,    38,   109,   679,  2866,   635,
          187,    38,    33,  5335, 26999,    38,   109,    45,   240,     7,
         3568,    10, 11445,   166,   240,     7,   912,    42, 20175,     2,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1, 

In [22]:
from transformers import AutoModelForSequenceClassification

# Load roberta-base with 2 labels (Fake/Real)
model = AutoModelForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels=2
)

import torch
if torch.cuda.is_available():
    model.to('cuda')
else:
    print("⚠️ CUDA not available, running on CPU")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
model.config.problem_type = "single_label_classification"  # must be single-label
batch['labels'] = batch['labels'].long()

# Training Loop

In [32]:
from torch.utils.data import DataLoader
from transformers import AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=8)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training parameters
num_epochs = 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Force correct problem type
model.config.problem_type = "single_label_classification"

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        batch['labels'] = batch['labels'].long()  # important for CrossEntropyLoss

        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(batch_loss=loss.item())

    avg_train_loss = train_loss / len(train_loader)

    # ===== Validation =====
    model.eval()
    val_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            batch['labels'] = batch['labels'].long()
            outputs = model(**batch)
            loss = outputs.loss
            val_loss += loss.item()

            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch['labels'].cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    val_acc = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')

    print(f"\nEpoch {epoch+1} summary:")
    print(f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f}")
    print(f"Precision: {precision:.4f} | Recall: {recall:.4f} | F1-score: {f1:.4f}\n")


Epoch 1: 100%|██████████| 15098/15098 [52:32<00:00,  4.79it/s, batch_loss=0.00112] 



Epoch 1 summary:
Train Loss: 0.0951 | Val Loss: 0.0627 | Val Acc: 0.9805
Precision: 0.9684 | Recall: 0.9945 | F1-score: 0.9812



Epoch 2: 100%|██████████| 15098/15098 [52:32<00:00,  4.79it/s, batch_loss=0.0278]  



Epoch 2 summary:
Train Loss: 0.0413 | Val Loss: 0.0392 | Val Acc: 0.9879
Precision: 0.9826 | Recall: 0.9939 | F1-score: 0.9882



Epoch 3: 100%|██████████| 15098/15098 [52:31<00:00,  4.79it/s, batch_loss=0.774]   



Epoch 3 summary:
Train Loss: 0.0279 | Val Loss: 0.0449 | Val Acc: 0.9881
Precision: 0.9818 | Recall: 0.9952 | F1-score: 0.9885



# Evaluation

In [42]:
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.eval()  # Set model to evaluation mode

all_preds = []
all_labels = []
val_loss = 0
loss_fn = torch.nn.CrossEntropyLoss()  # For binary classification with logits

with torch.no_grad():
    for batch in val_loader:
        # Move batch to GPU
        batch = {k: v.to(device) for k, v in batch.items()}
        batch['labels'] = batch['labels'].long()  # convert to long
        outputs = model(**batch)

        
        # Forward pass
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        loss = outputs.loss
        val_loss += loss.item()
        
        # Get predictions
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch['labels'].cpu().numpy())

# Compute average loss
val_loss /= len(val_loader)

# Compute metrics
accuracy = accuracy_score(all_labels, all_preds)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')

print(f"Validation Loss: {val_loss:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")


Validation Loss: 0.0449
Accuracy: 0.9881
Precision: 0.9818, Recall: 0.9952, F1-score: 0.9885


# Saving the Model

In [33]:
# Save model
model_save_path = '/kaggle/working/roberta-fake-news-model'
model.save_pretrained(model_save_path)

# Save tokenizer
tokenizer.save_pretrained(model_save_path)


('/kaggle/working/roberta-fake-news-model/tokenizer_config.json',
 '/kaggle/working/roberta-fake-news-model/special_tokens_map.json',
 '/kaggle/working/roberta-fake-news-model/vocab.json',
 '/kaggle/working/roberta-fake-news-model/merges.txt',
 '/kaggle/working/roberta-fake-news-model/added_tokens.json',
 '/kaggle/working/roberta-fake-news-model/tokenizer.json')

In [34]:
import shutil

# Path to the saved model folder
model_folder = '/kaggle/working/roberta-fake-news-model'
zip_path = '/kaggle/working/roberta-fake-news-model.zip'

# Zip the folder
shutil.make_archive(base_name=zip_path.replace('.zip',''), format='zip', root_dir=model_folder)

print(f"Model zipped at: {zip_path}")


Model zipped at: /kaggle/working/roberta-fake-news-model.zip
