In [1]:
import pandas as pd
import numpy as np
import re
from transformers import BertTokenizer, BertModel, AdamW
import torch
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from tqdm import tqdm

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#df = pd.read_csv("balanced_news_dataset.csv")
df = pd.read_csv("fakeandTrue_news_dataset.csv")
# Combine 'title' and 'text' into a single column for input
df["input_text"] = df["title"] + " " + df["text"]
df.head()

Unnamed: 0,title,text,label,input_text
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,0,Donald Trump Sends Out Embarrassing New Year’...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,0,Drunk Bragging Trump Staffer Started Russian ...
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",0,Sheriff David Clarke Becomes An Internet Joke...
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",0,Trump Is So Obsessed He Even Has Obama’s Name...
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,0,Pope Francis Just Called Out Donald Trump Dur...


In [2]:
import nltk
from nltk.corpus import stopwords
# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nisha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def preprocess_text(text):
    if pd.isna(text):  # Handle NaN values
        return None
    # Remove URLs only
    text = re.sub(r'http\S+', '', text)
    # Keep case, numbers, @mentions, #hashtags
    text = re.sub(r'[^a-zA-Z\s@#0-9]', '', text)  
    return text.strip()  # No lowercasing/stopword removal

df.head()
df['cleaned_text'] = df['input_text'].apply(preprocess_text)
df = df[df['cleaned_text'].str.split().str.len() >= 10]  # Step 1: Filter short texts

"""# Split data (60% train, 30% test, 10% validation)
train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=0.25, random_state=42)"""

'# Split data (60% train, 30% test, 10% validation)\ntrain_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)\ntest_df, val_df = train_test_split(temp_df, test_size=0.25, random_state=42)'

In [4]:
df = df.drop_duplicates(subset=['cleaned_text'], keep='first')
# --- NEW: SUBSET SAMPLING ---
"""desired_total = 5000  # Set your target size here
if len(df) > desired_total:
    df, _ = train_test_split(
        df,
        train_size=desired_total,
        random_state=42,
        stratify=df['label']
    )

# --- Proceed with splits ---
train_df, temp_df = train_test_split(
    df, 
    test_size=0.3, 
    random_state=42,
    stratify=df['label']
)

val_df, test_df = train_test_split(
    temp_df, 
    test_size=0.5, 
    random_state=42,
    stratify=temp_df['label']
)"""
train_df, temp_df = train_test_split(
    df, 
    test_size=0.4, #0.3
    random_state=42,
    stratify=df['label']  # Preserve class distribution
)

val_df, test_df = train_test_split(
    temp_df, 
    test_size=0.25, #0.5
    random_state=42,
    stratify=temp_df['label']
)

In [5]:


# Verify no overlap
train_texts = set(train_df['cleaned_text'])
val_texts = set(val_df['cleaned_text'])
test_texts = set(test_df['cleaned_text'])
print(len(train_texts & val_texts))
print(len(train_df['cleaned_text'])+ len(val_df['cleaned_text']))
print(f"Train-Val Overlap: {len(train_texts & val_texts)}")  # Should be 0
print(f"Train-Val Overlap: {len(train_texts & val_texts)}")
print(f"Train-Test Overlap: {len(train_texts & test_texts)}")  # Should be 0
print(len(train_texts ),  len(val_texts),len(train_df))

0
60427
Train-Val Overlap: 0
Train-Val Overlap: 0
Train-Test Overlap: 0
40285 20142 40285


In [6]:
from transformers import BertTokenizerFast, BertModel
import torch
from datasets import Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Initialize fast tokenizer
tokenizer = BertTokenizerFast.from_pretrained(
    'bert-base-uncased',
    padding='max_length',     # ENFORCE fixed-length
    truncation=True,          # Force truncation
    max_length=128,
    return_tensors='pt'   ,    # Direct tensor conversion
    return_token_type_ids=False
)


"""def process_data(df):
    dataset = Dataset.from_pandas(df)
    
    # Tokenize text
    dataset = dataset.map(
        lambda x: tokenizer(
            x['cleaned_text'],
            max_length=128,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        ),
        batched=True,
        batch_size=1000,
        remove_columns=['cleaned_text']
    )
    
    # Convert labels
    dataset = dataset.map(
        lambda x: {'labels': torch.tensor(x['label'], dtype=torch.long)},
        batched=True
    )
    
    # Set tensor format with labels
    dataset.set_format(
        type='torch',
        columns=['input_ids', 'attention_mask', 'token_type_ids', 'labels'],
        device=device
    )
    
    return dataset
"""
# 1. MODIFIED DATA PROCESSING (Keep data on CPU)
def process_data(df):
    dataset = Dataset.from_pandas(df)
    
    # Tokenize on CPU
    dataset = dataset.map(
        lambda x: tokenizer(
            x['cleaned_text'],
            max_length=128,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        ),
        batched=True,
        batch_size=1000,
        remove_columns=['cleaned_text']
    )
    
    # Keep labels on CPU
    dataset = dataset.map(
        lambda x: {'labels': x['label']},  # Keep as int, not tensor
        batched=True
    )
    
    # Set format to CPU tensors
    dataset.set_format(
        type='torch',
        columns=['input_ids', 'attention_mask', 'labels'],
        device='cpu'  # CRITICAL CHANGE
    )
    
    return dataset

# 4. Process all splits
train_dataset = process_data(train_df)
val_dataset = process_data(val_df)
test_dataset = process_data(test_df)

Map:   0%|          | 0/40285 [00:00<?, ? examples/s]

Map:   0%|          | 0/40285 [00:00<?, ? examples/s]

Map:   0%|          | 0/20142 [00:00<?, ? examples/s]

Map:   0%|          | 0/20142 [00:00<?, ? examples/s]

Map:   0%|          | 0/6715 [00:00<?, ? examples/s]

Map:   0%|          | 0/6715 [00:00<?, ? examples/s]

In [None]:
def verify_dataset(dataset):
    print("Feature lengths:")
    print(f"Input IDs: {dataset[0]['input_ids'].shape}")
    print(f"Attention Mask: {dataset[0]['attention_mask'].shape}")
    print(f"Labels: {dataset[0]['labels'] if 'labels' in dataset.features else 'No labels'}")

    lengths = [len(x['input_ids']) for x in dataset]
    assert len(set(lengths)) == 1, "Varying sequence lengths detected!"
    print("All sequences have length:", lengths[0])

verify_dataset(train_dataset)
verify_dataset(val_dataset)

In [7]:
import torch.nn as nn
# ------------------------------------
# 3. Custom BERT Model (3-layer CLS concat)
# ------------------------------------
"""class BERTClassifier(nn.Module):
    def __init__(self, bert_model):
        super().__init__()
        self.bert = bert_model
        self.bert.config.output_hidden_states = True  # Enable selectively
        
        # Calculate hidden_size once
        hidden_size = bert_model.config.hidden_size
        self.cls_layer = nn.Linear(3 * hidden_size, 2)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        # Get only last 3 layers' hidden states
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=[-3, -2, -1],  # Layer indices -3, -2, -1
            return_dict=True
        )
        
        # Simplified CLS extraction
        hidden_states = outputs.hidden_states  # Tuple of 3 layers
        cls_embeddings = torch.cat([
            hidden_states[0][:, 0, :],  # -3rd layer
            hidden_states[1][:, 0, :],  # -2nd
            hidden_states[2][:, 0, :]   # -1st
        ], dim=1)
        
        logits = self.cls_layer(cls_embeddings)
        
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        
        return {'loss': loss, 'logits': logits}  # Return dict for .loss access

# Initialize with optimized settings
bert_model = BertModel.from_pretrained(
    "bert-base-uncased",
    output_hidden_states=False  # Controlled in forward
)
model = BERTClassifier(bert_model).to(device)

        

"""

# ===============
# 2. Model Setup
# ===============
class BERTClassifier(nn.Module):
    def __init__(self, bert_model):
        super().__init__()
        self.bert = bert_model
        self.bert.config.output_hidden_states = True
        
        hidden_size = 768  # Fixed for bert-base-uncased
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(3 * hidden_size, 2).float()
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None,**kwargs):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )
        
        # Last 3 transformer layers (indices 10,11,12)
        hidden_states = outputs.hidden_states[-3:]  
        # Validate layer selection
        assert len(hidden_states) == 3, f"Expected 3 layers, got {len(hidden_states)}"
        for hs in hidden_states:
            assert hs.size(-1) == 768, f"Invalid hidden size {hs.size()}"
        # CLS tokens with FP32 casting
        cls_embeddings = torch.cat([
            hidden_states[0][:, 0, :].float(),  # Layer 10
            hidden_states[1][:, 0, :].float(),  # Layer 11
            hidden_states[2][:, 0, :].float()   # Layer 12
        ], dim=1)
        
        pooled = self.dropout(cls_embeddings)
        logits = self.classifier(pooled)
        
        loss = self.loss_fn(logits, labels) if labels is not None else None
            
        return {'loss': loss, 'logits': logits, 'embeddings': cls_embeddings}
    
  
bert_model = BertModel.from_pretrained("bert-base-uncased")
model = BERTClassifier(bert_model).to(device)

In [8]:
print(len(train_dataset))

40285


In [None]:
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if f"encoder.layer.{i}." in n],
        "lr": 5e-5 * (0.9 ** (11 - i))  # Gradual decay
    } for i in range(12)
] + [
    {
        "params": [p for n, p in model.named_parameters() if "cls_layer" in n],
        "lr": 1e-4  # Slightly reduced
    }
]
optimizer = AdamW(optimizer_grouped_parameters)

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'  # Debug async ops
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'backend:cudaMallocAsync'  # AMD fix

# Force synchronous mode
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.sdp_kernel(enable_flash=False)

In [None]:
# Add to start of training loop
dummy_input = tokenizer("This is a test news article.", return_tensors='pt').to(device)
with torch.no_grad():
    outputs = model(**dummy_input)
    print(f"Dummy CLS3 shape: {outputs['embeddings'].shape}")
# Expected: torch.Size([1, 2304])

In [9]:
# 1. Add this at the start of your code
torch.backends.cuda.matmul.allow_tf32 = True  # Enable TensorFloat-32
torch.backends.cudnn.benchmark = True  # Auto-optimize convolution algorithms

In [10]:
from torch.amp import GradScaler, autocast
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import OneCycleLR
# Config (Paper-Aligned with GPU Constraints)
batch_size = 6  # Max your GPU can handle
"""effective_batch = 24  # Paper uses 24 (6*4 but scaled to your GPU)
grad_accum_steps = effective_batch // batch_size  # 3 steps"""
grad_accum_steps = 4
max_lr = 2e-5  # As per paper
num_epochs = 1  # Must stay at 1 epoch
mixed_precision = True
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    #pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=32,
   # pin_memory=True
)
# ======================
# 3. Optimizer & Scheduler
# ======================
optimizer = AdamW([
    {'params': model.bert.parameters(), 'lr': 2e-5},
    {'params': model.classifier.parameters(), 'lr': 1e-4}
], weight_decay=0.01)

total_steps = len(train_loader) // grad_accum_steps
scheduler = OneCycleLR(
    optimizer,
    max_lr=max_lr,
    total_steps=total_steps,
    pct_start=0.3,
    div_factor=25.0,
    final_div_factor=100.0
)

# ==================
# 4. Training Loop
# ==================
scaler = GradScaler(
    enabled=mixed_precision,
    growth_interval=2000
    )
best_acc = 0

for epoch in range(num_epochs):
    # Training
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    
    for batch_idx, batch in enumerate(tqdm(train_loader)):
        with autocast(device_type='cuda', dtype=torch.float16,enabled=mixed_precision):
            # Memory-optimized transfer
            inputs = {
                'input_ids': batch['input_ids'].contiguous().to(device, non_blocking=True),
                'attention_mask': batch['attention_mask'].contiguous().to(device, non_blocking=True),
                'labels': batch['labels'].contiguous().to(device, non_blocking=True)
            }
            
            outputs = model(**inputs)
            loss = outputs['loss'] / grad_accum_steps
        
        # Backprop with scaling
        scaler.scale(loss).backward()
        torch.cuda.synchronize()
        
        if (batch_idx + 1) % grad_accum_steps == 0:
            # Gradient management
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) # Reduced from 1.0
            
            # Optimizer step
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            torch.cuda.synchronize()
            # LR scheduling
            scheduler.step()
            
            # Memory cleanup
            torch.cuda.empty_cache()
        
        total_loss += loss.item() * grad_accum_steps

    # Validation
    model.eval()
    val_correct = 0
    val_total = 0
    val_loss = 0
    
    with torch.no_grad(), autocast(device_type='cuda', dtype=torch.float16,enabled=mixed_precision):
        for val_batch in val_loader:
            inputs = {
                'input_ids': val_batch['input_ids'].contiguous().to(device),
                'attention_mask': val_batch['attention_mask'].contiguous().to(device),
                'labels': val_batch['labels'].contiguous().to(device)
            }
            
            outputs = model(**inputs)
            val_loss += outputs['loss'].item()
            
            # Precision metrics
            preds = torch.argmax(outputs['logits'], dim=1)
            val_correct += (preds == inputs['labels']).sum().item()
            val_total += inputs['labels'].size(0)
    
    # Statistics
    avg_loss = total_loss / len(train_loader)
    val_acc = val_correct / val_total
    print(f"Epoch {epoch+1} | Train Loss: {avg_loss:.4f} | Val Acc: {val_acc*100:.2f}%")
    
    # Save best
    if val_acc > best_acc:
        torch.save(model.state_dict(), 'bert_cls3.pth')
        best_acc = val_acc



  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 6715/6715 [09:05<00:00, 12.31it/s]


Epoch 1 | Train Loss: 0.1689 | Val Acc: 97.47%


In [11]:
test_loader = DataLoader(
    test_dataset,
    batch_size=64,  # From paper's evaluation section
    shuffle=False,  # Critical for proper evaluation
   
)

In [12]:
# --------------------------------------------------
# Step 5: Extract CLS3 embeddings (Eq.5)
# --------------------------------------------------
def extract_embeddings(model, dataloader, device='cuda'):
    """Generate contextualized embeddings using CLS3 concatenation"""
    model.eval()
    all_embeddings = []
    all_labels = []
    
    with torch.no_grad(), autocast(device_type='cuda', dtype=torch.float16):
        for batch in tqdm(dataloader, desc="Extracting embeddings"):
            inputs = {
                'input_ids': batch['input_ids'].to(device, non_blocking=True),
                'attention_mask': batch['attention_mask'].to(device, non_blocking=True)
            }
            
            # Forward pass (returns dict with 'embeddings')
            outputs = model(**inputs)
            
            # Paper's Eq.5: concat(H_{CLS}^L, H_{CLS}^{L-1}, H_{CLS}^{L-2})
            embeddings = outputs['embeddings'].cpu().numpy().astype(np.float32)
            labels = batch['labels'].cpu().numpy()
            
            all_embeddings.append(embeddings)
            all_labels.append(labels)
    
    return np.concatenate(all_embeddings), np.concatenate(all_labels)

# Run extraction
train_embs, train_labels = extract_embeddings(model, train_loader)
val_embs, val_labels = extract_embeddings(model, val_loader)
test_embs, test_labels = extract_embeddings(model, test_loader)

Extracting embeddings: 100%|██████████| 6715/6715 [02:02<00:00, 54.83it/s]
Extracting embeddings: 100%|██████████| 630/630 [00:30<00:00, 20.36it/s]
Extracting embeddings: 100%|██████████| 105/105 [00:10<00:00, 10.47it/s]


In [13]:
# Verify no overlap between splits
train_texts = set(train_df['cleaned_text'])
val_texts = set(val_df['cleaned_text'])
test_texts = set(test_df['cleaned_text'])

assert len(train_texts & val_texts) == 0, "Train-Val leakage!"
assert len(train_texts & test_texts) == 0, "Train-Test leakage!"
assert len(val_texts & test_texts) == 0, "Val-Test leakage!"

In [21]:
import optuna
import lightgbm as lgb
from lightgbm import Dataset
import numpy as np
from sklearn.metrics import accuracy_score
from optuna.integration import LightGBMPruningCallback
# Create native Dataset objects
dtrain = Dataset(train_embs, label=train_labels, free_raw_data=False)
dval = Dataset(val_embs, label=val_labels, reference=dtrain, free_raw_data=False)
def objective(trial):
    params = {
        'boosting_type': 'dart',
        'num_leaves': trial.suggest_int('num_leaves', 20, 40),
        'learning_rate': trial.suggest_float('lr', 0.01, 0.1, log=True),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 0.1, log=True),
        'objective': 'binary',
        'metric': 'binary_error',  # Must match pruning metric
        'verbosity': -1,
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0,
    }
    
    pruning_callback = LightGBMPruningCallback(
        trial, 
        metric='binary_error',  # Prune based on error (lower=better)
        valid_name='valid_0'
    )
    
    model = lgb.train(
        params,
        dtrain,
        valid_sets=[dval],
        num_boost_round=1000,
        callbacks=[
            pruning_callback,
            lgb.early_stopping(stopping_rounds=50),
            lgb.log_evaluation(0)
        ]
    )
    
    # Use binary_error for consistency with minimization
    error_rate = model.best_score['valid_0']['binary_error']
    return error_rate  # Minimizing error

study = optuna.create_study(direction='minimize')  # Change to minimize
study.optimize(objective, n_trials=50)
best_params = study.best_params
print(best_params)

[I 2025-03-21 21:15:36,226] A new study created in memory with name: no-name-a0428a6e-6c80-440c-896e-f8470498c867
[I 2025-03-21 21:17:45,375] Trial 0 finished with value: 0.023830801310694073 and parameters: {'num_leaves': 33, 'lr': 0.015195139950383966, 'min_child_samples': 40, 'subsample': 0.9233480114710293, 'colsample_bytree': 0.731938676589442, 'reg_alpha': 0.057427965448657324}. Best is trial 0 with value: 0.023830801310694073.
[I 2025-03-21 21:20:37,277] Trial 1 finished with value: 0.023483268791579782 and parameters: {'num_leaves': 37, 'lr': 0.05293451641664748, 'min_child_samples': 10, 'subsample': 0.9966715057368878, 'colsample_bytree': 0.7799620554978535, 'reg_alpha': 0.001708555587204413}. Best is trial 1 with value: 0.023483268791579782.
[I 2025-03-21 21:22:32,844] Trial 2 finished with value: 0.02363221129977162 and parameters: {'num_leaves': 30, 'lr': 0.01826582823697512, 'min_child_samples': 30, 'subsample': 0.7443647262769147, 'colsample_bytree': 0.7505251809895565, '

{'num_leaves': 36, 'lr': 0.08653128172922714, 'min_child_samples': 13, 'subsample': 0.8823417631926579, 'colsample_bytree': 0.8632146551823339, 'reg_alpha': 0.008949561447719115}


In [22]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score, f1_score, precision_score, classification_report

# 1. Paper-specified data merging (60% train + 30% val = 90% final train)
final_train_embs = np.concatenate([train_embs, val_embs])
final_train_labels = np.concatenate([train_labels, val_labels])

# 2. Create dataset objects with paper's split ratio
dtrain = lgb.Dataset(final_train_embs, label=final_train_labels)
dtest = lgb.Dataset(test_embs, label=test_labels, reference=dtrain)

# 3. Final model parameters (adjusted for paper's DART boosting)
final_params = {
    'boosting_type': 'dart',
    'num_leaves': 36,
    'learning_rate': 0.0865,
    'min_child_samples': 13,
    'subsample': 0.8823,
    'colsample_bytree': 0.8632,
    'reg_alpha': 0.00895,
    'objective': 'binary',
    'metric': 'binary_error',
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    'seed': 42,
    'num_threads': 8  # Match your 8-core Ryzen
}

# 4. Paper's training protocol (Table 2 specs)
final_model = lgb.train(
    final_params,
    dtrain,
    num_boost_round=1000,
    valid_sets=[dtest],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(50)  # Print every 50 iterations
    ]
)

# 5. Paper-style evaluation metrics
test_preds = final_model.predict(test_embs) > 0.5  # Binary threshold

print("\n=== Paper-Comparable Results ===")
print(f"Test Accuracy: {accuracy_score(test_labels, test_preds)*100:.2f}%")
print(f"F1-Score: {f1_score(test_labels, test_preds)*100:.2f}%")
print(f"Precision: {precision_score(test_labels, test_preds)*100:.2f}%")
print("\nClassification Report:")
print(classification_report(test_labels, test_preds, target_names=['Real', 'Fake']))

# 6. Save model for deployment
final_model.save_model('optimized_lightgbm.txt')

[50]	valid_0's binary_error: 0.0239762
[100]	valid_0's binary_error: 0.0238273
[150]	valid_0's binary_error: 0.0232316
[200]	valid_0's binary_error: 0.0230827
[250]	valid_0's binary_error: 0.0229337
[300]	valid_0's binary_error: 0.0227848
[350]	valid_0's binary_error: 0.0229337
[400]	valid_0's binary_error: 0.0226359
[450]	valid_0's binary_error: 0.0226359
[500]	valid_0's binary_error: 0.0226359
[550]	valid_0's binary_error: 0.0229337
[600]	valid_0's binary_error: 0.0235294
[650]	valid_0's binary_error: 0.0229337
[700]	valid_0's binary_error: 0.0232316
[750]	valid_0's binary_error: 0.0230827
[800]	valid_0's binary_error: 0.0230827
[850]	valid_0's binary_error: 0.0233805
[900]	valid_0's binary_error: 0.0232316
[950]	valid_0's binary_error: 0.0229337
[1000]	valid_0's binary_error: 0.0230827

=== Paper-Comparable Results ===
Test Accuracy: 97.69%
F1-Score: 97.79%
Precision: 98.25%

Classification Report:
              precision    recall  f1-score   support

        Real       0.97      0

<lightgbm.basic.Booster at 0x22a9dfb0250>

In [26]:
from sklearn.metrics import confusion_matrix
# After model training, generate predictions for ALL splits
train_preds = final_model.predict(train_embs) > 0.5
val_preds = final_model.predict(val_embs) > 0.5
test_preds = final_model.predict(test_embs) > 0.5

def print_confusion(title, y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    print(f"\n=== {title} ===")
    print("                Predicted")
    print("             |   Real   |   Fake  |")
    print("------------------------------------")
    print(f"Actual Real |   {cm[0,0]:4d}   |   {cm[0,1]:4d}  |")
    print("------------------------------------")
    print(f"Actual Fake |   {cm[1,0]:4d}   |   {cm[1,1]:4d}  |")
    print("------------------------------------")

# Print all confusion matrices
print_confusion("Original Training Set", train_labels, train_preds)
print_confusion("Validation Set", val_labels, val_preds)
print_confusion("Test Set", test_labels, test_preds)

# Optional: Combined dataset confusion matrix
full_embs = np.concatenate([train_embs, val_embs, test_embs])
full_labels = np.concatenate([train_labels, val_labels, test_labels])
full_probs = final_model.predict(full_embs)
full_preds = full_probs > 0.5
print_confusion("Entire Combined Dataset", full_labels, full_preds)


=== Original Training Set ===
                Predicted
             |   Real   |   Fake  |
------------------------------------
Actual Real |   19195   |      0  |
------------------------------------
Actual Fake |      0   |   21090  |
------------------------------------

=== Validation Set ===
                Predicted
             |   Real   |   Fake  |
------------------------------------
Actual Real |   9597   |      0  |
------------------------------------
Actual Fake |      0   |   10545  |
------------------------------------

=== Test Set ===
                Predicted
             |   Real   |   Fake  |
------------------------------------
Actual Real |   3138   |     61  |
------------------------------------
Actual Fake |     94   |   3422  |
------------------------------------

=== Entire Combined Dataset ===
                Predicted
             |   Real   |   Fake  |
------------------------------------
Actual Real |   31930   |     61  |
---------------------------

In [28]:
from sklearn.metrics import recall_score,roc_auc_score
# Calculate comprehensive metrics
print("\n=== Full Dataset Evaluation ===")
print(f"Accuracy: {accuracy_score(full_labels, full_preds)*100:.2f}%")
print(f"F1-Score: {f1_score(full_labels, full_preds)*100:.2f}%")
print(f"Precision: {precision_score(full_labels, full_preds)*100:.2f}%") 
print(f"Recall/Sensitivity: {recall_score(full_labels, full_preds)*100:.2f}%")
print(f"ROC AUC: {roc_auc_score(full_labels, full_probs)*100:.2f}%")

# Detailed classification report
print("\nClassification Report (Full Dataset):")
print(classification_report(full_labels, full_preds, 
                            target_names=['Real', 'Fake'],
                            digits=4))

# Confusion matrix with normalized view
print("\nNormalized Confusion Matrix (Full Dataset):")
cm = confusion_matrix(full_labels, full_preds)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

print("                Predicted")
print("             |   Real    |   Fake   |")
print("-------------------------------------")
print(f"Actual Real | {cm_normalized[0,0]:>7.2%} | {cm_normalized[0,1]:>7.2%} |")
print("-------------------------------------")
print(f"Actual Fake | {cm_normalized[1,0]:>7.2%} | {cm_normalized[1,1]:>7.2%} |")
print("-------------------------------------")


=== Full Dataset Evaluation ===
Accuracy: 99.77%
F1-Score: 99.78%
Precision: 99.83%
Recall/Sensitivity: 99.73%
ROC AUC: 99.99%

Classification Report (Full Dataset):
              precision    recall  f1-score   support

        Real     0.9971    0.9981    0.9976     31991
        Fake     0.9983    0.9973    0.9978     35151

    accuracy                         0.9977     67142
   macro avg     0.9977    0.9977    0.9977     67142
weighted avg     0.9977    0.9977    0.9977     67142


Normalized Confusion Matrix (Full Dataset):
                Predicted
             |   Real    |   Fake   |
-------------------------------------
Actual Real |  99.81% |   0.19% |
-------------------------------------
Actual Fake |   0.27% |  99.73% |
-------------------------------------


In [23]:
from sklearn.metrics import recall_score
# 5. Generate test predictions
test_pred_probs = final_model.predict(test_embs)
test_preds = (test_pred_probs > 0.5).astype(int)

# 6. Save predictions with original text (paper-style)
import pandas as pd

# Assuming test_df contains original text and labels
results_df = pd.DataFrame({
    'text': test_df['cleaned_text'],
    'true_label': test_labels,
    'predicted_label': test_preds,
    'confidence': test_pred_probs
})

# Save to CSV with paper's metrics
results_df.to_csv('paper_metrics_predictions.csv', index=False)

# 7. Paper-style formatted output
from sklearn.metrics import confusion_matrix

print("\n=== Experimental Results ===")
print(f"Dataset: hehe works")
print(f"Split Ratio: 60-30-10")
print(f"Embedding Method: CLS3 Concatenation")
print(f"Classifier: LightGBM (DART)")

# Detailed metrics
cm = confusion_matrix(test_labels, test_preds)
print("\nConfusion Matrix:")
print(cm)
print(f"\nAccuracy: {accuracy_score(test_labels, test_preds)*100:.2f}%")
print(f"Precision: {precision_score(test_labels, test_preds)*100:.2f}%") 
print(f"Recall: {recall_score(test_labels, test_preds)*100:.2f}%")
print(f"F1-Score: {f1_score(test_labels, test_preds)*100:.2f}%")

# Sample predictions
print("\n=== Prediction Samples ===")
print(results_df.sample(5, random_state=42)[
    ['text', 'true_label', 'predicted_label', 'confidence']
].to_markdown(index=False))


=== Experimental Results ===
Dataset: hehe works
Split Ratio: 60-30-10
Embedding Method: CLS3 Concatenation
Classifier: LightGBM (DART)

Confusion Matrix:
[[3138   61]
 [  94 3422]]

Accuracy: 97.69%
Precision: 98.25%
Recall: 97.33%
F1-Score: 97.79%

=== Prediction Samples ===
| text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           

In [None]:
# Final Memory Cleanup
torch.cuda.empty_cache()