In [1]:
import os
import pandas as pd
import random
from PIL import Image
from torchvision import transforms
from tqdm import tqdm

# Load your Excel dataset
df = pd.read_excel("updated_dataset_with_visualizations.xlsx")

# Folder to save augmented images
augmented_folder = "augmented_images"
os.makedirs(augmented_folder, exist_ok=True)

# Define your augmentation transform (same as in training)
train_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomCrop(224),
    transforms.RandomRotation(20),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.1),
    transforms.RandomAffine(degrees=15, translate=(0.1, 0.1), scale=(0.9, 1.1)),
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.RandomAdjustSharpness(1.5, p=0.5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    transforms.RandomErasing(p=0.3, scale=(0.02, 0.2), ratio=(0.3, 3.3), value=0),
])

# Remove Normalize & Tensor for saving augmented images as .jpg
save_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomCrop(224),
    transforms.RandomRotation(20),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.1),
    transforms.RandomAffine(degrees=15, translate=(0.1, 0.1), scale=(0.9, 1.1)),
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.RandomAdjustSharpness(1.5, p=0.5),
])

# Number of images to generate (30% more)
num_augmented = int(len(df) * 0.3)
sampled_df = df.sample(n=num_augmented, random_state=42).reset_index(drop=True)

# Store new rows for augmented data
augmented_rows = []

for idx, row in tqdm(sampled_df.iterrows(), total=len(sampled_df), desc="Augmenting"):
    orig_path = row['visualization_path']
    label = row['label']
    
    try:
        image = Image.open(orig_path).convert("RGB")
        augmented_image = save_transform(image)
        
        # Save augmented image
        new_filename = f"aug_{idx}_{os.path.basename(orig_path)}"
        new_path = os.path.join(augmented_folder, new_filename)
        augmented_image.save(new_path)

        # Append to new DataFrame
        augmented_rows.append({'visualization_path': new_path, 'label': label})
    
    except Exception as e:
        print(f"❌ Failed to augment {orig_path}: {e}")

# Create final DataFrame with original + augmented
augmented_df = pd.concat([df, pd.DataFrame(augmented_rows)], ignore_index=True)

# Save the new combined dataset
augmented_df.to_excel("augmented_dataset.xlsx", index=False)
print("✅ Augmentation complete. New dataset saved to 'augmented_dataset.xlsx'")


Augmenting: 100%|██████████| 5092/5092 [02:39<00:00, 31.89it/s]


✅ Augmentation complete. New dataset saved to 'augmented_dataset.xlsx'


In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from PIL import Image, ImageFile
import os
from sklearn.metrics import classification_report, confusion_matrix
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
import torch.cuda.amp as amp  # Mixed Precision Training

# ✅ Allow truncated images to load instead of crashing
ImageFile.LOAD_TRUNCATED_IMAGES = True

# ✅ Enhanced Data Augmentation (Fix for RandomErasing)
train_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomCrop(224),
    transforms.RandomRotation(20),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.1),
    transforms.RandomAffine(degrees=15, translate=(0.1, 0.1), scale=(0.9, 1.1)),
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.RandomAdjustSharpness(1.5, p=0.5),
    transforms.ToTensor(),  # ✅ Move ToTensor BEFORE RandomErasing
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    transforms.RandomErasing(p=0.3, scale=(0.02, 0.2), ratio=(0.3, 3.3), value=0),  # ✅ Now works
])

val_test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# ✅ Custom Dataset Class
class FakeNewsImageDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]['visualization_path']
        label = int(self.dataframe.iloc[idx]['label'])

        try:
            image = Image.open(img_path).convert('RGB')
        except (OSError, IOError) as e:
            print(f"Skipping corrupt image: {img_path} - {e}")
            return None

        if self.transform:
            image = self.transform(image)

        return image, label

# ✅ Custom collate function to remove None values (bad images)
def collate_fn(batch):
    batch = [item for item in batch if item is not None]
    if not batch:
        return torch.zeros((0, 3, 224, 224)), torch.zeros(0, dtype=torch.long)
    return torch.utils.data.dataloader.default_collate(batch)

# ✅ Load datasets
train_df = pd.read_csv("train_data-1.csv")
val_df = pd.read_csv("val_data-1.csv")
test_df = pd.read_csv("test_data-1.csv")

train_dataset = FakeNewsImageDataset(train_df, transform=train_transform)
val_dataset = FakeNewsImageDataset(val_df, transform=val_test_transform)
test_dataset = FakeNewsImageDataset(test_df, transform=val_test_transform)

# ✅ Compute class weights for balancing
class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=train_df['label'].values)
class_weights = torch.tensor(class_weights, dtype=torch.float)

# ✅ Fix Weighted Sampler
class_weights_np = class_weights.numpy()
sample_weights = train_df['label'].map(lambda x: 1.0 / class_weights_np[x]).astype(float).values
sample_weights = torch.tensor(sample_weights, dtype=torch.double)
sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)

# ✅ Define DataLoaders
batch_size = 32  # ✅ Increased batch size
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# ✅ Define Improved Model (ResNeXt-50)
class ModifiedResNeXt(nn.Module):
    def __init__(self):
        super(ModifiedResNeXt, self).__init__()
        resnext = models.resnext50_32x4d(weights=models.ResNeXt50_32X4D_Weights.IMAGENET1K_V1)
        self.features = nn.Sequential(*list(resnext.children())[:-1])  # Remove final FC layer
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.BatchNorm1d(2048),
            nn.Dropout(0.5),
            nn.Linear(2048, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.3),
            nn.Linear(512, 2)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

# ✅ Define Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ModifiedResNeXt().to(device)

# ✅ Loss Function, Optimizer & Scheduler
criterion = nn.CrossEntropyLoss(weight=class_weights.to(device), label_smoothing=0.1)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=2)

# ✅ Training Loop
num_epochs = 25
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    scheduler.step()
    
    print(f"\nEpoch {epoch+1}/{num_epochs} Training Loss: {running_loss/len(train_loader):.4f}")

    # ✅ Validation After Each Epoch
    model.eval()
    correct = 0
    total = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    val_acc = 100 * correct / total
    print(f"Validation Accuracy: {val_acc:.2f}%")

# ✅ Evaluate on Test Set
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

test_acc = 100 * correct / total
print(f"\n✅ Test Accuracy: {test_acc:.2f}%")

# ✅ Save Best Model
torch.save(model.state_dict(), 'best_resnext50_model-2.pth')



Epoch 1/25 Training Loss: 0.7497
Validation Accuracy: 52.87%

Epoch 2/25 Training Loss: 0.6941
Validation Accuracy: 60.53%

Epoch 3/25 Training Loss: 0.6694
Validation Accuracy: 61.11%

Epoch 4/25 Training Loss: 0.6466
Validation Accuracy: 62.70%

Epoch 5/25 Training Loss: 0.6339
Validation Accuracy: 62.12%

Epoch 6/25 Training Loss: 0.6499
Validation Accuracy: 60.32%

Epoch 7/25 Training Loss: 0.6366
Validation Accuracy: 64.52%

Epoch 8/25 Training Loss: 0.6221
Validation Accuracy: 67.08%

Epoch 9/25 Training Loss: 0.6020
Validation Accuracy: 66.87%

Epoch 10/25 Training Loss: 0.5708
Validation Accuracy: 68.59%

Epoch 11/25 Training Loss: 0.5470
Validation Accuracy: 72.52%

Epoch 12/25 Training Loss: 0.5140
Validation Accuracy: 72.86%

Epoch 13/25 Training Loss: 0.4859
Validation Accuracy: 75.29%

Epoch 14/25 Training Loss: 0.4671
Validation Accuracy: 75.34%

Epoch 15/25 Training Loss: 0.4573
Validation Accuracy: 76.25%

Epoch 16/25 Training Loss: 0.5262
Validation Accuracy: 74.79%

