In [1]:
import random
import os
import glob
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, f1_score, matthews_corrcoef,
    confusion_matrix, ConfusionMatrixDisplay,
    classification_report, precision_recall_fscore_support
)

from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader

import torch
import torch.nn as nn
import torch.nn.init as init
from torch.hub import load_state_dict_from_url
from torchvision import models


import math
from inspect import isfunction
import torch
import torch.nn as nn
import torch.nn.functional as F


import warnings
warnings.filterwarnings(
    "ignore", 
    category=UserWarning, 
    module="torchvision.models._utils"
)

In [2]:
class CFG:
    EPOCHS = 30
    BATCH_SIZE = 32
    SEED = 3170310
    HEIGHT = 224
    WIDTH = 224
    CHANNELS = 3
    IMAGE_SIZE = (224, 224, 3)

In [3]:
def seed_everything(seed=CFG.SEED):
    random.seed(seed)  # Python random module
    np.random.seed(seed)  # NumPy
    torch.manual_seed(seed)  # PyTorch CPU
    torch.cuda.manual_seed(seed)  # PyTorch GPU
    torch.cuda.manual_seed_all(seed)  # All GPUs
    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
    torch.backends.cudnn.benchmark = False  # Disable optimization for reproducibility

seed_everything(CFG.SEED)

In [4]:
# Define paths
DATASET_PATH = "/kaggle/input/skin-cancer-malignant-vs-benign/"
TRAIN_PATH = '/kaggle/input/skin-cancer-malignant-vs-benign/train/'
TEST_PATH = '/kaggle/input/skin-cancer-malignant-vs-benign/test/'

In [5]:
%%time
train_images = glob.glob(f"{TRAIN_PATH}**/*.jpg")
test_images = glob.glob(f"{TEST_PATH}**/*.jpg")

CPU times: user 7.05 ms, sys: 3.02 ms, total: 10.1 ms
Wall time: 95 ms


In [6]:
# Get train & test set sizes
train_size = len(train_images)
test_size = len(test_images)

# Get dataset size
total = train_size + test_size

# View samples counts
print(f'train samples count:\t\t{train_size}')
print(f'test samples count:\t\t{test_size}')
print('=======================================')
print(f'TOTAL:\t\t\t\t{total}')

train samples count:		2637
test samples count:		660
TOTAL:				3297


In [7]:
def generate_labels(image_paths):
    return [_.split('/')[-2:][0] for _ in image_paths]


def build_df(image_paths, labels):
    # Modified version with proper error checking
    df = pd.DataFrame({
        'image_path': image_paths,
        'label': labels
    })
    
    # Add print statement to verify unique labels
    print("Unique labels before encoding:", df['label'].unique())
    
    # Modified label encoding with value counts
    df['label_encoded'] = df.apply(lambda row: 0 if row.label == 'malignant' else 1, axis=1)
    print("Label distribution after encoding:", df['label_encoded'].value_counts())
    
    return df.sample(frac=1, random_state=CFG.SEED).reset_index(drop=True)

In [8]:
# Build the DataFrames
train_df = build_df(train_images, generate_labels(train_images))
test_df = build_df(test_images, generate_labels(test_images))

Unique labels before encoding: ['benign' 'malignant']
Label distribution after encoding: label_encoded
1    1440
0    1197
Name: count, dtype: int64
Unique labels before encoding: ['benign' 'malignant']
Label distribution after encoding: label_encoded
1    360
0    300
Name: count, dtype: int64


In [9]:
# Create Train/Val split with Training Set
train_split_idx, val_split_idx, _, _ = train_test_split(train_df.index, 
                                                        train_df.label_encoded, 
                                                        test_size=0.15,
                                                        stratify=train_df.label_encoded,
                                                        random_state=CFG.SEED)

In [10]:
# Get new training and validation data
train_new_df = train_df.iloc[train_split_idx].reset_index(drop=True)
val_df = train_df.iloc[val_split_idx].reset_index(drop=True)

# View shapes
train_new_df.shape, val_df.shape

((2241, 3), (396, 3))

In [11]:
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torchvision.transforms as transforms

class SkinCancerDataset(Dataset):
    def __init__(self, df, transform=None):
        """
        Args:
            df (pandas.DataFrame): DataFrame containing image paths and labels
            transform (callable, optional): Optional transform to be applied on an image
        """
        self.df = df
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        img_path = self.df.iloc[idx]['image_path']
        label = self.df.iloc[idx]['label_encoded']
        image = Image.open(img_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
            
        return image, label

class AugmentedSkinCancerDataset(Dataset):
    def __init__(self, df, transform=None, augment_times=4):
        self.df = df
        self.base_transform = transform
        self.augment_times = augment_times
        
        self.augmentations = [
            transforms.Compose([
                transforms.RandomChoice([
                    transforms.RandomRotation(90),
                    transforms.RandomHorizontalFlip(p=0.8),
                    transforms.RandomVerticalFlip(p=0.8),
                ]),
                transforms.ColorJitter(
                    brightness=0.2,
                    contrast=0.2,
                    saturation=0.2,
                    hue=0.1
                ),
                transforms.RandomAffine(
                    degrees=30,
                    translate=(0.1, 0.1),
                    scale=(0.8, 1.2),
                    shear=10
                ),
                transforms.RandomPerspective(distortion_scale=0.2, p=0.5),
            ]) for _ in range(augment_times)
        ]

    def __len__(self):
        return len(self.df) * (self.augment_times + 1)  

    def __getitem__(self, idx):
        original_idx = idx // (self.augment_times + 1)
        aug_idx = idx % (self.augment_times + 1)
        
        img_path = self.df.iloc[original_idx]['image_path']
        label = self.df.iloc[original_idx]['label_encoded']
        image = Image.open(img_path).convert('RGB')
        
        image = transforms.Resize((224, 224))(image)
        
        if aug_idx == 0:
            if self.base_transform:
                image = self.base_transform(image)
            return image, label
        
        image = self.augmentations[aug_idx-1](image)
        
        if self.base_transform:
            image = self.base_transform(image)
            
        return image, label

train_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomVerticalFlip(),
    transforms.ColorJitter(brightness=0.1, contrast=0.1),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(90),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

val_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

base_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = AugmentedSkinCancerDataset(
    train_new_df,
    transform=base_transforms,
    augment_times=4
)
val_dataset = SkinCancerDataset(val_df, transform=val_transforms)
test_dataset = SkinCancerDataset(test_df, transform=val_transforms)

train_loader = DataLoader(
    train_dataset, 
    batch_size=CFG.BATCH_SIZE, 
    shuffle=True,
    num_workers=4
)
val_loader = DataLoader(
    val_dataset, 
    batch_size=CFG.BATCH_SIZE, 
    shuffle=False,
    num_workers=4
)
test_loader = DataLoader(
    test_dataset, 
    batch_size=CFG.BATCH_SIZE, 
    shuffle=False,
    num_workers=4
)

In [12]:
print(f"Original dataset size: {len(train_new_df)}")
print(f"Augmented dataset size: {len(train_dataset)}")

Original dataset size: 2241
Augmented dataset size: 11205


# Customised Blocks 

In [13]:
def round_channels(channels,
                   divisor=8):
    """
    Round weighted channel number (make divisible operation).

    Parameters:
    ----------
    channels : int or float
        Original number of channels.
    divisor : int, default 8
        Alignment value.

    Returns
    -------
    int
        Weighted number of channels.
    """
    rounded_channels = max(int(channels + divisor / 2.0) // divisor * divisor, divisor)
    if float(rounded_channels) < 0.9 * channels:
        rounded_channels += divisor
    return rounded_channels


class Swish(nn.Module):
    """
    Swish activation function from 'Searching for Activation Functions,' https://arxiv.org/abs/1710.05941.
    """
    def forward(self, x):
        return x * torch.sigmoid(x)


class HSigmoid(nn.Module):
    """
    Approximated sigmoid function, so-called hard-version of sigmoid from 'Searching for MobileNetV3,'
    https://arxiv.org/abs/1905.02244.
    """
    def forward(self, x):
        return F.relu6(x + 3.0, inplace=True) / 6.0


class HSwish(nn.Module):
    """
    H-Swish activation function from 'Searching for MobileNetV3,' https://arxiv.org/abs/1905.02244.

    Parameters:
    ----------
    inplace : bool
        Whether to use inplace version of the module.
    """
    def __init__(self, inplace=False):
        super(HSwish, self).__init__()
        self.inplace = inplace

    def forward(self, x):
        return x * F.relu6(x + 3.0, inplace=self.inplace) / 6.0


def get_activation_layer(activation):
    """
    Create activation layer from string/function.

    Parameters:
    ----------
    activation : function, or str, or nn.Module
        Activation function or name of activation function.

    Returns
    -------
    nn.Module
        Activation layer.
    """
    assert (activation is not None)
    if isfunction(activation):
        return activation()
    elif isinstance(activation, str):
        if activation == "relu":
            return nn.ReLU(inplace=True)
        elif activation == "relu6":
            return nn.ReLU6(inplace=True)
        elif activation == "swish":
            return Swish()
        elif activation == "hswish":
            return HSwish(inplace=True)
        elif activation == "sigmoid":
            return nn.Sigmoid()
        elif activation == "hsigmoid":
            return HSigmoid()
        else:
            raise NotImplementedError()
    else:
        assert (isinstance(activation, nn.Module))
        return activation


def conv1x1(in_channels,
            out_channels,
            stride=1,
            groups=1,
            bias=False):
    """
    Convolution 1x1 layer.

    Parameters:
    ----------
    in_channels : int
        Number of input channels.
    out_channels : int
        Number of output channels.
    stride : int or tuple/list of 2 int, default 1
        Strides of the convolution.
    groups : int, default 1
        Number of groups.
    bias : bool, default False
        Whether the layer uses a bias vector.
    """
    return nn.Conv2d(
        in_channels=in_channels,
        out_channels=out_channels,
        kernel_size=1,
        stride=stride,
        groups=groups,
        bias=bias)


def conv3x3(in_channels,
            out_channels,
            stride=1,
            padding=1,
            dilation=1,
            groups=1,
            bias=False):
    """
    Convolution 3x3 layer.

    Parameters:
    ----------
    in_channels : int
        Number of input channels.
    out_channels : int
        Number of output channels.
    stride : int or tuple/list of 2 int, default 1
        Strides of the convolution.
    padding : int or tuple/list of 2 int, default 1
        Padding value for convolution layer.
    groups : int, default 1
        Number of groups.
    bias : bool, default False
        Whether the layer uses a bias vector.
    """
    return nn.Conv2d(
        in_channels=in_channels,
        out_channels=out_channels,
        kernel_size=3,
        stride=stride,
        padding=padding,
        dilation=dilation,
        groups=groups,
        bias=bias)


def depthwise_conv3x3(channels,
                      stride):
    """
    Depthwise convolution 3x3 layer.

    Parameters:
    ----------
    channels : int
        Number of input/output channels.
    strides : int or tuple/list of 2 int
        Strides of the convolution.
    """
    return nn.Conv2d(
        in_channels=channels,
        out_channels=channels,
        kernel_size=3,
        stride=stride,
        padding=1,
        groups=channels,
        bias=False)


class ConvBlock(nn.Module):
    """
    Standard convolution block with Batch normalization and activation.

    Parameters:
    ----------
    in_channels : int
        Number of input channels.
    out_channels : int
        Number of output channels.
    kernel_size : int or tuple/list of 2 int
        Convolution window size.
    stride : int or tuple/list of 2 int
        Strides of the convolution.
    padding : int or tuple/list of 2 int
        Padding value for convolution layer.
    dilation : int or tuple/list of 2 int, default 1
        Dilation value for convolution layer.
    groups : int, default 1
        Number of groups.
    bias : bool, default False
        Whether the layer uses a bias vector.
    use_bn : bool, default True
        Whether to use BatchNorm layer.
    bn_eps : float, default 1e-5
        Small float added to variance in Batch norm.
    activation : function or str or None, default nn.ReLU(inplace=True)
        Activation function or name of activation function.
    """
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride,
                 padding,
                 dilation=1,
                 groups=1,
                 bias=False,
                 use_bn=True,
                 bn_eps=1e-5,
                 activation=(lambda: nn.ReLU(inplace=True))):
        super(ConvBlock, self).__init__()
        self.activate = (activation is not None)
        self.use_bn = use_bn

        self.conv = nn.Conv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
            bias=bias)
        if self.use_bn:
            self.bn = nn.BatchNorm2d(
                num_features=out_channels,
                eps=bn_eps)
        if self.activate:
            self.activ = get_activation_layer(activation)

    def forward(self, x):
        x = self.conv(x)
        if self.use_bn:
            x = self.bn(x)
        if self.activate:
            x = self.activ(x)
        return x


def conv1x1_block(in_channels,
                  out_channels,
                  stride=1,
                  padding=0,
                  groups=1,
                  bias=False,
                  use_bn=True,
                  bn_eps=1e-5,
                  activation=(lambda: nn.ReLU(inplace=True))):
    """
    1x1 version of the standard convolution block.

    Parameters:
    ----------
    in_channels : int
        Number of input channels.
    out_channels : int
        Number of output channels.
    stride : int or tuple/list of 2 int, default 1
        Strides of the convolution.
    padding : int or tuple/list of 2 int, default 0
        Padding value for convolution layer.
    groups : int, default 1
        Number of groups.
    bias : bool, default False
        Whether the layer uses a bias vector.
    use_bn : bool, default True
        Whether to use BatchNorm layer.
    bn_eps : float, default 1e-5
        Small float added to variance in Batch norm.
    activation : function or str or None, default nn.ReLU(inplace=True)
        Activation function or name of activation function.
    """
    return ConvBlock(
        in_channels=in_channels,
        out_channels=out_channels,
        kernel_size=1,
        stride=stride,
        padding=padding,
        groups=groups,
        bias=bias,
        use_bn=use_bn,
        bn_eps=bn_eps,
        activation=activation)


def conv3x3_block(in_channels,
                  out_channels,
                  stride=1,
                  padding=1,
                  dilation=1,
                  groups=1,
                  bias=False,
                  use_bn=True,
                  bn_eps=1e-5,
                  activation=(lambda: nn.ReLU(inplace=True))):
    """
    3x3 version of the standard convolution block.

    Parameters:
    ----------
    in_channels : int
        Number of input channels.
    out_channels : int
        Number of output channels.
    stride : int or tuple/list of 2 int, default 1
        Strides of the convolution.
    padding : int or tuple/list of 2 int, default 1
        Padding value for convolution layer.
    dilation : int or tuple/list of 2 int, default 1
        Dilation value for convolution layer.
    groups : int, default 1
        Number of groups.
    bias : bool, default False
        Whether the layer uses a bias vector.
    use_bn : bool, default True
        Whether to use BatchNorm layer.
    bn_eps : float, default 1e-5
        Small float added to variance in Batch norm.
    activation : function or str or None, default nn.ReLU(inplace=True)
        Activation function or name of activation function.
    """
    return ConvBlock(
        in_channels=in_channels,
        out_channels=out_channels,
        kernel_size=3,
        stride=stride,
        padding=padding,
        dilation=dilation,
        groups=groups,
        bias=bias,
        use_bn=use_bn,
        bn_eps=bn_eps,
        activation=activation)


def conv5x5_block(in_channels,
                  out_channels,
                  stride=1,
                  padding=2,
                  dilation=1,
                  groups=1,
                  bias=False,
                  bn_eps=1e-5,
                  activation=(lambda: nn.ReLU(inplace=True))):
    """
    5x5 version of the standard convolution block.

    Parameters:
    ----------
    in_channels : int
        Number of input channels.
    out_channels : int
        Number of output channels.
    stride : int or tuple/list of 2 int, default 1
        Strides of the convolution.
    padding : int or tuple/list of 2 int, default 2
        Padding value for convolution layer.
    dilation : int or tuple/list of 2 int, default 1
        Dilation value for convolution layer.
    groups : int, default 1
        Number of groups.
    bias : bool, default False
        Whether the layer uses a bias vector.
    bn_eps : float, default 1e-5
        Small float added to variance in Batch norm.
    activation : function or str or None, default nn.ReLU(inplace=True)
        Activation function or name of activation function.
    """
    return ConvBlock(
        in_channels=in_channels,
        out_channels=out_channels,
        kernel_size=5,
        stride=stride,
        padding=padding,
        dilation=dilation,
        groups=groups,
        bias=bias,
        bn_eps=bn_eps,
        activation=activation)


def conv7x7_block(in_channels,
                  out_channels,
                  stride=1,
                  padding=3,
                  bias=False,
                  use_bn=True,
                  activation=(lambda: nn.ReLU(inplace=True))):
    """
    7x7 version of the standard convolution block.

    Parameters:
    ----------
    in_channels : int
        Number of input channels.
    out_channels : int
        Number of output channels.
    stride : int or tuple/list of 2 int, default 1
        Strides of the convolution.
    padding : int or tuple/list of 2 int, default 3
        Padding value for convolution layer.
    bias : bool, default False
        Whether the layer uses a bias vector.
    use_bn : bool, default True
        Whether to use BatchNorm layer.
    activation : function or str or None, default nn.ReLU(inplace=True)
        Activation function or name of activation function.
    """
    return ConvBlock(
        in_channels=in_channels,
        out_channels=out_channels,
        kernel_size=7,
        stride=stride,
        padding=padding,
        bias=bias,
        use_bn=use_bn,
        activation=activation)


def dwconv_block(in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=1,
                 dilation=1,
                 bias=False,
                 use_bn=True,
                 bn_eps=1e-5,
                 activation=(lambda: nn.ReLU(inplace=True))):
    """
    Depthwise version of the standard convolution block.

    Parameters:
    ----------
    in_channels : int
        Number of input channels.
    out_channels : int
        Number of output channels.
    kernel_size : int or tuple/list of 2 int
        Convolution window size.
    stride : int or tuple/list of 2 int, default 1
        Strides of the convolution.
    padding : int or tuple/list of 2 int, default 1
        Padding value for convolution layer.
    dilation : int or tuple/list of 2 int, default 1
        Dilation value for convolution layer.
    bias : bool, default False
        Whether the layer uses a bias vector.
    use_bn : bool, default True
        Whether to use BatchNorm layer.
    bn_eps : float, default 1e-5
        Small float added to variance in Batch norm.
    activation : function or str or None, default nn.ReLU(inplace=True)
        Activation function or name of activation function.
    """
    return ConvBlock(
        in_channels=in_channels,
        out_channels=out_channels,
        kernel_size=kernel_size,
        stride=stride,
        padding=padding,
        dilation=dilation,
        groups=out_channels,
        bias=bias,
        use_bn=use_bn,
        bn_eps=bn_eps,
        activation=activation)


def dwconv3x3_block(in_channels,
                    out_channels,
                    stride=1,
                    padding=1,
                    dilation=1,
                    bias=False,
                    bn_eps=1e-5,
                    activation=(lambda: nn.ReLU(inplace=True))):
    """
    3x3 depthwise version of the standard convolution block.

    Parameters:
    ----------
    in_channels : int
        Number of input channels.
    out_channels : int
        Number of output channels.
    stride : int or tuple/list of 2 int, default 1
        Strides of the convolution.
    padding : int or tuple/list of 2 int, default 1
        Padding value for convolution layer.
    dilation : int or tuple/list of 2 int, default 1
        Dilation value for convolution layer.
    bias : bool, default False
        Whether the layer uses a bias vector.
    bn_eps : float, default 1e-5
        Small float added to variance in Batch norm.
    activation : function or str or None, default nn.ReLU(inplace=True)
        Activation function or name of activation function.
    """
    return dwconv_block(
        in_channels=in_channels,
        out_channels=out_channels,
        kernel_size=3,
        stride=stride,
        padding=padding,
        dilation=dilation,
        bias=bias,
        bn_eps=bn_eps,
        activation=activation)


def dwconv5x5_block(in_channels,
                    out_channels,
                    stride=1,
                    padding=2,
                    dilation=1,
                    bias=False,
                    bn_eps=1e-5,
                    activation=(lambda: nn.ReLU(inplace=True))):
    """
    5x5 depthwise version of the standard convolution block.

    Parameters:
    ----------
    in_channels : int
        Number of input channels.
    out_channels : int
        Number of output channels.
    stride : int or tuple/list of 2 int, default 1
        Strides of the convolution.
    padding : int or tuple/list of 2 int, default 2
        Padding value for convolution layer.
    dilation : int or tuple/list of 2 int, default 1
        Dilation value for convolution layer.
    bias : bool, default False
        Whether the layer uses a bias vector.
    bn_eps : float, default 1e-5
        Small float added to variance in Batch norm.
    activation : function or str or None, default nn.ReLU(inplace=True)
        Activation function or name of activation function.
    """
    return dwconv_block(
        in_channels=in_channels,
        out_channels=out_channels,
        kernel_size=5,
        stride=stride,
        padding=padding,
        dilation=dilation,
        bias=bias,
        bn_eps=bn_eps,
        activation=activation)


class DwsConvBlock(nn.Module):
    """
    Depthwise separable convolution block with BatchNorms and activations at each convolution layers.

    Parameters:
    ----------
    in_channels : int
        Number of input channels.
    out_channels : int
        Number of output channels.
    kernel_size : int or tuple/list of 2 int
        Convolution window size.
    stride : int or tuple/list of 2 int
        Strides of the convolution.
    padding : int or tuple/list of 2 int
        Padding value for convolution layer.
    dilation : int or tuple/list of 2 int, default 1
        Dilation value for convolution layer.
    bias : bool, default False
        Whether the layer uses a bias vector.
    use_bn : bool, default True
        Whether to use BatchNorm layer.
    bn_eps : float, default 1e-5
        Small float added to variance in Batch norm.
    dw_activation : function or str or None, default nn.ReLU(inplace=True)
        Activation function after the depthwise convolution block.
    pw_activation : function or str or None, default nn.ReLU(inplace=True)
        Activation function after the pointwise convolution block.
    """
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride,
                 padding,
                 dilation=1,
                 bias=False,
                 use_bn=True,
                 bn_eps=1e-5,
                 dw_activation=(lambda: nn.ReLU(inplace=True)),
                 pw_activation=(lambda: nn.ReLU(inplace=True))):
        super(DwsConvBlock, self).__init__()
        self.dw_conv = dwconv_block(
            in_channels=in_channels,
            out_channels=in_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            bias=bias,
            use_bn=use_bn,
            bn_eps=bn_eps,
            activation=dw_activation)
        self.pw_conv = conv1x1_block(
            in_channels=in_channels,
            out_channels=out_channels,
            bias=bias,
            use_bn=use_bn,
            bn_eps=bn_eps,
            activation=pw_activation)

    def forward(self, x):
        x = self.dw_conv(x)
        x = self.pw_conv(x)
        return x


def dwsconv3x3_block(in_channels,
                     out_channels,
                     stride=1,
                     padding=1,
                     dilation=1,
                     bias=False,
                     bn_eps=1e-5,
                     dw_activation=(lambda: nn.ReLU(inplace=True)),
                     pw_activation=(lambda: nn.ReLU(inplace=True))):
    """
    3x3 depthwise separable version of the standard convolution block.

    Parameters:
    ----------
    in_channels : int
        Number of input channels.
    out_channels : int
        Number of output channels.
    stride : int or tuple/list of 2 int, default 1
        Strides of the convolution.
    padding : int or tuple/list of 2 int, default 1
        Padding value for convolution layer.
    dilation : int or tuple/list of 2 int, default 1
        Dilation value for convolution layer.
    bias : bool, default False
        Whether the layer uses a bias vector.
    bn_eps : float, default 1e-5
        Small float added to variance in Batch norm.
    dw_activation : function or str or None, default nn.ReLU(inplace=True)
        Activation function after the depthwise convolution block.
    pw_activation : function or str or None, default nn.ReLU(inplace=True)
        Activation function after the pointwise convolution block.
    """
    return DwsConvBlock(
        in_channels=in_channels,
        out_channels=out_channels,
        kernel_size=3,
        stride=stride,
        padding=padding,
        dilation=dilation,
        bias=bias,
        bn_eps=bn_eps,
        dw_activation=dw_activation,
        pw_activation=pw_activation)


class PreConvBlock(nn.Module):
    """
    Convolution block with Batch normalization and ReLU pre-activation.

    Parameters:
    ----------
    in_channels : int
        Number of input channels.
    out_channels : int
        Number of output channels.
    kernel_size : int or tuple/list of 2 int
        Convolution window size.
    stride : int or tuple/list of 2 int
        Strides of the convolution.
    padding : int or tuple/list of 2 int
        Padding value for convolution layer.
    dilation : int or tuple/list of 2 int, default 1
        Dilation value for convolution layer.
    bias : bool, default False
        Whether the layer uses a bias vector.
    return_preact : bool, default False
        Whether return pre-activation. It's used by PreResNet.
    activate : bool, default True
        Whether activate the convolution block.
    """
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride,
                 padding,
                 dilation=1,
                 bias=False,
                 return_preact=False,
                 activate=True):
        super(PreConvBlock, self).__init__()
        self.return_preact = return_preact
        self.activate = activate

        self.bn = nn.BatchNorm2d(num_features=in_channels)
        if self.activate:
            self.activ = nn.ReLU(inplace=True)
        self.conv = nn.Conv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            bias=bias)

    def forward(self, x):
        x = self.bn(x)
        if self.activate:
            x = self.activ(x)
        if self.return_preact:
            x_pre_activ = x
        x = self.conv(x)
        if self.return_preact:
            return x, x_pre_activ
        else:
            return x


def pre_conv1x1_block(in_channels,
                      out_channels,
                      stride=1,
                      bias=False,
                      return_preact=False,
                      activate=True):
    """
    1x1 version of the pre-activated convolution block.

    Parameters:
    ----------
    in_channels : int
        Number of input channels.
    out_channels : int
        Number of output channels.
    stride : int or tuple/list of 2 int, default 1
        Strides of the convolution.
    bias : bool, default False
        Whether the layer uses a bias vector.
    return_preact : bool, default False
        Whether return pre-activation.
    activate : bool, default True
        Whether activate the convolution block.
    """
    return PreConvBlock(
        in_channels=in_channels,
        out_channels=out_channels,
        kernel_size=1,
        stride=stride,
        padding=0,
        bias=bias,
        return_preact=return_preact,
        activate=activate)


def pre_conv3x3_block(in_channels,
                      out_channels,
                      stride=1,
                      padding=1,
                      dilation=1,
                      return_preact=False,
                      activate=True):
    """
    3x3 version of the pre-activated convolution block.

    Parameters:
    ----------
    in_channels : int
        Number of input channels.
    out_channels : int
        Number of output channels.
    stride : int or tuple/list of 2 int, default 1
        Strides of the convolution.
    padding : int or tuple/list of 2 int, default 1
        Padding value for convolution layer.
    dilation : int or tuple/list of 2 int, default 1
        Dilation value for convolution layer.
    return_preact : bool, default False
        Whether return pre-activation.
    activate : bool, default True
        Whether activate the convolution block.
    """
    return PreConvBlock(
        in_channels=in_channels,
        out_channels=out_channels,
        kernel_size=3,
        stride=stride,
        padding=padding,
        dilation=dilation,
        return_preact=return_preact,
        activate=activate)


def channel_shuffle(x,
                    groups):
    """
    Channel shuffle operation from 'ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices,'
    https://arxiv.org/abs/1707.01083.

    Parameters:
    ----------
    x : Tensor
        Input tensor.
    groups : int
        Number of groups.

    Returns
    -------
    Tensor
        Resulted tensor.
    """
    batch, channels, height, width = x.size()
    # assert (channels % groups == 0)
    channels_per_group = channels // groups
    x = x.view(batch, groups, channels_per_group, height, width)
    x = torch.transpose(x, 1, 2).contiguous()
    x = x.view(batch, channels, height, width)
    return x


class ChannelShuffle(nn.Module):
    """
    Channel shuffle layer. This is a wrapper over the same operation. It is designed to save the number of groups.

    Parameters:
    ----------
    channels : int
        Number of channels.
    groups : int
        Number of groups.
    """
    def __init__(self,
                 channels,
                 groups):
        super(ChannelShuffle, self).__init__()
        # assert (channels % groups == 0)
        if channels % groups != 0:
            raise ValueError('channels must be divisible by groups')
        self.groups = groups

    def forward(self, x):
        return channel_shuffle(x, self.groups)


def channel_shuffle2(x,
                     groups):
    """
    Channel shuffle operation from 'ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices,'
    https://arxiv.org/abs/1707.01083. The alternative version.

    Parameters:
    ----------
    x : Tensor
        Input tensor.
    groups : int
        Number of groups.

    Returns
    -------
    Tensor
        Resulted tensor.
    """
    batch, channels, height, width = x.size()
    # assert (channels % groups == 0)
    channels_per_group = channels // groups
    x = x.view(batch, channels_per_group, groups, height, width)
    x = torch.transpose(x, 1, 2).contiguous()
    x = x.view(batch, channels, height, width)
    return x


class ChannelShuffle2(nn.Module):
    """
    Channel shuffle layer. This is a wrapper over the same operation. It is designed to save the number of groups.
    The alternative version.

    Parameters:
    ----------
    channels : int
        Number of channels.
    groups : int
        Number of groups.
    """
    def __init__(self,
                 channels,
                 groups):
        super(ChannelShuffle2, self).__init__()
        # assert (channels % groups == 0)
        if channels % groups != 0:
            raise ValueError('channels must be divisible by groups')
        self.groups = groups

    def forward(self, x):
        return channel_shuffle2(x, self.groups)


class SEBlock(nn.Module):
    """
    Squeeze-and-Excitation block from 'Squeeze-and-Excitation Networks,' https://arxiv.org/abs/1709.01507.

    Parameters:
    ----------
    channels : int
        Number of channels.
    reduction : int, default 16
        Squeeze reduction value.
    round_mid : bool, default False
        Whether to round middle channel number (make divisible by 8).
    activation : function, or str, or nn.Module, default 'relu'
        Activation function after the first convolution.
    out_activation : function, or str, or nn.Module, default 'sigmoid'
        Activation function after the last convolution.
    """
    def __init__(self,
                 channels,
                 reduction=16,
                 round_mid=False,
                 mid_activation=(lambda: nn.ReLU(inplace=True)),
                 out_activation=(lambda: nn.Sigmoid())):
        super(SEBlock, self).__init__()
        mid_channels = channels // reduction if not round_mid else round_channels(float(channels) / reduction)

        self.pool = nn.AdaptiveAvgPool2d(output_size=1)
        self.conv1 = conv1x1(
            in_channels=channels,
            out_channels=mid_channels,
            bias=True)
        self.activ = get_activation_layer(mid_activation)
        self.conv2 = conv1x1(
            in_channels=mid_channels,
            out_channels=channels,
            bias=True)
        self.sigmoid = get_activation_layer(out_activation)

    def forward(self, x):
        w = self.pool(x)
        w = self.conv1(w)
        w = self.activ(w)
        w = self.conv2(w)
        w = self.sigmoid(w)
        x = x * w
        return x


class IBN(nn.Module):
    """
    Instance-Batch Normalization block from 'Two at Once: Enhancing Learning and Generalization Capacities via IBN-Net,'
    https://arxiv.org/abs/1807.09441.

    Parameters:
    ----------
    channels : int
        Number of channels.
    inst_fraction : float, default 0.5
        The first fraction of channels for normalization.
    inst_first : bool, default True
        Whether instance normalization be on the first part of channels.
    """
    def __init__(self,
                 channels,
                 first_fraction=0.5,
                 inst_first=True):
        super(IBN, self).__init__()
        self.inst_first = inst_first
        h1_channels = int(math.floor(channels * first_fraction))
        h2_channels = channels - h1_channels
        self.split_sections = [h1_channels, h2_channels]

        if self.inst_first:
            self.inst_norm = nn.InstanceNorm2d(
                num_features=h1_channels,
                affine=True)
            self.batch_norm = nn.BatchNorm2d(num_features=h2_channels)
        else:
            self.batch_norm = nn.BatchNorm2d(num_features=h1_channels)
            self.inst_norm = nn.InstanceNorm2d(
                num_features=h2_channels,
                affine=True)

    def forward(self, x):
        x1, x2 = torch.split(x, split_size_or_sections=self.split_sections, dim=1)
        if self.inst_first:
            x1 = self.inst_norm(x1.contiguous())
            x2 = self.batch_norm(x2.contiguous())
        else:
            x1 = self.batch_norm(x1.contiguous())
            x2 = self.inst_norm(x2.contiguous())
        x = torch.cat((x1, x2), dim=1)
        return x


class Identity(nn.Module):
    """
    Identity block.
    """
    def __init__(self):
        super(Identity, self).__init__()

    def forward(self, x):
        return x


class DualPathSequential(nn.Sequential):
    """
    A sequential container for modules with dual inputs/outputs.
    Modules will be executed in the order they are added.

    Parameters:
    ----------
    return_two : bool, default True
        Whether to return two output after execution.
    first_ordinals : int, default 0
        Number of the first modules with single input/output.
    last_ordinals : int, default 0
        Number of the final modules with single input/output.
    dual_path_scheme : function
        Scheme of dual path response for a module.
    dual_path_scheme_ordinal : function
        Scheme of dual path response for an ordinal module.
    """
    def __init__(self,
                 return_two=True,
                 first_ordinals=0,
                 last_ordinals=0,
                 dual_path_scheme=(lambda module, x1, x2: module(x1, x2)),
                 dual_path_scheme_ordinal=(lambda module, x1, x2: (module(x1), x2))):
        super(DualPathSequential, self).__init__()
        self.return_two = return_two
        self.first_ordinals = first_ordinals
        self.last_ordinals = last_ordinals
        self.dual_path_scheme = dual_path_scheme
        self.dual_path_scheme_ordinal = dual_path_scheme_ordinal

    def forward(self, x1, x2=None):
        length = len(self._modules.values())
        for i, module in enumerate(self._modules.values()):
            if (i < self.first_ordinals) or (i >= length - self.last_ordinals):
                x1, x2 = self.dual_path_scheme_ordinal(module, x1, x2)
            else:
                x1, x2 = self.dual_path_scheme(module, x1, x2)
        if self.return_two:
            return x1, x2
        else:
            return x1


class Concurrent(nn.Sequential):
    """
    A container for concatenation of modules on the base of the sequential container.

    Parameters:
    ----------
    axis : int, default 1
        The axis on which to concatenate the outputs.
    stack : bool, default False
        Whether to concatenate tensors along a new dimension.
    """
    def __init__(self,
                 axis=1,
                 stack=False):
        super(Concurrent, self).__init__()
        self.axis = axis
        self.stack = stack

    def forward(self, x):
        out = []
        for module in self._modules.values():
            out.append(module(x))
        if self.stack:
            out = torch.stack(tuple(out), dim=self.axis)
        else:
            out = torch.cat(tuple(out), dim=self.axis)
        return out


class ParametricSequential(nn.Sequential):
    """
    A sequential container for modules with parameters.
    Modules will be executed in the order they are added.
    """
    def __init__(self, *args):
        super(ParametricSequential, self).__init__(*args)

    def forward(self, x, **kwargs):
        for module in self._modules.values():
            x = module(x, **kwargs)
        return x


class ParametricConcurrent(nn.Sequential):
    """
    A container for concatenation of modules with parameters.

    Parameters:
    ----------
    axis : int, default 1
        The axis on which to concatenate the outputs.
    """
    def __init__(self, axis=1):
        super(ParametricConcurrent, self).__init__()
        self.axis = axis

    def forward(self, x, **kwargs):
        out = []
        for module in self._modules.values():
            out.append(module(x, **kwargs))
        out = torch.cat(tuple(out), dim=self.axis)
        return out


class Hourglass(nn.Module):
    """
    A hourglass block.

    Parameters:
    ----------
    down_seq : nn.Sequential
        Down modules as sequential.
    up_seq : nn.Sequential
        Up modules as sequential.
    skip_seq : nn.Sequential
        Skip connection modules as sequential.
    merge_type : str, default 'add'
        Type of concatenation of up and skip outputs.
    return_first_skip : bool, default False
        Whether return the first skip connection output. Used in ResAttNet.
    """
    def __init__(self,
                 down_seq,
                 up_seq,
                 skip_seq,
                 merge_type="add",
                 return_first_skip=False):
        super(Hourglass, self).__init__()
        assert (len(up_seq) == len(down_seq))
        assert (len(skip_seq) == len(down_seq))
        assert (merge_type in ["add"])
        self.merge_type = merge_type
        self.return_first_skip = return_first_skip
        self.depth = len(down_seq)

        self.down_seq = down_seq
        self.up_seq = up_seq
        self.skip_seq = skip_seq

    def forward(self, x, **kwargs):
        y = None
        down_outs = [x]
        for down_module in self.down_seq._modules.values():
            x = down_module(x)
            down_outs.append(x)
        for i in range(len(down_outs)):
            if i != 0:
                y = down_outs[self.depth - i]
                skip_module = self.skip_seq[self.depth - i]
                y = skip_module(y)
                if (y is not None) and (self.merge_type == "add"):
                    x = x + y
            if i != len(down_outs) - 1:
                up_module = self.up_seq[self.depth - 1 - i]
                x = up_module(x)
        if self.return_first_skip:
            return x, y
        else:
            return x


class SesquialteralHourglass(nn.Module):
    """
    A sesquialteral hourglass block.

    Parameters:
    ----------
    down1_seq : nn.Sequential
        The first down modules as sequential.
    skip1_seq : nn.Sequential
        The first skip connection modules as sequential.
    up_seq : nn.Sequential
        Up modules as sequential.
    skip2_seq : nn.Sequential
        The second skip connection modules as sequential.
    down2_seq : nn.Sequential
        The second down modules as sequential.
    merge_type : str, default 'con'
        Type of concatenation of up and skip outputs.
    """
    def __init__(self,
                 down1_seq,
                 skip1_seq,
                 up_seq,
                 skip2_seq,
                 down2_seq,
                 merge_type="cat"):
        super(SesquialteralHourglass, self).__init__()
        assert (len(down1_seq) == len(up_seq))
        assert (len(down1_seq) == len(down2_seq))
        assert (len(skip1_seq) == len(skip2_seq))
        assert (len(down1_seq) == len(skip1_seq) - 1)
        assert (merge_type in ["cat", "add"])
        self.merge_type = merge_type
        self.depth = len(down1_seq)

        self.down1_seq = down1_seq
        self.skip1_seq = skip1_seq
        self.up_seq = up_seq
        self.skip2_seq = skip2_seq
        self.down2_seq = down2_seq

    def _merge(self, x, y):
        if y is not None:
            if self.merge_type == "cat":
                x = torch.cat((x, y), dim=1)
            elif self.merge_type == "add":
                x = x + y
        return x

    def forward(self, x, **kwargs):
        y = self.skip1_seq[0](x)
        skip1_outs = [y]
        for i in range(self.depth):
            x = self.down1_seq[i](x)
            y = self.skip1_seq[i + 1](x)
            skip1_outs.append(y)
        x = skip1_outs[self.depth]
        y = self.skip2_seq[0](x)
        skip2_outs = [y]
        for i in range(self.depth):
            x = self.up_seq[i](x)
            y = skip1_outs[self.depth - 1 - i]
            x = self._merge(x, y)
            y = self.skip2_seq[i + 1](x)
            skip2_outs.append(y)
        x = self.skip2_seq[self.depth](x)
        for i in range(self.depth):
            x = self.down2_seq[i](x)
            y = skip2_outs[self.depth - 1 - i]
            x = self._merge(x, y)
        return x


class MultiOutputSequential(nn.Sequential):
    """
    A sequential container with multiple outputs.
    Modules will be executed in the order they are added.
    """
    def __init__(self):
        super(MultiOutputSequential, self).__init__()

    def forward(self, x):
        outs = []
        for module in self._modules.values():
            x = module(x)
            if hasattr(module, "do_output") and module.do_output:
                outs.append(x)
        return [x] + outs


class Flatten(nn.Module):
    """
    Simple flatten module.
    """

    def forward(self, x):
        return x.view(x.size(0), -1)

# SqueezeNet

# Train Model Function

In [14]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=CFG.EPOCHS, patience=7, min_lr=1e-6):
    best_accuracy = 0.0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Initialize learning rate scheduler without verbose parameter
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.1, patience=3, min_lr=min_lr
    )
    
    # Early stopping parameters
    early_stopping_patience = patience
    early_stopping_counter = 0
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        train_preds = []
        train_labels = []
        
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            train_preds.extend(preds.cpu().numpy())
            train_labels.extend(labels.cpu().numpy())
            
        # Calculate training metrics
        train_accuracy = accuracy_score(train_labels, train_preds)
        train_f1 = f1_score(train_labels, train_preds, average='weighted')
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        val_preds = []
        val_labels = []
        
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, preds = torch.max(outputs, 1)
                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())
                
        # Calculate validation metrics
        val_accuracy = accuracy_score(val_labels, val_preds)
        val_f1 = f1_score(val_labels, val_preds, average='weighted')
        
        # Print metrics
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Train Loss: {train_loss/len(train_loader):.4f}, Train Acc: {train_accuracy:.4f}, Train F1: {train_f1:.4f}")
        print(f"Val Loss: {val_loss/len(val_loader):.4f}, Val Acc: {val_accuracy:.4f}, Val F1: {val_f1:.4f}")
        
        # Print confusion matrix every 5 epochs
        # if epoch % 5 == 0:
        #     cm = confusion_matrix(val_labels, val_preds)
        
        # Learning rate scheduling
        old_lr = optimizer.param_groups[0]['lr']
        scheduler.step(val_accuracy)
        new_lr = optimizer.param_groups[0]['lr']
        
        # Only print LR changes when they occur
        if old_lr != new_lr:
            print(f"Learning rate decreased from {old_lr:.6f} to {new_lr:.6f}")
        
        # Save the best model and update early stopping
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            torch.save(model.state_dict(), 'best_model.pth')
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1
            
        # Early stopping check
        # if early_stopping_counter >= early_stopping_patience:
        #     print(f"\nEarly stopping triggered after {epoch+1} epochs!")
        #     break
            
        # If learning rate is too small, stop training
        if new_lr <= min_lr:
            print(f"\nLearning rate {new_lr:.6f} is too small. Stopping training!")
            break
    
    print(f"Best Validation Accuracy: {best_accuracy:.4f}")
    return best_accuracy

# Experiment 1 [EfficientNet + SqueezeNet + CBAM + SE Block]

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class ChannelAttention(nn.Module):
    def __init__(self, channels, reduction=16):
        super(ChannelAttention, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)
        
        self.fc = nn.Sequential(
            nn.Conv2d(channels, channels // reduction, 1, bias=False),
            nn.ReLU(inplace=True),
            nn.Conv2d(channels // reduction, channels, 1, bias=False)
        )
        
    def forward(self, x):
        avg_out = self.fc(self.avg_pool(x))
        max_out = self.fc(self.max_pool(x))
        out = avg_out + max_out
        return torch.sigmoid(out)

class SpatialAttention(nn.Module):
    def __init__(self, kernel_size=7):
        super(SpatialAttention, self).__init__()
        self.conv = nn.Conv2d(2, 1, kernel_size, padding=kernel_size//2, bias=False)
        
    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        x = torch.cat([avg_out, max_out], dim=1)
        x = self.conv(x)
        return torch.sigmoid(x)

class CBAM(nn.Module):
    def __init__(self, channels, reduction=16, kernel_size=7):
        super(CBAM, self).__init__()
        self.channel_att = ChannelAttention(channels, reduction)
        self.spatial_att = SpatialAttention(kernel_size)
        
    def forward(self, x):
        x = x * self.channel_att(x)
        x = x * self.spatial_att(x)
        return x

class HybridFireUnit(nn.Module):
    def __init__(self, in_channels, squeeze_channels, expand_channels, residual=True):
        super(HybridFireUnit, self).__init__()
        self.residual = residual and (in_channels == expand_channels * 2)
        
        self.squeeze = conv1x1_block(
            in_channels=in_channels,
            out_channels=squeeze_channels,
            activation="relu")
            
        self.expand1x1 = conv1x1_block(
            in_channels=squeeze_channels,
            out_channels=expand_channels,
            activation="relu")
            
        self.expand3x3 = conv3x3_block(
            in_channels=squeeze_channels,
            out_channels=expand_channels,
            activation="relu")
            
        self.cbam = CBAM(expand_channels * 2)
        
    def forward(self, x):
        identity = x
        
        x = self.squeeze(x)
        e1 = self.expand1x1(x)
        e3 = self.expand3x3(x)
        out = torch.cat([e1, e3], dim=1)
        
        out = self.cbam(out)
        
        if self.residual:
            out = out + identity
        return out

class EfficientFireBlock(nn.Module):
    def __init__(self, in_channels, squeeze_channels, expand_channels, stride=1):
        super(EfficientFireBlock, self).__init__()
        self.conv1 = conv1x1_block(
            in_channels=in_channels,
            out_channels=squeeze_channels,
            activation="swish")
            
        self.fire = HybridFireUnit(
            in_channels=squeeze_channels,
            squeeze_channels=squeeze_channels // 4,
            expand_channels=expand_channels // 2)
            
        self.se = SEBlock(
            channels=expand_channels,
            reduction=4,
            mid_activation="swish")
            
        if stride > 1:
            self.pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
        else:
            self.pool = None

    def forward(self, x):
        x = self.conv1(x)
        x = self.fire(x)
        x = self.se(x)
        if self.pool is not None:
            x = self.pool(x)
        return x

class HybridNet(nn.Module):
    def __init__(self,
                 init_channels=32,
                 channels=[64, 128, 256, 512],
                 scale_factor=1.0,
                 num_classes=1000):
        super(HybridNet, self).__init__()
        
        channels = [int(c * scale_factor) for c in channels]
        
        self.init_block = conv3x3_block(
            in_channels=3,
            out_channels=init_channels,
            stride=2,
            activation="swish")
            
        self.stages = nn.ModuleList()
        in_channels = init_channels
        
        for i, out_channels in enumerate(channels):
            stage = nn.Sequential(
                EfficientFireBlock(
                    in_channels=in_channels,
                    squeeze_channels=out_channels // 2,
                    expand_channels=out_channels,
                    stride=2 if i > 0 else 1),
                EfficientFireBlock(
                    in_channels=out_channels,
                    squeeze_channels=out_channels // 2,
                    expand_channels=out_channels)
            )
            self.stages.append(stage)
            in_channels = out_channels
            

        self.global_pool = nn.AdaptiveAvgPool2d(1)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(channels[-1], num_classes)
        
        self._initialize_weights()
        
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)
                
    def forward(self, x):
        x = self.init_block(x)
        
        for stage in self.stages:
            x = stage(x)
            
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = self.fc(x)
        return x

def create_model_1(num_classes=1000, scale=1.0):
    return HybridNet(
        init_channels=32,
        scale_factor=scale,
        num_classes=num_classes
    )

In [24]:
# Initialize model
model = create_model_1(num_classes=2, scale = 1.0)  # Set to 2 classes
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(num_params)

# Transfer to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Setup optimizer and scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.1, patience=3
)

# Calculate class weights
class_counts = train_new_df['label_encoded'].value_counts()
class_weights = torch.FloatTensor([1/class_counts[0], 1/class_counts[1]]).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

# Train the model
train_model(
    model=model, 
    train_loader=train_loader, 
    val_loader=val_loader, 
    criterion=criterion, 
    optimizer=optimizer, 
    num_epochs=100
)

# Load the best model
model.load_state_dict(torch.load('best_model.pth', weights_only = True))
model.eval()

# Evaluate on test set
correct_test = 0
total_test = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        correct_test += torch.sum(preds == labels)
        total_test += labels.size(0)

test_accuracy = correct_test.double() / total_test
print(f"Test Accuracy: {test_accuracy:.4f}")

1186418
Epoch 1/100
Train Loss: 0.6590, Train Acc: 0.5921, Train F1: 0.5924
Val Loss: 0.4703, Val Acc: 0.7677, Val F1: 0.7679
Epoch 2/100
Train Loss: 0.5456, Train Acc: 0.7091, Train F1: 0.7071
Val Loss: 0.4220, Val Acc: 0.7778, Val F1: 0.7768
Epoch 3/100
Train Loss: 0.5165, Train Acc: 0.7325, Train F1: 0.7315
Val Loss: 0.4004, Val Acc: 0.8081, Val F1: 0.8085
Epoch 4/100
Train Loss: 0.5005, Train Acc: 0.7457, Train F1: 0.7451
Val Loss: 0.3786, Val Acc: 0.8182, Val F1: 0.8183
Epoch 5/100
Train Loss: 0.4791, Train Acc: 0.7562, Train F1: 0.7560
Val Loss: 0.3865, Val Acc: 0.8131, Val F1: 0.8135
Epoch 6/100
Train Loss: 0.4666, Train Acc: 0.7635, Train F1: 0.7632
Val Loss: 0.3618, Val Acc: 0.8207, Val F1: 0.8209
Epoch 7/100
Train Loss: 0.4534, Train Acc: 0.7770, Train F1: 0.7770
Val Loss: 0.3427, Val Acc: 0.8460, Val F1: 0.8462
Epoch 8/100
Train Loss: 0.4457, Train Acc: 0.7803, Train F1: 0.7804
Val Loss: 0.3384, Val Acc: 0.8359, Val F1: 0.8357
Epoch 9/100
Train Loss: 0.4326, Train Acc: 0.788

# Experiment 2 [Prev Highest Accuracy Model]

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class ChannelAttention(nn.Module):
    def __init__(self, channels, reduction_ratio=16):
        super(ChannelAttention, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)
        
        self.fc = nn.Sequential(
            conv1x1(channels, channels // reduction_ratio, bias=True),
            nn.ReLU(inplace=True),
            conv1x1(channels // reduction_ratio, channels, bias=True)
        )
        
    def forward(self, x):
        avg_out = self.fc(self.avg_pool(x))
        max_out = self.fc(self.max_pool(x))
        out = torch.sigmoid(avg_out + max_out)
        return out

class SpatialAttention(nn.Module):
    def __init__(self):
        super(SpatialAttention, self).__init__()
        self.conv = conv3x3_block(2, 1, activation=None)
        
    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        out = torch.cat([avg_out, max_out], dim=1)
        out = self.conv(out)
        return torch.sigmoid(out)

class CBAMBlock(nn.Module):
    def __init__(self, channels, reduction_ratio=16):
        super(CBAMBlock, self).__init__()
        self.channel_attention = ChannelAttention(channels, reduction_ratio)
        self.spatial_attention = SpatialAttention()
        
    def forward(self, x):
        out = x * self.channel_attention(x)
        out = out * self.spatial_attention(out)
        return out

class EnhancedDenseBlock(nn.Module):
    def __init__(self, in_channels, growth_rate, bottleneck_ratio=4):
        super(EnhancedDenseBlock, self).__init__()
        bottleneck_channels = growth_rate * bottleneck_ratio
        
        self.branch1 = nn.Sequential(
            conv1x1_block(in_channels, bottleneck_channels),
            conv3x3_block(bottleneck_channels, growth_rate)
        )
        
        self.branch2 = nn.Sequential(
            conv1x1_block(in_channels, bottleneck_channels),
            conv3x3_block(bottleneck_channels, growth_rate),
            conv3x3_block(growth_rate, growth_rate)
        )
        
        self.cbam = CBAMBlock(growth_rate * 2)
        
    def forward(self, x):
        branch1 = self.branch1(x)
        branch2 = self.branch2(x)
        out = torch.cat([branch1, branch2], dim=1)
        out = self.cbam(out)
        out = torch.cat([x, out], dim=1)
        return out

class TransitionBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(TransitionBlock, self).__init__()
        self.conv = conv1x1_block(in_channels, out_channels)
        self.pool = nn.AvgPool2d(kernel_size=2, stride=2)
        
    def forward(self, x):
        x = self.conv(x)
        x = self.pool(x)
        return x

class InitialBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(InitialBlock, self).__init__()
        self.conv1 = conv3x3_block(in_channels, out_channels // 2, stride=2)
        self.conv2 = conv3x3_block(out_channels // 2, out_channels, stride=2)
        self.cbam = CBAMBlock(out_channels)
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.cbam(x)
        return x

class EnhancedNetwork(nn.Module):
    def __init__(self,
                 in_channels=3,
                 init_channels=32,
                 growth_rate=32,
                 block_config=(3, 4, 8, 6),
                 num_classes=1000):
        super(EnhancedNetwork, self).__init__()
        
        channels = init_channels
        
        self.features = nn.Sequential()
        self.features.add_module("init_block", InitialBlock(in_channels, channels))
        
        for i, num_layers in enumerate(block_config):
            block = nn.Sequential()
            for j in range(num_layers):
                block.add_module(f"dense_layer_{j+1}", EnhancedDenseBlock(
                    channels,
                    growth_rate
                ))
                channels += growth_rate * 2
                
            self.features.add_module(f"dense_block_{i+1}", block)
            
            if i != len(block_config) - 1:
                out_channels = channels // 2
                self.features.add_module(f"transition_{i+1}",
                    TransitionBlock(channels, out_channels))
                channels = out_channels
        
        self.features.add_module("final_bn", nn.BatchNorm2d(channels))
        self.features.add_module("final_relu", nn.ReLU(inplace=True))
        self.features.add_module("final_pool", nn.AdaptiveAvgPool2d((1, 1)))
        
        self.output = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(channels, num_classes)
        )
        
        self._initialize_weights()
        
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)
    
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.output(x)
        return x

def create_model_2(num_classes=2, **kwargs):
    return EnhancedNetwork(
        in_channels=3,
        init_channels=32,
        growth_rate=32,
        block_config=(3, 4, 8, 6),
        num_classes=num_classes,
        **kwargs
    )

In [17]:
# Initialize model
model = create_model_2() 
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(num_params)

# Transfer to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Setup optimizer and scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.1, patience=3
)

# Calculate class weights
class_counts = train_new_df['label_encoded'].value_counts()
class_weights = torch.FloatTensor([1/class_counts[0], 1/class_counts[1]]).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

# Train the model
train_model(
    model=model, 
    train_loader=train_loader, 
    val_loader=val_loader, 
    criterion=criterion, 
    optimizer=optimizer, 
    num_epochs=30
)

# Load the best model
model.load_state_dict(torch.load('best_model.pth', weights_only = True))
model.eval()

# Evaluate on test set
correct_test = 0
total_test = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        correct_test += torch.sum(preds == labels)
        total_test += labels.size(0)

test_accuracy = correct_test.double() / total_test
print(f"Test Accuracy: {test_accuracy:.4f}")

4016344
Epoch 1/30
Train Loss: 0.5299, Train Acc: 0.7223, Train F1: 0.7222
Val Loss: 0.3749, Val Acc: 0.8030, Val F1: 0.8031
Epoch 2/30
Train Loss: 0.4630, Train Acc: 0.7700, Train F1: 0.7700
Val Loss: 0.3439, Val Acc: 0.8434, Val F1: 0.8436
Epoch 3/30
Train Loss: 0.4331, Train Acc: 0.7854, Train F1: 0.7855
Val Loss: 0.3144, Val Acc: 0.8561, Val F1: 0.8564
Epoch 4/30
Train Loss: 0.4096, Train Acc: 0.7966, Train F1: 0.7967
Val Loss: 0.3081, Val Acc: 0.8384, Val F1: 0.8375
Epoch 5/30
Train Loss: 0.3983, Train Acc: 0.8043, Train F1: 0.8046
Val Loss: 0.3304, Val Acc: 0.8510, Val F1: 0.8513
Epoch 6/30
Train Loss: 0.3864, Train Acc: 0.8150, Train F1: 0.8153
Val Loss: 0.2794, Val Acc: 0.8813, Val F1: 0.8815
Epoch 7/30
Train Loss: 0.3747, Train Acc: 0.8194, Train F1: 0.8197
Val Loss: 0.2753, Val Acc: 0.8737, Val F1: 0.8740
Epoch 8/30
Train Loss: 0.3602, Train Acc: 0.8303, Train F1: 0.8306
Val Loss: 0.2801, Val Acc: 0.8838, Val F1: 0.8839
Epoch 9/30
Train Loss: 0.3554, Train Acc: 0.8306, Train 

In [23]:
import torch
torch.cuda.empty_cache()

# Experiment 3 [ EfficientNet + SqueezeNet + LSTM + CBAM]

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class ImprovedChannelAttention(nn.Module):
    def __init__(self, in_channels, reduction_ratio=16):  
        super(ImprovedChannelAttention, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1) 
        
        self.shared_mlp = nn.Sequential(
            nn.Conv2d(in_channels, in_channels // reduction_ratio, 1, bias=False),
            nn.BatchNorm2d(in_channels // reduction_ratio),  
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels // reduction_ratio, in_channels, 1, bias=False)
        )
        
    def forward(self, x):
        avg_out = self.shared_mlp(self.avg_pool(x))
        max_out = self.shared_mlp(self.max_pool(x))
        return torch.sigmoid(avg_out + max_out)  

class ImprovedSpatialAttention(nn.Module):
    def __init__(self, kernel_size=7):  
        super(ImprovedSpatialAttention, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(2, 1, kernel_size, padding=kernel_size//2, bias=False),
            nn.BatchNorm2d(1),  
            nn.ReLU(inplace=True) 
        )
        
    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        x = torch.cat([avg_out, max_out], dim=1)
        return torch.sigmoid(self.conv(x))

class ImprovedCBAMBlock(nn.Module):
    def __init__(self, in_channels, reduction_ratio=16, kernel_size=7):
        super(ImprovedCBAMBlock, self).__init__()
        self.channel_attention = ImprovedChannelAttention(in_channels, reduction_ratio)
        self.spatial_attention = ImprovedSpatialAttention(kernel_size)
        
    def forward(self, x):
        x = x * self.channel_attention(x)
        x = x * self.spatial_attention(x)
        return x

class SEBlock(nn.Module):
    def __init__(self, in_channels, reduction_ratio=16):
        super(SEBlock, self).__init__()
        self.squeeze = nn.AdaptiveAvgPool2d(1)
        self.excitation = nn.Sequential(
            nn.Linear(in_channels, in_channels // reduction_ratio),
            nn.ReLU(inplace=True),
            nn.Linear(in_channels // reduction_ratio, in_channels),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        b, c, _, _ = x.size()
        squeeze = self.squeeze(x).view(b, c)
        excitation = self.excitation(squeeze).view(b, c, 1, 1)
        return x * excitation

class ImprovedFireBlock(nn.Module):
    def __init__(self, in_channels, squeeze_channels, expand_channels):
        super(ImprovedFireBlock, self).__init__()
        self.squeeze = nn.Sequential(
            nn.Conv2d(in_channels, squeeze_channels, 1, bias=False),
            nn.BatchNorm2d(squeeze_channels),
            nn.ReLU(inplace=True)
        )
        
        self.expand_1x1 = nn.Sequential(
            nn.Conv2d(squeeze_channels, expand_channels, 1, bias=False),
            nn.BatchNorm2d(expand_channels),
            nn.ReLU(inplace=True)
        )
        
        self.expand_3x3 = nn.Sequential(
            nn.Conv2d(squeeze_channels, expand_channels, 3, padding=1, bias=False),
            nn.BatchNorm2d(expand_channels),
            nn.ReLU(inplace=True)
        )
        
        self.se = SEBlock(expand_channels * 2)  
        self.cbam = ImprovedCBAMBlock(expand_channels * 2)
        
    def forward(self, x):
        x = self.squeeze(x)
        x = torch.cat([self.expand_1x1(x), self.expand_3x3(x)], 1)
        x = self.se(x)  
        x = self.cbam(x) 
        return x

class ImprovedResidualBlock(nn.Module):
    def __init__(self, in_channels):
        super(ImprovedResidualBlock, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, 3, padding=1, bias=False),
            nn.BatchNorm2d(in_channels),
            nn.ReLU(inplace=True)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, 3, padding=1, bias=False),
            nn.BatchNorm2d(in_channels)
        )
        self.relu = nn.ReLU(inplace=True)
        self.se = SEBlock(in_channels) 
        
    def forward(self, x):
        residual = x
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.se(x)  
        x += residual 
        return self.relu(x)

class EnhancedHybridNet(nn.Module):
    def __init__(self, num_classes=1000, input_channels=3):
        super(EnhancedHybridNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(input_channels, 32, 3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )

        self.fire1 = ImprovedFireBlock(32, 16, 32)
        self.fire2 = ImprovedFireBlock(64, 24, 48)
        self.fire3 = ImprovedFireBlock(96, 32, 64)
        self.fire4 = ImprovedFireBlock(128, 48, 96)
        
        self.residual1 = ImprovedResidualBlock(192)
        self.residual2 = ImprovedResidualBlock(192)
        
        self.lstm = nn.LSTM(
            input_size=192,
            hidden_size=96,
            num_layers=2,
            batch_first=True,
            bidirectional=True,
            dropout=0.1
        )
        
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
            nn.BatchNorm1d(192),
            nn.Dropout(0.5),
            nn.Linear(192, 512),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(512),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )
        
        self._initialize_weights()
        
    def forward(self, x):
        x = self.features(x)
        f1 = self.fire1(x)
        f2 = self.fire2(f1)
        f3 = self.fire3(f2)
        f4 = self.fire4(f3)
        x = self.residual1(f4)
        x = self.residual2(x)
        batch_size, channels, height, width = x.size()
        x_seq = x.view(batch_size, channels, -1).permute(0, 2, 1)
        x_lstm, _ = self.lstm(x_seq)
        x = x_lstm.permute(0, 2, 1).view(batch_size, channels, height, width)
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.LSTM):
                for name, param in m.named_parameters():
                    if 'weight' in name:
                        nn.init.orthogonal_(param)
                    elif 'bias' in name:
                        nn.init.constant_(param, 0)

def create_model_3(num_classes=1000, pretrained=False):
    model = EnhancedHybridNet(num_classes=num_classes)
    if pretrained:
        raise ValueError("Pretrained model is not available for EnhancedHybridNet")
    return model

In [17]:
# Initialize model
model = create_model_3(num_classes=2)  # Set to 2 classes
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(num_params)

# Transfer to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


# Setup optimizer and scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.1, patience=3
)

# Calculate class weights
class_counts = train_new_df['label_encoded'].value_counts()
class_weights = torch.FloatTensor([1/class_counts[0], 1/class_counts[1]]).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

# Train the model
train_model(
    model=model, 
    train_loader=train_loader, 
    val_loader=val_loader, 
    criterion=criterion, 
    optimizer=optimizer, 
    num_epochs=35
)

# Load the best model
model.load_state_dict(torch.load('best_model.pth', weights_only = True))
model.eval()

# Evaluate on test set
correct_test = 0
total_test = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        correct_test += torch.sum(preds == labels)
        total_test += labels.size(0)

test_accuracy = correct_test.double() / total_test
print(f"Test Accuracy: {test_accuracy:.4f}")

1999156
Epoch 1/35
Train Loss: 0.5679, Train Acc: 0.6963, Train F1: 0.6953
Val Loss: 0.4188, Val Acc: 0.7854, Val F1: 0.7822
Epoch 2/35
Train Loss: 0.5086, Train Acc: 0.7433, Train F1: 0.7427
Val Loss: 0.3707, Val Acc: 0.8081, Val F1: 0.8063
Epoch 3/35
Train Loss: 0.4806, Train Acc: 0.7614, Train F1: 0.7611
Val Loss: 0.3484, Val Acc: 0.8182, Val F1: 0.8165
Epoch 4/35
Train Loss: 0.4735, Train Acc: 0.7700, Train F1: 0.7698
Val Loss: 0.3555, Val Acc: 0.8308, Val F1: 0.8297
Epoch 5/35
Train Loss: 0.4577, Train Acc: 0.7691, Train F1: 0.7688
Val Loss: 0.3403, Val Acc: 0.8207, Val F1: 0.8192
Epoch 6/35
Train Loss: 0.4414, Train Acc: 0.7859, Train F1: 0.7860
Val Loss: 0.3177, Val Acc: 0.8561, Val F1: 0.8559
Epoch 7/35
Train Loss: 0.4239, Train Acc: 0.7898, Train F1: 0.7899
Val Loss: 0.3315, Val Acc: 0.8232, Val F1: 0.8226
Epoch 8/35
Train Loss: 0.4090, Train Acc: 0.8015, Train F1: 0.8015
Val Loss: 0.3564, Val Acc: 0.8333, Val F1: 0.8318
Epoch 9/35
Train Loss: 0.4031, Train Acc: 0.8081, Train 

# Experiment 4 [ ShuffleNet + EfficientNet + LSTM + CBAM]

In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class CBAM(nn.Module):
    def __init__(self, channels, reduction_ratio=16):
        super(CBAM, self).__init__()
        self.channel_attention = ChannelAttention(channels, reduction_ratio)
        self.spatial_attention = SpatialAttention()

    def forward(self, x):
        x = self.channel_attention(x)
        x = self.spatial_attention(x)
        return x

class ChannelAttention(nn.Module):
    def __init__(self, channels, reduction_ratio):
        super(ChannelAttention, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)
        
        self.fc = nn.Sequential(
            conv1x1(channels, channels // reduction_ratio, bias=True),
            nn.ReLU(inplace=True),
            conv1x1(channels // reduction_ratio, channels, bias=True)
        )
        
    def forward(self, x):
        avg_out = self.fc(self.avg_pool(x))
        max_out = self.fc(self.max_pool(x))
        out = torch.sigmoid(avg_out + max_out)
        return x * out

class SpatialAttention(nn.Module):
    def __init__(self):
        super(SpatialAttention, self).__init__()
        self.conv = conv3x3_block(2, 1, activation=None)
        
    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        x_cat = torch.cat([avg_out, max_out], dim=1)
        out = torch.sigmoid(self.conv(x_cat))
        return x * out

class CustomSEBlock(nn.Module):
    def __init__(self, channels, reduction_ratio=16):
        super(CustomSEBlock, self).__init__()
        mid_channels = channels // reduction_ratio

        self.pool = nn.AdaptiveAvgPool2d(1)
        self.conv1 = conv1x1(channels, mid_channels, bias=True)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv1x1(mid_channels, channels, bias=True)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        w = self.pool(x)
        w = self.conv1(w)
        w = self.relu(w)
        w = self.conv2(w)
        w = self.sigmoid(w)
        return x * w

class HybridBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, groups=4):
        super(HybridBlock, self).__init__()
        
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.stride = stride
        
        split_channels = in_channels // 2
        
        self.effi_path = nn.Sequential(
            conv1x1_block(split_channels, out_channels // 2),
            depthwise_conv3x3(out_channels // 2, stride),
            CustomSEBlock(out_channels // 2, reduction_ratio=4)
        )
        
        self.shuffle_path = nn.Sequential(
            conv1x1_block(split_channels, out_channels // 2, groups=groups),
            ChannelShuffle(out_channels // 2, groups),
            depthwise_conv3x3(out_channels // 2, stride)
        )
        
        self.cbam = CBAM(out_channels)
        
        self.adjust_identity = None
        if stride != 1 or in_channels != out_channels:
            self.adjust_identity = conv1x1_block(
                in_channels, out_channels, stride=stride)

    def forward(self, x):
        if self.stride == 1:
            x1, x2 = torch.chunk(x, 2, dim=1)
            out1 = self.effi_path(x1)
            out2 = self.shuffle_path(x2)
            out = torch.cat([out1, out2], dim=1)
        else:
            split_size = self.in_channels // 2
            x1 = x[:, :split_size, :, :]
            x2 = x[:, split_size:, :, :]
            out1 = self.effi_path(x1)
            out2 = self.shuffle_path(x2)
            out = torch.cat([out1, out2], dim=1)
            
        out = self.cbam(out)
        
        if self.adjust_identity is not None:
            identity = self.adjust_identity(x)
        else:
            identity = x
            
        return F.relu(out + identity)

class SpatialLSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(SpatialLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, bidirectional=True, batch_first=True)
        
    def forward(self, x):
        batch, channels, height, width = x.size()
        x = x.permute(0, 2, 3, 1).contiguous()  # B x H x W x C
        x = x.view(batch * height, width, channels)  # (B*H) x W x C
        
        x, _ = self.lstm(x)

        x = x.view(batch, height, width, -1)
        x = x.permute(0, 3, 1, 2).contiguous()
        return x

class HybridNet(nn.Module):
    def __init__(self, num_classes=1000, input_channels=3):
        super(HybridNet, self).__init__()
        self.init_conv = conv3x3_block(
            in_channels=input_channels,
            out_channels=64,
            stride=2)
        self.stage1 = self._make_stage(64, 128, blocks=3)
        self.stage2 = self._make_stage(128, 256, blocks=4)
        self.stage3 = self._make_stage(256, 512, blocks=6)
        self.stage4 = self._make_stage(512, 1024, blocks=3)
        self.spatial_lstm = SpatialLSTM(1024, 512)
        self.global_pool = nn.AdaptiveAvgPool2d(1)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(1024, num_classes)
        
        self._initialize_weights()
        
    def _make_stage(self, in_channels, out_channels, blocks):
        layers = []
        layers.append(HybridBlock(in_channels, out_channels, stride=2))
        for _ in range(1, blocks):
            layers.append(HybridBlock(out_channels, out_channels, stride=1))
        return nn.Sequential(*layers)
    
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = self.init_conv(x)
        
        x = self.stage1(x)
        x = self.stage2(x)
        x = self.stage3(x)
        x = self.stage4(x)
        
        x = self.spatial_lstm(x)
        
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = self.fc(x)
        
        return x

def create_model_4(num_classes=1000, input_channels=3):
    model = HybridNet(num_classes=num_classes, input_channels=input_channels)
    return model

In [21]:
# Initialize model
model = create_model_4(num_classes=2)  # Set to 2 classes
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(num_params)

# Transfer to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Setup optimizer and scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.1, patience=3
)

# Calculate class weights
class_counts = train_new_df['label_encoded'].value_counts()
class_weights = torch.FloatTensor([1/class_counts[0], 1/class_counts[1]]).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

# Train the model
train_model(
    model=model, 
    train_loader=train_loader, 
    val_loader=val_loader, 
    criterion=criterion, 
    optimizer=optimizer, 
    num_epochs=35
)

# Load the best model
model.load_state_dict(torch.load('best_model.pth', weights_only = True))
model.eval()

# Evaluate on test set
correct_test = 0
total_test = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        correct_test += torch.sum(preds == labels)
        total_test += labels.size(0)

test_accuracy = correct_test.double() / total_test
print(f"Test Accuracy: {test_accuracy:.4f}")

9711562
Epoch 1/35
Train Loss: 0.5497, Train Acc: 0.7115, Train F1: 0.7118
Val Loss: 0.4047, Val Acc: 0.7929, Val F1: 0.7914
Epoch 2/35
Train Loss: 0.4981, Train Acc: 0.7453, Train F1: 0.7452
Val Loss: 0.3784, Val Acc: 0.8207, Val F1: 0.8207
Epoch 3/35
Train Loss: 0.4719, Train Acc: 0.7645, Train F1: 0.7645
Val Loss: 0.3715, Val Acc: 0.8258, Val F1: 0.8261
Epoch 4/35
Train Loss: 0.4587, Train Acc: 0.7736, Train F1: 0.7736
Val Loss: 0.3683, Val Acc: 0.8308, Val F1: 0.8310
Epoch 5/35
Train Loss: 0.4501, Train Acc: 0.7757, Train F1: 0.7759
Val Loss: 0.3575, Val Acc: 0.8283, Val F1: 0.8275
Epoch 6/35
Train Loss: 0.4364, Train Acc: 0.7804, Train F1: 0.7805
Val Loss: 0.3614, Val Acc: 0.8258, Val F1: 0.8255
Epoch 7/35
Train Loss: 0.4340, Train Acc: 0.7871, Train F1: 0.7873
Val Loss: 0.3412, Val Acc: 0.8409, Val F1: 0.8412
Epoch 8/35
Train Loss: 0.4276, Train Acc: 0.7912, Train F1: 0.7913
Val Loss: 0.3316, Val Acc: 0.8510, Val F1: 0.8513
Epoch 9/35
Train Loss: 0.4233, Train Acc: 0.7935, Train 

# Experiment 4 [ ShuffleNet + EfficientNet + LSTM + CBAM] Less Param Ver.

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class ChannelAttention(nn.Module):
    def __init__(self, channels, reduction=16):
        super(ChannelAttention, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)
        
        self.fc = nn.Sequential(
            nn.Conv2d(channels, channels // reduction, 1, bias=False),
            nn.ReLU(inplace=True),
            nn.Conv2d(channels // reduction, channels, 1, bias=False)
        )
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        avg_out = self.fc(self.avg_pool(x))
        max_out = self.fc(self.max_pool(x))
        out = avg_out + max_out
        return self.sigmoid(out)

class SpatialAttention(nn.Module):
    def __init__(self, kernel_size=7):
        super(SpatialAttention, self).__init__()
        self.conv = conv3x3_block(2, 1, activation=None)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        x = torch.cat([avg_out, max_out], dim=1)
        x = self.conv(x)
        return self.sigmoid(x)

class CBAM(nn.Module):
    def __init__(self, channels, reduction=16, kernel_size=7):
        super(CBAM, self).__init__()
        self.channel_att = ChannelAttention(channels, reduction)
        self.spatial_att = SpatialAttention(kernel_size)

    def forward(self, x):
        x = x * self.channel_att(x)
        x = x * self.spatial_att(x)
        return x

class LightweightShuffle(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(LightweightShuffle, self).__init__()
        self.stride = stride
        mid_channels = out_channels // 4
        
        self.conv1 = conv1x1_block(in_channels, mid_channels)
        self.shuffle = ChannelShuffle(mid_channels, 2)
        self.dwconv = dwconv3x3_block(mid_channels, mid_channels, stride=stride)
        self.conv2 = conv1x1_block(mid_channels, out_channels)
        
        if stride == 2 or in_channels != out_channels:
            self.shortcut = conv1x1_block(in_channels, out_channels, stride=stride)
        else:
            self.shortcut = None
            
    def forward(self, x):
        residual = x
        
        out = self.conv1(x)
        out = self.shuffle(out)
        out = self.dwconv(out)
        out = self.conv2(out)
        
        if self.shortcut:
            residual = self.shortcut(x)
            
        return out + residual

class TemporalBlock(nn.Module):
    def __init__(self, channels):
        super(TemporalBlock, self).__init__()
        self.lstm = nn.LSTM(channels, channels // 2, bidirectional=True, batch_first=True)
        
    def forward(self, x):
        b, c, h, w = x.size()
        x = x.view(b, c, -1).permute(0, 2, 1) 
        x, _ = self.lstm(x)
        x = x.permute(0, 2, 1).view(b, c, h, w)
        return x

class HybridBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, reduction=16):
        super(HybridBlock, self).__init__()
        self.shuffle = LightweightShuffle(in_channels, out_channels, stride)
        self.cbam = CBAM(out_channels, reduction)
        self.temporal = TemporalBlock(out_channels)
        
    def forward(self, x):
        x = self.shuffle(x)
        x = self.cbam(x)
        x = self.temporal(x)
        return x

class HybridNet(nn.Module):
    def __init__(self, num_classes=1000, input_size=224):
        super(HybridNet, self).__init__()
        
        # Initial conv block
        self.init_block = conv3x3_block(3, 32, stride=2)
        
        # Main stages
        self.stage1 = self._make_stage(32, 64, blocks=2)
        self.stage2 = self._make_stage(64, 128, blocks=3)
        self.stage3 = self._make_stage(128, 256, blocks=4)
        self.stage4 = self._make_stage(256, 512, blocks=3)
        
        # Final layers
        self.global_pool = nn.AdaptiveAvgPool2d(1)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(512, num_classes)
        
        self._initialize_weights()
        
    def _make_stage(self, in_channels, out_channels, blocks):
        layers = []
        layers.append(HybridBlock(in_channels, out_channels, stride=2))
        for _ in range(1, blocks):
            layers.append(HybridBlock(out_channels, out_channels, stride=1))
        return nn.Sequential(*layers)
    
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)
                
    def forward(self, x):
        x = self.init_block(x)
        
        x = self.stage1(x)
        x = self.stage2(x)
        x = self.stage3(x)
        x = self.stage4(x)
        
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = self.fc(x)
        
        return x

def create_model_4_0(num_classes=1000, input_size=224):
    return HybridNet(num_classes=num_classes, input_size=input_size)

In [24]:
# Initialize model
model = create_model_4_0(num_classes=2)  
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(num_params)

# Transfer to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Setup optimizer and scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.1, patience=3
)

# Calculate class weights
class_counts = train_new_df['label_encoded'].value_counts()
class_weights = torch.FloatTensor([1/class_counts[0], 1/class_counts[1]]).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

# Train the model
train_model(
    model=model, 
    train_loader=train_loader, 
    val_loader=val_loader, 
    criterion=criterion, 
    optimizer=optimizer, 
    num_epochs=35
)

# Load the best model
model.load_state_dict(torch.load('best_model.pth', weights_only = True))
model.eval()

# Evaluate on test set
correct_test = 0
total_test = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        correct_test += torch.sum(preds == labels)
        total_test += labels.size(0)

test_accuracy = correct_test.double() / total_test
print(f"Test Accuracy: {test_accuracy:.4f}")

7502098
Epoch 1/35
Train Loss: 0.6379, Train Acc: 0.6301, Train F1: 0.6309
Val Loss: 0.4980, Val Acc: 0.7652, Val F1: 0.7654
Epoch 2/35
Train Loss: 0.5714, Train Acc: 0.7016, Train F1: 0.7011
Val Loss: 0.4834, Val Acc: 0.7677, Val F1: 0.7677
Epoch 3/35
Train Loss: 0.5432, Train Acc: 0.7158, Train F1: 0.7156
Val Loss: 0.4513, Val Acc: 0.8030, Val F1: 0.8030
Epoch 4/35
Train Loss: 0.5369, Train Acc: 0.7235, Train F1: 0.7229
Val Loss: 0.4425, Val Acc: 0.7980, Val F1: 0.7983
Epoch 5/35
Train Loss: 0.5241, Train Acc: 0.7338, Train F1: 0.7336
Val Loss: 0.4268, Val Acc: 0.8106, Val F1: 0.8109
Epoch 6/35
Train Loss: 0.5203, Train Acc: 0.7356, Train F1: 0.7354
Val Loss: 0.4188, Val Acc: 0.8232, Val F1: 0.8233
Epoch 7/35
Train Loss: 0.5089, Train Acc: 0.7432, Train F1: 0.7430
Val Loss: 0.3977, Val Acc: 0.8207, Val F1: 0.8211
Epoch 8/35
Train Loss: 0.4973, Train Acc: 0.7527, Train F1: 0.7526
Val Loss: 0.3907, Val Acc: 0.8182, Val F1: 0.8174
Epoch 9/35
Train Loss: 0.4890, Train Acc: 0.7557, Train 

# Experiment 5 [ SeResNet + OctResNet + CBAM + SE Block]

In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class EfficientAttention(nn.Module):
    def __init__(self, channels):
        super(EfficientAttention, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        reduced_channels = max(channels // 16, 4)
        self.fc = nn.Sequential(
            nn.Conv2d(channels, reduced_channels, 1, bias=False),
            nn.ReLU(inplace=True),
            nn.Conv2d(reduced_channels, channels, 1, bias=False),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        return x * self.fc(self.avg_pool(x))

class DepthwiseSeparableConv(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(DepthwiseSeparableConv, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, 3, stride=stride, padding=1, groups=in_channels, bias=False),
            nn.BatchNorm2d(in_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels, out_channels, 1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True)
        )
    
    def forward(self, x):
        return self.conv(x)

class UltraLightBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(UltraLightBlock, self).__init__()
        hidden_channels = out_channels // 2
        
        self.conv1 = DepthwiseSeparableConv(in_channels, hidden_channels, stride)
        self.conv2 = DepthwiseSeparableConv(hidden_channels, out_channels)
        self.attention = EfficientAttention(out_channels) if out_channels >= 64 else None
        
        self.shortcut = None
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, 1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )
            
    def forward(self, x):
        identity = x
        
        out = self.conv1(x)
        out = self.conv2(out)
        
        if self.shortcut is not None:
            identity = self.shortcut(x)
            
        out = out + identity
        if self.attention is not None:
            out = self.attention(out)
        
        return out

class UltraLightHybridNet(nn.Module):
    def __init__(self, num_classes=1000, input_channels=3):
        super(UltraLightHybridNet, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(input_channels, 16, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True)
        )
        
        self.layer1 = self._make_layer(16, 32, blocks=1, stride=1)
        self.layer2 = self._make_layer(32, 64, blocks=2, stride=2)
        self.layer3 = self._make_layer(64, 128, blocks=2, stride=2)
        self.layer4 = self._make_layer(128, 256, blocks=1, stride=2)
        
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(0.2),
            nn.Linear(128, num_classes)
        )
        
        self._initialize_weights()
        
    def _make_layer(self, in_channels, out_channels, blocks, stride):
        layers = []
        layers.append(UltraLightBlock(in_channels, out_channels, stride=stride))
        for _ in range(1, blocks):
            layers.append(UltraLightBlock(out_channels, out_channels))
        return nn.Sequential(*layers)
        
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
                
    def forward(self, x):
        x = self.conv1(x)
        
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        
        return x

def create_model_5(num_classes=1000, **kwargs):
    model = UltraLightHybridNet(num_classes=num_classes, **kwargs)
    return model

In [28]:
# Initialize model
model = create_model_5(num_classes=2)  # Set to 2 classes
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(num_params)

# Transfer to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Setup optimizer and scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.1, patience=3
)

# Calculate class weights
class_counts = train_new_df['label_encoded'].value_counts()
class_weights = torch.FloatTensor([1/class_counts[0], 1/class_counts[1]]).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

# Train the model
train_model(
    model=model, 
    train_loader=train_loader, 
    val_loader=val_loader, 
    criterion=criterion, 
    optimizer=optimizer, 
    num_epochs=35
)

# Load the best model
model.load_state_dict(torch.load('best_model.pth', weights_only = True))
model.eval()

# Evaluate on test set
correct_test = 0
total_test = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        correct_test += torch.sum(preds == labels)
        total_test += labels.size(0)

test_accuracy = correct_test.double() / total_test
print(f"Test Accuracy: {test_accuracy:.4f}")

187634
Epoch 1/35
Train Loss: 0.5527, Train Acc: 0.7146, Train F1: 0.7150
Val Loss: 0.3938, Val Acc: 0.8157, Val F1: 0.8160
Epoch 2/35
Train Loss: 0.4824, Train Acc: 0.7551, Train F1: 0.7547
Val Loss: 0.3615, Val Acc: 0.8131, Val F1: 0.8124
Epoch 3/35
Train Loss: 0.4651, Train Acc: 0.7616, Train F1: 0.7613
Val Loss: 0.3587, Val Acc: 0.8182, Val F1: 0.8184
Epoch 4/35
Train Loss: 0.4571, Train Acc: 0.7697, Train F1: 0.7695
Val Loss: 0.3616, Val Acc: 0.8434, Val F1: 0.8434
Epoch 5/35
Train Loss: 0.4489, Train Acc: 0.7773, Train F1: 0.7772
Val Loss: 0.3449, Val Acc: 0.8384, Val F1: 0.8387
Epoch 6/35
Train Loss: 0.4371, Train Acc: 0.7813, Train F1: 0.7814
Val Loss: 0.3508, Val Acc: 0.8308, Val F1: 0.8308
Epoch 7/35
Train Loss: 0.4352, Train Acc: 0.7847, Train F1: 0.7849
Val Loss: 0.3167, Val Acc: 0.8535, Val F1: 0.8537
Epoch 8/35
Train Loss: 0.4274, Train Acc: 0.7870, Train F1: 0.7870
Val Loss: 0.3103, Val Acc: 0.8712, Val F1: 0.8714
Epoch 9/35
Train Loss: 0.4189, Train Acc: 0.7930, Train F

# Experiment 6 [ SeResNet + PeleeNet + CBAM]

In [15]:
import torch
import torch.nn as nn
import math
from torch.nn import init

class ChannelAttention(nn.Module):
    def __init__(self, channels, reduction_ratio=16):
        super(ChannelAttention, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)
        
        mid_channels = max(8, channels // reduction_ratio)
        
        self.shared_mlp = nn.Sequential(
            conv1x1(channels, mid_channels, bias=True),
            nn.ReLU(inplace=True),
            conv1x1(mid_channels, channels, bias=True)
        )
        
    def forward(self, x):
        avg_out = self.shared_mlp(self.avg_pool(x))
        max_out = self.shared_mlp(self.max_pool(x))
        out = torch.sigmoid(avg_out + max_out)
        return out

class SpatialAttention(nn.Module):
    def __init__(self):
        super(SpatialAttention, self).__init__()
        self.conv = conv3x3_block(2, 1, activation=None)
        
    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        x = torch.cat([avg_out, max_out], dim=1)
        x = self.conv(x)
        return torch.sigmoid(x)

class LightCBAM(nn.Module):
    def __init__(self, channels):
        super(LightCBAM, self).__init__()
        self.channel_att = ChannelAttention(channels)
        self.spatial_att = SpatialAttention()
        
    def forward(self, x):
        x = x * self.channel_att(x)
        x = x * self.spatial_att(x)
        return x

class LightResBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(LightResBlock, self).__init__()
        
        self.conv1 = conv3x3_block(
            in_channels=in_channels,
            out_channels=out_channels,
            stride=stride)
            
        self.conv2 = conv3x3_block(
            in_channels=out_channels,
            out_channels=out_channels,
            activation=None)
            
        self.attention = LightCBAM(out_channels)
        
        if stride != 1 or in_channels != out_channels:
            self.shortcut = conv1x1_block(
                in_channels=in_channels,
                out_channels=out_channels,
                stride=stride,
                activation=None)
        else:
            self.shortcut = None
            
        self.activ = nn.ReLU(inplace=True)
        
    def forward(self, x):
        identity = x if self.shortcut is None else self.shortcut(x)
        
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.attention(x)
        
        x = x + identity
        x = self.activ(x)
        return x

class DenseFeatureBlock(nn.Module):
    def __init__(self, in_channels, growth_rate, bottleneck_ratio=2):
        super(DenseFeatureBlock, self).__init__()
        
        bottleneck_channels = growth_rate * bottleneck_ratio
        
        self.branch1 = nn.Sequential(
            conv1x1_block(in_channels, bottleneck_channels),
            conv3x3_block(bottleneck_channels, growth_rate)
        )
        
        self.branch2 = nn.Sequential(
            conv1x1_block(in_channels, bottleneck_channels),
            conv3x3_block(bottleneck_channels, growth_rate),
            conv3x3_block(growth_rate, growth_rate)
        )
        
    def forward(self, x):
        branch1 = self.branch1(x)
        branch2 = self.branch2(x)
        return torch.cat([x, branch1, branch2], dim=1)

class TransitionBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(TransitionBlock, self).__init__()
        self.conv = conv1x1_block(in_channels, out_channels)
        self.pool = nn.AvgPool2d(kernel_size=2, stride=2)
        
    def forward(self, x):
        x = self.conv(x)
        x = self.pool(x)
        return x

class LightAttentionNet(nn.Module):
    def __init__(self,
                 channels=[32, 64, 128, 256],
                 init_channels=32,
                 growth_rates=[16, 24, 32, 48],
                 num_classes=1000):
        super(LightAttentionNet, self).__init__()
        
        self.features = nn.Sequential()
        self.features.add_module("init_conv", conv3x3_block(
            in_channels=3,
            out_channels=init_channels,
            stride=2))
        
        self.features.add_module("init_pool", nn.MaxPool2d(
            kernel_size=3,
            stride=2,
            padding=1))
            
        in_channels = init_channels
        
        for i, (out_channels, growth_rate) in enumerate(zip(channels, growth_rates)):
            stage = nn.Sequential()
            
            for j in range(i + 1):
                stage.add_module(f"dense{j + 1}", DenseFeatureBlock(
                    in_channels=in_channels,
                    growth_rate=growth_rate))
                in_channels += growth_rate * 2
            
            stage.add_module("res_att", LightResBlock(
                in_channels=in_channels,
                out_channels=out_channels))
            
            if i != len(channels) - 1:
                stage.add_module("transition", TransitionBlock(
                    in_channels=out_channels,
                    out_channels=out_channels))
            
            self.features.add_module(f"stage{i + 1}", stage)
            in_channels = out_channels
        
        self.features.add_module("final_pool", nn.AdaptiveAvgPool2d(1))
        
        self.output = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(in_channels, num_classes)
        )
        
        self._initialize_weights()
    
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                init.kaiming_normal_(m.weight, mode='fan_out')
                if m.bias is not None:
                    init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                init.constant_(m.weight, 1)
                init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                init.normal_(m.weight, std=0.001)
                if m.bias is not None:
                    init.constant_(m.bias, 0)
    
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.output(x)
        return x

def create_model_6(num_classes=1000, **kwargs):
    return LightAttentionNet(num_classes=num_classes, **kwargs)

In [16]:
# Initialize model
model = create_model_6(num_classes=2)  # Set to 2 classes
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(num_params)

# Transfer to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Setup optimizer and scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.1, patience=3
)

# Calculate class weights
class_counts = train_new_df['label_encoded'].value_counts()
class_weights = torch.FloatTensor([1/class_counts[0], 1/class_counts[1]]).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

# Train the model
train_model(
    model=model, 
    train_loader=train_loader, 
    val_loader=val_loader, 
    criterion=criterion, 
    optimizer=optimizer, 
    num_epochs=35
)

# Load the best model
model.load_state_dict(torch.load('best_model.pth', weights_only = True))
model.eval()

# Evaluate on test set
correct_test = 0
total_test = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        correct_test += torch.sum(preds == labels)
        total_test += labels.size(0)

test_accuracy = correct_test.double() / total_test
print(f"Test Accuracy: {test_accuracy:.4f}")

3454906
Epoch 1/35
Train Loss: 0.5471, Train Acc: 0.7050, Train F1: 0.7013
Val Loss: 0.3750, Val Acc: 0.8207, Val F1: 0.8195
Epoch 2/35
Train Loss: 0.4787, Train Acc: 0.7568, Train F1: 0.7568
Val Loss: 0.3380, Val Acc: 0.8384, Val F1: 0.8382
Epoch 3/35
Train Loss: 0.4524, Train Acc: 0.7797, Train F1: 0.7799
Val Loss: 0.3318, Val Acc: 0.8535, Val F1: 0.8538
Epoch 4/35
Train Loss: 0.4223, Train Acc: 0.7970, Train F1: 0.7973
Val Loss: 0.3241, Val Acc: 0.8384, Val F1: 0.8380
Epoch 5/35
Train Loss: 0.4076, Train Acc: 0.8025, Train F1: 0.8029
Val Loss: 0.3122, Val Acc: 0.8611, Val F1: 0.8611
Epoch 6/35
Train Loss: 0.3891, Train Acc: 0.8141, Train F1: 0.8145
Val Loss: 0.2851, Val Acc: 0.8687, Val F1: 0.8688
Epoch 7/35
Train Loss: 0.3653, Train Acc: 0.8207, Train F1: 0.8211
Val Loss: 0.2647, Val Acc: 0.8914, Val F1: 0.8916
Epoch 8/35
Train Loss: 0.3610, Train Acc: 0.8293, Train F1: 0.8296
Val Loss: 0.2919, Val Acc: 0.8813, Val F1: 0.8812
Epoch 9/35
Train Loss: 0.3589, Train Acc: 0.8282, Train 

# Experiment 7 [ EfficientNet + SeResNet + CBAM] 

In [15]:
import torch
import torch.nn as nn
import math
from torch.nn import init

class CBAM(nn.Module):
    def __init__(self, channels, reduction_ratio=16):
        super(CBAM, self).__init__()
        self.channel_attention = ChannelAttention(channels, reduction_ratio)
        self.spatial_attention = SpatialAttention()

    def forward(self, x):
        x = self.channel_attention(x)
        x = self.spatial_attention(x)
        return x

class ChannelAttention(nn.Module):
    def __init__(self, channels, reduction_ratio):
        super(ChannelAttention, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)
        
        self.fc = nn.Sequential(
            conv1x1(channels, channels // reduction_ratio, bias=True),
            nn.ReLU(inplace=True),
            conv1x1(channels // reduction_ratio, channels, bias=True)
        )
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        avg_out = self.fc(self.avg_pool(x))
        max_out = self.fc(self.max_pool(x))
        out = self.sigmoid(avg_out + max_out)
        return x * out

class SpatialAttention(nn.Module):
    def __init__(self, kernel_size=7):
        super(SpatialAttention, self).__init__()
        self.conv = conv3x3_block(
            in_channels=2,
            out_channels=1,
            activation=None
        )
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        out = torch.cat([avg_out, max_out], dim=1)
        out = self.conv(out)
        out = self.sigmoid(out)
        return x * out

class HybridBlock(nn.Module):
    def __init__(self,
                 in_channels,
                 out_channels,
                 stride,
                 expansion_factor=4,
                 use_cbam=True):
        super(HybridBlock, self).__init__()
        
        mid_channels = in_channels * expansion_factor
        self.use_cbam = use_cbam
        self.residual = (in_channels == out_channels) and (stride == 1)
        self.pw_expand = conv1x1_block(
            in_channels=in_channels,
            out_channels=mid_channels,
            activation="swish")
        
        self.dw_conv = dwconv3x3_block(
            in_channels=mid_channels,
            out_channels=mid_channels,
            stride=stride,
            activation="swish")

        self.se = SEBlock(
            channels=mid_channels,
            reduction=4,
            mid_activation="swish")

        if self.use_cbam:
            self.cbam = CBAM(mid_channels)

        self.pw_project = conv1x1_block(
            in_channels=mid_channels,
            out_channels=out_channels,
            activation=None)

    def forward(self, x):
        if self.residual:
            identity = x
        
        x = self.pw_expand(x)
        x = self.dw_conv(x)
        x = self.se(x)
        
        if self.use_cbam:
            x = self.cbam(x)
            
        x = self.pw_project(x)
        
        if self.residual:
            x = x + identity
            
        return x

class LightHybridNet(nn.Module):

    def __init__(self,
                 channels=[16, 24, 48, 88, 168],
                 init_block_channels=24,
                 final_block_channels=1280,
                 num_classes=1000):
        super(LightHybridNet, self).__init__()
        
        self.features = nn.Sequential()

        self.features.add_module("init_block", conv3x3_block(
            in_channels=3,
            out_channels=init_block_channels,
            stride=2,
            activation="swish"))
        
        in_channels = init_block_channels

        for i, out_channels in enumerate(channels):
            stage = nn.Sequential()
            stride = 2 if i > 0 else 1
            use_cbam = (i >= len(channels) - 2) 
            
            stage.add_module("unit1", HybridBlock(
                in_channels=in_channels,
                out_channels=out_channels,
                stride=stride,
                use_cbam=use_cbam))
            
            if i >= 1: 
                stage.add_module("unit2", HybridBlock(
                    in_channels=out_channels,
                    out_channels=out_channels,
                    stride=1,
                    use_cbam=use_cbam))
            
            in_channels = out_channels
            self.features.add_module(f"stage{i + 1}", stage)

        self.features.add_module("final_block", conv1x1_block(
            in_channels=in_channels,
            out_channels=final_block_channels,
            activation="swish"))
        
        self.features.add_module("final_pool", nn.AdaptiveAvgPool2d(1))

        self.output = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(
                in_features=final_block_channels,
                out_features=num_classes)
        )

        self._init_params()

    def _init_params(self):
        for name, module in self.named_modules():
            if isinstance(module, nn.Conv2d):
                init.kaiming_normal_(module.weight)
                if module.bias is not None:
                    init.constant_(module.bias, 0)
            elif isinstance(module, nn.BatchNorm2d):
                init.constant_(module.weight, 1)
                init.constant_(module.bias, 0)
            elif isinstance(module, nn.Linear):
                init.normal_(module.weight, std=0.01)
                if module.bias is not None:
                    init.constant_(module.bias, 0)

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.output(x)
        return x

def create_model_7(num_classes=1000, **kwargs):
    return LightHybridNet(num_classes=num_classes, **kwargs)

In [17]:
# Initialize model
model = create_model_7(num_classes=2)  # Set to 2 classes
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(num_params)

# Transfer to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Setup optimizer and scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.1, patience=3
)

# Calculate class weights
class_counts = train_new_df['label_encoded'].value_counts()
class_weights = torch.FloatTensor([1/class_counts[0], 1/class_counts[1]]).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

# Train the model
train_model(
    model=model, 
    train_loader=train_loader, 
    val_loader=val_loader, 
    criterion=criterion, 
    optimizer=optimizer, 
    num_epochs=35
)

# Load the best model
model.load_state_dict(torch.load('best_model.pth', weights_only = True))
model.eval()

# Evaluate on test set
correct_test = 0
total_test = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        correct_test += torch.sum(preds == labels)
        total_test += labels.size(0)

test_accuracy = correct_test.double() / total_test
print(f"Test Accuracy: {test_accuracy:.4f}")

1188828
Epoch 1/35
Train Loss: 0.5729, Train Acc: 0.6905, Train F1: 0.6912
Val Loss: 0.4035, Val Acc: 0.8056, Val F1: 0.8046
Epoch 2/35
Train Loss: 0.4952, Train Acc: 0.7450, Train F1: 0.7447
Val Loss: 0.3791, Val Acc: 0.8131, Val F1: 0.8135
Epoch 3/35
Train Loss: 0.4754, Train Acc: 0.7627, Train F1: 0.7627
Val Loss: 0.3688, Val Acc: 0.8182, Val F1: 0.8182
Epoch 4/35
Train Loss: 0.4609, Train Acc: 0.7707, Train F1: 0.7709
Val Loss: 0.3454, Val Acc: 0.8207, Val F1: 0.8207
Epoch 5/35
Train Loss: 0.4493, Train Acc: 0.7746, Train F1: 0.7747
Val Loss: 0.3359, Val Acc: 0.8409, Val F1: 0.8412
Epoch 6/35
Train Loss: 0.4327, Train Acc: 0.7873, Train F1: 0.7876
Val Loss: 0.3171, Val Acc: 0.8510, Val F1: 0.8513
Epoch 7/35
Train Loss: 0.4275, Train Acc: 0.7864, Train F1: 0.7867
Val Loss: 0.3209, Val Acc: 0.8561, Val F1: 0.8560
Epoch 8/35
Train Loss: 0.4138, Train Acc: 0.7964, Train F1: 0.7968
Val Loss: 0.3002, Val Acc: 0.8586, Val F1: 0.8589
Epoch 9/35
Train Loss: 0.3973, Train Acc: 0.8026, Train 

# Experinent 8 [ EfficientNet + SeResNet]

In [18]:
import torch
import torch.nn as nn
import math
from torch.nn import init

class AdaptiveSEBlock(nn.Module):
    def __init__(self, channels, min_reduction=4, max_reduction=16):
        super(AdaptiveSEBlock, self).__init__()
        reduction = min(max(min_reduction, channels // 8), max_reduction)
        
        mid_channels = max(channels // reduction, 8)
        
        self.gate = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Conv2d(channels, mid_channels, 1),
            nn.SiLU(inplace=True),
            nn.Conv2d(mid_channels, channels, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        return x * self.gate(x)

class LightweightConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, groups=1):
        super(LightweightConvBlock, self).__init__()
        padding = kernel_size // 2
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, kernel_size, stride, padding, groups=in_channels),
            nn.BatchNorm2d(in_channels),
            nn.SiLU(inplace=True),
            nn.Conv2d(in_channels, out_channels, 1),
            nn.BatchNorm2d(out_channels),
            nn.SiLU(inplace=True)
        )
        
    def forward(self, x):
        return self.conv(x)

class ResidualBlock(nn.Module):
    def __init__(self, channels, kernel_size=3):
        super(ResidualBlock, self).__init__()
        self.conv1 = LightweightConvBlock(channels, channels, kernel_size)
        self.se = AdaptiveSEBlock(channels)
        self.conv2 = LightweightConvBlock(channels, channels, kernel_size)
        
    def forward(self, x):
        identity = x
        out = self.conv1(x)
        out = self.se(out)
        out = self.conv2(out)
        return out + identity

class DownsampleBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3):
        super(DownsampleBlock, self).__init__()
        self.conv = LightweightConvBlock(in_channels, out_channels, kernel_size, stride=2)
        self.se = AdaptiveSEBlock(out_channels)
        
    def forward(self, x):
        x = self.conv(x)
        x = self.se(x)
        return x

class LightHybridNet(nn.Module):
    def __init__(self, num_classes=1000, input_channels=3, base_channels=32):
        super(LightHybridNet, self).__init__()
        
        # Initial convolution
        self.init_conv = nn.Sequential(
            nn.Conv2d(input_channels, base_channels, 3, stride=2, padding=1),
            nn.BatchNorm2d(base_channels),
            nn.SiLU(inplace=True)
        )
        
        # Main architecture
        self.stage1 = self._make_stage(base_channels, base_channels * 2)
        self.stage2 = self._make_stage(base_channels * 2, base_channels * 4)
        self.stage3 = self._make_stage(base_channels * 4, base_channels * 8)
        
        # Global features
        self.global_pool = nn.AdaptiveAvgPool2d(1)
        self.dropout = nn.Dropout(0.2)
        

        self.classifier = nn.Sequential(
            nn.Linear(base_channels * 8, base_channels * 4),
            nn.SiLU(inplace=True),
            nn.Dropout(0.2),
            nn.Linear(base_channels * 4, num_classes)
        )
        
        self._initialize_weights()
        
    def _make_stage(self, in_channels, out_channels):
        return nn.Sequential(
            DownsampleBlock(in_channels, out_channels),
            ResidualBlock(out_channels),
            ResidualBlock(out_channels)
        )
        
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)
                
    def forward(self, x):
        x = self.init_conv(x)
        
        x1 = self.stage1(x)
        x2 = self.stage2(x1)
        x3 = self.stage3(x2)

        x = self.global_pool(x3)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = self.classifier(x)
        
        return x

def create_model_8(num_classes=1000, **kwargs):
    return LightHybridNet(num_classes=num_classes, **kwargs)

In [19]:
# Initialize model
model = create_model_8(num_classes=2)  
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(num_params)

# Transfer to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Setup optimizer and scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.1, patience=3
)

# Calculate class weights
class_counts = train_new_df['label_encoded'].value_counts()
class_weights = torch.FloatTensor([1/class_counts[0], 1/class_counts[1]]).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

# Train the model
train_model(
    model=model, 
    train_loader=train_loader, 
    val_loader=val_loader, 
    criterion=criterion, 
    optimizer=optimizer, 
    num_epochs=35
)

# Load the best model
model.load_state_dict(torch.load('best_model.pth', weights_only = True))
model.eval()

# Evaluate on test set
correct_test = 0
total_test = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        correct_test += torch.sum(preds == labels)
        total_test += labels.size(0)

test_accuracy = correct_test.double() / total_test
print(f"Test Accuracy: {test_accuracy:.4f}")

487330
Epoch 1/35
Train Loss: 0.5628, Train Acc: 0.6923, Train F1: 0.6929
Val Loss: 0.3606, Val Acc: 0.8131, Val F1: 0.8120
Epoch 2/35
Train Loss: 0.4621, Train Acc: 0.7718, Train F1: 0.7716
Val Loss: 0.3336, Val Acc: 0.8333, Val F1: 0.8337
Epoch 3/35
Train Loss: 0.4366, Train Acc: 0.7888, Train F1: 0.7890
Val Loss: 0.3100, Val Acc: 0.8434, Val F1: 0.8437
Epoch 4/35
Train Loss: 0.4143, Train Acc: 0.7984, Train F1: 0.7987
Val Loss: 0.2920, Val Acc: 0.8611, Val F1: 0.8614
Epoch 5/35
Train Loss: 0.4017, Train Acc: 0.8033, Train F1: 0.8036
Val Loss: 0.2988, Val Acc: 0.8561, Val F1: 0.8560
Epoch 6/35
Train Loss: 0.3847, Train Acc: 0.8151, Train F1: 0.8154
Val Loss: 0.2978, Val Acc: 0.8662, Val F1: 0.8664
Epoch 7/35
Train Loss: 0.3820, Train Acc: 0.8104, Train F1: 0.8107
Val Loss: 0.2909, Val Acc: 0.8636, Val F1: 0.8636
Epoch 8/35
Train Loss: 0.3791, Train Acc: 0.8199, Train F1: 0.8202
Val Loss: 0.3150, Val Acc: 0.8561, Val F1: 0.8558
Epoch 9/35
Train Loss: 0.3669, Train Acc: 0.8251, Train F

# Experiment 9 [EfficientNet + SeResNet + SqueezeNet + CBAM]

In [20]:
import torch
import torch.nn as nn
import math
from torch.nn import functional as F

class ChannelAttention(nn.Module):
    def __init__(self, channels, reduction_ratio=16):
        super(ChannelAttention, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)
        
        self.fc = nn.Sequential(
            nn.Conv2d(channels, channels // reduction_ratio, 1, bias=False),
            nn.ReLU(inplace=True),
            nn.Conv2d(channels // reduction_ratio, channels, 1, bias=False)
        )
        
    def forward(self, x):
        avg_out = self.fc(self.avg_pool(x))
        max_out = self.fc(self.max_pool(x))
        out = torch.sigmoid(avg_out + max_out)
        return out

class SpatialAttention(nn.Module):
    def __init__(self, kernel_size=7):
        super(SpatialAttention, self).__init__()
        self.conv = nn.Conv2d(2, 1, kernel_size, padding=kernel_size//2, bias=False)
        
    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        out = torch.cat([avg_out, max_out], dim=1)
        out = torch.sigmoid(self.conv(out))
        return out

class CBAMBlock(nn.Module):
    def __init__(self, channels, reduction_ratio=16, kernel_size=7):
        super(CBAMBlock, self).__init__()
        self.channel_attention = ChannelAttention(channels, reduction_ratio)
        self.spatial_attention = SpatialAttention(kernel_size)
        
    def forward(self, x):
        x = x * self.channel_attention(x)
        x = x * self.spatial_attention(x)
        return x

class LightweightFireModule(nn.Module):
    def __init__(self, in_channels, squeeze_channels, expand_channels):
        super(LightweightFireModule, self).__init__()
        self.squeeze = conv1x1_block(
            in_channels=in_channels,
            out_channels=squeeze_channels,
            activation="relu")
            
        self.expand1x1 = conv1x1_block(
            in_channels=squeeze_channels,
            out_channels=expand_channels//2,
            activation="relu")
            
        self.expand3x3 = conv3x3_block(
            in_channels=squeeze_channels,
            out_channels=expand_channels//2,
            activation="relu")
            
        self.cbam = CBAMBlock(expand_channels)
        
    def forward(self, x):
        x = self.squeeze(x)
        x1 = self.expand1x1(x)
        x2 = self.expand3x3(x)
        out = torch.cat([x1, x2], dim=1)
        out = self.cbam(out)
        return out

class EfficientBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, expansion_factor=6):
        super(EfficientBlock, self).__init__()
        self.residual = (in_channels == out_channels) and (stride == 1)
        mid_channels = in_channels * expansion_factor
        
        self.expand = conv1x1_block(
            in_channels=in_channels,
            out_channels=mid_channels,
            activation="swish") if expansion_factor != 1 else None
            
        self.depthwise = dwconv3x3_block(
            in_channels=mid_channels,
            out_channels=mid_channels,
            stride=stride,
            activation="swish")
            
        self.se = SEBlock(
            channels=mid_channels,
            reduction=4,
            mid_activation="swish")
            
        self.project = conv1x1_block(
            in_channels=mid_channels,
            out_channels=out_channels,
            activation=None)
            
    def forward(self, x):
        if self.residual:
            identity = x
        if self.expand is not None:
            x = self.expand(x)
        x = self.depthwise(x)
        x = self.se(x)
        x = self.project(x)
        if self.residual:
            x = x + identity
        return x

class LightFusionNet(nn.Module):
    def __init__(self,
                 in_channels=3,
                 num_classes=1000,
                 init_channels=32):
        super(LightFusionNet, self).__init__()
        
        self.init_block = conv3x3_block(
            in_channels=in_channels,
            out_channels=init_channels,
            stride=2)

        self.stage1 = nn.Sequential(
            LightweightFireModule(init_channels, 16, 64),
            LightweightFireModule(64, 16, 64)
        )
        
        self.stage2 = nn.Sequential(
            EfficientBlock(64, 128, stride=2, expansion_factor=4),
            EfficientBlock(128, 128, expansion_factor=4)
        )
        
        self.stage3 = nn.Sequential(
            EfficientBlock(128, 256, stride=2, expansion_factor=6),
            EfficientBlock(256, 256, expansion_factor=6)
        )
        
        self.final_block = conv1x1_block(
            in_channels=256,
            out_channels=512,
            activation="swish")
            
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout(0.2)
        self.output = nn.Linear(512, num_classes)
        
        self._init_params()
        
    def _init_params(self):
        for name, module in self.named_modules():
            if isinstance(module, nn.Conv2d):
                init.kaiming_uniform_(module.weight)
                if module.bias is not None:
                    init.constant_(module.bias, 0)
            elif isinstance(module, nn.BatchNorm2d):
                init.constant_(module.weight, 1)
                init.constant_(module.bias, 0)
            elif isinstance(module, nn.Linear):
                init.normal_(module.weight, 0, 0.01)
                init.constant_(module.bias, 0)
                
    def forward(self, x):
        x = self.init_block(x)
        identity1 = x
        x = self.stage1(x)
        if x.size() == identity1.size():
            x = x + identity1
        x = self.stage2(x)
        x = self.stage3(x)
        
        x = self.final_block(x)
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = self.output(x)
        return x

def create_model_9(num_classes=1000, **kwargs):
    return LightFusionNet(num_classes=num_classes, **kwargs)

In [21]:
# Initialize model
model = create_model_9(num_classes=2) 
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(num_params)

# Transfer to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Setup optimizer and scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.1, patience=3
)

# Calculate class weights
class_counts = train_new_df['label_encoded'].value_counts()
class_weights = torch.FloatTensor([1/class_counts[0], 1/class_counts[1]]).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

# Train the model
train_model(
    model=model, 
    train_loader=train_loader, 
    val_loader=val_loader, 
    criterion=criterion, 
    optimizer=optimizer, 
    num_epochs=35
)

# Load the best model
model.load_state_dict(torch.load('best_model.pth', weights_only = True))
model.eval()

# Evaluate on test set
correct_test = 0
total_test = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        correct_test += torch.sum(preds == labels)
        total_test += labels.size(0)

test_accuracy = correct_test.double() / total_test
print(f"Test Accuracy: {test_accuracy:.4f}")

3092646
Epoch 1/35
Train Loss: 0.5326, Train Acc: 0.7298, Train F1: 0.7300
Val Loss: 0.4001, Val Acc: 0.8131, Val F1: 0.8134
Epoch 2/35
Train Loss: 0.4913, Train Acc: 0.7513, Train F1: 0.7511
Val Loss: 0.3630, Val Acc: 0.8308, Val F1: 0.8310
Epoch 3/35
Train Loss: 0.4595, Train Acc: 0.7736, Train F1: 0.7736
Val Loss: 0.3246, Val Acc: 0.8409, Val F1: 0.8411
Epoch 4/35
Train Loss: 0.4282, Train Acc: 0.7892, Train F1: 0.7893
Val Loss: 0.2945, Val Acc: 0.8485, Val F1: 0.8487
Epoch 5/35
Train Loss: 0.4166, Train Acc: 0.7999, Train F1: 0.8002
Val Loss: 0.3048, Val Acc: 0.8662, Val F1: 0.8664
Epoch 6/35
Train Loss: 0.3993, Train Acc: 0.8079, Train F1: 0.8082
Val Loss: 0.2723, Val Acc: 0.8889, Val F1: 0.8888
Epoch 7/35
Train Loss: 0.3867, Train Acc: 0.8148, Train F1: 0.8151
Val Loss: 0.2804, Val Acc: 0.8586, Val F1: 0.8578
Epoch 8/35
Train Loss: 0.3833, Train Acc: 0.8170, Train F1: 0.8173
Val Loss: 0.2645, Val Acc: 0.8838, Val F1: 0.8841
Epoch 9/35
Train Loss: 0.3684, Train Acc: 0.8245, Train 