In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cifar-10/trainLabels.csv
/kaggle/input/cifar-10/sampleSubmission.csv
/kaggle/input/cifar-10/test.7z
/kaggle/input/cifar-10/train.7z


In [None]:
import os
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import numpy as np
from PIL import Image
import random
import pandas as pd
!pip install py7zr
import py7zr
from io import BytesIO
import torch.optim.lr_scheduler as lr_scheduler

# Define a basic residual block with dropout
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.dropout = nn.Dropout(p=0.3)  # Dropout to regularize

        self.skip_connection = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.skip_connection = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        identity = self.skip_connection(x)
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.dropout(out)  # Apply dropout
        out += identity
        out = self.relu(out)
        return out

# Define ResNet-34 for CIFAR-10
class ResNet34(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet34, self).__init__()
        self.in_channels = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, num_classes)

    def _make_layer(self, block, out_channels, blocks, stride):
        strides = [stride] + [1] * (blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_channels, out_channels, stride))
            self.in_channels = out_channels
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.avg_pool(out)
        out = torch.flatten(out, 1)
        out = self.fc(out)
        return out

# Function to instantiate the ResNet-34 model
def get_resnet34():
    return ResNet34(ResidualBlock, [3, 4, 6, 3])

# Custom CutOut transformation
class CutOut(object):
    def __init__(self, n_holes, length):
        self.n_holes = n_holes
        self.length = length

    def __call__(self, img):
        h, w = img.size(1), img.size(2)
        mask = np.ones((h, w), np.float32)

        for n in range(self.n_holes):
            y = random.randint(0, h)
            x = random.randint(0, w)

            y1 = np.clip(y - self.length // 2, 0, h)
            y2 = np.clip(y + self.length // 2, 0, h)
            x1 = np.clip(x - self.length // 2, 0, w)
            x2 = np.clip(x + self.length // 2, 0, w)

            mask[y1:y2, x1:x2] = 0.

        mask = torch.from_numpy(mask).expand_as(img)
        img = img * mask
        return img

# Transformation function using PyTorch's transforms
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),  # Horizontal flip
    transforms.RandomAffine(degrees=15, translate=(0.1, 0.1), scale=(0.9, 1.1)),  # Rotate, scale, and shift
    transforms.RandomCrop(32, padding=4),  # Random crop with padding
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
    CutOut(n_holes=1, length=8),  # Use CutOut after ToTensor for regularization
])

transform_test = transforms.Compose([
    transforms.Resize((32, 32)),  # Resize to 32x32
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
])

# Example of applying the transformations to the CIFAR-10 dataset
train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=100, shuffle=False)

# Define device (GPU/CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create model, define loss function and optimizer
model = get_resnet34().to(device)
criterion = nn.CrossEntropyLoss()

# Use AdamW optimizer with weight decay
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)
# Calculate steps per epoch
steps_per_epoch = len(train_loader)  # Number of batches in the training set

# Cosine annealing scheduler with warmup

# Define the OneCycleLR scheduler
scheduler = lr_scheduler.OneCycleLR(
    optimizer, 
    max_lr=0.001,        # The maximum learning rate after warmup
    steps_per_epoch=steps_per_epoch,  # Total steps in one epoch (train dataset size / batch size)
    epochs=80,           # Total number of epochs
    pct_start=0.3,       # Warmup period (30% of the total steps)
    anneal_strategy='cos',  # Cosine annealing after warmup
    div_factor=25.0      # Initial learning rate will be max_lr / div_factor
)

# Training loop
def train(epoch):
    model.train()
    running_loss = 0.0
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        # Update the learning rate at each batch step
        scheduler.step()

        running_loss += loss.item()
        if batch_idx % 100 == 99:
            print(f"Epoch {epoch+1}, Batch {batch_idx+1}: Loss = {running_loss / 100:.4f}")
            running_loss = 0.0

def test(epoch):
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(test_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            test_loss += loss.item()

            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

        print(f"Epoch {epoch+1}: Test Loss = {test_loss / len(test_loader):.4f}, Accuracy = {100. * correct / total:.2f}%")

# Main training loop
for epoch in range(80):
    train(epoch)
    test(epoch)

# Run the test for submission
# Prediction process omitted for brevity; you can reuse your existing test set submission process.

# Create submission file with predictions for test images from .7z archive
test_filenames = []
test_images = []

with py7zr.SevenZipFile('/kaggle/input/cifar-10/test.7z', mode='r') as z:
    for name, file in z.readall().items():
        if name.endswith('.png'):
            img = Image.open(BytesIO(file.read()))
            test_images.append(transform_test(img))
            test_filenames.append(name)

test_images = torch.stack(test_images)
test_loader = DataLoader(test_images, batch_size=100, shuffle=False)

# Prediction and CSV creation
classes = ('airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

result = []
with torch.no_grad():
    model.eval()
    for inputs in tqdm(test_loader):
        inputs = inputs.to(device)
        outputs = model(inputs)
        _, predicted = outputs.max(1)
        result.extend(predicted.cpu().numpy())

# Create submission DataFrame
submission_df = pd.DataFrame({
    'id': [os.path.basename(f).replace('.png', '') for f in test_filenames],  # Remove .png from filenames
    'label': [classes[label] for label in result]
})

# Save submission file
submission_df.to_csv('/kaggle/working/submission.csv', index=False)

print("Submission file created successfully!")

Collecting py7zr
  Downloading py7zr-0.22.0-py3-none-any.whl.metadata (16 kB)
Collecting pycryptodomex>=3.16.0 (from py7zr)
  Downloading pycryptodomex-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting pyzstd>=0.15.9 (from py7zr)
  Downloading pyzstd-0.16.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.4 kB)
Collecting pyppmd<1.2.0,>=1.1.0 (from py7zr)
  Downloading pyppmd-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.7 kB)
Collecting pybcj<1.1.0,>=1.0.0 (from py7zr)
  Downloading pybcj-1.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)
Collecting multivolumefile>=0.2.3 (from py7zr)
  Downloading multivolumefile-0.2.3-py3-none-any.whl.metadata (6.3 kB)
Collecting inflate64<1.1.0,>=1.0.0 (from py7zr)
  Downloading inflate64-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)
Downloading py7zr-0.22.0-py3-none-any.whl (67 kB)
[2

100%|██████████| 170498071/170498071 [00:11<00:00, 14324935.05it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
Epoch 1, Batch 100: Loss = 2.1488
Epoch 1: Test Loss = 2.8656, Accuracy = 24.68%
Epoch 2, Batch 100: Loss = 1.7243
Epoch 2: Test Loss = 2.1155, Accuracy = 35.03%
Epoch 3, Batch 100: Loss = 1.5692
Epoch 3: Test Loss = 1.7380, Accuracy = 43.46%
Epoch 4, Batch 100: Loss = 1.4308
Epoch 4: Test Loss = 1.4408, Accuracy = 52.41%
Epoch 5, Batch 100: Loss = 1.2982
Epoch 5: Test Loss = 1.2856, Accuracy = 56.45%
Epoch 6, Batch 100: Loss = 1.2103
Epoch 6: Test Loss = 1.1827, Accuracy = 60.42%
Epoch 7, Batch 100: Loss = 1.1042
Epoch 7: Test Loss = 1.1319, Accuracy = 62.62%
Epoch 8, Batch 100: Loss = 1.0405
Epoch 8: Test Loss = 0.9317, Accuracy = 67.84%
Epoch 9, Batch 100: Loss = 0.9763
Epoch 9: Test Loss = 1.1378, Accuracy = 64.79%
Epoch 10, Batch 100: Loss = 0.8954
Epoch 10: Test Loss = 0.8063, Accuracy = 72.42%
Epoch 11, Batch 100: Loss = 0.8460
Epoch 11: Test Loss = 0.7982, Accuracy = 73.11%
Epoch 12, Batch 

## Pre-written test code

In [3]:
!pip install onnx-tool

Collecting onnx-tool
  Downloading onnx_tool-0.9.0-py3-none-any.whl.metadata (9.6 kB)
Downloading onnx_tool-0.9.0-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: onnx-tool
Successfully installed onnx-tool-0.9.0


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import math
import re
import onnx_tool
import torch.onnx
import yaml
import os

###################################################################################################
##############################  This is just a crap code made by me. ############################## 
###################################################################################################


# ========== Modifiable Parameters ==========
num_classes = 10  # Number of classes for CIFAR-10
epochs = 50  # Number of training epochs
batch_size = 128  # Batch size for DataLoader
width_factor = 0.36  # Width scaling factor for the model layers
device = 'cuda' if torch.cuda.is_available() else 'cpu'  # Use CUDA if available, else CPU
lr_values = [0.1]  # List of learning rates to try
# ============================================

# Function to format numbers for filenames
def format_number_filename(num):
    if abs(num) >= 1_000_000:
        return f'{int(num / 1_000_000)}M'  # Round to the nearest million
    elif abs(num) >= 1_000:
        return f'{int(num / 1_000)}k'  # Round to the nearest thousand
    else:
        return str(num)

# Function to format large numbers for readability
def format_number(num):
    if abs(num) >= 1_000_000:
        return f'{num / 1_000_000:.1f}M'  # Format in millions
    elif abs(num) >= 1_000:
        return f'{num / 1_000:.1f}k'  # Format in thousands
    else:
        return str(num)

# Function to round numbers to the nearest significant digit
def round_significant(x, digits=2):
    if x == 0:
        return 0
    else:
        return round(x, -int(math.floor(math.log10(abs(x))) - (digits - 1)))

# Function to calculate FLOPs using ONNX
def calculate_flops_onnx(model):
    # Generate a dummy input for the model with CIFAR-10 dimensions (3x32x32)
    dummy_input = torch.randn(1, 3, 32, 32).to(device)
    
    # Paths to save the ONNX model and profile
    onnx_path = "tmp.onnx"
    profile_path = "profile.txt"
    
    # Export the PyTorch model to ONNX format
    torch.onnx.export(model,
                      dummy_input,
                      onnx_path,
                      export_params=True,
                      opset_version=12,
                      do_constant_folding=True,
                      input_names=['input'],
                      output_names=['output'],
                      dynamic_axes=None)
    
    # Profile the ONNX model to calculate the number of MACs
    onnx_tool.model_profile(onnx_path, save_profile=profile_path)
    
    # Read and parse the profile to extract total MACs
    with open(profile_path, 'r') as file:
        profile = file.read()
    
    # Use regex to find the total MACs in the profile
    match = re.search(r'Total\s+_\s+([\d,]+)\s+100%', profile)
    
    if match:
        total_macs = match.group(1)
        total_macs = int(total_macs.replace(',', ''))  # Remove commas for calculation
        total_macs = round_significant(total_macs)
        return total_macs
    else:
        return None

# ECABlock class that adds channel-wise attention to the model
class ECABlock(nn.Module):
    def __init__(self, channels, gamma=4, b=24):
        super(ECABlock, self).__init__()
        
        # Calculate kernel size based on input channel size
        t = int(abs((math.log(channels, 2) + b) / gamma))
        kernel_size = t if t % 2 else t + 1
        
        # Define average pooling and 1D convolution
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, bias=False)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Apply global average pooling and convolution to calculate channel-wise attention
        y = self.avg_pool(x)
        y = y.squeeze(-1).transpose(-1, -2)
        y = self.conv(y)
        y = self.sigmoid(y)
        y = y.transpose(-1, -2).unsqueeze(-1)
        
        return x * y.expand_as(x)  # Element-wise multiplication for channel attention

# InvertedResidual block that can optionally use ECABlock for attention
class InvertedResidual(nn.Module):
    def __init__(self, inp, oup, stride, expand_ratio, use_eca=False):
        super(InvertedResidual, self).__init__()
        
        # Hidden dimension after expansion
        hidden_dim = int(inp * expand_ratio)
        
        # Check if residual connection is applicable
        self.use_res_connect = (stride == 1 and inp == oup)

        # Build layers: expansion, depthwise convolution, pointwise convolution
        layers = []
        if expand_ratio != 1:
            layers.extend([nn.Conv2d(inp, hidden_dim, 1, bias=False),
                           nn.BatchNorm2d(hidden_dim),
                           nn.GELU()])  # Use GELU activation
        
        # Add depthwise convolution and batch norm
        layers.extend([nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                       nn.BatchNorm2d(hidden_dim),
                       nn.GELU()])
        
        # Optionally add ECABlock for attention
        if use_eca:
            layers.append(ECABlock(hidden_dim))

        # Add final pointwise convolution
        layers.extend([nn.Conv2d(hidden_dim, oup, 1, bias=False),
                       nn.BatchNorm2d(oup)])

        self.conv = nn.Sequential(*layers)  # Define the sequential model

    def forward(self, x):
        # Forward pass through convolution layers
        out = self.conv(x)
        
        # Add residual connection if applicable
        if self.use_res_connect:
            return x + out
        else:
            return out


# Define the MobileNetECA architecture
#This class takes as input a  block_settings.yml which contains a list of different block settings, trains each block setting with a learning rate of 0.1, and saves the trained models.
#But  now  learning rate is set to 0.1, trained, and later on  re-trained with different learning rates.
class MobileNetECA(nn.Module):
    def __init__(self, num_classes=10, width_mult=0.2, block_settings=None):
        super(MobileNetECA, self).__init__()

        # Default block_settings if not provided
        if block_settings is None:
            block_settings = [
                [2, 24, 2, 1, True],  # Block 1
                [4, 24, 3, 2, True],  # Block 2
                [8, 36, 3, 2, True],  # Block 3
                [8, 44, 3, 1, True],  # Block 4
            ]

        
        # Calculate input and output channel sizes based on width factor
        input_channel = max(int(36 * width_mult), 8)
        last_channel = max(int(144 * width_mult), 8)

        # First convolution layer
        self.features = [nn.Sequential(
            nn.Conv2d(3, input_channel, 3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(input_channel),
            nn.GELU()
        )]

        # Add inverted residual blocks
        for idx, (t, c, n, s, use_eca) in enumerate(block_settings):
            output_channel = max(int(c * width_mult), 8)
            for i in range(n):
                stride = s if i == 0 else 1  # First layer in block may have stride > 1
                self.features.append(InvertedResidual(input_channel, output_channel, stride, expand_ratio=t, use_eca=use_eca))
                input_channel = output_channel

        # Final convolution layer
        self.features.append(nn.Sequential(
            nn.Conv2d(input_channel, last_channel, 1, bias=False),
            nn.BatchNorm2d(last_channel),
            nn.GELU(),
            nn.AdaptiveAvgPool2d(1)
        ))

        self.features = nn.Sequential(*self.features)  # Combine all layers

        # Final classifier layer
        self.pool = nn.AdaptiveAvgPool2d(1)
        self.classifier = nn.Linear(last_channel, num_classes)

        # Initialize weights
        self._initialize_weights()

    def forward(self, x):
        # Forward pass through feature extractor and classifier
        x = self.features(x)
        x = self.pool(x).flatten(1)
        x = self.classifier(x)
        return x

    # Function to initialize model weights
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.ones_(m.weight)
                nn.init.zeros_(m.bias)

# Data augmentation and normalization for CIFAR-10 training dataset
train_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(),  # Random horizontal flip
    transforms.RandomCrop(32, padding=4),  # Random crop with padding
    transforms.ToTensor(),  # Convert image to tensor
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))  # Normalize based on dataset statistics
])

# Normalization for test data
test_transforms = transforms.Compose([
    transforms.ToTensor(),  # Convert image to tensor
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))  # Normalize based on dataset statistics
])

# Load CIFAR-10 training and test datasets
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transforms)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)

test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transforms)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

# Load block_settings from external YAML file
with open('block_settings.yaml', 'r') as f:
    block_settings_dict = yaml.safe_load(f)



# Function to train and evaluate the model with each learning rate
for block_settings_name, block_settings in block_settings_dict.items():
    for lr in lr_values:
        # Create the MobileNetECA model with the provided block_settings and move it to the appropriate device
        model = MobileNetECA(num_classes=num_classes, width_mult=width_factor, block_settings=block_settings).to(device)
        
        # Calculate the number of parameters and FLOPs of the model
        params = sum(p.numel() for p in model.parameters())
        macs = calculate_flops_onnx(model)
        

        formatted_params = format_number(params)
        formatted_macs = format_number(macs)
        print(f"Total number of parameters for lr={lr}: {formatted_params}")
        
        # Set up the optimizer (SGD) and learning rate scheduler (Cosine Annealing)
        optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=3e-4)
        scheduler = CosineAnnealingLR(optimizer, T_max=epochs)

        # Loss function: Cross-Entropy Loss
        criterion = nn.CrossEntropyLoss()

        # Function to train for one epoch
        def train():
            model.train()  # Set model to training mode
            correct = 0
            total = 0

            for inputs, targets in train_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                optimizer.zero_grad()  # Reset gradients
                outputs = model(inputs)  # Forward pass
                loss = criterion(outputs, targets)  # Calculate loss
                loss.backward()  # Backpropagation
                nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)  # Gradient clipping
                optimizer.step()  # Update weights

                _, predicted = outputs.max(1)  # Get predicted class
                total += targets.size(0)
                correct += predicted.eq(targets).sum().item()

            accuracy = 100. * correct / total  # Calculate accuracy
            return accuracy

        # Function to validate on the test set
        def validate():
            model.eval()  # Set model to evaluation mode
            correct = 0
            total = 0

            with torch.no_grad():
                for inputs, targets in test_loader:
                    inputs, targets = inputs.to(device), targets.to(device)
                    outputs = model(inputs)  # Forward pass
                    loss = criterion(outputs, targets)  # Calculate loss

                    _, predicted = outputs.max(1)  # Get predicted class
                    total += targets.size(0)
                    correct += predicted.eq(targets).sum().item()

            accuracy = 100. * correct / total  # Calculate accuracy
            return accuracy

        # Calculate and print parameters and MACs
        print(f"------ Rounded Parameters for lr={lr} ------")
        params = sum(param.numel() for param in model.parameters())  # Total number of parameters
        params = round_significant(params)
        macs = calculate_flops_onnx(model)  # Calculate FLOPs using ONNX
        formatted_params = format_number(params)
        formatted_macs = format_number(macs)
        print(f"Params: {formatted_params}  MACS: {formatted_macs}")

        # Training loop for multiple epochs
        for epoch in range(epochs):
            acc_train = train()  # Train for one epoch
            acc_valid = validate()  # Validate on the test set
            scheduler.step()  # Update learning rate
            print(f'Epoch {epoch+1} - Training Accuracy: {acc_train:.2f}% - Validation Accuracy: {acc_valid:.2f}%')

        # Format parameters, MACs, accuracy, and learning rate for saving the model
        params_str = format_number_filename(params)
        macs_str = format_number_filename(macs)
        acc_str = f"{acc_valid:.1f}".replace('.', '_')  # Format accuracy like 84.4% -> 84_4
        lr_str = f"{lr:.2f}".replace('.', '_').rstrip('0').rstrip('_')  # Format learning rate

        # Add the block_settings name to the filename
        block_name_str = re.sub(r'\W+', '_', block_settings_name)

        # Save the trained model using TorchScript with a formatted filename
        model_save_dir = '/kaggle/working/'
        os.makedirs(model_save_dir, exist_ok=True)
        model_path = os.path.join(model_save_dir, f'{block_name_str}_{params_str}_{macs_str}_{acc_str}_{lr_str}.pt')
        scripted_model = torch.jit.script(model)
        scripted_model.save(model_path)
        print(f"Model saved as '{model_path}'")


Files already downloaded and verified
Files already downloaded and verified


FileNotFoundError: [Errno 2] No such file or directory: 'block_settings.yaml'

### 97.5

In [7]:
!pip install py7zr

Collecting py7zr
  Downloading py7zr-0.22.0-py3-none-any.whl.metadata (16 kB)
Collecting pycryptodomex>=3.16.0 (from py7zr)
  Downloading pycryptodomex-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting pyzstd>=0.15.9 (from py7zr)
  Downloading pyzstd-0.16.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.4 kB)
Collecting pyppmd<1.2.0,>=1.1.0 (from py7zr)
  Downloading pyppmd-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.7 kB)
Collecting pybcj<1.1.0,>=1.0.0 (from py7zr)
  Downloading pybcj-1.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)
Collecting multivolumefile>=0.2.3 (from py7zr)
  Downloading multivolumefile-0.2.3-py3-none-any.whl.metadata (6.3 kB)
Collecting inflate64<1.1.0,>=1.0.0 (from py7zr)
  Downloading inflate64-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)
Downloading py7zr-0.22.0-py3-none-any.whl (67 kB)
[2

In [2]:
import os
import re
import math
import random
import yaml
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import numpy as np
from PIL import Image
import py7zr
import pandas as pd
from io import BytesIO
import torch.optim.lr_scheduler as lr_scheduler

# 1. Print all the files in the input directory to verify paths
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/cifar-10/trainLabels.csv
/kaggle/input/cifar-10/sampleSubmission.csv
/kaggle/input/cifar-10/test.7z
/kaggle/input/cifar-10/train.7z


In [3]:
import py7zr
import os

# Extract train and test images
with py7zr.SevenZipFile('/kaggle/input/cifar-10/train.7z', mode='r') as z:
    z.extractall(path='/kaggle/working/train_images')
    
with py7zr.SevenZipFile('/kaggle/input/cifar-10/test.7z', mode='r') as z:
    z.extractall(path='/kaggle/working/test_images')

In [6]:
class CutOut(object):
    def __init__(self, n_holes, length):
        self.n_holes = n_holes
        self.length = length

    def __call__(self, img):
        h, w = img.size(1), img.size(2)
        mask = np.ones((h, w), np.float32)

        for n in range(self.n_holes):
            y = random.randint(0, h)
            x = random.randint(0, w)

            y1 = np.clip(y - self.length // 2, 0, h)
            y2 = np.clip(y + self.length // 2, 0, h)
            x1 = np.clip(x - self.length // 2, 0, w)
            x2 = np.clip(x + self.length // 2, 0, w)

            mask[y1:y2, x1:x2] = 0.

        mask = torch.from_numpy(mask).expand_as(img)
        img = img * mask
        return img

In [5]:
# import py7zr
# import os

# # Extract train and test images
# with py7zr.SevenZipFile('/kaggle/input/cifar-10/train.7z', mode='r') as z:
#     z.extractall(path='/kaggle/working/train_images')
    
# with py7zr.SevenZipFile('/kaggle/input/cifar-10/test.7z', mode='r') as z:
#     z.extractall(path='/kaggle/working/test_images')# Load the train labels
train_labels_df = pd.read_csv('/kaggle/input/cifar-10/trainLabels.csv')

# Map image filenames with labels
train_filenames = [os.path.join('/kaggle/working/train_images/train', f"{i}.png") for i in train_labels_df['id']]
train_labels = train_labels_df['label'].values

# CIFAR-10 classes (convert labels from string to index)
classes = ('airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
label_to_index = {label: idx for idx, label in enumerate(classes)}
train_labels = [label_to_index[label] for label in train_labels]

# Data Augmentation with CutOut (RandomErasing) for training data
# train_transform = transforms.Compose([
#     transforms.RandomHorizontalFlip(),
#     transforms.RandomCrop(32, padding=4),
#     transforms.ToTensor(),
#     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
#     RandomErasing(scale=(0.02, 0.33))
# ])

# # Define a simpler transform for test data
# test_transform = transforms.Compose([
#     transforms.ToTensor(),
#     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
# ])

train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomAffine(degrees=15, translate=(0.1, 0.1), scale=(0.9, 1.1)),  # Rotate, scale, shift
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
    CutOut(n_holes=1, length=8),
])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
])


In [6]:
# Create a custom dataset class for the training data
class CIFAR10CustomDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path)
        label = self.labels[idx]

        if self.transform:
            image = self.transform(image)

        return image, label

# Create the dataset and dataloader for training data
train_dataset = CIFAR10CustomDataset(train_filenames, train_labels, transform=train_transform)
train_loader = DataLoader(dataset=train_dataset, batch_size=128, shuffle=True, num_workers=2)

In [7]:
# Define the Residual Block and ResNet Model (same as before)
# class ResidualBlock(nn.Module):
#     def __init__(self, in_channels, out_channels, stride=1, downsample=None):
#         super(ResidualBlock, self).__init__()
#         self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
#         self.bn1 = nn.BatchNorm2d(out_channels)
#         self.relu = nn.ReLU(inplace=True)
#         self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False)
#         self.bn2 = nn.BatchNorm2d(out_channels)
#         self.downsample = downsample

#     def forward(self, x):
#         residual = x
#         out = self.conv1(x)
#         out = self.bn1(out)
#         out = self.relu(out)
#         out = self.conv2(out)
#         out = self.bn2(out)
#         if self.downsample:
#             residual = self.downsample(x)
#         out += residual
#         out = self.relu(out)
#         return out

# class ResNet(nn.Module):
#     def __init__(self, block, layers, num_classes=10):
#         super(ResNet, self).__init__()
#         self.in_channels = 64
#         self.conv = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
#         self.bn = nn.BatchNorm2d(64)
#         self.relu = nn.ReLU(inplace=True)
#         self.layer1 = self.make_layer(block, 64, layers[0])
#         self.layer2 = self.make_layer(block, 128, layers[1], stride=2)
#         self.layer3 = self.make_layer(block, 256, layers[2], stride=2)
#         self.layer4 = self.make_layer(block, 512, layers[3], stride=2)
#         self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
#         self.fc = nn.Linear(512, num_classes)

#     def make_layer(self, block, out_channels, blocks, stride=1):
#         downsample = None
#         if stride != 1 or self.in_channels != out_channels:
#             downsample = nn.Sequential(
#                 nn.Conv2d(self.in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
#                 nn.BatchNorm2d(out_channels)
#             )
#         layers = []
#         layers.append(block(self.in_channels, out_channels, stride, downsample))
#         self.in_channels = out_channels
#         for _ in range(1, blocks):
#             layers.append(block(out_channels, out_channels))
#         return nn.Sequential(*layers)

#     def forward(self, x):
#         x = self.conv(x)
#         x = self.bn(x)
#         x = self.relu(x)
#         x = self.layer1(x)
#         x = self.layer2(x)
#         x = self.layer3(x)
#         x = self.layer4(x)
#         x = self.avg_pool(x)
#         x = torch.flatten(x, 1)
#         x = self.fc(x)
#         return x

class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.downsample = downsample

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes=10, width_mult=1.0):
        super(ResNet, self).__init__()
        self.in_channels = int(64 * width_mult)  # Crucial: Use int()
        self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self.make_layer(block, int(64 * width_mult), layers[0], stride=1)
        self.layer2 = self.make_layer(block, int(128 * width_mult), layers[1], stride=2)
        self.layer3 = self.make_layer(block, int(256 * width_mult), layers[2], stride=2)
        self.layer4 = self.make_layer(block, int(512 * width_mult), layers[3], stride=2)
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(int(512 * width_mult), num_classes)  # Correct output

    def make_layer(self, block, out_channels, blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_channels != out_channels:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )
        layers = []
        layers.append(block(self.in_channels, out_channels, stride, downsample))
        self.in_channels = out_channels
        for _ in range(1, blocks):
            layers.append(block(out_channels, out_channels))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avg_pool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

# Initialize model, loss function, optimizer, and learning rate scheduler
# model = ResNet(ResidualBlock, [2, 2, 2, 2]).cuda()
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ResNet(ResidualBlock, [2, 2, 2, 2], width_mult=0.5).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=3e-4)
scheduler = lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=0.1,  # Adjust max_lr
    steps_per_epoch=len(train_loader),
    epochs=50,
    pct_start=0.3,
    anneal_strategy='cos',
    div_factor=25.0
)

In [8]:
# Cosine Annealing with Warmup
def warmup_cosine_lr_scheduler(optimizer, warmup_iters, max_iters):
    def lr_lambda(current_iter):
        if current_iter < warmup_iters:
            return float(current_iter) / float(warmup_iters)  # Warmup phase
        else:
            # Cosine annealing phase
            return 0.5 * (1 + math.cos(float(current_iter - warmup_iters) / float(max_iters - warmup_iters) * math.pi))
    return LambdaLR(optimizer, lr_lambda)

# Training and Testing Functions (same as before)
def train(model, train_loader, criterion, optimizer, scheduler):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for images, labels in tqdm(train_loader):
        images, labels = images.cuda(), labels.cuda()
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
    scheduler.step()
    print(f"Train Loss: {running_loss/len(train_loader)}, Train Accuracy: {100 * correct/total:.2f}%")

def test(model, test_loader, criterion):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in tqdm(test_loader):
            images, labels = images.cuda(), labels.cuda()
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    print(f"Test Loss: {running_loss/len(test_loader)}, Test Accuracy: {100 * correct/total:.2f}%")
    return 100 * correct/total

# Main training loop
max_iters = 50
warmup_iters = 20  # Warmup phase epochs
# scheduler = warmup_cosine_lr_scheduler(optimizer, warmup_iters, max_iters)

best_acc = 0

for epoch in range(max_iters):
    print(f"Epoch {epoch+1}/{max_iters}")
    train(model, train_loader, criterion, optimizer, scheduler)
    acc = test(model, train_loader, criterion)
    if acc > best_acc:
        best_acc = acc
        torch.save(model.state_dict(), "best_new_model.pth")

Epoch 1/50


 78%|███████▊  | 305/391 [00:22<00:06, 13.70it/s]


KeyboardInterrupt: 

### UPDATE POE WITH PREIVOUS code

In [9]:
import os
import random
import numpy as np
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import LambdaLR
from tqdm import tqdm

# Custom CutOut augmentation
class CutOut(object):
    def __init__(self, n_holes, length):
        self.n_holes = n_holes
        self.length = length

    def __call__(self, img):
        h, w = img.size(1), img.size(2)
        mask = np.ones((h, w), np.float32)

        for n in range(self.n_holes):
            y = random.randint(0, h - 1)
            x = random.randint(0, w - 1)

            y1 = np.clip(y - self.length // 2, 0, h)
            y2 = np.clip(y + self.length // 2, 0, h)
            x1 = np.clip(x - self.length // 2, 0, w)
            x2 = np.clip(x + self.length // 2, 0, w)

            mask[y1:y2, x1:x2] = 0.

        mask = torch.from_numpy(mask).expand_as(img)
        img = img * mask
        return img

# Data transformations
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomAffine(degrees=15, translate=(0.1, 0.1), scale=(0.9, 1.1)),  # Rotate, scale, shift
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
    CutOut(n_holes=1, length=8),
])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
])

# Load CIFAR-10 dataset using torchvision
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transform)

train_loader = DataLoader(dataset=train_dataset, batch_size=128, shuffle=True, num_workers=4)
test_loader = DataLoader(dataset=test_dataset, batch_size=128, shuffle=False, num_workers=4)

# Define the Residual Block and ResNet classes (as defined in your original code)
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.downsample = downsample

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out

class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes=10, width_mult=1.0):
        super(ResNet, self).__init__()
        self.in_channels = int(64 * width_mult)
        self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self.make_layer(block, int(64 * width_mult), layers[0], stride=1)
        self.layer2 = self.make_layer(block, int(128 * width_mult), layers[1], stride=2)
        self.layer3 = self.make_layer(block, int(256 * width_mult), layers[2], stride=2)
        self.layer4 = self.make_layer(block, int(512 * width_mult), layers[3], stride=2)
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(int(512 * width_mult), num_classes)

    def make_layer(self, block, out_channels, blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_channels != out_channels:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )
        layers = []
        layers.append(block(self.in_channels, out_channels, stride, downsample))
        self.in_channels = out_channels
        for _ in range(1, blocks):
            layers.append(block(out_channels, out_channels))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avg_pool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

# Initialize model, loss function, optimizer, and learning rate scheduler
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ResNet(ResidualBlock, [2, 2, 2, 2], width_mult=0.5).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=3e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)

# Training and Testing Functions
def train(model, train_loader, criterion, optimizer, scheduler):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for images, labels in tqdm(train_loader):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
    scheduler.step()
    print(f"Train Loss: {running_loss/len(train_loader):.4f}, Train Accuracy: {100 * correct/total:.2f}%")

def test(model, test_loader, criterion):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in tqdm(test_loader):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    print(f"Test Loss: {running_loss/len(test_loader):.4f}, Test Accuracy: {100 * correct/total:.2f}%")
    return 100 * correct / total

# Main training loop
best_acc = 0
for epoch in range(60):  # 50 epochs
    print(f"Epoch {epoch+1}/50")
    train(model, train_loader, criterion, optimizer, scheduler)
    acc = test(model, test_loader, criterion)
    if acc > best_acc:
        best_acc = acc
        torch.save(model.state_dict(), "best_new_model60.pth")

Files already downloaded and verified
Files already downloaded and verified
Epoch 1/50


100%|██████████| 391/391 [00:19<00:00, 19.72it/s]


Train Loss: 1.7778, Train Accuracy: 33.98%


100%|██████████| 79/79 [00:01<00:00, 53.15it/s]


Test Loss: 1.6130, Test Accuracy: 40.39%
Epoch 2/50


100%|██████████| 391/391 [00:20<00:00, 19.36it/s]


Train Loss: 1.4185, Train Accuracy: 47.84%


100%|██████████| 79/79 [00:01<00:00, 54.26it/s]


Test Loss: 1.2092, Test Accuracy: 57.49%
Epoch 3/50


100%|██████████| 391/391 [00:20<00:00, 19.30it/s]


Train Loss: 1.1706, Train Accuracy: 58.14%


100%|██████████| 79/79 [00:01<00:00, 53.76it/s]


Test Loss: 1.0593, Test Accuracy: 61.23%
Epoch 4/50


100%|██████████| 391/391 [00:20<00:00, 18.94it/s]


Train Loss: 1.0230, Train Accuracy: 63.73%


100%|██████████| 79/79 [00:01<00:00, 54.28it/s]


Test Loss: 0.9470, Test Accuracy: 66.10%
Epoch 5/50


100%|██████████| 391/391 [00:21<00:00, 18.61it/s]


Train Loss: 0.9133, Train Accuracy: 67.52%


100%|██████████| 79/79 [00:01<00:00, 53.28it/s]


Test Loss: 0.8820, Test Accuracy: 68.76%
Epoch 6/50


100%|██████████| 391/391 [00:21<00:00, 18.27it/s]


Train Loss: 0.8280, Train Accuracy: 70.91%


100%|██████████| 79/79 [00:01<00:00, 46.26it/s]


Test Loss: 0.8081, Test Accuracy: 72.58%
Epoch 7/50


100%|██████████| 391/391 [00:21<00:00, 18.55it/s]


Train Loss: 0.7682, Train Accuracy: 72.95%


100%|██████████| 79/79 [00:01<00:00, 53.37it/s]


Test Loss: 0.7462, Test Accuracy: 74.50%
Epoch 8/50


100%|██████████| 391/391 [00:21<00:00, 18.56it/s]


Train Loss: 0.7256, Train Accuracy: 74.61%


100%|██████████| 79/79 [00:01<00:00, 54.48it/s]


Test Loss: 0.6368, Test Accuracy: 78.26%
Epoch 9/50


100%|██████████| 391/391 [00:21<00:00, 18.36it/s]


Train Loss: 0.6942, Train Accuracy: 75.69%


100%|██████████| 79/79 [00:01<00:00, 51.59it/s]


Test Loss: 0.6191, Test Accuracy: 78.58%
Epoch 10/50


100%|██████████| 391/391 [00:21<00:00, 18.45it/s]


Train Loss: 0.6606, Train Accuracy: 77.09%


100%|██████████| 79/79 [00:01<00:00, 52.98it/s]


Test Loss: 0.6570, Test Accuracy: 78.34%
Epoch 11/50


100%|██████████| 391/391 [00:21<00:00, 18.56it/s]


Train Loss: 0.6308, Train Accuracy: 77.95%


100%|██████████| 79/79 [00:01<00:00, 54.93it/s]


Test Loss: 0.5799, Test Accuracy: 80.54%
Epoch 12/50


100%|██████████| 391/391 [00:21<00:00, 18.49it/s]


Train Loss: 0.6151, Train Accuracy: 78.56%


100%|██████████| 79/79 [00:01<00:00, 53.10it/s]


Test Loss: 0.6471, Test Accuracy: 78.50%
Epoch 13/50


100%|██████████| 391/391 [00:21<00:00, 18.58it/s]


Train Loss: 0.5972, Train Accuracy: 79.32%


100%|██████████| 79/79 [00:01<00:00, 53.58it/s]


Test Loss: 0.5679, Test Accuracy: 81.17%
Epoch 14/50


100%|██████████| 391/391 [00:21<00:00, 18.40it/s]


Train Loss: 0.5757, Train Accuracy: 79.91%


100%|██████████| 79/79 [00:01<00:00, 54.14it/s]


Test Loss: 0.5475, Test Accuracy: 80.64%
Epoch 15/50


100%|██████████| 391/391 [00:21<00:00, 18.44it/s]


Train Loss: 0.5617, Train Accuracy: 80.50%


100%|██████████| 79/79 [00:01<00:00, 53.23it/s]


Test Loss: 0.5353, Test Accuracy: 81.84%
Epoch 16/50


100%|██████████| 391/391 [00:21<00:00, 18.38it/s]


Train Loss: 0.5461, Train Accuracy: 80.97%


100%|██████████| 79/79 [00:01<00:00, 53.00it/s]


Test Loss: 0.5263, Test Accuracy: 82.32%
Epoch 17/50


100%|██████████| 391/391 [00:21<00:00, 18.46it/s]


Train Loss: 0.5292, Train Accuracy: 81.50%


100%|██████████| 79/79 [00:01<00:00, 53.21it/s]


Test Loss: 0.4664, Test Accuracy: 84.12%
Epoch 18/50


100%|██████████| 391/391 [00:21<00:00, 18.46it/s]


Train Loss: 0.5165, Train Accuracy: 82.22%


100%|██████████| 79/79 [00:01<00:00, 54.17it/s]


Test Loss: 0.5014, Test Accuracy: 83.59%
Epoch 19/50


100%|██████████| 391/391 [00:21<00:00, 18.48it/s]


Train Loss: 0.5014, Train Accuracy: 82.61%


100%|██████████| 79/79 [00:01<00:00, 54.22it/s]


Test Loss: 0.5328, Test Accuracy: 82.66%
Epoch 20/50


100%|██████████| 391/391 [00:21<00:00, 18.58it/s]


Train Loss: 0.4936, Train Accuracy: 82.91%


100%|██████████| 79/79 [00:01<00:00, 52.21it/s]


Test Loss: 0.4956, Test Accuracy: 83.22%
Epoch 21/50


100%|██████████| 391/391 [00:21<00:00, 18.48it/s]


Train Loss: 0.4783, Train Accuracy: 83.54%


100%|██████████| 79/79 [00:01<00:00, 52.93it/s]


Test Loss: 0.4605, Test Accuracy: 84.53%
Epoch 22/50


100%|██████████| 391/391 [00:21<00:00, 18.49it/s]


Train Loss: 0.4688, Train Accuracy: 83.84%


100%|██████████| 79/79 [00:01<00:00, 53.99it/s]


Test Loss: 0.4544, Test Accuracy: 84.65%
Epoch 23/50


100%|██████████| 391/391 [00:21<00:00, 18.54it/s]


Train Loss: 0.4532, Train Accuracy: 84.26%


100%|██████████| 79/79 [00:01<00:00, 45.10it/s]


Test Loss: 0.5264, Test Accuracy: 82.95%
Epoch 24/50


100%|██████████| 391/391 [00:21<00:00, 18.39it/s]


Train Loss: 0.4392, Train Accuracy: 84.69%


100%|██████████| 79/79 [00:01<00:00, 53.73it/s]


Test Loss: 0.4119, Test Accuracy: 86.15%
Epoch 25/50


100%|██████████| 391/391 [00:21<00:00, 18.45it/s]


Train Loss: 0.4261, Train Accuracy: 84.94%


100%|██████████| 79/79 [00:01<00:00, 54.00it/s]


Test Loss: 0.4049, Test Accuracy: 86.33%
Epoch 26/50


100%|██████████| 391/391 [00:21<00:00, 18.47it/s]


Train Loss: 0.4159, Train Accuracy: 85.44%


100%|██████████| 79/79 [00:01<00:00, 53.62it/s]


Test Loss: 0.4556, Test Accuracy: 84.62%
Epoch 27/50


100%|██████████| 391/391 [00:21<00:00, 18.45it/s]


Train Loss: 0.4030, Train Accuracy: 86.02%


100%|██████████| 79/79 [00:01<00:00, 53.14it/s]


Test Loss: 0.4187, Test Accuracy: 86.14%
Epoch 28/50


100%|██████████| 391/391 [00:21<00:00, 18.40it/s]


Train Loss: 0.3920, Train Accuracy: 86.33%


100%|██████████| 79/79 [00:01<00:00, 54.52it/s]


Test Loss: 0.4096, Test Accuracy: 86.30%
Epoch 29/50


100%|██████████| 391/391 [00:21<00:00, 18.43it/s]


Train Loss: 0.3754, Train Accuracy: 86.89%


100%|██████████| 79/79 [00:01<00:00, 51.56it/s]


Test Loss: 0.4467, Test Accuracy: 85.17%
Epoch 30/50


100%|██████████| 391/391 [00:21<00:00, 18.53it/s]


Train Loss: 0.3643, Train Accuracy: 87.33%


100%|██████████| 79/79 [00:01<00:00, 54.66it/s]


Test Loss: 0.3379, Test Accuracy: 88.60%
Epoch 31/50


100%|██████████| 391/391 [00:21<00:00, 18.47it/s]


Train Loss: 0.3491, Train Accuracy: 87.81%


100%|██████████| 79/79 [00:01<00:00, 53.14it/s]


Test Loss: 0.3382, Test Accuracy: 88.77%
Epoch 32/50


100%|██████████| 391/391 [00:21<00:00, 18.47it/s]


Train Loss: 0.3347, Train Accuracy: 88.30%


100%|██████████| 79/79 [00:01<00:00, 54.93it/s]


Test Loss: 0.3308, Test Accuracy: 89.08%
Epoch 33/50


100%|██████████| 391/391 [00:21<00:00, 18.46it/s]


Train Loss: 0.3237, Train Accuracy: 88.68%


100%|██████████| 79/79 [00:01<00:00, 50.60it/s]


Test Loss: 0.3181, Test Accuracy: 89.41%
Epoch 34/50


100%|██████████| 391/391 [00:21<00:00, 18.50it/s]


Train Loss: 0.3125, Train Accuracy: 89.14%


100%|██████████| 79/79 [00:01<00:00, 54.28it/s]


Test Loss: 0.3286, Test Accuracy: 89.19%
Epoch 35/50


100%|██████████| 391/391 [00:21<00:00, 18.43it/s]


Train Loss: 0.2965, Train Accuracy: 89.59%


100%|██████████| 79/79 [00:01<00:00, 51.49it/s]


Test Loss: 0.3320, Test Accuracy: 88.89%
Epoch 36/50


100%|██████████| 391/391 [00:21<00:00, 18.51it/s]


Train Loss: 0.2827, Train Accuracy: 89.98%


100%|██████████| 79/79 [00:01<00:00, 53.43it/s]


Test Loss: 0.2843, Test Accuracy: 90.42%
Epoch 37/50


100%|██████████| 391/391 [00:21<00:00, 18.54it/s]


Train Loss: 0.2641, Train Accuracy: 90.79%


100%|██████████| 79/79 [00:01<00:00, 53.75it/s]


Test Loss: 0.2970, Test Accuracy: 90.14%
Epoch 38/50


100%|██████████| 391/391 [00:21<00:00, 18.50it/s]


Train Loss: 0.2486, Train Accuracy: 91.23%


100%|██████████| 79/79 [00:01<00:00, 54.49it/s]


Test Loss: 0.2862, Test Accuracy: 90.81%
Epoch 39/50


100%|██████████| 391/391 [00:21<00:00, 18.58it/s]


Train Loss: 0.2379, Train Accuracy: 91.56%


100%|██████████| 79/79 [00:01<00:00, 52.79it/s]


Test Loss: 0.2718, Test Accuracy: 90.97%
Epoch 40/50


100%|██████████| 391/391 [00:21<00:00, 18.57it/s]


Train Loss: 0.2221, Train Accuracy: 92.24%


100%|██████████| 79/79 [00:01<00:00, 52.19it/s]


Test Loss: 0.2629, Test Accuracy: 91.49%
Epoch 41/50


100%|██████████| 391/391 [00:21<00:00, 18.38it/s]


Train Loss: 0.2108, Train Accuracy: 92.55%


100%|██████████| 79/79 [00:01<00:00, 53.48it/s]


Test Loss: 0.2530, Test Accuracy: 91.76%
Epoch 42/50


100%|██████████| 391/391 [00:21<00:00, 18.38it/s]


Train Loss: 0.1959, Train Accuracy: 93.06%


100%|██████████| 79/79 [00:01<00:00, 53.80it/s]


Test Loss: 0.2485, Test Accuracy: 91.98%
Epoch 43/50


100%|██████████| 391/391 [00:21<00:00, 18.43it/s]


Train Loss: 0.1845, Train Accuracy: 93.46%


100%|██████████| 79/79 [00:01<00:00, 53.95it/s]


Test Loss: 0.2423, Test Accuracy: 92.30%
Epoch 44/50


100%|██████████| 391/391 [00:21<00:00, 18.58it/s]


Train Loss: 0.1708, Train Accuracy: 94.04%


100%|██████████| 79/79 [00:01<00:00, 53.51it/s]


Test Loss: 0.2413, Test Accuracy: 92.28%
Epoch 45/50


100%|██████████| 391/391 [00:21<00:00, 18.54it/s]


Train Loss: 0.1599, Train Accuracy: 94.42%


100%|██████████| 79/79 [00:01<00:00, 54.45it/s]


Test Loss: 0.2341, Test Accuracy: 92.67%
Epoch 46/50


100%|██████████| 391/391 [00:21<00:00, 18.53it/s]


Train Loss: 0.1553, Train Accuracy: 94.51%


100%|██████████| 79/79 [00:01<00:00, 54.54it/s]


Test Loss: 0.2299, Test Accuracy: 92.84%
Epoch 47/50


100%|██████████| 391/391 [00:21<00:00, 18.50it/s]


Train Loss: 0.1480, Train Accuracy: 94.74%


100%|██████████| 79/79 [00:01<00:00, 53.91it/s]


Test Loss: 0.2265, Test Accuracy: 92.81%
Epoch 48/50


100%|██████████| 391/391 [00:21<00:00, 18.45it/s]


Train Loss: 0.1401, Train Accuracy: 95.08%


100%|██████████| 79/79 [00:01<00:00, 54.40it/s]


Test Loss: 0.2268, Test Accuracy: 92.71%
Epoch 49/50


100%|██████████| 391/391 [00:21<00:00, 18.44it/s]


Train Loss: 0.1403, Train Accuracy: 95.21%


100%|██████████| 79/79 [00:01<00:00, 54.08it/s]


Test Loss: 0.2246, Test Accuracy: 92.80%
Epoch 50/50


100%|██████████| 391/391 [00:21<00:00, 18.48it/s]


Train Loss: 0.1408, Train Accuracy: 95.08%


100%|██████████| 79/79 [00:01<00:00, 42.89it/s]


Test Loss: 0.2249, Test Accuracy: 92.82%
Epoch 51/50


100%|██████████| 391/391 [00:21<00:00, 18.45it/s]


Train Loss: 0.1328, Train Accuracy: 95.35%


100%|██████████| 79/79 [00:01<00:00, 53.82it/s]


Test Loss: 0.2262, Test Accuracy: 92.77%
Epoch 52/50


100%|██████████| 391/391 [00:21<00:00, 18.42it/s]


Train Loss: 0.1374, Train Accuracy: 95.15%


100%|██████████| 79/79 [00:01<00:00, 53.01it/s]


Test Loss: 0.2235, Test Accuracy: 92.69%
Epoch 53/50


100%|██████████| 391/391 [00:21<00:00, 18.44it/s]


Train Loss: 0.1388, Train Accuracy: 95.15%


100%|██████████| 79/79 [00:01<00:00, 53.44it/s]


Test Loss: 0.2239, Test Accuracy: 92.81%
Epoch 54/50


100%|██████████| 391/391 [00:21<00:00, 18.49it/s]


Train Loss: 0.1391, Train Accuracy: 95.22%


100%|██████████| 79/79 [00:01<00:00, 54.24it/s]


Test Loss: 0.2257, Test Accuracy: 92.76%
Epoch 55/50


100%|██████████| 391/391 [00:21<00:00, 18.47it/s]


Train Loss: 0.1396, Train Accuracy: 95.13%


100%|██████████| 79/79 [00:01<00:00, 54.78it/s]


Test Loss: 0.2351, Test Accuracy: 92.68%
Epoch 56/50


100%|██████████| 391/391 [00:21<00:00, 18.46it/s]


Train Loss: 0.1452, Train Accuracy: 94.91%


100%|██████████| 79/79 [00:01<00:00, 53.74it/s]


Test Loss: 0.2308, Test Accuracy: 92.79%
Epoch 57/50


100%|██████████| 391/391 [00:21<00:00, 18.50it/s]


Train Loss: 0.1486, Train Accuracy: 94.74%


100%|██████████| 79/79 [00:01<00:00, 54.21it/s]


Test Loss: 0.2352, Test Accuracy: 92.43%
Epoch 58/50


100%|██████████| 391/391 [00:21<00:00, 18.47it/s]


Train Loss: 0.1519, Train Accuracy: 94.74%


100%|██████████| 79/79 [00:01<00:00, 53.29it/s]


Test Loss: 0.2450, Test Accuracy: 92.46%
Epoch 59/50


100%|██████████| 391/391 [00:21<00:00, 18.56it/s]


Train Loss: 0.1615, Train Accuracy: 94.27%


100%|██████████| 79/79 [00:01<00:00, 53.79it/s]


Test Loss: 0.2431, Test Accuracy: 92.50%
Epoch 60/50


100%|██████████| 391/391 [00:21<00:00, 18.47it/s]


Train Loss: 0.1724, Train Accuracy: 93.80%


100%|██████████| 79/79 [00:01<00:00, 49.48it/s]

Test Loss: 0.2568, Test Accuracy: 92.04%





In [10]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params}")

Total number of parameters: 2797610


## 99

In [5]:
## poe
import os
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm

# Set random seeds for reproducibility
random.seed(123)
np.random.seed(123)
tf.random.set_seed(1234)
os.environ['PYTHONHASHSEED'] = '0'

# Load CIFAR-10 dataset
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

# Define the new ResNet architecture
def build_model():
    input_layer = layers.Input(shape=(32, 32, 3))
    x = layers.Normalization()(input_layer)

    filters_l = [32, 64, 128, 128]
    
    # Initial convolution
    x = layers.Conv2D(filters=filters_l[0], kernel_size=(3, 3), kernel_initializer='he_uniform', activation='elu', padding='same')(x)

    for i, filters in enumerate(filters_l):
        res_con1 = x  
        x = layers.BatchNormalization()(x)
        x = layers.ELU()(x)
        x = layers.Conv2D(filters=filters, kernel_size=(3, 3), kernel_initializer='he_uniform', padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.ELU()(x)
        x = layers.Conv2D(filters=filters, kernel_size=(3, 3), kernel_initializer='he_uniform', padding='same')(x)
        x = layers.Add()([res_con1, x])

        # Additional residual connections
        for _ in range(2):
            res_con2 = x  
            x = layers.BatchNormalization()(x)
            x = layers.ELU()(x)
            x = layers.Conv2D(filters=filters, kernel_size=(3, 3), kernel_initializer='he_uniform', padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.ELU()(x)
            x = layers.Conv2D(filters=filters, kernel_size=(3, 3), kernel_initializer='he_uniform', padding='same')(x)
            x = layers.Add()([res_con1, res_con2, x])

        res_con = x
        downsample = i != len(filters_l) - 1
        
        if downsample:
            res_con = layers.BatchNormalization()(res_con)
            res_con = layers.ELU()(res_con)
            res_con = layers.Conv2D(filters=filters_l[i + 1], kernel_size=(3, 3), kernel_initializer='he_uniform', strides=2, padding='same')(res_con)

        x = layers.BatchNormalization()(x)
        x = layers.ELU()(x)
        x = layers.Conv2D(filters=filters, kernel_size=(3, 3), kernel_initializer='he_uniform', padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.ELU()(x)
        x = layers.Conv2D(filters=(filters_l[i + 1] if downsample else filters), kernel_size=(3, 3), kernel_initializer='he_uniform', strides=(2 if downsample else 1), padding='same')(x)
        x = layers.Add()([res_con, x])
    
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(10, activation='softmax')(x)
    
    model = models.Model(inputs=input_layer, outputs=x)
    model.compile(optimizer=tf.keras.optimizers.Adam(0.001),
                  loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1),
                  metrics=['accuracy'])
    return model

# Initialize model
model = build_model()

# Training parameters
batch_size = 128
epochs = 100

# Training loop
best_acc = 0
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    model.fit(x_train, y_train, batch_size=batch_size, epochs=1, verbose=1, validation_data=(x_test, y_test))  # Train for one epoch at a time

    # Evaluate the model
    test_loss, test_acc = model.evaluate(x_test, y_test, verbose=0)
    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc * 100:.2f}%")

    # Save the best model
    if test_acc > best_acc:
        best_acc = test_acc
        model.save("test_model_poe3.h5")

Epoch 1/100
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 110ms/step - accuracy: 0.3719 - loss: 2.6425 - val_accuracy: 0.5438 - val_loss: 1.5640
Test Loss: 1.5640, Test Accuracy: 54.38%
Epoch 2/100
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 81ms/step - accuracy: 0.6238 - loss: 1.3693 - val_accuracy: 0.6649 - val_loss: 1.3124
Test Loss: 1.3124, Test Accuracy: 66.49%
Epoch 3/100
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 81ms/step - accuracy: 0.7219 - loss: 1.1715 - val_accuracy: 0.7058 - val_loss: 1.2193
Test Loss: 1.2193, Test Accuracy: 70.58%
Epoch 4/100
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 81ms/step - accuracy: 0.7822 - loss: 1.0519 - val_accuracy: 0.7176 - val_loss: 1.1991
Test Loss: 1.1991, Test Accuracy: 71.76%
Epoch 5/100
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 81ms/step - accuracy: 0.8334 - loss: 0.9581 - val_accuracy: 0.7113 - val_loss: 1.2138
Test Loss:

KeyboardInterrupt: 

In [2]:
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchvision import datasets, transforms
from tqdm import tqdm

# Set random seeds for reproducibility
random.seed(123)
np.random.seed(123)
torch.manual_seed(1234)
os.environ['PYTHONHASHSEED'] = '0'

# Check if CUDA (GPU) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load CIFAR-10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

# Define the new ResNet architecture in PyTorch
class ResNetModel(nn.Module):
    def __init__(self):
        super(ResNetModel, self).__init__()
        
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.relu = nn.ELU()
        
        self.layer1 = self._make_layer(32, 64)
        self.layer2 = self._make_layer(64, 128)
        self.layer3 = self._make_layer(128, 128)
        
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(128, 10)
        
    def _make_layer(self, in_channels, out_channels):
        layers = []
        
        # Initial convolution block
        layers.append(nn.BatchNorm2d(in_channels))
        layers.append(self.relu)
        layers.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
        
        layers.append(nn.BatchNorm2d(out_channels))
        layers.append(self.relu)
        layers.append(nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1))
        
        # Residual connection
        layers.append(nn.Identity())
        
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.relu(self.conv1(x))
        
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        
        x = self.avg_pool(x)
        x = torch.flatten(x, 1)
        
        x = self.fc(x)
        return x

# Initialize model and move it to the appropriate device (GPU or CPU)
model = ResNetModel().to(device)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training parameters
epochs = 50
best_acc = 0

# Training loop
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    
    # Training phase
    model.train()
    for data, target in tqdm(train_loader):
        data, target = data.to(device), target.to(device)  # Move data and target to device
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
    
    # Validation phase
    model.eval()
    correct = 0
    total = 0
    test_loss = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)  # Move data and target to device
            output = model(data)
            loss = criterion(output, target)
            test_loss += loss.item()
            
            _, predicted = torch.max(output, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()
    
    test_acc = correct / total
    test_loss /= len(test_loader)
    
    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc * 100:.2f}%")
    
    # Save the best model
    if test_acc > best_acc:
        best_acc = test_acc
        torch.save(model.state_dict(), "test_model_poe1gpt.pth")

Using device: cuda
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:05<00:00, 29319361.25it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
Epoch 1/50


100%|██████████| 391/391 [00:32<00:00, 12.07it/s]


Test Loss: 1.6655, Test Accuracy: 46.38%
Epoch 2/50


100%|██████████| 391/391 [00:32<00:00, 12.18it/s]


Test Loss: 1.5202, Test Accuracy: 53.81%
Epoch 3/50


100%|██████████| 391/391 [00:33<00:00, 11.52it/s]


Test Loss: 1.4524, Test Accuracy: 56.92%
Epoch 4/50


100%|██████████| 391/391 [00:34<00:00, 11.37it/s]


Test Loss: 1.3862, Test Accuracy: 61.12%
Epoch 5/50


100%|██████████| 391/391 [00:33<00:00, 11.56it/s]


Test Loss: 1.3246, Test Accuracy: 64.14%
Epoch 6/50


100%|██████████| 391/391 [00:34<00:00, 11.40it/s]


Test Loss: 1.3015, Test Accuracy: 65.41%
Epoch 7/50


100%|██████████| 391/391 [00:34<00:00, 11.45it/s]


Test Loss: 1.2628, Test Accuracy: 67.38%
Epoch 8/50


100%|██████████| 391/391 [00:34<00:00, 11.44it/s]


Test Loss: 1.2246, Test Accuracy: 69.33%
Epoch 9/50


100%|██████████| 391/391 [00:34<00:00, 11.42it/s]


Test Loss: 1.2234, Test Accuracy: 69.77%
Epoch 10/50


100%|██████████| 391/391 [00:34<00:00, 11.40it/s]


Test Loss: 1.1877, Test Accuracy: 71.22%
Epoch 11/50


100%|██████████| 391/391 [00:34<00:00, 11.41it/s]


Test Loss: 1.1624, Test Accuracy: 72.22%
Epoch 12/50


100%|██████████| 391/391 [00:34<00:00, 11.41it/s]


Test Loss: 1.1401, Test Accuracy: 73.04%
Epoch 13/50


100%|██████████| 391/391 [00:34<00:00, 11.40it/s]


Test Loss: 1.1284, Test Accuracy: 74.30%
Epoch 14/50


100%|██████████| 391/391 [00:34<00:00, 11.39it/s]


Test Loss: 1.1166, Test Accuracy: 75.01%
Epoch 15/50


100%|██████████| 391/391 [00:34<00:00, 11.39it/s]


Test Loss: 1.1053, Test Accuracy: 75.48%
Epoch 16/50


100%|██████████| 391/391 [00:34<00:00, 11.41it/s]


Test Loss: 1.0877, Test Accuracy: 76.65%
Epoch 17/50


100%|██████████| 391/391 [00:34<00:00, 11.43it/s]


Test Loss: 1.0877, Test Accuracy: 76.88%
Epoch 18/50


100%|██████████| 391/391 [00:34<00:00, 11.42it/s]


Test Loss: 1.0857, Test Accuracy: 77.06%
Epoch 19/50


100%|██████████| 391/391 [00:34<00:00, 11.43it/s]


Test Loss: 1.0757, Test Accuracy: 77.58%
Epoch 20/50


100%|██████████| 391/391 [00:34<00:00, 11.42it/s]


Test Loss: 1.0932, Test Accuracy: 76.56%
Epoch 21/50


100%|██████████| 391/391 [00:34<00:00, 11.42it/s]


Test Loss: 1.1028, Test Accuracy: 75.84%
Epoch 22/50


100%|██████████| 391/391 [00:34<00:00, 11.42it/s]


Test Loss: 1.0837, Test Accuracy: 76.57%
Epoch 23/50


100%|██████████| 391/391 [00:34<00:00, 11.41it/s]


Test Loss: 1.0466, Test Accuracy: 79.07%
Epoch 24/50


100%|██████████| 391/391 [00:34<00:00, 11.38it/s]


Test Loss: 1.0483, Test Accuracy: 78.73%
Epoch 25/50


100%|██████████| 391/391 [00:34<00:00, 11.40it/s]


Test Loss: 1.0794, Test Accuracy: 76.72%
Epoch 26/50


100%|██████████| 391/391 [00:34<00:00, 11.42it/s]


Test Loss: 1.0983, Test Accuracy: 77.35%
Epoch 27/50


100%|██████████| 391/391 [00:34<00:00, 11.42it/s]


Test Loss: 1.0413, Test Accuracy: 79.50%
Epoch 28/50


100%|██████████| 391/391 [00:34<00:00, 11.42it/s]


Test Loss: 1.0023, Test Accuracy: 80.96%
Epoch 29/50


100%|██████████| 391/391 [00:34<00:00, 11.43it/s]


Test Loss: 1.0148, Test Accuracy: 79.92%
Epoch 30/50


100%|██████████| 391/391 [00:34<00:00, 11.42it/s]


Test Loss: 1.2519, Test Accuracy: 69.94%
Epoch 31/50


100%|██████████| 391/391 [00:34<00:00, 11.41it/s]


Test Loss: 1.0220, Test Accuracy: 80.70%
Epoch 32/50


100%|██████████| 391/391 [00:34<00:00, 11.41it/s]


Test Loss: 1.0251, Test Accuracy: 80.42%
Epoch 33/50


100%|██████████| 391/391 [00:34<00:00, 11.42it/s]


Test Loss: 0.9887, Test Accuracy: 81.81%
Epoch 34/50


100%|██████████| 391/391 [00:34<00:00, 11.42it/s]


Test Loss: 1.0675, Test Accuracy: 78.52%
Epoch 35/50


100%|██████████| 391/391 [00:34<00:00, 11.42it/s]


Test Loss: 1.0348, Test Accuracy: 80.19%
Epoch 36/50


100%|██████████| 391/391 [00:34<00:00, 11.39it/s]


Test Loss: 0.9829, Test Accuracy: 82.06%
Epoch 37/50


100%|██████████| 391/391 [00:34<00:00, 11.37it/s]


Test Loss: 1.0095, Test Accuracy: 80.86%
Epoch 38/50


100%|██████████| 391/391 [00:34<00:00, 11.38it/s]


Test Loss: 0.9932, Test Accuracy: 81.53%
Epoch 39/50


100%|██████████| 391/391 [00:34<00:00, 11.39it/s]


Test Loss: 1.1164, Test Accuracy: 76.98%
Epoch 40/50


100%|██████████| 391/391 [00:34<00:00, 11.41it/s]


Test Loss: 1.0231, Test Accuracy: 80.27%
Epoch 41/50


100%|██████████| 391/391 [00:34<00:00, 11.42it/s]


Test Loss: 1.1045, Test Accuracy: 76.20%
Epoch 42/50


100%|██████████| 391/391 [00:34<00:00, 11.42it/s]


Test Loss: 1.1401, Test Accuracy: 76.86%
Epoch 43/50


100%|██████████| 391/391 [00:34<00:00, 11.43it/s]


Test Loss: 1.1232, Test Accuracy: 76.29%
Epoch 44/50


100%|██████████| 391/391 [00:34<00:00, 11.42it/s]


Test Loss: 1.1471, Test Accuracy: 75.25%
Epoch 45/50


100%|██████████| 391/391 [00:34<00:00, 11.43it/s]


Test Loss: 1.2763, Test Accuracy: 70.26%
Epoch 46/50


100%|██████████| 391/391 [00:34<00:00, 11.40it/s]


Test Loss: 1.0676, Test Accuracy: 78.93%
Epoch 47/50


100%|██████████| 391/391 [00:34<00:00, 11.41it/s]


Test Loss: 1.1841, Test Accuracy: 75.10%
Epoch 48/50


100%|██████████| 391/391 [00:34<00:00, 11.42it/s]


Test Loss: 1.1968, Test Accuracy: 73.10%
Epoch 49/50


100%|██████████| 391/391 [00:34<00:00, 11.42it/s]


Test Loss: 1.0837, Test Accuracy: 78.66%
Epoch 50/50


100%|██████████| 391/391 [00:34<00:00, 11.42it/s]


Test Loss: 1.1683, Test Accuracy: 75.73%


## CHAT MODEL CODES

In [6]:
!pip install torchprofile

  pid, fd = os.forkpty()


Collecting torchprofile
  Downloading torchprofile-0.0.4-py3-none-any.whl.metadata (303 bytes)
Downloading torchprofile-0.0.4-py3-none-any.whl (7.7 kB)
Installing collected packages: torchprofile
Successfully installed torchprofile-0.0.4


In [9]:
## Gemini

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
from tqdm import tqdm
import torch.nn.functional as F
from torchprofile import profile_macs
import os


# Improved Lightweight CNN Architecture with Depthwise Separable Convolutions
class EfficientCIFAR10Net(nn.Module):
    def __init__(self):
        super(EfficientCIFAR10Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.relu = nn.ReLU(inplace=True)
        self.dw_conv1 = nn.Conv2d(16, 16, kernel_size=3, padding=1, groups=16) # Depthwise separable
        self.bn2 = nn.BatchNorm2d(16)
        self.pool1 = nn.MaxPool2d(2)


        self.dw_conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1, groups=16) # Depthwise separable
        self.bn3 = nn.BatchNorm2d(32)
        self.pool2 = nn.MaxPool2d(2)

        self.dw_conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1, groups=32) # Depthwise separable
        self.bn4 = nn.BatchNorm2d(64)
        self.pool3 = nn.MaxPool2d(2)

        self.fc1 = nn.Linear(64 * 4 * 4, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.relu(self.bn2(self.dw_conv1(x)))
        x = self.pool1(x)
        x = self.relu(self.bn3(self.dw_conv2(x)))
        x = self.pool2(x)
        x = self.relu(self.bn4(self.dw_conv3(x)))
        x = self.pool3(x)
        x = x.view(-1, 64 * 4 * 4)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x


# Data Transformations (same as before)
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])


# Data Loaders (same as before)
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=2)



# Model, Optimizer, and Loss Function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = EfficientCIFAR10Net().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=3e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)

# Training Loop (same as before)
num_epochs = 54
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, data in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")):
        inputs, labels = data[0].to(device), data[1].to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    scheduler.step()
    print(f"Epoch {epoch+1}, Training Loss: {running_loss / len(train_loader)}")


# Testing (same as before)
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for data in tqdm(test_loader, desc="Testing"):
        images, labels = data[0].to(device), data[1].to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")


# Parameter and FLOPs calculation
num_params = sum(p.numel() for p in model.parameters())
macs = profile_macs(model, (torch.randn(1,3,32,32,).to(device),))
print(f"Number of Parameters: {num_params}")
print(f"Millions of MACs (FLOPs): {macs/1e6:.2f}M")

torch.save(model.state_dict(), 'efficient_cifar10_model_final22.pth')

Files already downloaded and verified
Files already downloaded and verified


Epoch 1/54: 100%|██████████| 391/391 [00:11<00:00, 35.02it/s]


Epoch 1, Training Loss: 1.5364239596954696


Epoch 2/54: 100%|██████████| 391/391 [00:11<00:00, 32.89it/s]


Epoch 2, Training Loss: 1.2560509928047199


Epoch 3/54: 100%|██████████| 391/391 [00:11<00:00, 33.95it/s]


Epoch 3, Training Loss: 1.1826129923086337


Epoch 4/54: 100%|██████████| 391/391 [00:11<00:00, 34.61it/s]


Epoch 4, Training Loss: 1.147909636235298


Epoch 5/54: 100%|██████████| 391/391 [00:11<00:00, 34.41it/s]


Epoch 5, Training Loss: 1.1073205099081445


Epoch 6/54: 100%|██████████| 391/391 [00:11<00:00, 35.03it/s]


Epoch 6, Training Loss: 1.091283948982463


Epoch 7/54: 100%|██████████| 391/391 [00:11<00:00, 34.28it/s]


Epoch 7, Training Loss: 1.0888570922110088


Epoch 8/54: 100%|██████████| 391/391 [00:11<00:00, 34.86it/s]


Epoch 8, Training Loss: 1.0523556711728617


Epoch 9/54: 100%|██████████| 391/391 [00:11<00:00, 35.33it/s]


Epoch 9, Training Loss: 1.0339647909564436


Epoch 10/54: 100%|██████████| 391/391 [00:11<00:00, 33.57it/s]


Epoch 10, Training Loss: 1.0061696351641585


Epoch 11/54: 100%|██████████| 391/391 [00:11<00:00, 35.43it/s]


Epoch 11, Training Loss: 1.0040201849644752


Epoch 12/54: 100%|██████████| 391/391 [00:11<00:00, 35.02it/s]


Epoch 12, Training Loss: 0.9842502801009761


Epoch 13/54: 100%|██████████| 391/391 [00:11<00:00, 33.67it/s]


Epoch 13, Training Loss: 0.9904604959670845


Epoch 14/54: 100%|██████████| 391/391 [00:10<00:00, 35.83it/s]


Epoch 14, Training Loss: 0.966610344779461


Epoch 15/54: 100%|██████████| 391/391 [00:10<00:00, 36.18it/s]


Epoch 15, Training Loss: 0.9556394321534335


Epoch 16/54: 100%|██████████| 391/391 [00:11<00:00, 35.04it/s]


Epoch 16, Training Loss: 0.9431104478628739


Epoch 17/54: 100%|██████████| 391/391 [00:10<00:00, 36.27it/s]


Epoch 17, Training Loss: 0.9378941734428601


Epoch 18/54: 100%|██████████| 391/391 [00:10<00:00, 36.01it/s]


Epoch 18, Training Loss: 0.9182587270541569


Epoch 19/54: 100%|██████████| 391/391 [00:11<00:00, 33.93it/s]


Epoch 19, Training Loss: 0.9079578424353734


Epoch 20/54: 100%|██████████| 391/391 [00:11<00:00, 35.42it/s]


Epoch 20, Training Loss: 0.8959687314070094


Epoch 21/54: 100%|██████████| 391/391 [00:10<00:00, 36.08it/s]


Epoch 21, Training Loss: 0.8879362975849825


Epoch 22/54: 100%|██████████| 391/391 [00:11<00:00, 34.05it/s]


Epoch 22, Training Loss: 0.8846084364234944


Epoch 23/54: 100%|██████████| 391/391 [00:11<00:00, 35.46it/s]


Epoch 23, Training Loss: 0.867362433534754


Epoch 24/54: 100%|██████████| 391/391 [00:11<00:00, 35.47it/s]


Epoch 24, Training Loss: 0.860125606169786


Epoch 25/54: 100%|██████████| 391/391 [00:10<00:00, 35.74it/s]


Epoch 25, Training Loss: 0.847225135854443


Epoch 26/54: 100%|██████████| 391/391 [00:10<00:00, 36.48it/s]


Epoch 26, Training Loss: 0.8362181098259929


Epoch 27/54: 100%|██████████| 391/391 [00:11<00:00, 34.69it/s]


Epoch 27, Training Loss: 0.8233318299893528


Epoch 28/54: 100%|██████████| 391/391 [00:10<00:00, 36.11it/s]


Epoch 28, Training Loss: 0.8185485031293787


Epoch 29/54: 100%|██████████| 391/391 [00:10<00:00, 36.00it/s]


Epoch 29, Training Loss: 0.7974282634227782


Epoch 30/54: 100%|██████████| 391/391 [00:11<00:00, 35.14it/s]


Epoch 30, Training Loss: 0.7959852116492093


Epoch 31/54: 100%|██████████| 391/391 [00:10<00:00, 35.68it/s]


Epoch 31, Training Loss: 0.786235325324261


Epoch 32/54: 100%|██████████| 391/391 [00:10<00:00, 36.38it/s]


Epoch 32, Training Loss: 0.7711078763922767


Epoch 33/54: 100%|██████████| 391/391 [00:11<00:00, 34.52it/s]


Epoch 33, Training Loss: 0.7697922616358608


Epoch 34/54: 100%|██████████| 391/391 [00:10<00:00, 35.91it/s]


Epoch 34, Training Loss: 0.7537462361480879


Epoch 35/54: 100%|██████████| 391/391 [00:10<00:00, 35.59it/s]


Epoch 35, Training Loss: 0.7477378136361651


Epoch 36/54: 100%|██████████| 391/391 [00:11<00:00, 34.77it/s]


Epoch 36, Training Loss: 0.7370004952716096


Epoch 37/54: 100%|██████████| 391/391 [00:10<00:00, 36.27it/s]


Epoch 37, Training Loss: 0.7245345413684845


Epoch 38/54: 100%|██████████| 391/391 [00:11<00:00, 35.52it/s]


Epoch 38, Training Loss: 0.7142928818150249


Epoch 39/54: 100%|██████████| 391/391 [00:11<00:00, 33.92it/s]


Epoch 39, Training Loss: 0.7021586065707


Epoch 40/54: 100%|██████████| 391/391 [00:11<00:00, 35.54it/s]


Epoch 40, Training Loss: 0.6916785901769653


Epoch 41/54: 100%|██████████| 391/391 [00:11<00:00, 35.30it/s]


Epoch 41, Training Loss: 0.6859340689828634


Epoch 42/54: 100%|██████████| 391/391 [00:11<00:00, 34.19it/s]


Epoch 42, Training Loss: 0.6779215898355255


Epoch 43/54: 100%|██████████| 391/391 [00:11<00:00, 34.90it/s]


Epoch 43, Training Loss: 0.6693788829361996


Epoch 44/54: 100%|██████████| 391/391 [00:11<00:00, 34.90it/s]


Epoch 44, Training Loss: 0.6609987685137697


Epoch 45/54: 100%|██████████| 391/391 [00:11<00:00, 34.25it/s]


Epoch 45, Training Loss: 0.6585582670805704


Epoch 46/54: 100%|██████████| 391/391 [00:11<00:00, 35.43it/s]


Epoch 46, Training Loss: 0.6530845629437195


Epoch 47/54: 100%|██████████| 391/391 [00:11<00:00, 34.60it/s]


Epoch 47, Training Loss: 0.6467831187388476


Epoch 48/54: 100%|██████████| 391/391 [00:11<00:00, 35.13it/s]


Epoch 48, Training Loss: 0.6455500016889304


Epoch 49/54: 100%|██████████| 391/391 [00:11<00:00, 35.42it/s]


Epoch 49, Training Loss: 0.6413988310205357


Epoch 50/54: 100%|██████████| 391/391 [00:11<00:00, 33.88it/s]


Epoch 50, Training Loss: 0.6356624158294609


Epoch 51/54: 100%|██████████| 391/391 [00:11<00:00, 34.96it/s]


Epoch 51, Training Loss: 0.6404678108137282


Epoch 52/54: 100%|██████████| 391/391 [00:11<00:00, 34.41it/s]


Epoch 52, Training Loss: 0.637955033641947


Epoch 53/54: 100%|██████████| 391/391 [00:11<00:00, 34.15it/s]


Epoch 53, Training Loss: 0.6411362423768738


Epoch 54/54: 100%|██████████| 391/391 [00:11<00:00, 35.12it/s]


Epoch 54, Training Loss: 0.6455451769901969


Testing: 100%|██████████| 79/79 [00:01<00:00, 53.05it/s]


Test Accuracy: 77.83%
Number of Parameters: 134314
Millions of MACs (FLOPs): 0.88M


In [7]:
## gpt

import torch
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from tqdm import tqdm


class LightCIFAR10Net(nn.Module):
    def __init__(self, num_classes=10):
        super(LightCIFAR10Net, self).__init__()
        self.stem = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(32),
            nn.GELU()
        )

        # Efficient blocks with depthwise separable convolutions
        self.block1 = self._make_block(32, 64, 1)
        self.block2 = self._make_block(64, 128, 2)
        self.block3 = self._make_block(128, 256, 2)
        self.block4 = self._make_block(256, 512, 2)

        # Classifier head
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Sequential(
            nn.Linear(512, 256),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_classes)
        )

    def _make_block(self, in_channels, out_channels, stride):
        return nn.Sequential(
            nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=stride, padding=1, groups=in_channels, bias=False),
            nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.GELU()
        )

    def forward(self, x):
        x = self.stem(x)
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.block4(x)
        x = self.global_pool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LightCIFAR10Net().to(device)

# Data transforms
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
])

# Load CIFAR-10 Dataset
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transform)

train_loader = DataLoader(dataset=train_dataset, batch_size=128, shuffle=True, num_workers=4)
test_loader = DataLoader(dataset=test_dataset, batch_size=128, shuffle=False, num_workers=4)

# Loss, Optimizer, Scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=3e-4)
scheduler = CosineAnnealingLR(optimizer, T_max=50)

# Train function
def train(model, train_loader, criterion, optimizer):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for images, labels in tqdm(train_loader, desc="Training"):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    accuracy = 100. * correct / total
    print(f"Train Loss: {running_loss/len(train_loader):.4f}, Train Accuracy: {accuracy:.2f}%")
    return accuracy

# Test function
def test(model, test_loader, criterion):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in tqdm(test_loader, desc="Testing"):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    accuracy = 100. * correct / total
    print(f"Test Loss: {running_loss/len(test_loader):.4f}, Test Accuracy: {accuracy:.2f}%")
    return accuracy

# Training loop
num_epochs = 54
best_acc = 0.0
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_acc = train(model, train_loader, criterion, optimizer)
    test_acc = test(model, test_loader, criterion)
    scheduler.step()

    if test_acc > best_acc:
        best_acc = test_acc
        torch.save(model.state_dict(), "best_cifar10_modelFinal.pth")
        print(f"Saved new best model with accuracy: {best_acc:.2f}%")


Files already downloaded and verified
Files already downloaded and verified
Epoch 1/54


Training: 100%|██████████| 391/391 [00:09<00:00, 42.76it/s]


Train Loss: 1.6288, Train Accuracy: 38.97%


Testing: 100%|██████████| 79/79 [00:01<00:00, 63.35it/s]


Test Loss: 1.4351, Test Accuracy: 48.50%
Saved new best model with accuracy: 48.50%
Epoch 2/54


Training: 100%|██████████| 391/391 [00:09<00:00, 41.01it/s]


Train Loss: 1.2106, Train Accuracy: 56.44%


Testing: 100%|██████████| 79/79 [00:01<00:00, 64.25it/s]


Test Loss: 1.3207, Test Accuracy: 54.07%
Saved new best model with accuracy: 54.07%
Epoch 3/54


Training: 100%|██████████| 391/391 [00:09<00:00, 43.36it/s]


Train Loss: 1.0373, Train Accuracy: 62.94%


Testing: 100%|██████████| 79/79 [00:01<00:00, 62.45it/s]


Test Loss: 1.0186, Test Accuracy: 63.95%
Saved new best model with accuracy: 63.95%
Epoch 4/54


Training: 100%|██████████| 391/391 [00:09<00:00, 42.87it/s]


Train Loss: 0.9380, Train Accuracy: 66.67%


Testing: 100%|██████████| 79/79 [00:01<00:00, 63.72it/s]


Test Loss: 1.2287, Test Accuracy: 57.73%
Epoch 5/54


Training: 100%|██████████| 391/391 [00:09<00:00, 41.31it/s]


Train Loss: 0.8762, Train Accuracy: 68.76%


Testing: 100%|██████████| 79/79 [00:01<00:00, 62.85it/s]


Test Loss: 0.9889, Test Accuracy: 65.25%
Saved new best model with accuracy: 65.25%
Epoch 6/54


Training: 100%|██████████| 391/391 [00:09<00:00, 43.39it/s]


Train Loss: 0.8205, Train Accuracy: 71.09%


Testing: 100%|██████████| 79/79 [00:01<00:00, 64.12it/s]


Test Loss: 0.8592, Test Accuracy: 70.07%
Saved new best model with accuracy: 70.07%
Epoch 7/54


Training: 100%|██████████| 391/391 [00:09<00:00, 43.22it/s]


Train Loss: 0.7793, Train Accuracy: 72.52%


Testing: 100%|██████████| 79/79 [00:01<00:00, 64.33it/s]


Test Loss: 0.8386, Test Accuracy: 70.66%
Saved new best model with accuracy: 70.66%
Epoch 8/54


Training: 100%|██████████| 391/391 [00:09<00:00, 41.17it/s]


Train Loss: 0.7300, Train Accuracy: 74.44%


Testing: 100%|██████████| 79/79 [00:01<00:00, 64.89it/s]


Test Loss: 0.7670, Test Accuracy: 73.17%
Saved new best model with accuracy: 73.17%
Epoch 9/54


Training: 100%|██████████| 391/391 [00:08<00:00, 43.50it/s]


Train Loss: 0.6994, Train Accuracy: 75.40%


Testing: 100%|██████████| 79/79 [00:01<00:00, 62.39it/s]


Test Loss: 0.7473, Test Accuracy: 74.13%
Saved new best model with accuracy: 74.13%
Epoch 10/54


Training: 100%|██████████| 391/391 [00:08<00:00, 43.59it/s]


Train Loss: 0.6684, Train Accuracy: 76.48%


Testing: 100%|██████████| 79/79 [00:01<00:00, 62.80it/s]


Test Loss: 0.7259, Test Accuracy: 74.41%
Saved new best model with accuracy: 74.41%
Epoch 11/54


Training: 100%|██████████| 391/391 [00:09<00:00, 41.34it/s]


Train Loss: 0.6443, Train Accuracy: 77.64%


Testing: 100%|██████████| 79/79 [00:01<00:00, 61.13it/s]


Test Loss: 0.6959, Test Accuracy: 75.74%
Saved new best model with accuracy: 75.74%
Epoch 12/54


Training: 100%|██████████| 391/391 [00:09<00:00, 43.27it/s]


Train Loss: 0.6278, Train Accuracy: 78.22%


Testing: 100%|██████████| 79/79 [00:01<00:00, 62.51it/s]


Test Loss: 0.6577, Test Accuracy: 77.41%
Saved new best model with accuracy: 77.41%
Epoch 13/54


Training: 100%|██████████| 391/391 [00:08<00:00, 44.03it/s]


Train Loss: 0.6045, Train Accuracy: 78.90%


Testing: 100%|██████████| 79/79 [00:01<00:00, 63.18it/s]


Test Loss: 0.7227, Test Accuracy: 74.45%
Epoch 14/54


Training: 100%|██████████| 391/391 [00:08<00:00, 43.49it/s]


Train Loss: 0.5928, Train Accuracy: 79.39%


Testing: 100%|██████████| 79/79 [00:01<00:00, 50.55it/s]


Test Loss: 0.6361, Test Accuracy: 78.08%
Saved new best model with accuracy: 78.08%
Epoch 15/54


Training: 100%|██████████| 391/391 [00:08<00:00, 44.05it/s]


Train Loss: 0.5814, Train Accuracy: 79.60%


Testing: 100%|██████████| 79/79 [00:01<00:00, 62.80it/s]


Test Loss: 0.6258, Test Accuracy: 77.73%
Epoch 16/54


Training: 100%|██████████| 391/391 [00:09<00:00, 43.16it/s]


Train Loss: 0.5667, Train Accuracy: 80.14%


Testing: 100%|██████████| 79/79 [00:01<00:00, 63.00it/s]


Test Loss: 0.6703, Test Accuracy: 76.91%
Epoch 17/54


Training: 100%|██████████| 391/391 [00:08<00:00, 43.49it/s]


Train Loss: 0.5571, Train Accuracy: 80.51%


Testing: 100%|██████████| 79/79 [00:01<00:00, 50.49it/s]


Test Loss: 0.6216, Test Accuracy: 78.81%
Saved new best model with accuracy: 78.81%
Epoch 18/54


Training: 100%|██████████| 391/391 [00:09<00:00, 42.28it/s]


Train Loss: 0.5358, Train Accuracy: 81.41%


Testing: 100%|██████████| 79/79 [00:01<00:00, 61.75it/s]


Test Loss: 0.6004, Test Accuracy: 79.39%
Saved new best model with accuracy: 79.39%
Epoch 19/54


Training: 100%|██████████| 391/391 [00:08<00:00, 43.68it/s]


Train Loss: 0.5264, Train Accuracy: 81.73%


Testing: 100%|██████████| 79/79 [00:01<00:00, 62.73it/s]


Test Loss: 0.6396, Test Accuracy: 77.53%
Epoch 20/54


Training: 100%|██████████| 391/391 [00:08<00:00, 43.46it/s]


Train Loss: 0.5141, Train Accuracy: 82.08%


Testing: 100%|██████████| 79/79 [00:01<00:00, 63.42it/s]


Test Loss: 0.6018, Test Accuracy: 79.29%
Epoch 21/54


Training: 100%|██████████| 391/391 [00:09<00:00, 41.30it/s]


Train Loss: 0.5057, Train Accuracy: 82.51%


Testing: 100%|██████████| 79/79 [00:01<00:00, 63.09it/s]


Test Loss: 0.5886, Test Accuracy: 79.53%
Saved new best model with accuracy: 79.53%
Epoch 22/54


Training: 100%|██████████| 391/391 [00:09<00:00, 43.43it/s]


Train Loss: 0.4938, Train Accuracy: 82.61%


Testing: 100%|██████████| 79/79 [00:01<00:00, 63.33it/s]


Test Loss: 0.5288, Test Accuracy: 81.69%
Saved new best model with accuracy: 81.69%
Epoch 23/54


Training: 100%|██████████| 391/391 [00:09<00:00, 43.32it/s]


Train Loss: 0.4814, Train Accuracy: 83.22%


Testing: 100%|██████████| 79/79 [00:01<00:00, 62.23it/s]


Test Loss: 0.5496, Test Accuracy: 81.47%
Epoch 24/54


Training: 100%|██████████| 391/391 [00:09<00:00, 40.87it/s]


Train Loss: 0.4707, Train Accuracy: 83.70%


Testing: 100%|██████████| 79/79 [00:01<00:00, 62.61it/s]


Test Loss: 0.5933, Test Accuracy: 79.39%
Epoch 25/54


Training: 100%|██████████| 391/391 [00:09<00:00, 43.42it/s]


Train Loss: 0.4640, Train Accuracy: 83.89%


Testing: 100%|██████████| 79/79 [00:01<00:00, 64.70it/s]


Test Loss: 0.5354, Test Accuracy: 81.52%
Epoch 26/54


Training: 100%|██████████| 391/391 [00:08<00:00, 43.66it/s]


Train Loss: 0.4481, Train Accuracy: 84.44%


Testing: 100%|██████████| 79/79 [00:01<00:00, 64.71it/s]


Test Loss: 0.5722, Test Accuracy: 80.71%
Epoch 27/54


Training: 100%|██████████| 391/391 [00:09<00:00, 40.63it/s]


Train Loss: 0.4350, Train Accuracy: 84.85%


Testing: 100%|██████████| 79/79 [00:01<00:00, 62.88it/s]


Test Loss: 0.5518, Test Accuracy: 81.20%
Epoch 28/54


Training: 100%|██████████| 391/391 [00:08<00:00, 43.68it/s]


Train Loss: 0.4258, Train Accuracy: 85.27%


Testing: 100%|██████████| 79/79 [00:01<00:00, 63.22it/s]


Test Loss: 0.5280, Test Accuracy: 82.37%
Saved new best model with accuracy: 82.37%
Epoch 29/54


Training: 100%|██████████| 391/391 [00:08<00:00, 43.55it/s]


Train Loss: 0.4113, Train Accuracy: 85.89%


Testing: 100%|██████████| 79/79 [00:01<00:00, 62.80it/s]


Test Loss: 0.4759, Test Accuracy: 84.21%
Saved new best model with accuracy: 84.21%
Epoch 30/54


Training: 100%|██████████| 391/391 [00:09<00:00, 41.21it/s]


Train Loss: 0.4013, Train Accuracy: 85.99%


Testing: 100%|██████████| 79/79 [00:01<00:00, 64.16it/s]


Test Loss: 0.5109, Test Accuracy: 82.27%
Epoch 31/54


Training: 100%|██████████| 391/391 [00:09<00:00, 43.43it/s]


Train Loss: 0.3944, Train Accuracy: 86.26%


Testing: 100%|██████████| 79/79 [00:01<00:00, 64.37it/s]


Test Loss: 0.5099, Test Accuracy: 82.90%
Epoch 32/54


Training: 100%|██████████| 391/391 [00:09<00:00, 41.17it/s]


Train Loss: 0.3798, Train Accuracy: 86.75%


Testing: 100%|██████████| 79/79 [00:01<00:00, 63.52it/s]


Test Loss: 0.4889, Test Accuracy: 83.25%
Epoch 33/54


Training: 100%|██████████| 391/391 [00:09<00:00, 40.36it/s]


Train Loss: 0.3684, Train Accuracy: 87.07%


Testing: 100%|██████████| 79/79 [00:01<00:00, 63.26it/s]


Test Loss: 0.4464, Test Accuracy: 85.01%
Saved new best model with accuracy: 85.01%
Epoch 34/54


Training: 100%|██████████| 391/391 [00:08<00:00, 43.45it/s]


Train Loss: 0.3565, Train Accuracy: 87.56%


Testing: 100%|██████████| 79/79 [00:01<00:00, 63.46it/s]


Test Loss: 0.5016, Test Accuracy: 83.35%
Epoch 35/54


Training: 100%|██████████| 391/391 [00:09<00:00, 43.02it/s]


Train Loss: 0.3423, Train Accuracy: 88.14%


Testing: 100%|██████████| 79/79 [00:01<00:00, 62.69it/s]


Test Loss: 0.4557, Test Accuracy: 84.66%
Epoch 36/54


Training: 100%|██████████| 391/391 [00:09<00:00, 41.31it/s]


Train Loss: 0.3331, Train Accuracy: 88.35%


Testing: 100%|██████████| 79/79 [00:01<00:00, 63.15it/s]


Test Loss: 0.4657, Test Accuracy: 84.54%
Epoch 37/54


Training: 100%|██████████| 391/391 [00:09<00:00, 42.77it/s]


Train Loss: 0.3216, Train Accuracy: 88.72%


Testing: 100%|██████████| 79/79 [00:01<00:00, 62.49it/s]


Test Loss: 0.4313, Test Accuracy: 85.54%
Saved new best model with accuracy: 85.54%
Epoch 38/54


Training: 100%|██████████| 391/391 [00:09<00:00, 43.16it/s]


Train Loss: 0.3086, Train Accuracy: 89.21%


Testing: 100%|██████████| 79/79 [00:01<00:00, 61.98it/s]


Test Loss: 0.4186, Test Accuracy: 85.78%
Saved new best model with accuracy: 85.78%
Epoch 39/54


Training: 100%|██████████| 391/391 [00:09<00:00, 41.48it/s]


Train Loss: 0.2997, Train Accuracy: 89.59%


Testing: 100%|██████████| 79/79 [00:01<00:00, 59.01it/s]


Test Loss: 0.4185, Test Accuracy: 85.89%
Saved new best model with accuracy: 85.89%
Epoch 40/54


Training: 100%|██████████| 391/391 [00:09<00:00, 43.17it/s]


Train Loss: 0.2854, Train Accuracy: 90.05%


Testing: 100%|██████████| 79/79 [00:01<00:00, 62.01it/s]


Test Loss: 0.3917, Test Accuracy: 86.77%
Saved new best model with accuracy: 86.77%
Epoch 41/54


Training: 100%|██████████| 391/391 [00:08<00:00, 43.44it/s]


Train Loss: 0.2745, Train Accuracy: 90.37%


Testing: 100%|██████████| 79/79 [00:01<00:00, 63.26it/s]


Test Loss: 0.3955, Test Accuracy: 86.59%
Epoch 42/54


Training: 100%|██████████| 391/391 [00:09<00:00, 41.24it/s]


Train Loss: 0.2614, Train Accuracy: 90.92%


Testing: 100%|██████████| 79/79 [00:01<00:00, 63.67it/s]


Test Loss: 0.3771, Test Accuracy: 87.24%
Saved new best model with accuracy: 87.24%
Epoch 43/54


Training: 100%|██████████| 391/391 [00:08<00:00, 43.48it/s]


Train Loss: 0.2519, Train Accuracy: 91.28%


Testing: 100%|██████████| 79/79 [00:01<00:00, 62.39it/s]


Test Loss: 0.3884, Test Accuracy: 87.28%
Saved new best model with accuracy: 87.28%
Epoch 44/54


Training: 100%|██████████| 391/391 [00:09<00:00, 43.28it/s]


Train Loss: 0.2417, Train Accuracy: 91.74%


Testing: 100%|██████████| 79/79 [00:01<00:00, 61.68it/s]


Test Loss: 0.3862, Test Accuracy: 87.18%
Epoch 45/54


Training: 100%|██████████| 391/391 [00:09<00:00, 41.04it/s]


Train Loss: 0.2317, Train Accuracy: 91.99%


Testing: 100%|██████████| 79/79 [00:01<00:00, 63.29it/s]


Test Loss: 0.3660, Test Accuracy: 87.50%
Saved new best model with accuracy: 87.50%
Epoch 46/54


Training: 100%|██████████| 391/391 [00:09<00:00, 43.02it/s]


Train Loss: 0.2309, Train Accuracy: 92.06%


Testing: 100%|██████████| 79/79 [00:01<00:00, 63.06it/s]


Test Loss: 0.3581, Test Accuracy: 87.87%
Saved new best model with accuracy: 87.87%
Epoch 47/54


Training: 100%|██████████| 391/391 [00:09<00:00, 42.78it/s]


Train Loss: 0.2192, Train Accuracy: 92.42%


Testing: 100%|██████████| 79/79 [00:01<00:00, 62.82it/s]


Test Loss: 0.3628, Test Accuracy: 87.62%
Epoch 48/54


Training: 100%|██████████| 391/391 [00:09<00:00, 41.12it/s]


Train Loss: 0.2150, Train Accuracy: 92.72%


Testing: 100%|██████████| 79/79 [00:01<00:00, 62.92it/s]


Test Loss: 0.3564, Test Accuracy: 87.85%
Epoch 49/54


Training: 100%|██████████| 391/391 [00:09<00:00, 43.21it/s]


Train Loss: 0.2119, Train Accuracy: 92.80%


Testing: 100%|██████████| 79/79 [00:01<00:00, 63.22it/s]


Test Loss: 0.3563, Test Accuracy: 87.91%
Saved new best model with accuracy: 87.91%
Epoch 50/54


Training: 100%|██████████| 391/391 [00:09<00:00, 43.10it/s]


Train Loss: 0.2111, Train Accuracy: 92.78%


Testing: 100%|██████████| 79/79 [00:01<00:00, 62.87it/s]


Test Loss: 0.3541, Test Accuracy: 87.81%
Epoch 51/54


Training: 100%|██████████| 391/391 [00:09<00:00, 40.95it/s]


Train Loss: 0.2112, Train Accuracy: 92.72%


Testing: 100%|██████████| 79/79 [00:01<00:00, 62.45it/s]


Test Loss: 0.3569, Test Accuracy: 87.95%
Saved new best model with accuracy: 87.95%
Epoch 52/54


Training: 100%|██████████| 391/391 [00:09<00:00, 42.54it/s]


Train Loss: 0.2112, Train Accuracy: 92.83%


Testing: 100%|██████████| 79/79 [00:01<00:00, 63.98it/s]


Test Loss: 0.3551, Test Accuracy: 87.82%
Epoch 53/54


Training: 100%|██████████| 391/391 [00:09<00:00, 42.97it/s]


Train Loss: 0.2117, Train Accuracy: 92.81%


Testing: 100%|██████████| 79/79 [00:01<00:00, 64.37it/s]


Test Loss: 0.3571, Test Accuracy: 87.80%
Epoch 54/54


Training: 100%|██████████| 391/391 [00:10<00:00, 38.65it/s]


Train Loss: 0.2121, Train Accuracy: 92.71%


Testing: 100%|██████████| 79/79 [00:01<00:00, 48.99it/s]

Test Loss: 0.3616, Test Accuracy: 87.77%





In [5]:
print(len(train_dataset))
print(len(test_dataset))

50000
10000


### TEST

In [9]:
# Test dataset setup (as before)
test_filenames = [os.path.join('/kaggle/working/test_images/test', f) for f in os.listdir('/kaggle/working/test_images/test')]
test_filenames.sort(key=lambda x: int(x.split('/')[-1].split('.')[0]))

class CIFAR10TestDataset(Dataset):
    def __init__(self, filenames, transform):
        self.filenames = filenames
        self.transform = transform

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, idx):
        img = Image.open(self.filenames[idx])
        if self.transform:
            img = self.transform(img)
        return img

test_dataset = CIFAR10TestDataset(test_filenames, test_transform)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=2)

# Predict on the test data (same as before)
model.load_state_dict(torch.load('best_model.pth'))
model = model.cuda()

result = []
with torch.no_grad():
    model.eval()
    for images in tqdm(test_loader):
        images = images.cuda()
        outputs = model(images)
        preds = torch.argmax(outputs, 1)
        preds = preds.cpu().numpy()
        result.extend([classes[i] for i in preds])

# Save the predictions to CSV
sample = pd.read_csv('/kaggle/input/cifar-10/sampleSubmission.csv')
sample['label'] = result
sample.to_csv('./submissionfile.csv', index=False)

print("Predictions saved to submissionfile.csv")


  model.load_state_dict(torch.load('best_model.pth'))
100%|██████████| 2344/2344 [01:30<00:00, 25.93it/s]


Predictions saved to submissionfile.csv


In [11]:
import shutil

shutil.rmtree('/kaggle/working/train_images')
shutil.rmtree('/kaggle/working/test_images')

# Check if the file exists and delete it
file_path = '/kaggle/working/best_model.pth'
if os.path.exists(file_path):
    os.remove(file_path)
    print(f"{file_path} has been deleted.")
else:
    print(f"{file_path} does not exist.")

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/working/train_images'

### train-94, test-92

In [None]:
import os
import random
import numpy as np
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import LambdaLR
from tqdm import tqdm

# Custom CutOut augmentation
class CutOut(object):
    def __init__(self, n_holes, length):
        self.n_holes = n_holes
        self.length = length

    def __call__(self, img):
        h, w = img.size(1), img.size(2)
        mask = np.ones((h, w), np.float32)

        for n in range(self.n_holes):
            y = random.randint(0, h - 1)
            x = random.randint(0, w - 1)

            y1 = np.clip(y - self.length // 2, 0, h)
            y2 = np.clip(y + self.length // 2, 0, h)
            x1 = np.clip(x - self.length // 2, 0, w)
            x2 = np.clip(x + self.length // 2, 0, w)

            mask[y1:y2, x1:x2] = 0.

        mask = torch.from_numpy(mask).expand_as(img)
        img = img * mask
        return img

# Data transformations
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomAffine(degrees=15, translate=(0.1, 0.1), scale=(0.9, 1.1)),  # Rotate, scale, shift
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
    CutOut(n_holes=1, length=8),
])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
])

# Load CIFAR-10 dataset using torchvision
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transform)

train_loader = DataLoader(dataset=train_dataset, batch_size=128, shuffle=True, num_workers=4)
test_loader = DataLoader(dataset=test_dataset, batch_size=128, shuffle=False, num_workers=4)

# Define the Residual Block and ResNet classes (as defined in your original code)
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.downsample = downsample

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out

class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes=10, width_mult=1.0):
        super(ResNet, self).__init__()
        self.in_channels = int(64 * width_mult)
        self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self.make_layer(block, int(64 * width_mult), layers[0], stride=1)
        self.layer2 = self.make_layer(block, int(128 * width_mult), layers[1], stride=2)
        self.layer3 = self.make_layer(block, int(256 * width_mult), layers[2], stride=2)
        self.layer4 = self.make_layer(block, int(512 * width_mult), layers[3], stride=2)
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(int(512 * width_mult), num_classes)

    def make_layer(self, block, out_channels, blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_channels != out_channels:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )
        layers = []
        layers.append(block(self.in_channels, out_channels, stride, downsample))
        self.in_channels = out_channels
        for _ in range(1, blocks):
            layers.append(block(out_channels, out_channels))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avg_pool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

# Initialize model, loss function, optimizer, and learning rate scheduler
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ResNet(ResidualBlock, [2, 2, 2, 2], width_mult=0.5).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=3e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)

# Training and Testing Functions
def train(model, train_loader, criterion, optimizer, scheduler):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for images, labels in tqdm(train_loader):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
    scheduler.step()
    print(f"Train Loss: {running_loss/len(train_loader):.4f}, Train Accuracy: {100 * correct/total:.2f}%")

def test(model, test_loader, criterion):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in tqdm(test_loader):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    print(f"Test Loss: {running_loss/len(test_loader):.4f}, Test Accuracy: {100 * correct/total:.2f}%")
    return 100 * correct / total

# Main training loop
best_acc = 0
for epoch in range(60):  # 50 epochs
    print(f"Epoch {epoch+1}/50")
    train(model, train_loader, criterion, optimizer, scheduler)
    acc = test(model, test_loader, criterion)
    if acc > best_acc:
        best_acc = acc
        torch.save(model.state_dict(), "best_new_model60.pth")

### test-94

In [8]:
import os
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import numpy as np
from PIL import Image
import random
import pandas as pd
!pip install py7zr
import py7zr
from io import BytesIO
import torch.optim.lr_scheduler as lr_scheduler

# Define a basic residual block with dropout
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.dropout = nn.Dropout(p=0.3)  # Dropout to regularize

        self.skip_connection = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.skip_connection = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        identity = self.skip_connection(x)
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.dropout(out)  # Apply dropout
        out += identity
        out = self.relu(out)
        return out

# Define ResNet-34 for CIFAR-10
class ResNet34(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet34, self).__init__()
        self.in_channels = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, num_classes)

    def _make_layer(self, block, out_channels, blocks, stride):
        strides = [stride] + [1] * (blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_channels, out_channels, stride))
            self.in_channels = out_channels
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.avg_pool(out)
        out = torch.flatten(out, 1)
        out = self.fc(out)
        return out

# Function to instantiate the ResNet-34 model
def get_resnet34():
    return ResNet34(ResidualBlock, [3, 4, 6, 3])

# Custom CutOut transformation
class CutOut(object):
    def __init__(self, n_holes, length):
        self.n_holes = n_holes
        self.length = length

    def __call__(self, img):
        h, w = img.size(1), img.size(2)
        mask = np.ones((h, w), np.float32)

        for n in range(self.n_holes):
            y = random.randint(0, h)
            x = random.randint(0, w)

            y1 = np.clip(y - self.length // 2, 0, h)
            y2 = np.clip(y + self.length // 2, 0, h)
            x1 = np.clip(x - self.length // 2, 0, w)
            x2 = np.clip(x + self.length // 2, 0, w)

            mask[y1:y2, x1:x2] = 0.

        mask = torch.from_numpy(mask).expand_as(img)
        img = img * mask
        return img

# Transformation function using PyTorch's transforms
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),  # Horizontal flip
    transforms.RandomAffine(degrees=15, translate=(0.1, 0.1), scale=(0.9, 1.1)),  # Rotate, scale, and shift
    transforms.RandomCrop(32, padding=4),  # Random crop with padding
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
    CutOut(n_holes=1, length=8),  # Use CutOut after ToTensor for regularization
])

transform_test = transforms.Compose([
    transforms.Resize((32, 32)),  # Resize to 32x32
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
])

# Example of applying the transformations to the CIFAR-10 dataset
train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=100, shuffle=False)

# Define device (GPU/CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create model, define loss function and optimizer
model = get_resnet34().to(device)
criterion = nn.CrossEntropyLoss()

# Use AdamW optimizer with weight decay
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)
# Calculate steps per epoch
steps_per_epoch = len(train_loader)  # Number of batches in the training set

# Cosine annealing scheduler with warmup

# Define the OneCycleLR scheduler
scheduler = lr_scheduler.OneCycleLR(
    optimizer, 
    max_lr=0.001,        # The maximum learning rate after warmup
    steps_per_epoch=steps_per_epoch,  # Total steps in one epoch (train dataset size / batch size)
    epochs=80,           # Total number of epochs
    pct_start=0.3,       # Warmup period (30% of the total steps)
    anneal_strategy='cos',  # Cosine annealing after warmup
    div_factor=25.0      # Initial learning rate will be max_lr / div_factor
)

# Training loop
def train(epoch):
    model.train()
    running_loss = 0.0
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        # Update the learning rate at each batch step
        scheduler.step()

        running_loss += loss.item()
        if batch_idx % 100 == 99:
            print(f"Epoch {epoch+1}, Batch {batch_idx+1}: Loss = {running_loss / 100:.4f}")
            running_loss = 0.0

def test(epoch):
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(test_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            test_loss += loss.item()

            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

        print(f"Epoch {epoch+1}: Test Loss = {test_loss / len(test_loader):.4f}, Accuracy = {100. * correct / total:.2f}%")

# Main training loop
for epoch in range(5):
    train(epoch)
    test(epoch)

Files already downloaded and verified
Files already downloaded and verified
Epoch 1, Batch 100: Loss = 2.1271
Epoch 1: Test Loss = 3.4228, Accuracy = 19.50%
Epoch 2, Batch 100: Loss = 1.7461
Epoch 2: Test Loss = 1.8835, Accuracy = 36.52%
Epoch 3, Batch 100: Loss = 1.5620
Epoch 3: Test Loss = 1.7684, Accuracy = 43.58%
Epoch 4, Batch 100: Loss = 1.4393
Epoch 4: Test Loss = 1.6325, Accuracy = 48.06%
Epoch 5, Batch 100: Loss = 1.3019
Epoch 5: Test Loss = 1.1812, Accuracy = 60.49%


In [3]:
!pip install torchsummary
!pip install fvcore

Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl.metadata (296 bytes)
Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1
Collecting fvcore
  Downloading fvcore-0.1.5.post20221221.tar.gz (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting yacs>=0.1.6 (from fvcore)
  Downloading yacs-0.1.8-py3-none-any.whl.metadata (639 bytes)
Collecting iopath>=0.1.7 (from fvcore)
  Downloading iopath-0.1.10.tar.gz (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting portalocker (from iopath>=0.1.7->fvcore)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Downloading yacs-0.1.8-py3-none-any.whl (1

### FIRST SUBMISSION

In [11]:
import os
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import numpy as np
from fvcore.nn import FlopCountAnalysis, flop_count_table
from torchinfo import summary
from PIL import Image
import random

# Define a basic convolutional block with GELU activation
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ConvBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.act = nn.GELU()
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.dropout = nn.Dropout(p=0.3)

        self.skip_connection = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.skip_connection = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        identity = self.skip_connection(x)
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.act(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.dropout(out)
        out += identity
        out = self.act(out)
        return out

# Define the base architecture for CIFAR-10
class BaseArchitecture(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(BaseArchitecture, self).__init__()
        self.in_channels = 32

        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.act = nn.GELU()
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(256, num_classes)

    def _make_layer(self, block, out_channels, blocks, stride):
        strides = [stride] + [1] * (blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_channels, out_channels, stride))
            self.in_channels = out_channels
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.act(out)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.avg_pool(out)
        out = torch.flatten(out, 1)
        out = self.fc(out)
        return out

def get_base_architecture():
    return BaseArchitecture(ConvBlock, [2, 2, 2])

# Training configurations
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
])

train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False, num_workers=2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = get_base_architecture().to(device)

# Calculate FLOPs and Params
dummy_input = torch.randn(1, 3, 32, 32).to(device)
flops = FlopCountAnalysis(model, dummy_input)
print("FLOPs and Parameters BEFORE Training:")
print(flop_count_table(flops))
print(f"Total Parameters: {sum(p.numel() for p in model.parameters())}")

# Loss, optimizer, scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=3e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)

# Training loop
def train(epoch):
    model.train()
    running_loss = 0.0
    for inputs, targets in tqdm(train_loader, desc=f"Epoch {epoch+1}/{50}"):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    scheduler.step()
    print(f"Training Loss: {running_loss / len(train_loader):.4f}")

# Test loop
def test():
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    print(f"Test Accuracy: {100. * correct / total:.2f}%")

# Train and save model
for epoch in range(50):
    train(epoch)
    test()

print("FLOPs and Parameters AFTER Training:")
print(flop_count_table(flops))
print(f"Total Parameters AFTER Training: {sum(p.numel() for p in model.parameters())}")

torch.save(model.state_dict(), "cifar_last_model.pth")
print("MODEL SAVED")

Files already downloaded and verified
Files already downloaded and verified
FLOPs and Parameters BEFORE Training:
| module                      | #parameters or shape   | #flops     |
|:----------------------------|:-----------------------|:-----------|
| model                       | 2.76M                  | 0.407G     |
|  conv1                      |  0.864K                |  0.885M    |
|   conv1.weight              |   (32, 3, 3, 3)        |            |
|  bn1                        |  64                    |  0.164M    |
|   bn1.weight                |   (32,)                |            |
|   bn1.bias                  |   (32,)                |            |
|  layer1                     |  0.132M                |  0.136G    |
|   layer1.0                  |   57.728K              |   59.703M  |
|    layer1.0.conv1           |    18.432K             |    18.874M |
|    layer1.0.bn1             |    0.128K              |    0.328M  |
|    layer1.0.conv2           |    36.864K    

Epoch 1/50: 100%|██████████| 391/391 [00:33<00:00, 11.53it/s]

Training Loss: 1.6032





Test Accuracy: 42.59%


Epoch 2/50: 100%|██████████| 391/391 [00:32<00:00, 12.01it/s]

Training Loss: 1.0943





Test Accuracy: 61.87%


Epoch 3/50: 100%|██████████| 391/391 [00:33<00:00, 11.80it/s]

Training Loss: 0.8885





Test Accuracy: 67.92%


Epoch 4/50: 100%|██████████| 391/391 [00:33<00:00, 11.78it/s]

Training Loss: 0.7398





Test Accuracy: 75.03%


Epoch 5/50: 100%|██████████| 391/391 [00:33<00:00, 11.83it/s]

Training Loss: 0.6293





Test Accuracy: 75.93%


Epoch 6/50: 100%|██████████| 391/391 [00:32<00:00, 11.87it/s]

Training Loss: 0.5667





Test Accuracy: 81.12%


Epoch 7/50: 100%|██████████| 391/391 [00:32<00:00, 11.89it/s]

Training Loss: 0.5227





Test Accuracy: 78.90%


Epoch 8/50: 100%|██████████| 391/391 [00:32<00:00, 11.86it/s]

Training Loss: 0.4844





Test Accuracy: 80.80%


Epoch 9/50: 100%|██████████| 391/391 [00:33<00:00, 11.83it/s]

Training Loss: 0.4605





Test Accuracy: 82.36%


Epoch 10/50: 100%|██████████| 391/391 [00:33<00:00, 11.79it/s]

Training Loss: 0.4329





Test Accuracy: 83.49%


Epoch 11/50: 100%|██████████| 391/391 [00:32<00:00, 11.85it/s]

Training Loss: 0.4150





Test Accuracy: 82.35%


Epoch 12/50: 100%|██████████| 391/391 [00:32<00:00, 11.88it/s]

Training Loss: 0.3949





Test Accuracy: 85.76%


Epoch 13/50: 100%|██████████| 391/391 [00:32<00:00, 11.87it/s]

Training Loss: 0.3783





Test Accuracy: 85.90%


Epoch 14/50: 100%|██████████| 391/391 [00:33<00:00, 11.82it/s]

Training Loss: 0.3661





Test Accuracy: 86.86%


Epoch 15/50: 100%|██████████| 391/391 [00:33<00:00, 11.81it/s]

Training Loss: 0.3498





Test Accuracy: 85.00%


Epoch 16/50: 100%|██████████| 391/391 [00:33<00:00, 11.82it/s]

Training Loss: 0.3339





Test Accuracy: 85.69%


Epoch 17/50: 100%|██████████| 391/391 [00:33<00:00, 11.84it/s]

Training Loss: 0.3217





Test Accuracy: 86.51%


Epoch 18/50: 100%|██████████| 391/391 [00:33<00:00, 11.84it/s]

Training Loss: 0.3124





Test Accuracy: 87.55%


Epoch 19/50: 100%|██████████| 391/391 [00:32<00:00, 11.85it/s]

Training Loss: 0.3010





Test Accuracy: 87.61%


Epoch 20/50: 100%|██████████| 391/391 [00:33<00:00, 11.83it/s]

Training Loss: 0.2866





Test Accuracy: 86.63%


Epoch 21/50: 100%|██████████| 391/391 [00:32<00:00, 11.85it/s]

Training Loss: 0.2736





Test Accuracy: 88.27%


Epoch 22/50: 100%|██████████| 391/391 [00:33<00:00, 11.84it/s]

Training Loss: 0.2683





Test Accuracy: 88.21%


Epoch 23/50: 100%|██████████| 391/391 [00:33<00:00, 11.83it/s]

Training Loss: 0.2531





Test Accuracy: 88.07%


Epoch 24/50: 100%|██████████| 391/391 [00:33<00:00, 11.84it/s]

Training Loss: 0.2414





Test Accuracy: 89.96%


Epoch 25/50: 100%|██████████| 391/391 [00:33<00:00, 11.84it/s]

Training Loss: 0.2287





Test Accuracy: 87.84%


Epoch 26/50: 100%|██████████| 391/391 [00:33<00:00, 11.83it/s]

Training Loss: 0.2204





Test Accuracy: 88.87%


Epoch 27/50: 100%|██████████| 391/391 [00:33<00:00, 11.82it/s]

Training Loss: 0.2098





Test Accuracy: 89.17%


Epoch 28/50: 100%|██████████| 391/391 [00:33<00:00, 11.83it/s]

Training Loss: 0.2001





Test Accuracy: 89.28%


Epoch 29/50: 100%|██████████| 391/391 [00:33<00:00, 11.84it/s]

Training Loss: 0.1837





Test Accuracy: 88.89%


Epoch 30/50: 100%|██████████| 391/391 [00:33<00:00, 11.81it/s]

Training Loss: 0.1730





Test Accuracy: 90.40%


Epoch 31/50: 100%|██████████| 391/391 [00:33<00:00, 11.82it/s]

Training Loss: 0.1635





Test Accuracy: 89.05%


Epoch 32/50: 100%|██████████| 391/391 [00:32<00:00, 11.93it/s]

Training Loss: 0.1511





Test Accuracy: 91.35%


Epoch 33/50: 100%|██████████| 391/391 [00:32<00:00, 11.89it/s]

Training Loss: 0.1407





Test Accuracy: 91.31%


Epoch 34/50: 100%|██████████| 391/391 [00:32<00:00, 11.89it/s]

Training Loss: 0.1270





Test Accuracy: 91.12%


Epoch 35/50: 100%|██████████| 391/391 [00:32<00:00, 11.86it/s]

Training Loss: 0.1193





Test Accuracy: 90.87%


Epoch 36/50: 100%|██████████| 391/391 [00:32<00:00, 11.87it/s]

Training Loss: 0.1060





Test Accuracy: 91.38%


Epoch 37/50: 100%|██████████| 391/391 [00:33<00:00, 11.83it/s]

Training Loss: 0.0929





Test Accuracy: 92.46%


Epoch 38/50: 100%|██████████| 391/391 [00:33<00:00, 11.83it/s]

Training Loss: 0.0804





Test Accuracy: 92.10%


Epoch 39/50: 100%|██████████| 391/391 [00:32<00:00, 11.89it/s]

Training Loss: 0.0732





Test Accuracy: 91.95%


Epoch 40/50: 100%|██████████| 391/391 [00:32<00:00, 11.88it/s]

Training Loss: 0.0622





Test Accuracy: 92.48%


Epoch 41/50: 100%|██████████| 391/391 [00:32<00:00, 11.89it/s]

Training Loss: 0.0533





Test Accuracy: 92.17%


Epoch 42/50: 100%|██████████| 391/391 [00:32<00:00, 11.86it/s]

Training Loss: 0.0465





Test Accuracy: 92.51%


Epoch 43/50: 100%|██████████| 391/391 [00:32<00:00, 11.87it/s]

Training Loss: 0.0420





Test Accuracy: 92.67%


Epoch 44/50: 100%|██████████| 391/391 [00:32<00:00, 11.85it/s]

Training Loss: 0.0382





Test Accuracy: 93.06%


Epoch 45/50: 100%|██████████| 391/391 [00:33<00:00, 11.84it/s]

Training Loss: 0.0329





Test Accuracy: 92.99%


Epoch 46/50: 100%|██████████| 391/391 [00:33<00:00, 11.84it/s]

Training Loss: 0.0314





Test Accuracy: 93.08%


Epoch 47/50: 100%|██████████| 391/391 [00:33<00:00, 11.81it/s]

Training Loss: 0.0284





Test Accuracy: 93.09%


Epoch 48/50: 100%|██████████| 391/391 [00:33<00:00, 11.83it/s]

Training Loss: 0.0282





Test Accuracy: 93.14%


Epoch 49/50: 100%|██████████| 391/391 [00:32<00:00, 11.88it/s]

Training Loss: 0.0277





Test Accuracy: 93.22%


Epoch 50/50: 100%|██████████| 391/391 [00:32<00:00, 11.88it/s]

Training Loss: 0.0267





Test Accuracy: 93.12%
FLOPs and Parameters AFTER Training:
| module                      | #parameters or shape   | #flops     |
|:----------------------------|:-----------------------|:-----------|
| model                       | 2.76M                  | 0.407G     |
|  conv1                      |  0.864K                |  0.885M    |
|   conv1.weight              |   (32, 3, 3, 3)        |            |
|  bn1                        |  64                    |  0.164M    |
|   bn1.weight                |   (32,)                |            |
|   bn1.bias                  |   (32,)                |            |
|  layer1                     |  0.132M                |  0.136G    |
|   layer1.0                  |   57.728K              |   59.703M  |
|    layer1.0.conv1           |    18.432K             |    18.874M |
|    layer1.0.bn1             |    0.128K              |    0.328M  |
|    layer1.0.conv2           |    36.864K             |    37.749M |
|    layer1.0.bn2             |

In [4]:
!pip install gdown

Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Installing collected packages: gdown
Successfully installed gdown-5.2.0


In [5]:
import gdown
gdown.download(f"https://drive.google.com/uc?id=19y5_-tSLzIjhjvT86NdqL0OB0h2b7b-s")

Downloading...
From: https://drive.google.com/uc?id=19y5_-tSLzIjhjvT86NdqL0OB0h2b7b-s
To: /kaggle/working/b13.PNG
100%|██████████| 610k/610k [00:00<00:00, 104MB/s]


'b13.PNG'

In [24]:
import torch
from torchvision import transforms
from PIL import Image

# Load the trained model
model = get_base_architecture()
model.load_state_dict(torch.load("/kaggle/working/cifar_last_model.pth"))
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define the transform pipeline for single image
transform_test_single = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
])

# Class names for CIFAR-10
class_names = [
    'airplane', 'automobile', 'bird', 'cat', 'deer',
    'dog', 'frog', 'horse', 'ship', 'truck'
]

# Preprocess function
def preprocess_image(image_path):
    image = Image.open(image_path).convert('RGB')
    image = transform_test_single(image)
    image = image.unsqueeze(0)
    return image

# Predict function
def predict(image_path, model, class_names):
    image = preprocess_image(image_path).to(device)
    with torch.no_grad():
        outputs = model(image)
        _, predicted_class = outputs.max(1)
    return class_names[predicted_class.item()]

# Test on a new image
image_path = "/kaggle/working/b13.PNG"  # Replace with your image path
predicted_label = predict(image_path, model, class_names)
print(f"Predicted Class: {predicted_label}")

Predicted Class: cat


  model.load_state_dict(torch.load("/kaggle/working/cifar_last_model.pth"))


## just test

In [12]:
# Define a basic residual block with GELU activation
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ConvBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.act = nn.GELU()
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.dropout = nn.Dropout(p=0.3)

        self.skip_connection = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.skip_connection = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        identity = self.skip_connection(x)
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.act(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.dropout(out)
        out += identity
        out = self.act(out)
        return out

# Define the base architecture for CIFAR-10
class BaseArchitecture(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(BaseArchitecture, self).__init__()
        self.in_channels = 32

        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.act = nn.GELU()
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(256, num_classes)

    def _make_layer(self, block, out_channels, blocks, stride):
        strides = [stride] + [1] * (blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_channels, out_channels, stride))
            self.in_channels = out_channels
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.act(out)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.avg_pool(out)
        out = torch.flatten(out, 1)
        out = self.fc(out)
        return out

In [13]:
def get_base_architecture():
    return BaseArchitecture(ConvBlock, [2, 2, 2])

In [14]:
# Training configurations
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
])

train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False, num_workers=2)

Files already downloaded and verified
Files already downloaded and verified


In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = get_base_architecture().to(device)

# Calculate FLOPs and Params
dummy_input = torch.randn(1, 3, 32, 32).to(device)
flops = FlopCountAnalysis(model, dummy_input)
print("FLOPs and Parameters BEFORE Training:")
print(flop_count_table(flops))
print(f"Total Parameters: {sum(p.numel() for p in model.parameters())}")

# Loss, optimizer, scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=3e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)

FLOPs and Parameters BEFORE Training:
| module                      | #parameters or shape   | #flops     |
|:----------------------------|:-----------------------|:-----------|
| model                       | 2.76M                  | 0.407G     |
|  conv1                      |  0.864K                |  0.885M    |
|   conv1.weight              |   (32, 3, 3, 3)        |            |
|  bn1                        |  64                    |  0.164M    |
|   bn1.weight                |   (32,)                |            |
|   bn1.bias                  |   (32,)                |            |
|  layer1                     |  0.132M                |  0.136G    |
|   layer1.0                  |   57.728K              |   59.703M  |
|    layer1.0.conv1           |    18.432K             |    18.874M |
|    layer1.0.bn1             |    0.128K              |    0.328M  |
|    layer1.0.conv2           |    36.864K             |    37.749M |
|    layer1.0.bn2             |    0.128K           

In [16]:
# Training loop
def train(epoch):
    model.train()
    running_loss = 0.0
    for inputs, targets in tqdm(train_loader, desc=f"Epoch {epoch+1}/{50}"):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    scheduler.step()
    print(f"Training Loss: {running_loss / len(train_loader):.4f}")

# Test loop
def test():
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    print(f"Test Accuracy: {100. * correct / total:.2f}%")

# Train and save model
for epoch in range(1):
    train(epoch)
    test()

torch.save(model.state_dict(), "just_test.pth")
print("MODEL SAVED")

Epoch 1/50: 100%|██████████| 391/391 [00:34<00:00, 11.42it/s]

Training Loss: 1.6605





Test Accuracy: 49.42%
MODEL SAVED


### Update 93 accuracy model. reduce parameters

In [14]:
import os
from tqdm import tqdm
import torch 
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import numpy as np
from fvcore.nn import FlopCountAnalysis, flop_count_table
from torchinfo import summary
from PIL import Image
import random

# Define a basic convolutional block with GELU activation
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ConvBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.act = nn.GELU()
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.dropout = nn.Dropout(p=0.2)

        self.skip_connection = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.skip_connection = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        identity = self.skip_connection(x)
        out = self.conv1(x)
        out = self.bn1(out)  # Batch Normalization
        out = self.act(out)
        out = self.conv2(out)
        out = self.bn2(out)  # Batch Normalization
        out = self.dropout(out)
        out += identity
        out = self.act(out)
        return out

# Define the base architecture for CIFAR-10
class BaseArchitecture(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(BaseArchitecture, self).__init__()
        self.in_channels = 32

        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.act = nn.GELU()
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(256, num_classes)

    def _make_layer(self, block, out_channels, blocks, stride):
        strides = [stride] + [1] * (blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_channels, out_channels, stride))
            self.in_channels = out_channels
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.act(out)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.avg_pool(out)
        out = torch.flatten(out, 1)
        out = self.fc(out)
        return out

def get_base_architecture():
    return BaseArchitecture(ConvBlock, [2, 2, 2])

# Training configurations
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),  # Color jitter
    transforms.RandomRotation(10),  # Random rotation
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
])

train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False, num_workers=2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = get_base_architecture().to(device)

# Calculate FLOPs and Params
dummy_input = torch.randn(1, 3, 32, 32).to(device)
flops = FlopCountAnalysis(model, dummy_input)
print("FLOPs and Parameters BEFORE Training:")
print(flop_count_table(flops))
print(f"Total Parameters: {sum(p.numel() for p in model.parameters())}")

# Loss, optimizer, scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=3e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)

# Training loop
def train(epoch):
    model.train()
    running_loss = 0.0
    for inputs, targets in tqdm(train_loader, desc=f"Epoch {epoch+1}/{50}"):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    scheduler.step()
    print(f"Training Loss: {running_loss / len(train_loader):.4f}")

# Test loop
def test():
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    print(f"Test Accuracy: {100. * correct / total:.2f}%")

# Train and save model
for epoch in range(65):
    train(epoch)
    test()

print("FLOPs and Parameters AFTER Training:")
print(flop_count_table(flops))
print(f"Total Parameters AFTER Training: {sum(p.numel() for p in model.parameters())}")

torch.save(model.state_dict(), "cifar_umode_traintransofom.pth")
print("MODEL SAVED")

Files already downloaded and verified
Files already downloaded and verified
FLOPs and Parameters BEFORE Training:
| module                      | #parameters or shape   | #flops     |
|:----------------------------|:-----------------------|:-----------|
| model                       | 2.76M                  | 0.407G     |
|  conv1                      |  0.864K                |  0.885M    |
|   conv1.weight              |   (32, 3, 3, 3)        |            |
|  bn1                        |  64                    |  0.164M    |
|   bn1.weight                |   (32,)                |            |
|   bn1.bias                  |   (32,)                |            |
|  layer1                     |  0.132M                |  0.136G    |
|   layer1.0                  |   57.728K              |   59.703M  |
|    layer1.0.conv1           |    18.432K             |    18.874M |
|    layer1.0.bn1             |    0.128K              |    0.328M  |
|    layer1.0.conv2           |    36.864K    

Epoch 1/50: 100%|██████████| 391/391 [00:32<00:00, 12.01it/s]

Training Loss: 1.6937





Test Accuracy: 51.23%


Epoch 2/50: 100%|██████████| 391/391 [00:32<00:00, 11.93it/s]

Training Loss: 1.2072





Test Accuracy: 57.63%


Epoch 3/50: 100%|██████████| 391/391 [00:32<00:00, 11.99it/s]

Training Loss: 1.0116





Test Accuracy: 65.53%


Epoch 4/50: 100%|██████████| 391/391 [00:33<00:00, 11.83it/s]

Training Loss: 0.8551





Test Accuracy: 72.00%


Epoch 5/50: 100%|██████████| 391/391 [00:32<00:00, 11.96it/s]

Training Loss: 0.7422





Test Accuracy: 75.34%


Epoch 6/50: 100%|██████████| 391/391 [00:32<00:00, 11.87it/s]

Training Loss: 0.6785





Test Accuracy: 75.19%


Epoch 7/50: 100%|██████████| 391/391 [00:32<00:00, 11.93it/s]

Training Loss: 0.6315





Test Accuracy: 73.06%


Epoch 8/50: 100%|██████████| 391/391 [00:32<00:00, 11.86it/s]

Training Loss: 0.5960





Test Accuracy: 82.04%


Epoch 9/50: 100%|██████████| 391/391 [00:33<00:00, 11.77it/s]

Training Loss: 0.5651





Test Accuracy: 82.33%


Epoch 10/50: 100%|██████████| 391/391 [00:33<00:00, 11.79it/s]

Training Loss: 0.5391





Test Accuracy: 81.36%


Epoch 11/50: 100%|██████████| 391/391 [00:32<00:00, 11.93it/s]

Training Loss: 0.5131





Test Accuracy: 83.79%


Epoch 12/50: 100%|██████████| 391/391 [00:32<00:00, 11.85it/s]

Training Loss: 0.4950





Test Accuracy: 83.72%


Epoch 13/50: 100%|██████████| 391/391 [00:32<00:00, 11.87it/s]

Training Loss: 0.4747





Test Accuracy: 84.44%


Epoch 14/50: 100%|██████████| 391/391 [00:32<00:00, 11.95it/s]

Training Loss: 0.4577





Test Accuracy: 84.83%


Epoch 15/50: 100%|██████████| 391/391 [00:33<00:00, 11.83it/s]

Training Loss: 0.4417





Test Accuracy: 86.41%


Epoch 16/50: 100%|██████████| 391/391 [00:32<00:00, 11.91it/s]

Training Loss: 0.4291





Test Accuracy: 86.24%


Epoch 17/50: 100%|██████████| 391/391 [00:32<00:00, 11.94it/s]

Training Loss: 0.4152





Test Accuracy: 83.89%


Epoch 18/50: 100%|██████████| 391/391 [00:33<00:00, 11.81it/s]

Training Loss: 0.3963





Test Accuracy: 87.33%


Epoch 19/50: 100%|██████████| 391/391 [00:33<00:00, 11.82it/s]

Training Loss: 0.3873





Test Accuracy: 85.94%


Epoch 20/50: 100%|██████████| 391/391 [00:32<00:00, 11.86it/s]

Training Loss: 0.3721





Test Accuracy: 87.23%


Epoch 21/50: 100%|██████████| 391/391 [00:32<00:00, 11.87it/s]

Training Loss: 0.3682





Test Accuracy: 85.55%


Epoch 22/50: 100%|██████████| 391/391 [00:32<00:00, 11.92it/s]

Training Loss: 0.3505





Test Accuracy: 88.08%


Epoch 23/50: 100%|██████████| 391/391 [00:33<00:00, 11.73it/s]

Training Loss: 0.3382





Test Accuracy: 87.61%


Epoch 24/50: 100%|██████████| 391/391 [00:32<00:00, 11.91it/s]

Training Loss: 0.3252





Test Accuracy: 89.19%


Epoch 25/50: 100%|██████████| 391/391 [00:33<00:00, 11.85it/s]

Training Loss: 0.3154





Test Accuracy: 89.21%


Epoch 26/50: 100%|██████████| 391/391 [00:32<00:00, 11.91it/s]

Training Loss: 0.2998





Test Accuracy: 89.67%


Epoch 27/50: 100%|██████████| 391/391 [00:33<00:00, 11.84it/s]

Training Loss: 0.2864





Test Accuracy: 89.07%


Epoch 28/50: 100%|██████████| 391/391 [00:32<00:00, 11.87it/s]

Training Loss: 0.2754





Test Accuracy: 90.70%


Epoch 29/50: 100%|██████████| 391/391 [00:33<00:00, 11.81it/s]

Training Loss: 0.2661





Test Accuracy: 90.52%


Epoch 30/50: 100%|██████████| 391/391 [00:32<00:00, 11.94it/s]

Training Loss: 0.2508





Test Accuracy: 91.00%


Epoch 31/50: 100%|██████████| 391/391 [00:32<00:00, 11.90it/s]

Training Loss: 0.2351





Test Accuracy: 90.71%


Epoch 32/50: 100%|██████████| 391/391 [00:33<00:00, 11.80it/s]

Training Loss: 0.2245





Test Accuracy: 91.17%


Epoch 33/50: 100%|██████████| 391/391 [00:32<00:00, 11.88it/s]

Training Loss: 0.2115





Test Accuracy: 90.70%


Epoch 34/50: 100%|██████████| 391/391 [00:32<00:00, 11.95it/s]

Training Loss: 0.2012





Test Accuracy: 91.62%


Epoch 35/50: 100%|██████████| 391/391 [00:33<00:00, 11.76it/s]

Training Loss: 0.1837





Test Accuracy: 91.08%


Epoch 36/50: 100%|██████████| 391/391 [00:32<00:00, 11.88it/s]

Training Loss: 0.1732





Test Accuracy: 91.21%


Epoch 37/50: 100%|██████████| 391/391 [00:32<00:00, 11.87it/s]

Training Loss: 0.1594





Test Accuracy: 92.02%


Epoch 38/50: 100%|██████████| 391/391 [00:32<00:00, 11.90it/s]

Training Loss: 0.1485





Test Accuracy: 92.01%


Epoch 39/50: 100%|██████████| 391/391 [00:33<00:00, 11.84it/s]

Training Loss: 0.1354





Test Accuracy: 92.27%


Epoch 40/50: 100%|██████████| 391/391 [00:33<00:00, 11.82it/s]

Training Loss: 0.1216





Test Accuracy: 92.48%


Epoch 41/50: 100%|██████████| 391/391 [00:33<00:00, 11.80it/s]

Training Loss: 0.1113





Test Accuracy: 92.96%


Epoch 42/50: 100%|██████████| 391/391 [00:32<00:00, 11.90it/s]

Training Loss: 0.0969





Test Accuracy: 92.92%


Epoch 43/50: 100%|██████████| 391/391 [00:32<00:00, 11.86it/s]

Training Loss: 0.0938





Test Accuracy: 93.32%


Epoch 44/50: 100%|██████████| 391/391 [00:32<00:00, 11.90it/s]

Training Loss: 0.0844





Test Accuracy: 93.30%


Epoch 45/50: 100%|██████████| 391/391 [00:33<00:00, 11.85it/s]

Training Loss: 0.0784





Test Accuracy: 93.46%


Epoch 46/50: 100%|██████████| 391/391 [00:33<00:00, 11.81it/s]

Training Loss: 0.0724





Test Accuracy: 93.40%


Epoch 47/50: 100%|██████████| 391/391 [00:33<00:00, 11.82it/s]

Training Loss: 0.0663





Test Accuracy: 93.48%


Epoch 48/50: 100%|██████████| 391/391 [00:33<00:00, 11.79it/s]

Training Loss: 0.0663





Test Accuracy: 93.59%


Epoch 49/50: 100%|██████████| 391/391 [00:32<00:00, 11.88it/s]

Training Loss: 0.0655





Test Accuracy: 93.66%


Epoch 50/50: 100%|██████████| 391/391 [00:33<00:00, 11.77it/s]

Training Loss: 0.0628





Test Accuracy: 93.57%


Epoch 51/50: 100%|██████████| 391/391 [00:33<00:00, 11.82it/s]

Training Loss: 0.0640





Test Accuracy: 93.62%


Epoch 52/50: 100%|██████████| 391/391 [00:32<00:00, 11.87it/s]

Training Loss: 0.0646





Test Accuracy: 93.59%


Epoch 53/50: 100%|██████████| 391/391 [00:33<00:00, 11.83it/s]

Training Loss: 0.0634





Test Accuracy: 93.77%


Epoch 54/50: 100%|██████████| 391/391 [00:33<00:00, 11.84it/s]

Training Loss: 0.0641





Test Accuracy: 93.56%


Epoch 55/50: 100%|██████████| 391/391 [00:32<00:00, 11.88it/s]

Training Loss: 0.0649





Test Accuracy: 93.49%


Epoch 56/50: 100%|██████████| 391/391 [00:32<00:00, 11.88it/s]

Training Loss: 0.0664





Test Accuracy: 93.40%


Epoch 57/50: 100%|██████████| 391/391 [00:32<00:00, 11.87it/s]

Training Loss: 0.0701





Test Accuracy: 93.48%


Epoch 58/50: 100%|██████████| 391/391 [00:33<00:00, 11.84it/s]

Training Loss: 0.0719





Test Accuracy: 93.22%


Epoch 59/50: 100%|██████████| 391/391 [00:32<00:00, 11.86it/s]

Training Loss: 0.0771





Test Accuracy: 92.90%


Epoch 60/50: 100%|██████████| 391/391 [00:32<00:00, 11.93it/s]

Training Loss: 0.0885





Test Accuracy: 92.76%
FLOPs and Parameters AFTER Training:
| module                      | #parameters or shape   | #flops     |
|:----------------------------|:-----------------------|:-----------|
| model                       | 2.76M                  | 0.407G     |
|  conv1                      |  0.864K                |  0.885M    |
|   conv1.weight              |   (32, 3, 3, 3)        |            |
|  bn1                        |  64                    |  0.164M    |
|   bn1.weight                |   (32,)                |            |
|   bn1.bias                  |   (32,)                |            |
|  layer1                     |  0.132M                |  0.136G    |
|   layer1.0                  |   57.728K              |   59.703M  |
|    layer1.0.conv1           |    18.432K             |    18.874M |
|    layer1.0.bn1             |    0.128K              |    0.328M  |
|    layer1.0.conv2           |    36.864K             |    37.749M |
|    layer1.0.bn2             |

In [16]:
## GPT

import os
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import numpy as np
from fvcore.nn import FlopCountAnalysis, flop_count_table
from torchinfo import summary
from PIL import Image
import random

# Define a basic convolutional block with GELU activation
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, reduction=16):
        super(ConvBlock, self).__init__()
        # Depthwise convolution
        self.dw_conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=stride, padding=1, groups=in_channels, bias=False)
        self.bn1 = nn.BatchNorm2d(in_channels)
        # Pointwise convolution
        self.pw_conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.act = nn.GELU()
        
        # Squeeze-and-Excitation block
        self.se = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Conv2d(out_channels, out_channels // reduction, kernel_size=1),
            nn.GELU(),
            nn.Conv2d(out_channels // reduction, out_channels, kernel_size=1),
            nn.Sigmoid()
        )

        # Residual connection
        self.skip_connection = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.skip_connection = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        identity = self.skip_connection(x)
        out = self.dw_conv(x)
        out = self.bn1(out)
        out = self.act(out)
        out = self.pw_conv(out)
        out = self.bn2(out)
        out = self.act(out)
        
        # Apply SE Attention
        se_weight = self.se(out)
        out = out * se_weight
        
        out += identity  # Residual connection
        out = self.act(out)
        return out

# Define the updated BaseArchitecture with increased depth
class BaseArchitecture(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(BaseArchitecture, self).__init__()
        self.in_channels = 32

        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.act = nn.GELU()
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(256, num_classes)

    def _make_layer(self, block, out_channels, blocks, stride):
        strides = [stride] + [1] * (blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_channels, out_channels, stride))
            self.in_channels = out_channels
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.act(out)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.avg_pool(out)
        out = torch.flatten(out, 1)
        out = self.fc(out)
        return out

def get_updated_architecture():
    return BaseArchitecture(ConvBlock, [3, 3, 3])  # Increased depth

# Training configurations
# transform_train = transforms.Compose([
#     transforms.RandomHorizontalFlip(),
#     transforms.RandomCrop(32, padding=4),
#     transforms.ToTensor(),
#     transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
# ])

transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),  # Color jitter
    transforms.RandomRotation(10),  # Random rotation
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
])

train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False, num_workers=2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = get_updated_architecture().to(device)


# Calculate FLOPs and Params
dummy_input = torch.randn(1, 3, 32, 32).to(device)
flops = FlopCountAnalysis(model, dummy_input)
print("FLOPs and Parameters AFTER Enhancements:")
print(flop_count_table(flops))
print(f"Total Parameters: {sum(p.numel() for p in model.parameters())}")

# Loss, optimizer, scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=3e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)

# Training loop
def train(epoch):
    model.train()
    running_loss = 0.0
    for inputs, targets in tqdm(train_loader, desc=f"Epoch {epoch+1}/{50}"):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    scheduler.step()
    print(f"Training Loss: {running_loss / len(train_loader):.4f}")

# Test loop
def test():
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    print(f"Test Accuracy: {100. * correct / total:.2f}%")

# Train and save model
for epoch in range(70):
    train(epoch)
    test()

print("FLOPs and Parameters AFTER Training:")
print(flop_count_table(flops))
print(f"Total Parameters AFTER Training: {sum(p.numel() for p in model.parameters())}")

torch.save(model.state_dict(), "cifar10_gpt_train.pth")
print("MODEL SAVED")

Files already downloaded and verified
Files already downloaded and verified
FLOPs and Parameters AFTER Enhancements:
| module                      | #parameters or shape   | #flops     |
|:----------------------------|:-----------------------|:-----------|
| model                       | 0.311M                 | 45.5M      |
|  conv1                      |  0.864K                |  0.885M    |
|   conv1.weight              |   (32, 3, 3, 3)        |            |
|  bn1                        |  64                    |  0.164M    |
|   bn1.weight                |   (32,)                |            |
|   bn1.bias                  |   (32,)                |            |
|  layer1                     |  16.3K                 |  16.386M   |
|   layer1.0                  |   5.284K               |   5.374M   |
|    layer1.0.dw_conv         |    0.288K              |    0.295M  |
|    layer1.0.bn1             |    64                  |    0.164M  |
|    layer1.0.pw_conv         |    2.048K  

Epoch 1/50: 100%|██████████| 391/391 [00:29<00:00, 13.22it/s]

Training Loss: 1.5472





Test Accuracy: 57.63%


Epoch 2/50: 100%|██████████| 391/391 [00:29<00:00, 13.21it/s]

Training Loss: 1.0450





Test Accuracy: 69.49%


Epoch 3/50: 100%|██████████| 391/391 [00:30<00:00, 13.01it/s]

Training Loss: 0.8457





Test Accuracy: 72.26%


Epoch 4/50: 100%|██████████| 391/391 [00:29<00:00, 13.24it/s]

Training Loss: 0.7397





Test Accuracy: 76.11%


Epoch 5/50: 100%|██████████| 391/391 [00:29<00:00, 13.05it/s]

Training Loss: 0.6739





Test Accuracy: 78.47%


Epoch 6/50: 100%|██████████| 391/391 [00:30<00:00, 13.02it/s]

Training Loss: 0.6326





Test Accuracy: 79.67%


Epoch 7/50: 100%|██████████| 391/391 [00:29<00:00, 13.16it/s]

Training Loss: 0.5993





Test Accuracy: 80.97%


Epoch 8/50: 100%|██████████| 391/391 [00:29<00:00, 13.18it/s]

Training Loss: 0.5713





Test Accuracy: 78.76%


Epoch 9/50: 100%|██████████| 391/391 [00:29<00:00, 13.10it/s]

Training Loss: 0.5466





Test Accuracy: 82.32%


Epoch 10/50: 100%|██████████| 391/391 [00:29<00:00, 13.15it/s]

Training Loss: 0.5270





Test Accuracy: 77.41%


Epoch 11/50: 100%|██████████| 391/391 [00:29<00:00, 13.26it/s]

Training Loss: 0.5145





Test Accuracy: 82.33%


Epoch 12/50: 100%|██████████| 391/391 [00:29<00:00, 13.27it/s]

Training Loss: 0.4948





Test Accuracy: 83.28%


Epoch 13/50: 100%|██████████| 391/391 [00:29<00:00, 13.36it/s]

Training Loss: 0.4849





Test Accuracy: 83.53%


Epoch 14/50: 100%|██████████| 391/391 [00:29<00:00, 13.45it/s]

Training Loss: 0.4731





Test Accuracy: 84.09%


Epoch 15/50: 100%|██████████| 391/391 [00:29<00:00, 13.14it/s]

Training Loss: 0.4551





Test Accuracy: 83.80%


Epoch 16/50: 100%|██████████| 391/391 [00:29<00:00, 13.24it/s]

Training Loss: 0.4477





Test Accuracy: 83.26%


Epoch 17/50: 100%|██████████| 391/391 [00:30<00:00, 13.00it/s]

Training Loss: 0.4389





Test Accuracy: 84.08%


Epoch 18/50: 100%|██████████| 391/391 [00:29<00:00, 13.29it/s]

Training Loss: 0.4249





Test Accuracy: 83.74%


Epoch 19/50: 100%|██████████| 391/391 [00:30<00:00, 12.97it/s]

Training Loss: 0.4111





Test Accuracy: 85.39%


Epoch 20/50: 100%|██████████| 391/391 [00:30<00:00, 12.98it/s]

Training Loss: 0.4061





Test Accuracy: 86.29%


Epoch 21/50: 100%|██████████| 391/391 [00:29<00:00, 13.23it/s]

Training Loss: 0.3908





Test Accuracy: 86.35%


Epoch 22/50: 100%|██████████| 391/391 [00:29<00:00, 13.16it/s]

Training Loss: 0.3848





Test Accuracy: 87.89%


Epoch 23/50: 100%|██████████| 391/391 [00:29<00:00, 13.29it/s]

Training Loss: 0.3779





Test Accuracy: 85.91%


Epoch 24/50: 100%|██████████| 391/391 [00:29<00:00, 13.27it/s]

Training Loss: 0.3655





Test Accuracy: 87.10%


Epoch 25/50: 100%|██████████| 391/391 [00:29<00:00, 13.24it/s]

Training Loss: 0.3509





Test Accuracy: 86.26%


Epoch 26/50: 100%|██████████| 391/391 [00:29<00:00, 13.24it/s]

Training Loss: 0.3450





Test Accuracy: 86.18%


Epoch 27/50: 100%|██████████| 391/391 [00:29<00:00, 13.20it/s]

Training Loss: 0.3317





Test Accuracy: 87.90%


Epoch 28/50: 100%|██████████| 391/391 [00:29<00:00, 13.12it/s]

Training Loss: 0.3235





Test Accuracy: 86.30%


Epoch 29/50: 100%|██████████| 391/391 [00:30<00:00, 13.00it/s]

Training Loss: 0.3090





Test Accuracy: 87.34%


Epoch 30/50: 100%|██████████| 391/391 [00:30<00:00, 13.01it/s]

Training Loss: 0.2973





Test Accuracy: 89.12%


Epoch 31/50: 100%|██████████| 391/391 [00:30<00:00, 12.86it/s]

Training Loss: 0.2860





Test Accuracy: 89.06%


Epoch 32/50: 100%|██████████| 391/391 [00:30<00:00, 12.93it/s]

Training Loss: 0.2775





Test Accuracy: 89.28%


Epoch 33/50: 100%|██████████| 391/391 [00:29<00:00, 13.07it/s]

Training Loss: 0.2639





Test Accuracy: 89.08%


Epoch 34/50: 100%|██████████| 391/391 [00:30<00:00, 12.81it/s]

Training Loss: 0.2514





Test Accuracy: 89.71%


Epoch 35/50: 100%|██████████| 391/391 [00:30<00:00, 12.76it/s]

Training Loss: 0.2423





Test Accuracy: 89.37%


Epoch 36/50: 100%|██████████| 391/391 [00:30<00:00, 12.89it/s]

Training Loss: 0.2282





Test Accuracy: 90.47%


Epoch 37/50: 100%|██████████| 391/391 [00:30<00:00, 12.94it/s]

Training Loss: 0.2165





Test Accuracy: 90.88%


Epoch 38/50: 100%|██████████| 391/391 [00:30<00:00, 12.97it/s]

Training Loss: 0.2034





Test Accuracy: 91.16%


Epoch 39/50: 100%|██████████| 391/391 [00:30<00:00, 12.90it/s]

Training Loss: 0.1930





Test Accuracy: 90.77%


Epoch 40/50: 100%|██████████| 391/391 [00:30<00:00, 13.00it/s]

Training Loss: 0.1784





Test Accuracy: 90.89%


Epoch 41/50: 100%|██████████| 391/391 [00:29<00:00, 13.04it/s]

Training Loss: 0.1722





Test Accuracy: 91.53%


Epoch 42/50: 100%|██████████| 391/391 [00:30<00:00, 13.02it/s]

Training Loss: 0.1591





Test Accuracy: 91.63%


Epoch 43/50: 100%|██████████| 391/391 [00:30<00:00, 12.64it/s]

Training Loss: 0.1476





Test Accuracy: 91.94%


Epoch 44/50: 100%|██████████| 391/391 [00:30<00:00, 12.78it/s]

Training Loss: 0.1403





Test Accuracy: 92.10%


Epoch 45/50: 100%|██████████| 391/391 [00:31<00:00, 12.41it/s]

Training Loss: 0.1294





Test Accuracy: 91.95%


Epoch 46/50: 100%|██████████| 391/391 [00:30<00:00, 12.81it/s]

Training Loss: 0.1254





Test Accuracy: 91.93%


Epoch 47/50: 100%|██████████| 391/391 [00:30<00:00, 12.88it/s]

Training Loss: 0.1211





Test Accuracy: 92.22%


Epoch 48/50: 100%|██████████| 391/391 [00:30<00:00, 12.75it/s]

Training Loss: 0.1171





Test Accuracy: 92.12%


Epoch 49/50: 100%|██████████| 391/391 [00:29<00:00, 13.07it/s]

Training Loss: 0.1140





Test Accuracy: 92.17%


Epoch 50/50: 100%|██████████| 391/391 [00:30<00:00, 13.02it/s]

Training Loss: 0.1123





Test Accuracy: 92.28%


Epoch 51/50: 100%|██████████| 391/391 [00:29<00:00, 13.09it/s]

Training Loss: 0.1134





Test Accuracy: 92.29%


Epoch 52/50: 100%|██████████| 391/391 [00:29<00:00, 13.06it/s]

Training Loss: 0.1131





Test Accuracy: 92.30%


Epoch 53/50: 100%|██████████| 391/391 [00:30<00:00, 13.01it/s]

Training Loss: 0.1125





Test Accuracy: 92.23%


Epoch 54/50: 100%|██████████| 391/391 [00:30<00:00, 13.01it/s]

Training Loss: 0.1152





Test Accuracy: 92.08%


Epoch 55/50: 100%|██████████| 391/391 [00:30<00:00, 13.03it/s]

Training Loss: 0.1155





Test Accuracy: 92.20%


Epoch 56/50: 100%|██████████| 391/391 [00:30<00:00, 12.87it/s]

Training Loss: 0.1158





Test Accuracy: 91.95%


Epoch 57/50: 100%|██████████| 391/391 [00:30<00:00, 12.96it/s]

Training Loss: 0.1203





Test Accuracy: 92.10%


Epoch 58/50: 100%|██████████| 391/391 [00:30<00:00, 12.87it/s]

Training Loss: 0.1258





Test Accuracy: 91.97%


Epoch 59/50: 100%|██████████| 391/391 [00:29<00:00, 13.06it/s]

Training Loss: 0.1299





Test Accuracy: 91.74%


Epoch 60/50: 100%|██████████| 391/391 [00:30<00:00, 12.66it/s]

Training Loss: 0.1440





Test Accuracy: 91.55%


Epoch 61/50: 100%|██████████| 391/391 [00:30<00:00, 12.69it/s]

Training Loss: 0.1552





Test Accuracy: 90.81%


Epoch 62/50: 100%|██████████| 391/391 [00:30<00:00, 12.94it/s]

Training Loss: 0.1691





Test Accuracy: 90.56%


Epoch 63/50: 100%|██████████| 391/391 [00:29<00:00, 13.18it/s]

Training Loss: 0.1900





Test Accuracy: 90.14%


Epoch 64/50: 100%|██████████| 391/391 [00:29<00:00, 13.15it/s]

Training Loss: 0.2045





Test Accuracy: 90.67%


Epoch 65/50: 100%|██████████| 391/391 [00:30<00:00, 13.03it/s]

Training Loss: 0.2232





Test Accuracy: 89.46%


Epoch 66/50: 100%|██████████| 391/391 [00:29<00:00, 13.17it/s]

Training Loss: 0.2327





Test Accuracy: 88.52%


Epoch 67/50:  91%|█████████ | 354/391 [00:27<00:02, 12.82it/s]


KeyboardInterrupt: 

## DLA

In [36]:
import math

import torch
import torch.nn as nn
import torch.nn.functional as F


class Bottleneck(nn.Module):
    def __init__(self, in_planes, growth_rate):
        super(Bottleneck, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=False)
        self.bn2 = nn.BatchNorm2d(4*growth_rate)
        self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)

    def forward(self, x):
        out = self.conv1(F.relu(self.bn1(x)))
        out = self.conv2(F.relu(self.bn2(out)))
        out = torch.cat([out,x], 1)
        return out


class Transition(nn.Module):
    def __init__(self, in_planes, out_planes):
        super(Transition, self).__init__()
        self.bn = nn.BatchNorm2d(in_planes)
        self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False)

    def forward(self, x):
        out = self.conv(F.relu(self.bn(x)))
        out = F.avg_pool2d(out, 2)
        return out

class LightConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(LightConvBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.act = nn.GELU()
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.dropout = nn.Dropout(p=0.2)

        self.skip_connection = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.skip_connection = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        identity = self.skip_connection(x)
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.act(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.dropout(out)
        out += identity
        out = self.act(out)
        return out

class DenseNet(nn.Module):
    def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10):
        super(DenseNet, self).__init__()
        self.growth_rate = growth_rate
        self.in_channels = 16  # Reduced initial channels

        self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size=3, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.act = nn.ReLU()


        # Crucial fix: correct input channels for dense1!
        self.dense1 = self._make_dense_layers(block, self.in_channels, nblocks[0])
        self.in_channels += nblocks[0] * self.growth_rate


        self.trans1 = Transition(self.in_channels, int(self.in_channels * reduction))
        self.in_channels = int(self.in_channels * reduction)


        self.dense2 = self._make_dense_layers(block, self.in_channels, nblocks[1])  # Corrected number of blocks
        self.in_channels += nblocks[1] * self.growth_rate
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(self.in_channels, num_classes)


    def _make_dense_layers(self, block, in_planes, nblock):
        layers = []
        for i in range(nblock):
            layers.append(block(in_planes, self.growth_rate))
            in_planes += self.growth_rate
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.act(out)
        out = self.dense1(out)
        out = self.trans1(out)
        out = self.dense2(out)
        out = self.avg_pool(self.act(out))
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out

class ExtremelyLightDenseNet(nn.Module):
    def __init__(self, block, nblocks, growth_rate=16, reduction=0.5, num_classes=10):
        super(ExtremelyLightDenseNet, self).__init__()
        self.growth_rate = growth_rate
        self.in_channels = 16 # Use 16 for initial channels

        self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size=3, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.act = nn.ReLU()

        # Crucial fix: Correct input channels for dense1!
        self.dense1 = self._make_dense_layers(block, self.in_channels, nblocks[0])
        self.in_channels += nblocks[0] * self.growth_rate


        self.trans1 = Transition(self.in_channels, int(self.in_channels * reduction))
        self.in_channels = int(self.in_channels * reduction)

        self.dense2 = self._make_dense_layers(block, self.in_channels, nblocks[1])
        self.in_channels += nblocks[1] * self.growth_rate
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(self.in_channels, num_classes)


    def _make_dense_layers(self, block, in_planes, nblock):
        layers = []
        for i in range(nblock):
            layers.append(block(in_planes, self.growth_rate))
            in_planes += self.growth_rate
        return nn.Sequential(*layers)

    
    def forward(self, x):
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.act(out)
        out = self.dense1(out)
        out = self.trans1(out)
        out = self.dense2(out)
        out = self.avg_pool(self.act(out))
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out


def DenseNet121():
    return DenseNet(Bottleneck, [6,12,24,16], growth_rate=32)

def DenseNet169():
    return DenseNet(Bottleneck, [6,12,32,32], growth_rate=32)

def DenseNet201():
    return DenseNet(Bottleneck, [6,12,48,32], growth_rate=32)

def DenseNet161():
    return DenseNet(Bottleneck, [6,12,36,24], growth_rate=48)

def densenet_cifar():
    return DenseNet(Bottleneck, [6,12,24,16], growth_rate=12)

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(
            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class Root(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=1):
        super(Root, self).__init__()
        self.conv = nn.Conv2d(
            in_channels, out_channels, kernel_size,
            stride=1, padding=(kernel_size - 1) // 2, bias=False)
        self.bn = nn.BatchNorm2d(out_channels)

    def forward(self, xs):
        x = torch.cat(xs, 1)
        out = F.relu(self.bn(self.conv(x)))
        return out


class Tree(nn.Module):
    def __init__(self, block, in_channels, out_channels, level=1, stride=1):
        super(Tree, self).__init__()
        self.root = Root(2*out_channels, out_channels)
        if level == 1:
            self.left_tree = block(in_channels, out_channels, stride=stride)
            self.right_tree = block(out_channels, out_channels, stride=1)
        else:
            self.left_tree = Tree(block, in_channels,
                                  out_channels, level=level-1, stride=stride)
            self.right_tree = Tree(block, out_channels,
                                   out_channels, level=level-1, stride=1)

    def forward(self, x):
        out1 = self.left_tree(x)
        out2 = self.right_tree(out1)
        out = self.root([out1, out2])
        return out


class SimpleDLA(nn.Module):
    def __init__(self, block=BasicBlock, num_classes=10):
        super(SimpleDLA, self).__init__()
        self.base = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(16),
            nn.ReLU(True)
        )

        self.layer1 = nn.Sequential(  # No need to repeat these
            nn.Conv2d(16, 16, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(16),
            nn.ReLU(True)
        )

        self.layer2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(32),
            nn.ReLU(True)
        )


        # CRITICAL: Correct the Tree usage and define a working _forward_
        self.layer3 = Tree(block, 32, 32, level=1, stride=1)
        self.layer4 = Tree(block, 32, 64, level=2, stride=2)
        self.layer5 = Tree(block, 64, 128, level=2, stride=2)
        self.layer6 = Tree(block, 128, 256, level=1, stride=1)
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.base(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer5(x)
        x = self.layer6(x)
        x = self.avg_pool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

In [4]:
import os
import sys
import time
import math
import shutil
import torch.nn as nn
import torch.nn.init as init

def get_mean_and_std(dataset):
    '''Compute the mean and std value of dataset.'''
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True, num_workers=2)
    mean = torch.zeros(3)
    std = torch.zeros(3)
    print('==> Computing mean and std..')
    for inputs, targets in dataloader:
        for i in range(3):
            mean[i] += inputs[:,i,:,:].mean()
            std[i] += inputs[:,i,:,:].std()
    mean.div_(len(dataset))
    std.div_(len(dataset))
    return mean, std

def init_params(net):
    '''Init layer parameters.'''
    for m in net.modules():
        if isinstance(m, nn.Conv2d):
            init.kaiming_normal(m.weight, mode='fan_out')
            if m.bias is not None:
                init.constant(m.bias, 0)
        elif isinstance(m, nn.BatchNorm2d):
            init.constant(m.weight, 1)
            init.constant(m.bias, 0)
        elif isinstance(m, nn.Linear):
            init.normal(m.weight, std=1e-3)
            if m.bias is not None:
                init.constant(m.bias, 0)

# Get terminal width using shutil
term_width, _ = shutil.get_terminal_size()
TOTAL_BAR_LENGTH = 65.
last_time = time.time()
begin_time = last_time

def progress_bar(current, total, msg=None):
    global last_time, begin_time
    if current == 0:
        begin_time = time.time()  # Reset for new bar.

    cur_len = int(TOTAL_BAR_LENGTH*current/total)
    rest_len = int(TOTAL_BAR_LENGTH - cur_len) - 1

    sys.stdout.write(' [')
    for i in range(cur_len):
        sys.stdout.write('=')
    sys.stdout.write('>')
    for i in range(rest_len):
        sys.stdout.write('.')
    sys.stdout.write(']')

    cur_time = time.time()
    step_time = cur_time - last_time
    last_time = cur_time
    tot_time = cur_time - begin_time

    L = []
    L.append('  Step: %s' % format_time(step_time))
    L.append(' | Tot: %s' % format_time(tot_time))
    if msg:
        L.append(' | ' + msg)

    msg = ''.join(L)
    sys.stdout.write(msg)
    for i in range(term_width-int(TOTAL_BAR_LENGTH)-len(msg)-3):
        sys.stdout.write(' ')

    # Go back to the center of the bar.
    for i in range(term_width-int(TOTAL_BAR_LENGTH/2)+2):
        sys.stdout.write('\b')
    sys.stdout.write(' %d/%d ' % (current+1, total))

    if current < total-1:
        sys.stdout.write('\r')
    else:
        sys.stdout.write('\n')
    sys.stdout.flush()

def format_time(seconds):
    days = int(seconds / 3600/24)
    seconds = seconds - days*3600*24
    hours = int(seconds / 3600)
    seconds = seconds - hours*3600
    minutes = int(seconds / 60)
    seconds = seconds - minutes*60
    secondsf = int(seconds)
    seconds = seconds - secondsf
    millis = int(seconds*1000)

    f = ''
    i = 1
    if days > 0:
        f += str(days) + 'D'
        i += 1
    if hours > 0 and i <= 2:
        f += str(hours) + 'h'
        i += 1
    if minutes > 0 and i <= 2:
        f += str(minutes) + 'm'
        i += 1
    if secondsf > 0 and i <= 2:
        f += str(secondsf) + 's'
        i += 1
    if millis > 0 and i <= 2:
        f += str(millis) + 'ms'
        i += 1
    if f == '':
        f = '0ms'
    return f

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn

import torchvision
import torchvision.transforms as transforms

import os
import argparse

# from models import *
# from utils import progress_bar


device = 'cuda' if torch.cuda.is_available() else 'cpu'
best_acc = 0  # best test accuracy
start_epoch = 0  # start from epoch 0 or last checkpoint epoch

# Data
print('==> Preparing data..')
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

trainset = torchvision.datasets.CIFAR10(
    root='./data', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(
    trainset, batch_size=128, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(
    root='./data', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(
    testset, batch_size=100, shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer',
           'dog', 'frog', 'horse', 'ship', 'truck')

# Model
print('==> Building model..')
net = SimpleDLA()
print(f"Total Parameters: {sum(p.numel() for p in net.parameters())}")
# net = DenseNet121()
# net = ExtremelyLightDenseNet(LightConvBlock, [2, 4]).to(device)
net = net.to(device)
if device == 'cuda':
    net = torch.nn.DataParallel(net)
    cudnn.benchmark = True

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01,
                      momentum=0.9, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)


# Training
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

        progress_bar(batch_idx, len(trainloader), 'Loss: %.3f |Train Acc: %.3f%% (%d/%d)'
                     % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))


def test(epoch):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

            progress_bar(batch_idx, len(testloader), 'Loss: %.3f |Test Acc: %.3f%% (%d/%d)'
                         % (test_loss/(batch_idx+1), 100.*correct/total, correct, total))

    # Save checkpoint.
    acc = 100.*correct/total
    if acc > best_acc:
        print('Saving..')
        state = {
            'net': net.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir('/kaggle/working/checkpoint2'):
            os.mkdir('/kaggle/working/checkpoint2')
        torch.save(state, '/kaggle/working/checkpoint2/ckpt1.pth')
        best_acc = acc


# for epoch in range(start_epoch, start_epoch+1):
for epoch in range(1):
    train(epoch)
    test(epoch)
    scheduler.step()

==> Preparing data..
Files already downloaded and verified
Files already downloaded and verified
==> Building model..
Total Parameters: 3800698

Epoch: 0
Saving..


In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(
            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class Root(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=1):
        super(Root, self).__init__()
        self.conv = nn.Conv2d(
            in_channels, out_channels, kernel_size,
            stride=1, padding=(kernel_size - 1) // 2, bias=False)
        self.bn = nn.BatchNorm2d(out_channels)

    def forward(self, xs):
        x = torch.cat(xs, 1)
        out = F.relu(self.bn(self.conv(x)))
        return out


class Tree(nn.Module):
    def __init__(self, block, in_channels, out_channels, level=1, stride=1):
        super(Tree, self).__init__()
        self.root = Root(2*out_channels, out_channels)
        if level == 1:
            self.left_tree = block(in_channels, out_channels, stride=stride)
            self.right_tree = block(out_channels, out_channels, stride=1)
        else:
            self.left_tree = Tree(block, in_channels,
                                  out_channels, level=level-1, stride=stride)
            self.right_tree = Tree(block, out_channels,
                                   out_channels, level=level-1, stride=1)

    def forward(self, x):
        out1 = self.left_tree(x)
        out2 = self.right_tree(out1)
        out = self.root([out1, out2])
        return out


class SimpleDLA(nn.Module):
    def __init__(self, block=BasicBlock, num_classes=10):
        super(SimpleDLA, self).__init__()
        self.base = nn.Sequential(
            nn.Conv2d(3, 8, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(8),
            nn.ReLU(True)
        )

        self.layer1 = nn.Sequential(
            nn.Conv2d(8, 8, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(8),
            nn.ReLU(True)
        )

        self.layer2 = nn.Sequential(
            nn.Conv2d(8, 16, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(16),
            nn.ReLU(True)
        )

        self.layer3 = Tree(block, 16, 16, level=1, stride=1)
        self.layer4 = Tree(block, 16, 32, level=2, stride=2)
        self.layer5 = Tree(block, 32, 64, level=2, stride=2)
        self.layer6 = Tree(block, 64, 128, level=1, stride=1)
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.base(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer5(x)
        x = self.layer6(x)
        x = self.avg_pool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn

import torchvision
import torchvision.transforms as transforms

import os
import argparse

device = 'cuda' if torch.cuda.is_available() else 'cpu'
best_acc = 0  # best test accuracy
start_epoch = 0  # start from epoch 0 or last checkpoint epoch

# Data
print('==> Preparing data..')
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

trainset = torchvision.datasets.CIFAR10(
    root='./data', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(
    trainset, batch_size=128, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(
    root='./data', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(
    testset, batch_size=100, shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer',
           'dog', 'frog', 'horse', 'ship', 'truck')

# Model
print('==> Building model..')
net = SimpleDLA()
print(f"Total Parameters: {sum(p.numel() for p in net.parameters())}")
net = net.to(device)
if device == 'cuda':
    net = torch.nn.DataParallel(net)
    cudnn.benchmark = True

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01,
                      momentum=0.9, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)


# Training
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

        progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Train Acc: %.3f%% (%d/%d)'
                     % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))


def test(epoch):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

            progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Test Acc: %.3f%% (%d/%d)'
                         % (test_loss/(batch_idx+1), 100.*correct/total, correct, total))

    # Save checkpoint.
    acc = 100.*correct/total
    if acc > best_acc:
        print('Saving..')
        state = {
            'net': net.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir('/kaggle/working/checkpoint2'):
            os.mkdir('/kaggle/working/checkpoint2')
        torch.save(state, '/kaggle/working/checkpoint2/ckpt_new_today.pth')
        best_acc = acc


for epoch in range(130):
    train(epoch)
    test(epoch)
    scheduler.step()

==> Preparing data..
Files already downloaded and verified
Files already downloaded and verified
==> Building model..
Total Parameters: 952962

Epoch: 0
Saving..

Epoch: 1
Saving..

Epoch: 2
Saving..

Epoch: 3
Saving..

Epoch: 4
Saving..

Epoch: 5
Saving..

Epoch: 6
Saving..

Epoch: 7
Saving..

Epoch: 8
Saving..

Epoch: 9

Epoch: 10

Epoch: 11

Epoch: 12

Epoch: 13
Saving..

Epoch: 14
Saving..

Epoch: 15
Saving..

Epoch: 16

Epoch: 17

Epoch: 18

Epoch: 19
Saving..

Epoch: 20

Epoch: 21
Saving..

Epoch: 22

Epoch: 23
Saving..

Epoch: 24

Epoch: 25

Epoch: 26

Epoch: 27
Saving..

Epoch: 28

Epoch: 29

Epoch: 30

Epoch: 31

Epoch: 32
Saving..

Epoch: 33

Epoch: 34

Epoch: 35

Epoch: 36
Saving..

Epoch: 37

Epoch: 38

Epoch: 39

Epoch: 40

Epoch: 41

Epoch: 42
Saving..

Epoch: 43
Saving..

Epoch: 44
Saving..

Epoch: 45

Epoch: 46

Epoch: 47

Epoch: 48
Saving..

Epoch: 49

Epoch: 50
Saving..

Epoch: 51

Epoch: 52
Saving..

Epoch: 53

Epoch: 54

Epoch: 55

Epoch: 56

Epoch: 57

Epoch: 58

E

## 96

In [4]:
import os
import sys
import uuid
from math import ceil

import torch
from torch import nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as T

torch.backends.cudnn.benchmark = True

hyp = {
    'opt': {
        'train_epochs': 37.0,
        'batch_size': 1024,
        'lr': 9.0,               # learning rate per 1024 examples
        'momentum': 0.85,
        'weight_decay': 0.012,   # weight decay per 1024 examples (decoupled from learning rate)
        'bias_scaler': 64.0,     # scales up learning rate (but not weight decay) for BatchNorm biases
        'label_smoothing': 0.2,
        'whiten_bias_epochs': 3, # how many epochs to train the whitening layer bias before freezing
    },
    'aug': {
        'flip': True,
        'translate': 4,
        'cutout': 12,
    },
    'net': {
        'widths': {
            'block1': 128,
            'block2': 384,
            'block3': 512,
        },
        'scaling_factor': 1/9,
        'tta_level': 2,         # the level of test-time augmentation: 0=none, 1=mirror, 2=mirror+translate
    },
}

#############################################
#                DataLoader                 #
#############################################

CIFAR_MEAN = torch.tensor((0.4914, 0.4822, 0.4465))
CIFAR_STD = torch.tensor((0.2470, 0.2435, 0.2616))

def batch_flip_lr(inputs):
    flip_mask = (torch.rand(len(inputs), device=inputs.device) < 0.5).view(-1, 1, 1, 1)
    return torch.where(flip_mask, inputs.flip(-1), inputs)

def batch_crop(images, crop_size):
    r = (images.size(-1) - crop_size)//2
    shifts = torch.randint(-r, r+1, size=(len(images), 2), device=images.device)
    images_out = torch.empty((len(images), 3, crop_size, crop_size), device=images.device, dtype=images.dtype)
    # The two cropping methods in this if-else produce equivalent results, but the second is faster for r > 2.
    if r <= 2:
        for sy in range(-r, r+1):
            for sx in range(-r, r+1):
                mask = (shifts[:, 0] == sy) & (shifts[:, 1] == sx)
                images_out[mask] = images[mask, :, r+sy:r+sy+crop_size, r+sx:r+sx+crop_size]
    else:
        images_tmp = torch.empty((len(images), 3, crop_size, crop_size+2*r), device=images.device, dtype=images.dtype)
        for s in range(-r, r+1):
            mask = (shifts[:, 0] == s)
            images_tmp[mask] = images[mask, :, r+s:r+s+crop_size, :]
        for s in range(-r, r+1):
            mask = (shifts[:, 1] == s)
            images_out[mask] = images_tmp[mask, :, :, r+s:r+s+crop_size]
    return images_out

def make_random_square_masks(inputs, size):
    is_even = int(size % 2 == 0)
    n,c,h,w = inputs.shape

    # seed top-left corners of squares to cutout boxes from, in one dimension each
    corner_y = torch.randint(0, h-size+1, size=(n,), device=inputs.device)
    corner_x = torch.randint(0, w-size+1, size=(n,), device=inputs.device)

    # measure distance, using the center as a reference point
    corner_y_dists = torch.arange(h, device=inputs.device).view(1, 1, h, 1) - corner_y.view(-1, 1, 1, 1)
    corner_x_dists = torch.arange(w, device=inputs.device).view(1, 1, 1, w) - corner_x.view(-1, 1, 1, 1)

    mask_y = (corner_y_dists >= 0) * (corner_y_dists < size)
    mask_x = (corner_x_dists >= 0) * (corner_x_dists < size)

    final_mask = mask_y * mask_x

    return final_mask

def batch_cutout(inputs, size):
    cutout_masks = make_random_square_masks(inputs, size)
    return inputs.masked_fill(cutout_masks, 0)

class CifarLoader:

    def __init__(self, path, train=True, batch_size=500, aug=None, drop_last=None, shuffle=None, gpu=0):
        data_path = os.path.join(path, 'train.pt' if train else 'test.pt')
        if not os.path.exists(data_path):
            dset = torchvision.datasets.CIFAR10(path, download=True, train=train)
            images = torch.tensor(dset.data)
            labels = torch.tensor(dset.targets)
            torch.save({'images': images, 'labels': labels, 'classes': dset.classes}, data_path)

        data = torch.load(data_path, map_location=torch.device(gpu))
        self.images, self.labels, self.classes = data['images'], data['labels'], data['classes']
        # It's faster to load+process uint8 data than to load preprocessed fp16 data
        self.images = (self.images.half() / 255).permute(0, 3, 1, 2).to(memory_format=torch.channels_last)

        self.normalize = T.Normalize(CIFAR_MEAN, CIFAR_STD)
        self.proc_images = {} # Saved results of image processing to be done on the first epoch
        self.epoch = 0

        self.aug = aug or {}
        for k in self.aug.keys():
            assert k in ['flip', 'translate', 'cutout'], 'Unrecognized key: %s' % k

        self.batch_size = batch_size
        self.drop_last = train if drop_last is None else drop_last
        self.shuffle = train if shuffle is None else shuffle

    def __len__(self):
        return len(self.images)//self.batch_size if self.drop_last else ceil(len(self.images)/self.batch_size)

    def __iter__(self):

        if self.epoch == 0:
            images = self.proc_images['norm'] = self.normalize(self.images)
            # Pre-flip images in order to do every-other epoch flipping scheme
            if self.aug.get('flip', False):
                images = self.proc_images['flip'] = batch_flip_lr(images)
            # Pre-pad images to save time when doing random translation
            pad = self.aug.get('translate', 0)
            if pad > 0:
                self.proc_images['pad'] = F.pad(images, (pad,)*4, 'reflect')

        if self.aug.get('translate', 0) > 0:
            images = batch_crop(self.proc_images['pad'], self.images.shape[-2])
        elif self.aug.get('flip', False):
            images = self.proc_images['flip']
        else:
            images = self.proc_images['norm']
        # Flip all images together every other epoch. This increases diversity relative to random flipping
        if self.aug.get('flip', False):
            if self.epoch % 2 == 1:
                images = images.flip(-1)
        if self.aug.get('cutout', 0) > 0:
            images = batch_cutout(images, self.aug['cutout'])

        self.epoch += 1

        indices = (torch.randperm if self.shuffle else torch.arange)(len(images), device=images.device)
        for i in range(len(self)):
            idxs = indices[i*self.batch_size:(i+1)*self.batch_size]
            yield (images[idxs], self.labels[idxs])

#############################################
#            Network Components             #
#############################################

class Flatten(nn.Module):
    def forward(self, x):
        return x.view(x.size(0), -1)

class Mul(nn.Module):
    def __init__(self, scale):
        super().__init__()
        self.scale = scale
    def forward(self, x):
        return x * self.scale

class BatchNorm(nn.BatchNorm2d):
    def __init__(self, num_features, eps=1e-12,
                 weight=False, bias=True):
        super().__init__(num_features, eps=eps)
        self.weight.requires_grad = weight
        self.bias.requires_grad = bias
        # Note that PyTorch already initializes the weights to one and bias to zero

class Conv(nn.Conv2d):
    def __init__(self, in_channels, out_channels, kernel_size=3, padding='same', bias=False):
        super().__init__(in_channels, out_channels, kernel_size=kernel_size, padding=padding, bias=bias)

    def reset_parameters(self):
        super().reset_parameters()
        if self.bias is not None:
            self.bias.data.zero_()
        w = self.weight.data
        torch.nn.init.dirac_(w[:w.size(1)])

class ConvGroup(nn.Module):
    def __init__(self, channels_in, channels_out):
        super().__init__()
        self.conv1 = Conv(channels_in,  channels_out)
        self.pool = nn.MaxPool2d(2)
        self.norm1 = BatchNorm(channels_out)
        self.conv2 = Conv(channels_out, channels_out)
        self.norm2 = BatchNorm(channels_out)
        self.conv3 = Conv(channels_out, channels_out)
        self.norm3 = BatchNorm(channels_out)
        self.activ = nn.GELU()

    def forward(self, x):
        x = self.conv1(x)
        x = self.pool(x)
        x = self.norm1(x)
        x = self.activ(x)
        x0 = x
        x = self.conv2(x)
        x = self.norm2(x)
        x = self.activ(x)
        x = self.conv3(x)
        x = self.norm3(x)
        x = x + x0
        x = self.activ(x)
        return x

#############################################
#            Network Definition             #
#############################################

def make_net():
    widths = hyp['net']['widths']
    whiten_kernel_size = 2
    whiten_width = 2 * 3 * whiten_kernel_size**2
    net = nn.Sequential(
        Conv(3, whiten_width, whiten_kernel_size, padding=0, bias=True),
        nn.GELU(),
        ConvGroup(whiten_width,     widths['block1']),
        ConvGroup(widths['block1'], widths['block2']),
        ConvGroup(widths['block2'], widths['block3']),
        nn.MaxPool2d(3),
        Flatten(),
        nn.Linear(widths['block3'], 10, bias=False),
        Mul(hyp['net']['scaling_factor']),
    )
    net[0].weight.requires_grad = False
    net = net.half().cuda()
    net = net.to(memory_format=torch.channels_last)
    for mod in net.modules():
        if isinstance(mod, BatchNorm):
            mod.float()
    return net

#############################################
#       Whitening Conv Initialization       #
#############################################

def get_patches(x, patch_shape):
    c, (h, w) = x.shape[1], patch_shape
    return x.unfold(2,h,1).unfold(3,w,1).transpose(1,3).reshape(-1,c,h,w).float()

def get_whitening_parameters(patches):
    n,c,h,w = patches.shape
    patches_flat = patches.view(n, -1)
    est_patch_covariance = (patches_flat.T @ patches_flat) / n
    eigenvalues, eigenvectors = torch.linalg.eigh(est_patch_covariance, UPLO='U')
    return eigenvalues.flip(0).view(-1, 1, 1, 1), eigenvectors.T.reshape(c*h*w,c,h,w).flip(0)

def init_whitening_conv(layer, train_set, eps=5e-4):
    patches = get_patches(train_set, patch_shape=layer.weight.data.shape[2:])
    eigenvalues, eigenvectors = get_whitening_parameters(patches)
    eigenvectors_scaled = eigenvectors / torch.sqrt(eigenvalues + eps)
    layer.weight.data[:] = torch.cat((eigenvectors_scaled, -eigenvectors_scaled))

############################################
#                Lookahead                 #
############################################

class LookaheadState:
    def __init__(self, net):
        self.net_ema = {k: v.clone() for k, v in net.state_dict().items()}

    def update(self, net, decay):
        for ema_param, net_param in zip(self.net_ema.values(), net.state_dict().values()):
            if net_param.dtype in (torch.half, torch.float):
                ema_param.lerp_(net_param, 1-decay)
                net_param.copy_(ema_param)

############################################
#                 Logging                  #
############################################

def print_columns(columns_list, is_head=False, is_final_entry=False):
    print_string = ''
    for col in columns_list:
        print_string += '|  %s  ' % col
    print_string += '|'
    if is_head:
        print('-'*len(print_string))
    print(print_string)
    if is_head or is_final_entry:
        print('-'*len(print_string))

logging_columns_list = ['run   ', 'epoch', 'train_loss', 'train_acc', 'val_acc', 'tta_val_acc', 'total_time_seconds']
def print_training_details(variables, is_final_entry):
    formatted = []
    for col in logging_columns_list:
        var = variables.get(col.strip(), None)
        if type(var) in (int, str):
            res = str(var)
        elif type(var) is float:
            res = '{:0.4f}'.format(var)
        else:
            assert var is None
            res = ''
        formatted.append(res.rjust(len(col)))
    print_columns(formatted, is_final_entry=is_final_entry)

############################################
#               Evaluation                 #
############################################

def infer(model, loader, tta_level=0):

    # Test-time augmentation strategy (for tta_level=2):
    # 1. Flip/mirror the image left-to-right (50% of the time).
    # 2. Translate the image by one pixel either up-and-left or down-and-right (50% of the time,
    #    i.e. both happen 25% of the time).
    #
    # This creates 6 views per image (left/right times the two translations and no-translation),
    # which we evaluate and then weight according to the given probabilities.

    def infer_basic(inputs, net):
        return net(inputs).clone()

    def infer_mirror(inputs, net):
        return 0.5 * net(inputs) + 0.5 * net(inputs.flip(-1))

    def infer_mirror_translate(inputs, net):
        logits = infer_mirror(inputs, net)
        pad = 1
        padded_inputs = F.pad(inputs, (pad,)*4, 'reflect')
        inputs_translate_list = [
            padded_inputs[:, :, 0:32, 0:32],
            padded_inputs[:, :, 2:34, 2:34],
        ]
        logits_translate_list = [infer_mirror(inputs_translate, net)
                                 for inputs_translate in inputs_translate_list]
        logits_translate = torch.stack(logits_translate_list).mean(0)
        return 0.5 * logits + 0.5 * logits_translate

    model.eval()
    test_images = loader.normalize(loader.images)
    infer_fn = [infer_basic, infer_mirror, infer_mirror_translate][tta_level]
    with torch.no_grad():
        return torch.cat([infer_fn(inputs, model) for inputs in test_images.split(2000)])

def evaluate(model, loader, tta_level=0):
    logits = infer(model, loader, tta_level)
    return (logits.argmax(1) == loader.labels).float().mean().item()

############################################
#                Training                  #
############################################

def main(run):

    batch_size = hyp['opt']['batch_size']
    epochs = hyp['opt']['train_epochs']
    momentum = hyp['opt']['momentum']
    # Assuming gradients are constant in time, for Nesterov momentum, the below ratio is how much
    # larger the default steps will be than the underlying per-example gradients. We divide the
    # learning rate by this ratio in order to ensure steps are the same scale as gradients, regardless
    # of the choice of momentum.
    kilostep_scale = 1024 * (1 + 1 / (1 - momentum))
    lr = hyp['opt']['lr'] / kilostep_scale # un-decoupled learning rate for PyTorch SGD
    wd = hyp['opt']['weight_decay'] * batch_size / kilostep_scale
    lr_biases = lr * hyp['opt']['bias_scaler']

    loss_fn = nn.CrossEntropyLoss(label_smoothing=hyp['opt']['label_smoothing'], reduction='none')
    test_loader = CifarLoader('cifar10', train=False, batch_size=2000)
    train_loader = CifarLoader('cifar10', train=True, batch_size=batch_size, aug=hyp['aug'])
    if run == 'warmup':
        # The only purpose of the first run is to warmup, so we can use dummy data
        train_loader.labels = torch.randint(0, 10, size=(len(train_loader.labels),), device=train_loader.labels.device)
    total_train_steps = ceil(len(train_loader) * epochs)

    model = make_net()
    current_steps = 0

    norm_biases = [p for k, p in model.named_parameters() if 'norm' in k and p.requires_grad]
    other_params = [p for k, p in model.named_parameters() if 'norm' not in k and p.requires_grad]
    param_configs = [dict(params=norm_biases, lr=lr_biases, weight_decay=wd/lr_biases),
                     dict(params=other_params, lr=lr, weight_decay=wd/lr)]
    optimizer = torch.optim.SGD(param_configs, momentum=momentum, nesterov=True)

    def get_lr(step):
        warmup_steps = int(total_train_steps * 0.1)
        warmdown_steps = total_train_steps - warmup_steps
        if step < warmup_steps:
            frac = step / warmup_steps
            return 0.2 * (1 - frac) + 1.0 * frac
        else:
            frac = (total_train_steps - step) / warmdown_steps
            return frac
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, get_lr)

    alpha_schedule = 0.95**5 * (torch.arange(total_train_steps+1) / total_train_steps)**3
    lookahead_state = LookaheadState(model)

    # For accurately timing GPU code
    starter = torch.cuda.Event(enable_timing=True)
    ender = torch.cuda.Event(enable_timing=True)
    total_time_seconds = 0.0

    # Initialize the whitening layer using training images
    starter.record()
    train_images = train_loader.normalize(train_loader.images[:5000])
    init_whitening_conv(model[0], train_images)
    ender.record()
    torch.cuda.synchronize()
    total_time_seconds += 1e-3 * starter.elapsed_time(ender)

    for epoch in range(ceil(epochs)):

        model[0].bias.requires_grad = (epoch < hyp['opt']['whiten_bias_epochs'])

        ####################
        #     Training     #
        ####################

        starter.record()

        model.train()
        for inputs, labels in train_loader:

            outputs = model(inputs)
            loss = loss_fn(outputs, labels).sum()
            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            optimizer.step()
            scheduler.step()

            current_steps += 1

            if current_steps % 5 == 0:
                lookahead_state.update(model, decay=alpha_schedule[current_steps].item())

            if current_steps >= total_train_steps:
                if lookahead_state is not None:
                    lookahead_state.update(model, decay=1.0)
                break

        ender.record()
        torch.cuda.synchronize()
        total_time_seconds += 1e-3 * starter.elapsed_time(ender)

        ####################
        #    Evaluation    #
        ####################

        # Save the accuracy and loss from the last training batch of the epoch
        train_acc = (outputs.detach().argmax(1) == labels).float().mean().item()
        train_loss = loss.item() / batch_size
        val_acc = evaluate(model, test_loader, tta_level=0)
        print_training_details(locals(), is_final_entry=False)
        run = None # Only print the run number once

    ####################
    #  TTA Evaluation  #
    ####################

    starter.record()
    tta_val_acc = evaluate(model, test_loader, tta_level=hyp['net']['tta_level'])
    ender.record()
    torch.cuda.synchronize()
    total_time_seconds += 1e-3 * starter.elapsed_time(ender)

    epoch = 'eval'
    print_training_details(locals(), is_final_entry=True)

    return tta_val_acc

if __name__ == "__main__":
    with open(sys.argv[0]) as f:
        code = f.read()

    print_columns(logging_columns_list, is_head=True)
    #main('warmup')
    accs = torch.tensor([main(run) for run in range(25)])
    print('Mean: %.4f    Std: %.4f' % (accs.mean(), accs.std()))

    log = {'code': code, 'accs': accs}
    log_dir = os.path.join('logs', str(uuid.uuid4()))
    os.makedirs(log_dir, exist_ok=True)
    log_path = os.path.join(log_dir, 'log.pt')
    print(os.path.abspath(log_path))
    torch.save(log, os.path.join(log_dir, 'log.pt'))

------------------------------------------------------------------------------------------------------
|  run     |  epoch  |  train_loss  |  train_acc  |  val_acc  |  tta_val_acc  |  total_time_seconds  |
------------------------------------------------------------------------------------------------------
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to cifar10/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:02<00:00, 63600621.13it/s]


Extracting cifar10/cifar-10-python.tar.gz to cifar10


  data = torch.load(data_path, map_location=torch.device(gpu))


Files already downloaded and verified
|       0  |      0  |      1.6826  |     0.5381  |   0.5500  |               |             11.1038  |
|          |      1  |      1.3838  |     0.7256  |   0.6558  |               |             19.2859  |
|          |      2  |      1.3447  |     0.7520  |   0.7455  |               |             27.7728  |
|          |      3  |      1.2832  |     0.7842  |   0.7823  |               |             36.0952  |
|          |      4  |      1.2422  |     0.7988  |   0.8261  |               |             44.4373  |
|          |      5  |      1.2373  |     0.7930  |   0.7957  |               |             52.9464  |
|          |      6  |      1.1963  |     0.8184  |   0.8061  |               |             61.6216  |
|          |      7  |      1.2090  |     0.8252  |   0.8142  |               |             70.3833  |
|          |      8  |      1.2197  |     0.8193  |   0.8071  |               |             79.2256  |
|          |      9  |      1.1963 

KeyboardInterrupt: 

In [42]:
import os
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import numpy as np
from fvcore.nn import FlopCountAnalysis, flop_count_table
from torchinfo import summary
from PIL import Image

# Define the ResidualBlock as an updated architecture
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.downsample = downsample

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out

class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes=10):
        super(ResNet, self).__init__()
        self.in_channels = 64
        self.conv = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self.make_layer(block, 64, layers[0])
        self.layer2 = self.make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self.make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self.make_layer(block, 512, layers[3], stride=2)
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, num_classes)

    def make_layer(self, block, out_channels, blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_channels != out_channels:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )
        layers = []
        layers.append(block(self.in_channels, out_channels, stride, downsample))
        self.in_channels = out_channels
        for _ in range(1, blocks):
            layers.append(block(out_channels, out_channels))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avg_pool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

# Training configurations
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
])

train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False, num_workers=2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the ResNet model
model = ResNet(ResidualBlock, [2, 2, 2, 2]).to(device)

# Calculate FLOPs and Params
dummy_input = torch.randn(1, 3, 32, 32).to(device)
flops = FlopCountAnalysis(model, dummy_input)
print("FLOPs and Parameters BEFORE Training:")
print(flop_count_table(flops))
print(f"Total Parameters: {sum(p.numel() for p in model.parameters())}")

# Loss, optimizer, scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)

# Training loop
def train(epoch):
    model.train()
    running_loss = 0.0
    for inputs, targets in tqdm(train_loader, desc=f"Epoch {epoch+1}/{50}"):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Training Loss: {running_loss / len(train_loader):.4f}")

# Test loop
def test():
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    print(f"Test Accuracy: {100. * correct / total:.2f}%")

# Train and save model
for epoch in range(1):
    train(epoch)
    test()

print("FLOPs and Parameters AFTER Training:")
print(flop_count_table(flops))
print(f"Total Parameters AFTER Training: {sum(p.numel() for p in model.parameters())}")

torch.save(model.state_dict(), "cifar_resnet_model.pth")
print("MODEL SAVED")

Files already downloaded and verified
Files already downloaded and verified
FLOPs and Parameters BEFORE Training:
| module                 | #parameters or shape   | #flops     |
|:-----------------------|:-----------------------|:-----------|
| model                  | 11.174M                | 0.559G     |
|  conv                  |  1.728K                |  1.769M    |
|   conv.weight          |   (64, 3, 3, 3)        |            |
|  bn                    |  0.128K                |  0.328M    |
|   bn.weight            |   (64,)                |            |
|   bn.bias              |   (64,)                |            |
|  layer1                |  0.148M                |  0.152G    |
|   layer1.0             |   73.984K              |   76.153M  |
|    layer1.0.conv1      |    36.864K             |    37.749M |
|    layer1.0.bn1        |    0.128K              |    0.328M  |
|    layer1.0.conv2      |    36.864K             |    37.749M |
|    layer1.0.bn2        |    0.128K     

Epoch 1/50: 100%|██████████| 391/391 [00:42<00:00,  9.20it/s]

Training Loss: 1.4544





Test Accuracy: 57.75%
FLOPs and Parameters AFTER Training:
| module                 | #parameters or shape   | #flops     |
|:-----------------------|:-----------------------|:-----------|
| model                  | 11.174M                | 0.559G     |
|  conv                  |  1.728K                |  1.769M    |
|   conv.weight          |   (64, 3, 3, 3)        |            |
|  bn                    |  0.128K                |  0.328M    |
|   bn.weight            |   (64,)                |            |
|   bn.bias              |   (64,)                |            |
|  layer1                |  0.148M                |  0.152G    |
|   layer1.0             |   73.984K              |   76.153M  |
|    layer1.0.conv1      |    36.864K             |    37.749M |
|    layer1.0.bn1        |    0.128K              |    0.328M  |
|    layer1.0.conv2      |    36.864K             |    37.749M |
|    layer1.0.bn2        |    0.128K              |    0.328M  |
|   layer1.1             |   73

In [5]:
import os
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import numpy as np
from fvcore.nn import FlopCountAnalysis, flop_count_table

# Define a lightweight ResidualBlock
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.downsample = downsample

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out

class MidResNet(nn.Module):
    def __init__(self, block, layers, num_classes=10):
        super(MidResNet, self).__init__()
        self.in_channels = 32  # Increased from 16 to 32
        self.conv = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn = nn.BatchNorm2d(32)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self.make_layer(block, 64, layers[0])  # Increased channels
        self.layer2 = self.make_layer(block, 128, layers[1], stride=2)  # Increased channels
        self.layer3 = self.make_layer(block, 256, layers[2], stride=2)  # Increased channels
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(256, num_classes)  # Increased fully connected input size

    def make_layer(self, block, out_channels, blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_channels != out_channels:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )
        layers = []
        layers.append(block(self.in_channels, out_channels, stride, downsample))
        self.in_channels = out_channels
        for _ in range(1, blocks):
            layers.append(block(out_channels, out_channels))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.avg_pool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

# Training configurations
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010)),
])

train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False, num_workers=2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the modified ResNet model
model = MidResNet(ResidualBlock, [3, 3, 3], num_classes=10).to(device)

# Calculate FLOPs and Params
dummy_input = torch.randn(1, 3, 32, 32).to(device)
flops = FlopCountAnalysis(model, dummy_input)
print("FLOPs and Parameters BEFORE Training:")
print(flop_count_table(flops))
print(f"Total Parameters: {sum(p.numel() for p in model.parameters())}")

# Loss, optimizer, scheduler
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)

# Training loop
def train(epoch):
    model.train()
    running_loss = 0.0
    for inputs, targets in tqdm(train_loader, desc=f"Epoch {epoch+1}/{50}"):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Training Loss: {running_loss / len(train_loader):.4f}")

# Test loop
def test():
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    print(f"Test Accuracy: {100. * correct / total:.2f}%")

# Train and save model
for epoch in range(70):
    train(epoch)
    test()

print("FLOPs and Parameters AFTER Training:")
print(flop_count_table(flops))
print(f"Total Parameters AFTER Training: {sum(p.numel() for p in model.parameters())}")

torch.save(model.state_dict(), "cifar_resnet_mid_mode1.pth")
print("MODEL SAVED")

Files already downloaded and verified
Files already downloaded and verified
FLOPs and Parameters BEFORE Training:
| module                 | #parameters or shape   | #flops     |
|:-----------------------|:-----------------------|:-----------|
| model                  | 4.311M                 | 0.634G     |
|  conv                  |  0.864K                |  0.885M    |
|   conv.weight          |   (32, 3, 3, 3)        |            |
|  bn                    |  64                    |  0.164M    |
|   bn.weight            |   (32,)                |            |
|   bn.bias              |   (32,)                |            |
|  layer1                |  0.206M                |  0.212G    |
|   layer1.0             |   57.728K              |   59.703M  |
|    layer1.0.conv1      |    18.432K             |    18.874M |
|    layer1.0.bn1        |    0.128K              |    0.328M  |
|    layer1.0.conv2      |    36.864K             |    37.749M |
|    layer1.0.bn2        |    0.128K     

Epoch 1/50: 100%|██████████| 391/391 [00:45<00:00,  8.59it/s]

Training Loss: 1.4356





Test Accuracy: 52.32%


Epoch 2/50: 100%|██████████| 391/391 [00:52<00:00,  7.49it/s]

Training Loss: 0.9636





Test Accuracy: 67.77%


Epoch 3/50: 100%|██████████| 391/391 [00:51<00:00,  7.65it/s]

Training Loss: 0.7870





Test Accuracy: 71.46%


Epoch 4/50: 100%|██████████| 391/391 [00:51<00:00,  7.53it/s]

Training Loss: 0.6537





Test Accuracy: 76.06%


Epoch 5/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.5691





Test Accuracy: 77.64%


Epoch 6/50: 100%|██████████| 391/391 [00:51<00:00,  7.59it/s]

Training Loss: 0.5010





Test Accuracy: 79.62%


Epoch 7/50: 100%|██████████| 391/391 [00:51<00:00,  7.59it/s]

Training Loss: 0.4482





Test Accuracy: 80.76%


Epoch 8/50: 100%|██████████| 391/391 [00:51<00:00,  7.57it/s]

Training Loss: 0.4120





Test Accuracy: 82.50%


Epoch 9/50: 100%|██████████| 391/391 [00:51<00:00,  7.54it/s]

Training Loss: 0.3834





Test Accuracy: 82.37%


Epoch 10/50: 100%|██████████| 391/391 [00:51<00:00,  7.54it/s]

Training Loss: 0.3453





Test Accuracy: 82.15%


Epoch 11/50: 100%|██████████| 391/391 [00:51<00:00,  7.52it/s]

Training Loss: 0.3185





Test Accuracy: 83.63%


Epoch 12/50: 100%|██████████| 391/391 [00:51<00:00,  7.58it/s]

Training Loss: 0.2952





Test Accuracy: 84.52%


Epoch 13/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.2811





Test Accuracy: 87.33%


Epoch 14/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.2564





Test Accuracy: 86.65%


Epoch 15/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.2370





Test Accuracy: 86.99%


Epoch 16/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.2267





Test Accuracy: 85.34%


Epoch 17/50: 100%|██████████| 391/391 [00:51<00:00,  7.57it/s]

Training Loss: 0.2071





Test Accuracy: 88.33%


Epoch 18/50: 100%|██████████| 391/391 [00:52<00:00,  7.51it/s]

Training Loss: 0.1943





Test Accuracy: 89.14%


Epoch 19/50: 100%|██████████| 391/391 [00:51<00:00,  7.56it/s]

Training Loss: 0.1787





Test Accuracy: 88.62%


Epoch 20/50: 100%|██████████| 391/391 [00:51<00:00,  7.58it/s]

Training Loss: 0.1720





Test Accuracy: 89.03%


Epoch 21/50: 100%|██████████| 391/391 [00:51<00:00,  7.57it/s]

Training Loss: 0.1583





Test Accuracy: 90.10%


Epoch 22/50: 100%|██████████| 391/391 [00:51<00:00,  7.56it/s]

Training Loss: 0.1516





Test Accuracy: 89.48%


Epoch 23/50: 100%|██████████| 391/391 [00:51<00:00,  7.53it/s]

Training Loss: 0.1445





Test Accuracy: 87.90%


Epoch 24/50: 100%|██████████| 391/391 [00:51<00:00,  7.59it/s]

Training Loss: 0.1287





Test Accuracy: 88.67%


Epoch 25/50: 100%|██████████| 391/391 [00:51<00:00,  7.59it/s]

Training Loss: 0.1245





Test Accuracy: 89.19%


Epoch 26/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.1157





Test Accuracy: 89.40%


Epoch 27/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.1119





Test Accuracy: 90.61%


Epoch 28/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.1017





Test Accuracy: 89.91%


Epoch 29/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.1022





Test Accuracy: 89.44%


Epoch 30/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.0887





Test Accuracy: 89.85%


Epoch 31/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.0912





Test Accuracy: 90.10%


Epoch 32/50: 100%|██████████| 391/391 [00:51<00:00,  7.59it/s]

Training Loss: 0.0817





Test Accuracy: 89.88%


Epoch 33/50: 100%|██████████| 391/391 [00:51<00:00,  7.57it/s]

Training Loss: 0.0793





Test Accuracy: 90.56%


Epoch 34/50: 100%|██████████| 391/391 [00:51<00:00,  7.53it/s]

Training Loss: 0.0735





Test Accuracy: 90.02%


Epoch 35/50: 100%|██████████| 391/391 [00:51<00:00,  7.59it/s]

Training Loss: 0.0797





Test Accuracy: 90.96%


Epoch 36/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.0683





Test Accuracy: 90.69%


Epoch 37/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.0675





Test Accuracy: 91.44%


Epoch 38/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.0630





Test Accuracy: 89.43%


Epoch 39/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.0620





Test Accuracy: 91.21%


Epoch 40/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.0586





Test Accuracy: 90.11%


Epoch 41/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.0600





Test Accuracy: 90.45%


Epoch 42/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.0523





Test Accuracy: 91.26%


Epoch 43/50: 100%|██████████| 391/391 [00:51<00:00,  7.59it/s]

Training Loss: 0.0533





Test Accuracy: 91.12%


Epoch 44/50: 100%|██████████| 391/391 [00:51<00:00,  7.55it/s]

Training Loss: 0.0443





Test Accuracy: 90.39%


Epoch 45/50: 100%|██████████| 391/391 [00:51<00:00,  7.57it/s]

Training Loss: 0.0537





Test Accuracy: 90.97%


Epoch 46/50: 100%|██████████| 391/391 [00:51<00:00,  7.59it/s]

Training Loss: 0.0449





Test Accuracy: 90.75%


Epoch 47/50: 100%|██████████| 391/391 [00:51<00:00,  7.58it/s]

Training Loss: 0.0469





Test Accuracy: 91.06%


Epoch 48/50: 100%|██████████| 391/391 [00:51<00:00,  7.59it/s]

Training Loss: 0.0453





Test Accuracy: 91.21%


Epoch 49/50: 100%|██████████| 391/391 [00:51<00:00,  7.59it/s]

Training Loss: 0.0419





Test Accuracy: 90.92%


Epoch 50/50: 100%|██████████| 391/391 [00:51<00:00,  7.57it/s]

Training Loss: 0.0445





Test Accuracy: 91.37%


Epoch 51/50: 100%|██████████| 391/391 [00:51<00:00,  7.59it/s]

Training Loss: 0.0434





Test Accuracy: 92.23%


Epoch 52/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.0348





Test Accuracy: 91.14%


Epoch 53/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.0441





Test Accuracy: 91.06%


Epoch 54/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.0395





Test Accuracy: 91.26%


Epoch 55/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.0333





Test Accuracy: 91.91%


Epoch 56/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.0407





Test Accuracy: 92.02%


Epoch 57/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.0357





Test Accuracy: 91.68%


Epoch 58/50: 100%|██████████| 391/391 [00:51<00:00,  7.59it/s]

Training Loss: 0.0345





Test Accuracy: 91.46%


Epoch 59/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.0337





Test Accuracy: 91.76%


Epoch 60/50: 100%|██████████| 391/391 [00:51<00:00,  7.58it/s]

Training Loss: 0.0360





Test Accuracy: 91.64%


Epoch 61/50: 100%|██████████| 391/391 [00:51<00:00,  7.55it/s]

Training Loss: 0.0313





Test Accuracy: 91.63%


Epoch 62/50: 100%|██████████| 391/391 [00:51<00:00,  7.58it/s]

Training Loss: 0.0329





Test Accuracy: 90.59%


Epoch 63/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.0302





Test Accuracy: 91.65%


Epoch 64/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.0339





Test Accuracy: 92.36%


Epoch 65/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.0278





Test Accuracy: 91.69%


Epoch 66/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.0285





Test Accuracy: 91.64%


Epoch 67/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.0334





Test Accuracy: 91.66%


Epoch 68/50: 100%|██████████| 391/391 [00:51<00:00,  7.59it/s]

Training Loss: 0.0289





Test Accuracy: 91.00%


Epoch 69/50: 100%|██████████| 391/391 [00:51<00:00,  7.60it/s]

Training Loss: 0.0290





Test Accuracy: 91.76%


Epoch 70/50: 100%|██████████| 391/391 [00:51<00:00,  7.58it/s]

Training Loss: 0.0245





Test Accuracy: 91.86%
FLOPs and Parameters AFTER Training:
| module                 | #parameters or shape   | #flops     |
|:-----------------------|:-----------------------|:-----------|
| model                  | 4.311M                 | 0.634G     |
|  conv                  |  0.864K                |  0.885M    |
|   conv.weight          |   (32, 3, 3, 3)        |            |
|  bn                    |  64                    |  0.164M    |
|   bn.weight            |   (32,)                |            |
|   bn.bias              |   (32,)                |            |
|  layer1                |  0.206M                |  0.212G    |
|   layer1.0             |   57.728K              |   59.703M  |
|    layer1.0.conv1      |    18.432K             |    18.874M |
|    layer1.0.bn1        |    0.128K              |    0.328M  |
|    layer1.0.conv2      |    36.864K             |    37.749M |
|    layer1.0.bn2        |    0.128K              |    0.328M  |
|    layer1.0.downsample |    2

## From kaggle

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import CosineAnnealingLR, LambdaLR
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
from torchvision.transforms import RandomErasing
from tqdm import tqdm
import math
import os
import pandas as pd
from PIL import Image
import py7zr

# 1. Print all the files in the input directory to verify paths
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/cifar-10/trainLabels.csv
/kaggle/input/cifar-10/sampleSubmission.csv
/kaggle/input/cifar-10/test.7z
/kaggle/input/cifar-10/train.7z


In [9]:
import py7zr
import os

# Extract train and test images
with py7zr.SevenZipFile('/kaggle/input/cifar-10/train.7z', mode='r') as z:
    z.extractall(path='/kaggle/working/train_images')
    
with py7zr.SevenZipFile('/kaggle/input/cifar-10/test.7z', mode='r') as z:
    z.extractall(path='/kaggle/working/test_images')

In [10]:
train_labels_df = pd.read_csv('/kaggle/input/cifar-10/trainLabels.csv')

# Map image filenames with labels
train_filenames = [os.path.join('/kaggle/working/train_images/train', f"{i}.png") for i in train_labels_df['id']]
train_labels = train_labels_df['label'].values

# CIFAR-10 classes (convert labels from string to index)
classes = ('airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
label_to_index = {label: idx for idx, label in enumerate(classes)}
train_labels = [label_to_index[label] for label in train_labels]

# Data Augmentation with CutOut (RandomErasing) for training data
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    RandomErasing(scale=(0.02, 0.33))
])

# Define a simpler transform for test data
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

In [11]:
# Create a custom dataset class for the training data
class CIFAR10CustomDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path)
        label = self.labels[idx]

        if self.transform:
            image = self.transform(image)

        return image, label

# Create the dataset and dataloader for training data
train_dataset = CIFAR10CustomDataset(train_filenames, train_labels, transform=train_transform)
train_loader = DataLoader(dataset=train_dataset, batch_size=128, shuffle=True, num_workers=2)

In [12]:
# Define the Residual Block and ResNet Model (same as before)
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.downsample = downsample

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out

class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes=10):
        super(ResNet, self).__init__()
        self.in_channels = 64
        self.conv = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self.make_layer(block, 64, layers[0])
        self.layer2 = self.make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self.make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self.make_layer(block, 512, layers[3], stride=2)
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, num_classes)

    def make_layer(self, block, out_channels, blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_channels != out_channels:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )
        layers = []
        layers.append(block(self.in_channels, out_channels, stride, downsample))
        self.in_channels = out_channels
        for _ in range(1, blocks):
            layers.append(block(out_channels, out_channels))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avg_pool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

# Initialize model, loss function, optimizer, and learning rate scheduler
model = ResNet(ResidualBlock, [2, 2, 2, 2]).cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)

In [None]:
# Cosine Annealing with Warmup
def warmup_cosine_lr_scheduler(optimizer, warmup_iters, max_iters):
    def lr_lambda(current_iter):
        if current_iter < warmup_iters:
            return float(current_iter) / float(warmup_iters)  # Warmup phase
        else:
            # Cosine annealing phase
            return 0.5 * (1 + math.cos(float(current_iter - warmup_iters) / float(max_iters - warmup_iters) * math.pi))
    return LambdaLR(optimizer, lr_lambda)

# Training and Testing Functions (same as before)
def train(model, train_loader, criterion, optimizer, scheduler):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for images, labels in tqdm(train_loader):
        images, labels = images.cuda(), labels.cuda()
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
    scheduler.step()
    print(f"Train Loss: {running_loss/len(train_loader)}, Train Accuracy: {100 * correct/total:.2f}%")

def test(model, test_loader, criterion):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in tqdm(test_loader):
            images, labels = images.cuda(), labels.cuda()
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    print(f"Test Loss: {running_loss/len(test_loader)}, Test Accuracy: {100 * correct/total:.2f}%")
    return 100 * correct/total

# Main training loop
max_iters = 45
warmup_iters = 20  # Warmup phase epochs
scheduler = warmup_cosine_lr_scheduler(optimizer, warmup_iters, max_iters)

best_acc = 0

for epoch in range(max_iters):
    print(f"Epoch {epoch+1}/{max_iters}")
    train(model, train_loader, criterion, optimizer, scheduler)
    acc = test(model, train_loader, criterion)
    if acc > best_acc:
        best_acc = acc
        torch.save(model.state_dict(), "best_model_high_Test.pth")

Epoch 1/45


100%|██████████| 391/391 [00:47<00:00,  8.22it/s]


Train Loss: 2.4084013099865533, Train Accuracy: 9.68%


100%|██████████| 391/391 [00:21<00:00, 17.94it/s]


Test Loss: 2.4105548876935563, Test Accuracy: 9.68%
Epoch 2/45


100%|██████████| 391/391 [00:50<00:00,  7.79it/s]


Train Loss: 1.5593097719085185, Train Accuracy: 42.52%


100%|██████████| 391/391 [00:21<00:00, 18.39it/s]


Test Loss: 1.374968390025751, Test Accuracy: 49.99%
Epoch 3/45


100%|██████████| 391/391 [00:50<00:00,  7.72it/s]


Train Loss: 1.216722205624251, Train Accuracy: 56.04%


100%|██████████| 391/391 [00:21<00:00, 18.26it/s]


Test Loss: 1.2339170361723741, Test Accuracy: 56.34%
Epoch 4/45


100%|██████████| 391/391 [00:50<00:00,  7.73it/s]


Train Loss: 1.0054678285823149, Train Accuracy: 64.26%


100%|██████████| 391/391 [00:21<00:00, 18.50it/s]


Test Loss: 1.0142482837752613, Test Accuracy: 64.77%
Epoch 5/45


100%|██████████| 391/391 [00:50<00:00,  7.77it/s]


Train Loss: 0.861000851139693, Train Accuracy: 69.84%


100%|██████████| 391/391 [00:21<00:00, 18.20it/s]


Test Loss: 0.9403767094892614, Test Accuracy: 67.32%
Epoch 6/45


100%|██████████| 391/391 [00:50<00:00,  7.75it/s]


Train Loss: 0.7692277115171827, Train Accuracy: 72.97%


100%|██████████| 391/391 [00:21<00:00, 18.33it/s]


Test Loss: 0.9129568640228427, Test Accuracy: 68.35%
Epoch 7/45


100%|██████████| 391/391 [00:50<00:00,  7.73it/s]


Train Loss: 0.7033825817010592, Train Accuracy: 75.32%


100%|██████████| 391/391 [00:21<00:00, 18.14it/s]


Test Loss: 0.9822658943703108, Test Accuracy: 68.30%
Epoch 8/45


100%|██████████| 391/391 [00:50<00:00,  7.72it/s]


Train Loss: 0.6431815251517479, Train Accuracy: 77.63%


100%|██████████| 391/391 [00:20<00:00, 18.62it/s]


Test Loss: 0.6398160599381723, Test Accuracy: 77.65%
Epoch 9/45


100%|██████████| 391/391 [00:50<00:00,  7.72it/s]


Train Loss: 0.6104684033052391, Train Accuracy: 78.70%


100%|██████████| 391/391 [00:21<00:00, 17.92it/s]


Test Loss: 0.6827639638615386, Test Accuracy: 76.61%
Epoch 10/45


100%|██████████| 391/391 [00:50<00:00,  7.75it/s]


Train Loss: 0.5687352752746524, Train Accuracy: 80.34%


100%|██████████| 391/391 [00:21<00:00, 18.08it/s]


Test Loss: 0.8180861477656742, Test Accuracy: 73.04%
Epoch 11/45


100%|██████████| 391/391 [00:50<00:00,  7.73it/s]


Train Loss: 0.5424037920239636, Train Accuracy: 81.15%


100%|██████████| 391/391 [00:21<00:00, 18.10it/s]


Test Loss: 0.6769786496144121, Test Accuracy: 76.56%
Epoch 12/45


100%|██████████| 391/391 [00:50<00:00,  7.75it/s]


Train Loss: 0.5136195024870851, Train Accuracy: 82.32%


100%|██████████| 391/391 [00:20<00:00, 18.85it/s]


Test Loss: 0.5417249106690097, Test Accuracy: 80.76%
Epoch 13/45


100%|██████████| 391/391 [00:50<00:00,  7.74it/s]


Train Loss: 0.49686034500141585, Train Accuracy: 82.86%


100%|██████████| 391/391 [00:21<00:00, 18.31it/s]


Test Loss: 0.5893441117023264, Test Accuracy: 79.73%
Epoch 14/45


100%|██████████| 391/391 [00:50<00:00,  7.73it/s]


Train Loss: 0.46676159167990966, Train Accuracy: 83.69%


100%|██████████| 391/391 [00:21<00:00, 18.03it/s]


Test Loss: 0.7107227078026823, Test Accuracy: 76.98%
Epoch 15/45


100%|██████████| 391/391 [00:50<00:00,  7.75it/s]


Train Loss: 0.45325108752836046, Train Accuracy: 84.23%


100%|██████████| 391/391 [00:20<00:00, 18.66it/s]


Test Loss: 0.5209427507941985, Test Accuracy: 81.80%
Epoch 16/45


100%|██████████| 391/391 [00:50<00:00,  7.73it/s]


Train Loss: 0.43237027724075805, Train Accuracy: 84.97%


100%|██████████| 391/391 [00:20<00:00, 18.79it/s]


Test Loss: 0.49581391480572695, Test Accuracy: 82.66%
Epoch 17/45


100%|██████████| 391/391 [00:50<00:00,  7.75it/s]


Train Loss: 0.4268256726548495, Train Accuracy: 85.24%


100%|██████████| 391/391 [00:21<00:00, 18.57it/s]


Test Loss: 0.40680152017747045, Test Accuracy: 85.89%
Epoch 18/45


100%|██████████| 391/391 [00:50<00:00,  7.74it/s]


Train Loss: 0.406308168340522, Train Accuracy: 85.89%


100%|██████████| 391/391 [00:21<00:00, 17.95it/s]


Test Loss: 0.4424433704379879, Test Accuracy: 85.01%
Epoch 19/45


100%|██████████| 391/391 [00:50<00:00,  7.72it/s]


Train Loss: 0.3922664330667242, Train Accuracy: 86.24%


100%|██████████| 391/391 [00:21<00:00, 18.33it/s]


Test Loss: 0.4375018700766746, Test Accuracy: 84.80%
Epoch 20/45


100%|██████████| 391/391 [00:50<00:00,  7.74it/s]


Train Loss: 0.3759881593001163, Train Accuracy: 86.96%


100%|██████████| 391/391 [00:20<00:00, 18.68it/s]


Test Loss: 0.39439525236101713, Test Accuracy: 86.35%
Epoch 21/45


100%|██████████| 391/391 [00:50<00:00,  7.74it/s]


Train Loss: 0.37105606321026297, Train Accuracy: 87.12%


100%|██████████| 391/391 [00:20<00:00, 18.76it/s]


Test Loss: 0.36511855479091637, Test Accuracy: 87.14%
Epoch 22/45


100%|██████████| 391/391 [00:50<00:00,  7.78it/s]


Train Loss: 0.35070710749272493, Train Accuracy: 87.80%


100%|██████████| 391/391 [00:21<00:00, 18.61it/s]


Test Loss: 0.3692601552171171, Test Accuracy: 87.44%
Epoch 23/45


100%|██████████| 391/391 [00:50<00:00,  7.71it/s]


Train Loss: 0.32126259590353806, Train Accuracy: 88.65%


100%|██████████| 391/391 [00:20<00:00, 18.77it/s]


Test Loss: 0.3332406451635044, Test Accuracy: 88.43%
Epoch 24/45


100%|██████████| 391/391 [00:50<00:00,  7.76it/s]


Train Loss: 0.3104703553649775, Train Accuracy: 89.15%


100%|██████████| 391/391 [00:20<00:00, 19.03it/s]


Test Loss: 0.29164112730861624, Test Accuracy: 89.75%
Epoch 25/45


100%|██████████| 391/391 [00:50<00:00,  7.72it/s]


Train Loss: 0.29432198245202185, Train Accuracy: 89.64%


100%|██████████| 391/391 [00:20<00:00, 18.86it/s]


Test Loss: 0.31807556782689544, Test Accuracy: 88.71%
Epoch 26/45


100%|██████████| 391/391 [00:50<00:00,  7.74it/s]


Train Loss: 0.28125737516014165, Train Accuracy: 90.05%


100%|██████████| 391/391 [00:21<00:00, 18.44it/s]


Test Loss: 0.34943176375325685, Test Accuracy: 88.09%
Epoch 27/45


100%|██████████| 391/391 [00:50<00:00,  7.74it/s]


Train Loss: 0.26507108778599886, Train Accuracy: 90.61%


100%|██████████| 391/391 [00:21<00:00, 18.38it/s]


Test Loss: 0.3283120730077214, Test Accuracy: 88.69%
Epoch 28/45


100%|██████████| 391/391 [00:50<00:00,  7.75it/s]


Train Loss: 0.24751411145910276, Train Accuracy: 91.28%


100%|██████████| 391/391 [00:20<00:00, 18.76it/s]


Test Loss: 0.2373088863690186, Test Accuracy: 91.66%
Epoch 29/45


 45%|████▍     | 175/391 [00:22<00:27,  7.77it/s]