In [None]:
import torch
import torchvision
from torchvision import transforms, models
from torch.utils.data import DataLoader, Subset
import numpy as np
from sklearn.decomposition import PCA
import random
from torch import device
from torchvision.datasets import CIFAR10

In [14]:
# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pre-trained ResNet-18 model
resnet18 = models.resnet18(pretrained=True)

# Remove the last fully connected layer to use it as a feature extractor
resnet18 = torch.nn.Sequential(*list(resnet18.children())[:-1])

# Move the model to the correct device and set it to evaluation mode
resnet18 = resnet18.to(device)
resnet18.eval()



Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Con

In [None]:
# 1. Resize the images to 224x224.
# 2. Normalize them using the mean and std of the ImageNet dataset.

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load the CIFAR-10 dataset after transformation
train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, transform=transform, download=False)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, transform=transform, download=False)

In [None]:
# Function to get a subset of dataset with a specified number of images per class
def get_subset_by_class(dataset, num_images_per_class):
    class_indices = [[] for _ in range(10)]  
    for idx, (_, label) in enumerate(dataset):
        if len(class_indices[label]) < num_images_per_class:
            class_indices[label].append(idx)
        if all(len(class_list) >= num_images_per_class for class_list in class_indices):
            break
    # Flatten the list of lists and return as a Subset
    subset_indices = [idx for class_list in class_indices for idx in class_list]
    return Subset(dataset, subset_indices)

# Get subsets of 500 images per class for training and 100 images per class for testing
train_subset = get_subset_by_class(train_dataset, 500)
test_subset = get_subset_by_class(test_dataset, 100)

# Confirm the sizes of the subsets
print(f"Total training subset size: {len(train_subset)}")  
print(f"Total testing subset size: {len(test_subset)}")    


Total training subset size: 5000
Total testing subset size: 1000


In [None]:
# Function to extract features from a DataLoader using ResNet-18
def extract_features(data_loader):
    features = []
    labels = []
    
    with torch.no_grad():  
        for images, label in data_loader:
            images = images.to(device)
            output = resnet18(images)  
            output = output.view(output.size(0), -1) 
            features.append(output.cpu().numpy())
            labels.extend(label.numpy())
    
    # Concatenate all feature arrays and convert to numpy arrays
    features = np.concatenate(features)
    labels = np.array(labels)
    return features, labels

# Define DataLoaders for batch processing
train_loader = DataLoader(train_subset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_subset, batch_size=32, shuffle=False)

# Extract features for both training and test sets
train_features, train_labels = extract_features(train_loader)
test_features, test_labels = extract_features(test_loader)

# Print shapes to confirm dimensions (should be [5000, 512] for train, [1000, 512] for test)
print(f"Training features shape: {train_features.shape}")
print(f"Test features shape: {test_features.shape}")


Training features shape: (5000, 512)
Test features shape: (1000, 512)


In [None]:
# Apply PCA to reduce dimensions from 512 to 50
pca = PCA(n_components=50)

# Fit PCA on the training features and transform both train and test sets
train_features_pca = pca.fit_transform(train_features)
test_features_pca = pca.transform(test_features)

# Confirm the new shapes of the feature vectors
print(f"Training features after PCA shape: {train_features_pca.shape}")
print(f"Test features after PCA shape: {test_features_pca.shape}")


Training features after PCA shape: (5000, 50)
Test features after PCA shape: (1000, 50)


In [None]:


# Saving the processed feature vectors and labels for use with classifiers
np.save('train_features_pca.npy', train_features_pca)
np.save('test_features_pca.npy', test_features_pca)
np.save('train_labels.npy', train_labels)
np.save('test_labels.npy', test_labels)

print("Feature vectors and labels saved successfully.")


Final reduced feature vectors ready for classification models.
Training set: (5000, 50)
Test set: (1000, 50)
Feature vectors and labels saved successfully.
