In [7]:
import torch
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import scipy.io as sio
from torch.utils.data import TensorDataset
import pickle

In [2]:
def set_device():
    """
    Sets the PyTorch device to CUDA, MPS, or CPU based on system availability,
    in that priority order. Returns the device and configuration details.

    Returns:
        device (torch.device): The selected device for computation.
        config (dict): Dictionary with details about the device and backend.
    """
    if torch.cuda.is_available():
        device = torch.device("cuda")
        config = {
            "backend": "CUDA",
            "device_name": torch.cuda.get_device_name(device),
            "num_devices": torch.cuda.device_count(),
            "version": torch.version.cuda,
        }
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
        config = {
            "backend": "MPS (Metal Performance Shaders)",
            "device_name": "Apple GPU",
            "torch_version": torch.__version__,
        }
    else:
        device = torch.device("cpu")
        config = {
            "backend": "CPU",
            "device_name": "Generic CPU",
            "torch_version": torch.__version__,
        }

    # print(f"Using device: {config['device_name']} ({config['backend']})")
    return device, config

In [3]:
device, config = set_device()

In [4]:
dataset_info = sio.loadmat("/Users/lkk/Documents/BOUN CMPE/CMPE 537-Computer Vision/Term Project/Code/Datasets/CUB_Paper/produced/dataset_draft.mat")

In [5]:
dataset_info.keys()

dict_keys(['__header__', '__version__', '__globals__', 'image_files', 'features', 'labels', 'imageNumbers', 'labelNames', 'image_tensors_256', 'image_tensors_128', 'image_tensors_64', 'sentence_descriptions'])

In [6]:
image_tensors_256 = dataset_info['image_tensors_256']

In [9]:
np.info(image_tensors_256)

class:  ndarray
shape:  (11788, 3, 256, 256)
strides:  (1, 11788, 35364, 9053184)
itemsize:  1
aligned:  True
contiguous:  False
fortran:  True
data pointer: 0x310000020
byteorder:  little
byteswap:  False
type: uint8


In [11]:
with open("/Users/lkk/Documents/BOUN CMPE/CMPE 537-Computer Vision/Term Project/Code/Datasets/CUB_Paper/produced/image_tensors_256.pkl", 'wb') as f:
    pickle.dump(image_tensors_256, f)

In [69]:
base_keys = ["__header__", "__version__", "__globals__", "image_files", "features", "labels"]
dataset_base = {key: dataset_info[key] for key in base_keys}

In [23]:
image_tensors = torch.tensor(dataset_info['image_tensors_256'])
print(image_tensors.shape)
print(image_tensors.dtype)
print(image_tensors.device)

torch.Size([11788, 3, 256, 256])
torch.uint8
cpu


### CONVNEXT

In [24]:
from torchvision.models import ConvNeXt, ConvNeXt_Tiny_Weights

In [25]:
convnext_transform = ConvNeXt_Tiny_Weights.DEFAULT.transforms()

In [26]:
convnext_images = []
for image in image_tensors:
    trans_image = convnext_transform(image)
    convnext_images.append(trans_image)

In [27]:
print(len(convnext_images))
print(convnext_images[0].shape)
print(convnext_images[0].dtype)
print(convnext_images[0].device)

11788
torch.Size([3, 224, 224])
torch.float32
cpu


In [28]:
convnext_images = torch.stack(convnext_images, dim=0)
print(convnext_images.shape)
print(convnext_images.dtype)
print(convnext_images.device)

torch.Size([11788, 3, 224, 224])
torch.float32
cpu


In [29]:
convnext_images[0]

tensor([[[0.8618, 0.8618, 0.8618,  ..., 0.8961, 0.9132, 0.9132],
         [0.8961, 0.8961, 0.8789,  ..., 0.9132, 0.9303, 0.9303],
         [0.8789, 0.8618, 0.8618,  ..., 0.9303, 0.9474, 0.9646],
         ...,
         [0.7419, 0.7419, 0.7591,  ..., 0.8276, 0.8789, 0.8961],
         [0.7933, 0.7933, 0.8104,  ..., 0.9132, 0.8961, 0.8618],
         [0.8447, 0.8447, 0.8447,  ..., 0.8789, 0.8104, 0.7419]],

        [[1.0805, 1.0805, 1.0805,  ..., 1.1506, 1.1681, 1.1681],
         [1.1155, 1.1155, 1.0980,  ..., 1.1681, 1.1856, 1.1856],
         [1.0980, 1.0805, 1.0805,  ..., 1.1856, 1.2031, 1.2206],
         ...,
         [0.9580, 0.9580, 0.9755,  ..., 1.0805, 1.1155, 1.1506],
         [1.0105, 1.0105, 1.0280,  ..., 1.1681, 1.1506, 1.1155],
         [1.0630, 1.0630, 1.0630,  ..., 1.1506, 1.0805, 1.0105]],

        [[1.2805, 1.2805, 1.2805,  ..., 1.3328, 1.3502, 1.3502],
         [1.3154, 1.3154, 1.2980,  ..., 1.3502, 1.3677, 1.3677],
         [1.2980, 1.2805, 1.2805,  ..., 1.3677, 1.3851, 1.

In [30]:
convnext_dataset = TensorDataset(convnext_images)
convnext_dataloader = DataLoader(convnext_dataset, batch_size=32, shuffle=False)

In [40]:
convnext_model = torchvision.models.convnext_tiny(weights=ConvNeXt_Tiny_Weights.DEFAULT)

In [42]:
convnext_feature_extractor = torch.nn.Sequential(
    *list(convnext_model.children())[:-1],  # Keep everything except the classifier block
    list(convnext_model.children())[-1][0]  # Add only the LayerNorm2d layer from the classifier block
)

In [43]:
convnext_feature_extractor.eval()

Sequential(
  (0): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
      (1): LayerNorm2d((96,), eps=1e-06, elementwise_affine=True)
    )
    (1): Sequential(
      (0): CNBlock(
        (block): Sequential(
          (0): Conv2d(96, 96, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=96)
          (1): Permute()
          (2): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
          (3): Linear(in_features=96, out_features=384, bias=True)
          (4): GELU(approximate='none')
          (5): Linear(in_features=384, out_features=96, bias=True)
          (6): Permute()
        )
        (stochastic_depth): StochasticDepth(p=0.0, mode=row)
      )
      (1): CNBlock(
        (block): Sequential(
          (0): Conv2d(96, 96, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=96)
          (1): Permute()
          (2): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
          (3): Linear(in_features=96, o

In [44]:
convnext_features = []
with torch.no_grad():
    for batch in convnext_dataloader:
        image_batch = batch[0]
        batch_features = convnext_feature_extractor(image_batch)
        convnext_features.append(batch_features)

In [55]:
convnext_features = torch.cat(convnext_features, dim=0)

In [58]:
convnext_features = torch.squeeze(convnext_features)

In [61]:
print(type(dataset_info["features"]))
print(dataset_info["features"].shape)
print(dataset_info["features"].dtype)

<class 'numpy.ndarray'>
(2048, 11788)
float64


In [57]:
print(torch.squeeze(convnext_features).shape)

torch.Size([11788, 768])


In [65]:
features = convnext_features.numpy()

In [68]:
np.info(features)

class:  ndarray
shape:  (768, 11788)
strides:  (8, 6144)
itemsize:  8
aligned:  True
contiguous:  False
fortran:  True
data pointer: 0x580828000
byteorder:  little
byteswap:  False
type: float64


In [67]:
features = features.astype(np.float64).transpose()

In [71]:
convnext_dataset = dataset_base.copy()
convnext_dataset["features"] = features

In [73]:
convnext_dataset.keys()

dict_keys(['__header__', '__version__', '__globals__', 'image_files', 'features', 'labels'])

In [74]:
sio.savemat("/Users/lkk/Documents/BOUN CMPE/CMPE 537-Computer Vision/Term Project/Code/Datasets/CUB_Paper/convnext.mat", convnext_dataset)

In [77]:
convnext_features[0]

tensor([ 4.5726e-01,  1.4883e-01, -9.6176e-02, -6.8534e-01, -6.6053e-02,
        -4.7127e-01, -3.4878e-01,  5.2725e-02,  7.1465e-05,  2.9067e-01,
         1.1392e-01,  3.5608e-01, -3.4380e-01, -8.0767e-02,  4.4609e-01,
         4.4047e-01,  3.7574e-01, -2.0576e-01, -2.5488e-01,  6.8414e-01,
         7.9678e-02, -2.3107e-01,  3.5366e-01,  7.9906e-01, -1.1060e+00,
         3.9207e-02, -1.0121e-01,  1.9065e-01,  5.1355e-01,  3.7845e-01,
         2.2630e-01,  4.4053e-01,  1.1746e-01,  7.8322e-01,  4.1385e-01,
         4.3613e-01,  2.5520e-01, -2.1951e-01, -8.9570e-02, -1.5697e-01,
         3.3415e-01, -4.1315e-01, -4.0601e-02,  6.3488e-01,  3.5650e-01,
         3.6608e-01, -3.3946e-01, -5.0042e-02, -1.5461e-01, -3.8677e-01,
        -8.0369e-01, -1.1937e-02, -6.4231e-01, -1.1751e-01, -2.4048e-01,
        -1.2101e-01, -2.2544e-01, -7.0509e-03, -4.4976e-01, -4.9131e-01,
        -5.1578e-01, -2.4659e-01,  1.9713e-02, -3.3780e-01, -2.7405e-02,
        -3.4229e-01, -7.1477e-01,  9.9296e-01, -6.2

In [78]:
conv_copy = torch.tensor(convnext_features)

  conv_copy = torch.tensor(convnext_features)


In [80]:
print(conv_copy.shape)
print(conv_copy.dtype)
print(conv_copy.device)
print(conv_copy[0])

torch.Size([11788, 768])
torch.float32
cpu


In [84]:
conv_copy = conv_copy.to(device)

In [85]:
print(conv_copy.shape)
print(conv_copy.dtype)
print(conv_copy.device)
print(conv_copy[0])

torch.Size([11788, 768])
torch.float32
mps:0
tensor([ 4.5726e-01,  1.4883e-01, -9.6176e-02, -6.8534e-01, -6.6053e-02,
        -4.7127e-01, -3.4878e-01,  5.2725e-02,  7.1465e-05,  2.9067e-01,
         1.1392e-01,  3.5608e-01, -3.4380e-01, -8.0767e-02,  4.4609e-01,
         4.4047e-01,  3.7574e-01, -2.0576e-01, -2.5488e-01,  6.8414e-01,
         7.9678e-02, -2.3107e-01,  3.5366e-01,  7.9906e-01, -1.1060e+00,
         3.9207e-02, -1.0121e-01,  1.9065e-01,  5.1355e-01,  3.7845e-01,
         2.2630e-01,  4.4053e-01,  1.1746e-01,  7.8322e-01,  4.1385e-01,
         4.3613e-01,  2.5520e-01, -2.1951e-01, -8.9570e-02, -1.5697e-01,
         3.3415e-01, -4.1315e-01, -4.0601e-02,  6.3488e-01,  3.5650e-01,
         3.6608e-01, -3.3946e-01, -5.0042e-02, -1.5461e-01, -3.8677e-01,
        -8.0369e-01, -1.1937e-02, -6.4231e-01, -1.1751e-01, -2.4048e-01,
        -1.2101e-01, -2.2544e-01, -7.0509e-03, -4.4976e-01, -4.9131e-01,
        -5.1578e-01, -2.4659e-01,  1.9713e-02, -3.3780e-01, -2.7405e-02,
      

In [86]:
def tensor_info(tensor):
    print(tensor.shape)
    print(tensor.dtype)
    print(tensor.device)
    print(tensor[0])

In [89]:
tensor_info(convnext_images)

torch.Size([11788, 3, 224, 224])
torch.float32
cpu
tensor([[[0.8618, 0.8618, 0.8618,  ..., 0.8961, 0.9132, 0.9132],
         [0.8961, 0.8961, 0.8789,  ..., 0.9132, 0.9303, 0.9303],
         [0.8789, 0.8618, 0.8618,  ..., 0.9303, 0.9474, 0.9646],
         ...,
         [0.7419, 0.7419, 0.7591,  ..., 0.8276, 0.8789, 0.8961],
         [0.7933, 0.7933, 0.8104,  ..., 0.9132, 0.8961, 0.8618],
         [0.8447, 0.8447, 0.8447,  ..., 0.8789, 0.8104, 0.7419]],

        [[1.0805, 1.0805, 1.0805,  ..., 1.1506, 1.1681, 1.1681],
         [1.1155, 1.1155, 1.0980,  ..., 1.1681, 1.1856, 1.1856],
         [1.0980, 1.0805, 1.0805,  ..., 1.1856, 1.2031, 1.2206],
         ...,
         [0.9580, 0.9580, 0.9755,  ..., 1.0805, 1.1155, 1.1506],
         [1.0105, 1.0105, 1.0280,  ..., 1.1681, 1.1506, 1.1155],
         [1.0630, 1.0630, 1.0630,  ..., 1.1506, 1.0805, 1.0105]],

        [[1.2805, 1.2805, 1.2805,  ..., 1.3328, 1.3502, 1.3502],
         [1.3154, 1.3154, 1.2980,  ..., 1.3502, 1.3677, 1.3677],
       

In [92]:
convnext_images_gpu = torch.tensor(convnext_images, device=device)

  convnext_images_gpu = torch.tensor(convnext_images, device=device)


RuntimeError: MPS backend out of memory (MPS allocated: 14.23 GB, other allocations: 720.00 KB, max allowed: 18.13 GB). Tried to allocate 6.61 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).