In [1]:
from transformers import AutoImageProcessor, SwinForImageClassification
import torch
from PIL import Image
import pandas as pd
from sklearn.model_selection import train_test_split
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import numpy as np

test_df = pd.read_csv('../test.csv')
test_df['filename'] = test_df['id'].apply(lambda x: f"../test/{x}.jpg")

class TestDataset(Dataset):
    def __init__(self, dataframe, is_train=True):
        self.dataframe = dataframe
        self.is_train = is_train

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]['filename']
        image = Image.open(img_path).convert('RGB')
        
        # Resize the image
        image = image.resize((224, 224))
        
        image = np.array(image, dtype=np.float32)
        image = (image - 127.5) / 127.5  # Normalize the image
        
        # Convert the numpy array to a PyTorch tensor
        image = torch.from_numpy(image).permute(2, 0, 1)  # Change HWC to CHW format for PyTorch
        
        return image

# Create a DataLoader for the test set
test_dataset = TestDataset(test_df, False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
import torch.nn as nn
from transformers import ViTForImageClassification

class CustomViTModel(nn.Module):
    def __init__(self, num_labels):
        super(CustomViTModel, self).__init__()
        # Load the pre-trained Vision Transformer and extract the backbone
        self.vit_backbone = ViTForImageClassification.from_pretrained(
            "google/vit-base-patch16-224",
            num_labels=num_labels,
            ignore_mismatched_sizes=True
        ).vit
        
        # Add custom fully connected layers
        self.custom_classifier = nn.Sequential(
            nn.Linear(768, 1024),
            nn.Linear(1024, num_labels)
        )
    
    def forward(self, x):
        # Forward pass through the Vision Transformer backbone
        vit_outputs = self.vit_backbone(x)
        # Extract the last hidden state (we use the CLS token embedding as the feature)
        vit_features = vit_outputs.last_hidden_state[:, 0, :]  # CLS token embedding
        # Forward pass through the custom classifier
        output = self.custom_classifier(vit_features)
        return output


In [3]:
device = 'cuda'
# Initialize the custom model with the correct number of labels
model = CustomViTModel(num_labels=6)

# Load the fine-tuned model weights
model.load_state_dict(torch.load('../model/visionT.pth', map_location=device))
model.to(device)

model.eval()
# Placeholder for predictions
predictions = []

with torch.no_grad():
    for inputs in test_loader:
        inputs = inputs.to(device)
        outputs = model(inputs) 
        
        # Get the predicted class
        _, predicted_labels = torch.max(outputs, 1)
        
        # Store predictions
        predictions.extend(predicted_labels.cpu().numpy())

# Add predictions to the test DataFrame
test_df['stable_height'] = predictions



Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([6]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load('../model/visionT.pth', map_location=device))


In [4]:
test_df

Unnamed: 0,id,filename,stable_height
0,95,../test/95.jpg,0
1,706,../test/706.jpg,1
2,2854,../test/2854.jpg,0
3,3093,../test/3093.jpg,1
4,4283,../test/4283.jpg,1
...,...,...,...
1915,998419,../test/998419.jpg,0
1916,998676,../test/998676.jpg,5
1917,998916,../test/998916.jpg,0
1918,999235,../test/999235.jpg,0


In [5]:
# Save predictions to a CSV file
test_df['stable_height'] += 1
test_df[['id', 'stable_height']].to_csv('../predictions/ViT.csv', index=False)