In [1]:
from transformers import AutoImageProcessor, ViTForImageClassification
import torch
from PIL import Image
import pandas as pd
from sklearn.model_selection import train_test_split
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader

# Set the image size and paths
imsize = 224
image_dir = '../train/'
metadata = pd.read_csv('../train.csv')
metadata['filename'] = metadata['id'].apply(lambda x: f"{image_dir}{x}.jpg")

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(metadata, test_size=0.2, random_state=42)

# Subtract 1 from each label in the 'stable_height' column to make labels zero-indexed
train_df['stable_height'] = train_df['stable_height'] - 1
val_df['stable_height'] = val_df['stable_height'] - 1


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import random
import numpy as np
random_seed = 25
random.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)

In [3]:
import torch
import torch.nn as nn
from transformers import ViTForImageClassification

class CustomViTModel(nn.Module):
    def __init__(self, num_labels):
        super(CustomViTModel, self).__init__()
        # Load the pre-trained Vision Transformer and extract the backbone
        self.vit_backbone = ViTForImageClassification.from_pretrained(
            "google/vit-base-patch16-224",
            num_labels=num_labels,
            ignore_mismatched_sizes=True
        ).vit
        
        # Add custom fully connected layers
        self.custom_classifier = nn.Sequential(
            nn.Linear(768, 1024),
            nn.Linear(1024, num_labels)
        )
    
    def forward(self, x):
        # Forward pass through the Vision Transformer backbone
        vit_outputs = self.vit_backbone(x)
        # Extract the last hidden state (we use the CLS token embedding as the feature)
        vit_features = vit_outputs.last_hidden_state[:, 0, :]  # CLS token embedding
        # Forward pass through the custom classifier
        output = self.custom_classifier(vit_features)
        return output


In [4]:
import numpy as np
from PIL import Image
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, dataframe, is_train=True):
        self.dataframe = dataframe
        self.is_train = is_train

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]['filename']
        label = self.dataframe.iloc[idx]['stable_height']
        image = Image.open(img_path).convert('RGB')  # Load the image file
        
        # Resize the image manually
        image = image.resize((224, 224))
        
        image = np.array(image, dtype=np.float32)
        image = (image - 127.5) / 127.5  # Normalize the image
        
        # Convert the numpy array to a PyTorch tensor
        image = torch.from_numpy(image).permute(2, 0, 1)  # Change HWC to CHW format for PyTorch
        
        return image, label 



# Training and validating

In [5]:
# Create PyTorch datasets and dataloaders
train_dataset = CustomDataset(train_df, is_train=True)
val_dataset = CustomDataset(val_df, is_train=False)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [6]:
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

class EarlyStopping:
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pth', trace_func=print):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = float('inf')
        self.delta = delta
        self.path = path
        self.trace_func = trace_func

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decreases.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

# Initialize model, optimizer, and loss function
model = CustomViTModel(6)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Initialize early stopping and reduce LR on plateau
early_stopping = EarlyStopping(patience=7, verbose=True, path='best_model.pth')
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, min_lr=1e-6, verbose=True)

num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    # Get current learning rate from optimizer
    current_lr = optimizer.param_groups[0]['lr']
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader)}, Learning Rate: {current_lr}')

    # Validation step
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    val_loss = val_loss / len(val_loader)
    print(f'Validation Loss: {val_loss}, Accuracy: {100 * correct / total:.2f}%')

    # Step the scheduler
    scheduler.step(val_loss)

    # Early stopping check
    early_stopping(val_loss, model)

    # Break if early stopping is triggered
    if early_stopping.early_stop:
        print("Early stopping")
        break

# Load the best model weights
model.load_state_dict(torch.load('best_model.pth'))


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/100], Loss: 1.5569626421978076, Learning Rate: 0.0001
Validation Loss: 1.4024630164106686, Accuracy: 41.28%
Validation loss decreased (inf --> 1.402463).  Saving model ...
Epoch [2/100], Loss: 1.2632704256102443, Learning Rate: 0.0001
Validation Loss: 1.229226807753245, Accuracy: 48.50%
Validation loss decreased (1.402463 --> 1.229227).  Saving model ...
Epoch [3/100], Loss: 0.9331069805969795, Learning Rate: 0.0001
Validation Loss: 1.2608084132273991, Accuracy: 49.93%
EarlyStopping counter: 1 out of 7
Epoch [4/100], Loss: 0.5865986347974589, Learning Rate: 0.0001
Validation Loss: 1.5306539634863536, Accuracy: 51.30%
EarlyStopping counter: 2 out of 7


KeyboardInterrupt: 

In [7]:
model_save_path = "../model/visionT.pth"
torch.save(model.state_dict(), model_save_path)