<a href="https://colab.research.google.com/github/MuhammadIrzam447/visionCodes/blob/master/Train_ViT_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# google/vit-base-patch16-224-in21k on fused train dataset using SGD 0.001

In [None]:
# !gdown https://drive.google.com/uc?id=1wgl3QGXZ4m2aLg3T-1TDXQqSP31RuXgL

In [None]:
# !unzip "/content/hateful_train+test_unseen.zip"

In [None]:
!pip install transformers



In [None]:
import torch
import os
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
from torchvision.transforms import transforms
from transformers import ViTForImageClassification, ViTFeatureExtractor, AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, roc_auc_score
from PIL import Image

In [None]:
# Define the path to your training and validation data
train_data_root = "/content/hateful_ViT1/train"
val_data_root = "/content/hateful_ViT1/test"

In [None]:
from transformers import ViTImageProcessor

processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
image_mean, image_std = processor.image_mean, processor.image_std
size = processor.size["height"]

# Define transformations for the input images
transform = transforms.Compose([
    transforms.Resize(size),
    transforms.ToTensor(),
    transforms.Normalize(mean=image_mean, std=image_std)
])


In [None]:
# Load the dataset using ImageFolder and apply transformations
train_dataset = ImageFolder(train_data_root, transform=transform)
val_dataset = ImageFolder(val_data_root, transform=transform)

In [None]:
# Create label2id and id2label dictionaries based on the class names in the dataset
label2id = {class_name: idx for class_name, idx in train_dataset.class_to_idx.items()}
id2label = {idx: class_name for class_name, idx in train_dataset.class_to_idx.items()}

In [None]:
# Initialize the feature extractor
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")

# Define batch size and number of workers (adjust based on your system's resources)
batch_size = 32
num_workers = 1

# Create DataLoader for the dataset
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)



In [None]:
num_classes = len(train_dataset.classes)
print(num_classes)

2


In [None]:
vit = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224-in21k", id2label=id2label, label2id=label2id)
vit.classifier = nn.Linear(vit.config.hidden_size, num_classes)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vit.to(device)
print(vit)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=7

In [None]:
criterion = nn.CrossEntropyLoss()
# optimizer = AdamW(vit.parameters(), lr=1e-5)
optimizer = optim.SGD(vit.parameters(), lr=0.001, momentum=0.9)
num_epochs = 20

In [None]:
predicted_classes = []
actual_labels = []

for epoch in range(num_epochs):
    vit.train()
    train_loss = 0.0

    for images, labels in train_loader:
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = vit(images).logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * images.size(0)

    # Calculate average loss for this epoch
    train_loss /= len(train_loader.dataset)

    save_dir = "/content/Train-ViT-01/"
    os.makedirs(save_dir, exist_ok=True)  # Create the directory if it doesn't exist

    model_name = str(epoch+1) + "_model.pth"
    save_path = os.path.join(save_dir, model_name)  # Specify the complete path to the model file
    torch.save(vit.state_dict(), save_path)

    # Validation
    vit.eval()
    val_loss = 0.0
    correct = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = vit(images).logits
            val_loss += criterion(outputs, labels).item() * images.size(0)

            probabilities = torch.softmax(outputs, dim=1)
            predicted = torch.argmax(probabilities, dim=1)

            correct += (predicted == labels).sum().item()

            predicted_classes.extend(predicted.cpu().numpy())
            actual_labels.extend(labels.cpu().numpy())

    # Calculate average loss and accuracy for validation set
    val_loss /= len(val_loader.dataset)
    accuracy = correct / len(val_loader.dataset)

    print(f"Epoch {epoch+1}/{num_epochs} - Training Loss: {train_loss:.4f} - Validation Loss: {val_loss:.4f} - Accuracy: {accuracy:.4f}")

    # Compute evaluation metrics using the predicted_classes and actual_labels lists
    accuracy = accuracy_score(actual_labels, predicted_classes)
    precision = precision_score(actual_labels, predicted_classes, average='weighted')
    recall = recall_score(actual_labels, predicted_classes, average='weighted')
    f1 = f1_score(actual_labels, predicted_classes, average='weighted')

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)
    print(classification_report(actual_labels, predicted_classes))
    cm = confusion_matrix(actual_labels, predicted_classes)
    print("Confusion Matrix:")
    print(cm)
    auroc = roc_auc_score(actual_labels, predicted_classes)
    print("AUROC:", auroc)



Epoch 1/20 - Training Loss: 0.6384 - Validation Loss: 0.6469 - Accuracy: 0.6355
Accuracy: 0.6355
Precision: 0.6191898055498133
Recall: 0.6355
F1-score: 0.5405230099961142
              precision    recall  f1-score   support

           0       0.64      0.96      0.77      1250
           1       0.59      0.09      0.16       750

    accuracy                           0.64      2000
   macro avg       0.61      0.53      0.47      2000
weighted avg       0.62      0.64      0.54      2000

Confusion Matrix:
[[1200   50]
 [ 679   71]]
AUROC: 0.5273333333333333
Epoch 2/20 - Training Loss: 0.5923 - Validation Loss: 0.6424 - Accuracy: 0.6460
Accuracy: 0.64075
Precision: 0.6252748433253531
Recall: 0.64075
F1-score: 0.5617393213623161
              precision    recall  f1-score   support

           0       0.65      0.94      0.77      2500
           1       0.59      0.14      0.22      1500

    accuracy                           0.64      4000
   macro avg       0.62      0.54      0