<a href="https://colab.research.google.com/github/MuhammadIrzam447/visionCodes/blob/master/Train_ViT_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# google/vit-base-patch16-224 on fused train dataset using Adam lr=1e-5

In [None]:
!gdown https://drive.google.com/uc?id=1wgl3QGXZ4m2aLg3T-1TDXQqSP31RuXgL

In [None]:
!unzip "/content/hateful_train+test_unseen.zip"

In [None]:
!pip install transformers

In [None]:
import torch
import os
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
from torchvision.transforms import transforms
from transformers import ViTForImageClassification, ViTFeatureExtractor, AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, roc_auc_score
from PIL import Image

In [None]:
# Define the path to your training and validation data
train_data_root = "/content/hateful_ViT1/train"
val_data_root = "/content/hateful_ViT1/test"

In [None]:
from transformers import ViTImageProcessor

processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
image_mean, image_std = processor.image_mean, processor.image_std
size = processor.size["height"]

# Define transformations for the input images
transform = transforms.Compose([
    transforms.Resize(size),
    transforms.ToTensor(),
    transforms.Normalize(mean=image_mean, std=image_std)
])


In [None]:
# Load the dataset using ImageFolder and apply transformations
train_dataset = ImageFolder(train_data_root, transform=transform)
val_dataset = ImageFolder(val_data_root, transform=transform)

In [None]:
# Create label2id and id2label dictionaries based on the class names in the dataset
label2id = {class_name: idx for class_name, idx in train_dataset.class_to_idx.items()}
id2label = {idx: class_name for class_name, idx in train_dataset.class_to_idx.items()}

In [None]:
# Define batch size and number of workers (adjust based on your system's resources)
batch_size = 32

In [None]:
# Create DataLoader for the dataset
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
num_classes = len(train_dataset.classes)
print(num_classes)

In [None]:
vit = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224", id2label=id2label, label2id=label2id, num_labels=num_classes, ignore_mismatched_sizes=True)
vit.classifier = nn.Linear(vit.config.hidden_size, num_classes)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vit.to(device)
print(vit)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(vit.parameters(), lr=1e-5)
# optimizer = optim.SGD(vit.parameters(), lr=0.001, momentum=0.9)
num_epochs = 20

In [None]:
predicted_classes = []
actual_labels = []

for epoch in range(num_epochs):
    vit.train()
    train_loss = 0.0

    for images, labels in train_loader:
        images = images.to(device)
        labels = labels.to(device)

        # Forward pass
        inputs = processor(images=images, return_tensors="pt")
        outputs = model(**inputs)
        loss = criterion(outputs.logits, labels)
        total_loss += loss.item()

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * images.size(0)

    # Calculate average loss for this epoch
    train_loss /= len(train_loader.dataset)

    save_dir = "/content/Train-ViT-02/"
    os.makedirs(save_dir, exist_ok=True)  # Create the directory if it doesn't exist

    model_name = str(epoch+1) + "_model.pth"
    save_path = os.path.join(save_dir, model_name)  # Specify the complete path to the model file
    torch.save(vit.state_dict(), save_path)

    # Validation
    vit.eval()
    val_loss = 0.0
    correct = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images = images.to(device)
            labels = labels.to(device)

            # Forward pass
            inputs = processor(images=images, return_tensors="pt")
            outputs = model(**inputs)
            loss = criterion(outputs.logits, labels)
            total_val_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(outputs.logits, 1)
            correct_predictions += (predicted == labels).sum().item()

            predicted_classes.extend(predicted.cpu().numpy())
            actual_labels.extend(labels.cpu().numpy())

    # Calculate average loss and accuracy for validation set
    val_loss /= len(val_loader.dataset)
    accuracy = correct / len(val_loader.dataset)

    print(f"Epoch {epoch+1}/{num_epochs} - Training Loss: {train_loss:.4f} - Validation Loss: {val_loss:.4f} - Accuracy: {accuracy:.4f}")

    # Compute evaluation metrics using the predicted_classes and actual_labels lists
    accuracy = accuracy_score(actual_labels, predicted_classes)
    precision = precision_score(actual_labels, predicted_classes, average='weighted')
    recall = recall_score(actual_labels, predicted_classes, average='weighted')
    f1 = f1_score(actual_labels, predicted_classes, average='weighted')

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)
    print(classification_report(actual_labels, predicted_classes))
    cm = confusion_matrix(actual_labels, predicted_classes)
    print("Confusion Matrix:")
    print(cm)
    auroc = roc_auc_score(actual_labels, predicted_classes)
    print("AUROC:", auroc)



# New Section

In [1]:
!pip install transformers
import torch
import os
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
from torchvision.transforms import transforms
from transformers import ViTForImageClassification, ViTFeatureExtractor, AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, roc_auc_score
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import ImageFolder



In [2]:
# Define the path to your training and validation data
train_data_root = "/content/hateful_ViT1/train"
val_data_root = "/content/hateful_ViT1/test"

In [3]:
class CustomImageDataset(Dataset):
    def __init__(self, root_dir):
        self.root_dir = root_dir
        self.image_folder = ImageFolder(root_dir, transform=None)

    def __len__(self):
        return len(self.image_folder)

    def __getitem__(self, idx):
        image_path, label = self.image_folder.imgs[idx]
        image = self.load_function(image_path)
        return image, label

    def load_function(self, path):
        # Load the image in RGB format using PIL
        image = Image.open(path).convert("RGB")
        # Resize the image to the required input size for the ViT model
        transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
        ])
        image = transform(image)
        return image

def custom_collate_fn(batch):
    images, labels = zip(*batch)
    images = torch.stack(images, dim=0)
    labels = torch.tensor(labels)
    return images, labels

In [4]:
train_dataset = CustomImageDataset(train_data_root)
val_dataset = CustomImageDataset(val_data_root)

In [5]:
batch_size = 32

In [6]:
# Create DataLoader for the dataset
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,  collate_fn=custom_collate_fn)

In [7]:
# Merge the class_to_idx dictionaries from both train and validation datasets
label2id = {class_name: idx for class_name, idx in train_dataset.image_folder.class_to_idx.items()}
id2label = {idx: class_name for class_name, idx in label2id.items()}


In [8]:
num_classes = len(train_dataset.image_folder.classes)
print(num_classes)

2


In [9]:
from transformers import ViTImageProcessor
processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
vit = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224", id2label=id2label, label2id=label2id, num_labels=num_classes, ignore_mismatched_sizes=True)
vit.classifier = nn.Linear(vit.config.hidden_size, num_classes)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vit.to(device)
print(vit)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=7

In [10]:
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(vit.parameters(), lr=1e-5)
# optimizer = optim.SGD(vit.parameters(), lr=0.001, momentum=0.9)
num_epochs = 20



In [None]:
predicted_classes = []
actual_labels = []

for epoch in range(num_epochs):
    vit.train()
    train_loss = 0.0

    for images, labels in train_loader:
        images = images.to(device)
        labels = labels.to(device)

        # Forward pass
        inputs = processor(images=images, return_tensors="pt")
        inputs = inputs.to(device)
        outputs = vit(**inputs)
        loss = criterion(outputs.logits, labels)
        # total_loss += loss.item()

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * images.size(0)

    # Calculate average loss for this epoch
    train_loss /= len(train_loader.dataset)

    save_dir = "/content/Train-ViT-02/"
    os.makedirs(save_dir, exist_ok=True)  # Create the directory if it doesn't exist

    model_name = str(epoch+1) + "_model.pth"
    save_path = os.path.join(save_dir, model_name)  # Specify the complete path to the model file
    torch.save(vit.state_dict(), save_path)

    # Validation
    vit.eval()
    val_loss = 0.0
    correct = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images = images.to(device)
            labels = labels.to(device)

            # Forward pass
            inputs = processor(images=images, return_tensors="pt")
            inputs = inputs.to(device)
            outputs = vit(**inputs)
            loss = criterion(outputs.logits, labels)
            val_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(outputs.logits, 1)
            correct += (predicted == labels).sum().item()

            predicted_classes.extend(predicted.cpu().numpy())
            actual_labels.extend(labels.cpu().numpy())

    # Calculate average loss and accuracy for validation set
    val_loss /= len(val_loader.dataset)
    accuracy = correct / len(val_loader.dataset)

    print(f"Epoch {epoch+1}/{num_epochs} - Training Loss: {train_loss:.4f} - Validation Loss: {val_loss:.4f} - Accuracy: {accuracy:.4f}")

    # Compute evaluation metrics using the predicted_classes and actual_labels lists
    accuracy = accuracy_score(actual_labels, predicted_classes)
    precision = precision_score(actual_labels, predicted_classes, average='weighted')
    recall = recall_score(actual_labels, predicted_classes, average='weighted')
    f1 = f1_score(actual_labels, predicted_classes, average='weighted')

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)
    print(classification_report(actual_labels, predicted_classes))
    cm = confusion_matrix(actual_labels, predicted_classes)
    print("Confusion Matrix:")
    print(cm)
    auroc = roc_auc_score(actual_labels, predicted_classes)
    print("AUROC:", auroc)


Epoch 1/20 - Training Loss: 0.6593 - Validation Loss: 0.0209 - Accuracy: 0.6250
Accuracy: 0.625
Precision: 0.390625
Recall: 0.625
F1-score: 0.4807692307692308
              precision    recall  f1-score   support

           0       0.62      1.00      0.77      1250
           1       0.00      0.00      0.00       750

    accuracy                           0.62      2000
   macro avg       0.31      0.50      0.38      2000
weighted avg       0.39      0.62      0.48      2000

Confusion Matrix:
[[1250    0]
 [ 750    0]]
AUROC: 0.5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/20 - Training Loss: 0.6564 - Validation Loss: 0.0211 - Accuracy: 0.6250
Accuracy: 0.625
Precision: 0.390625
Recall: 0.625
F1-score: 0.4807692307692308
              precision    recall  f1-score   support

           0       0.62      1.00      0.77      2500
           1       0.00      0.00      0.00      1500

    accuracy                           0.62      4000
   macro avg       0.31      0.50      0.38      4000
weighted avg       0.39      0.62      0.48      4000

Confusion Matrix:
[[2500    0]
 [1500    0]]
AUROC: 0.5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 3/20 - Training Loss: 0.6543 - Validation Loss: 0.0208 - Accuracy: 0.6250
Accuracy: 0.625
Precision: 0.390625
Recall: 0.625
F1-score: 0.4807692307692308
              precision    recall  f1-score   support

           0       0.62      1.00      0.77      3750
           1       0.00      0.00      0.00      2250

    accuracy                           0.62      6000
   macro avg       0.31      0.50      0.38      6000
weighted avg       0.39      0.62      0.48      6000

Confusion Matrix:
[[3750    0]
 [2250    0]]
AUROC: 0.5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
