In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import torch
from torch import nn
from torch.optim import Adam
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
from transformers import CLIPProcessor, CLIPModel
from torchvision.transforms.functional import to_pil_image #Import to_pil_image

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 1. Data Augmentation and Preprocessing
img_height, img_width = 224, 224
batch_size = 16

# Path to the dataset
dataset_path = '/content/drive/MyDrive/FYPDocuments/FYPEyeconAIDataset/underbody_sealer'
print(f"Dataset path: {dataset_path}")

# Transformations for data augmentation
data_transforms = transforms.Compose([
    transforms.Resize((img_height, img_width)),
    transforms.RandomRotation(20),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomResizedCrop((img_height, img_width), scale=(0.8, 1.0)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

# Load the dataset
full_dataset = datasets.ImageFolder(root=dataset_path, transform=data_transforms)

# Split dataset: 80% train + val, 20% test
test_size = int(0.2 * len(full_dataset))
train_val_size = len(full_dataset) - test_size
train_val_dataset, test_dataset = random_split(full_dataset, [train_val_size, test_size])

# Further split train_val into 80% train, 20% validation
val_size = int(0.2 * train_val_size)
train_size = train_val_size - val_size
train_dataset, val_dataset = random_split(train_val_dataset, [train_size, val_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# 2. Load Pre-Trained CLIP Model
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

# Modify CLIP for binary classification by adding a new classification layer
class CLIPClassifier(nn.Module):
    def __init__(self, clip_model):
        super(CLIPClassifier, self).__init__()
        self.clip = clip_model
        self.classifier = nn.Linear(clip_model.text_model.config.hidden_size, 2) # 2 classes

    def forward(self, inputs):
        # Get text embeddings for the labels
        # Assuming your labels are 'OK' and 'NG'
        texts = ["OK", "NG"]
        text_inputs = processor(text=texts, return_tensors="pt", padding=True).to(device)
        text_outputs = self.clip.get_text_features(**text_inputs)

        # Get image embeddings
        image_outputs = self.clip.get_image_features(**inputs)

        # Calculate cosine similarity between image and text embeddings
        logits_per_image = (image_outputs @ text_outputs.T) * self.clip.logit_scale

        return logits_per_image # Return the calculated logits

model = CLIPClassifier(model).to(device)

# 3. Define Loss and Optimizer
criterion = nn.CrossEntropyLoss()
# Access the text_model through the 'clip' attribute of your CLIPClassifier
optimizer = Adam(model.clip.text_model.parameters(), lr=1e-4)

# 4. Training Loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct, total = 0, 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

       # Convert images to PIL before passing to CLIPProcessor
        pil_images = [to_pil_image(image) for image in images]

        # Preprocess images with PIL images
        inputs = processor(images=pil_images, return_tensors="pt", padding=True).to(device)

        # Forward pass
        outputs = model(inputs) # Call the model's forward method, not the nested function
        loss = criterion(outputs, labels) # Calculate loss using the outputs


        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Track metrics
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    train_accuracy = 100 * correct / total
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}, Accuracy: {train_accuracy:.2f}%")

Dataset path: /content/drive/MyDrive/FYPDocuments/FYPEyeconAIDataset/underbody_sealer
Train dataset size: 119
Validation dataset size: 29
Test dataset size: 36


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Epoch [1/100], Loss: 17.3162, Accuracy: 57.14%
Epoch [2/100], Loss: 0.6548, Accuracy: 60.50%
Epoch [3/100], Loss: 0.6205, Accuracy: 66.39%
Epoch [4/100], Loss: 0.6068, Accuracy: 66.39%
Epoch [5/100], Loss: 0.5848, Accuracy: 66.39%
Epoch [6/100], Loss: 0.5467, Accuracy: 73.11%
Epoch [7/100], Loss: 0.5054, Accuracy: 80.67%
Epoch [8/100], Loss: 0.4947, Accuracy: 73.11%
Epoch [9/100], Loss: 0.4258, Accuracy: 79.83%
Epoch [10/100], Loss: 0.3506, Accuracy: 83.19%
Epoch [11/100], Loss: 0.4211, Accuracy: 78.99%
Epoch [12/100], Loss: 0.3052, Accuracy: 87.39%
Epoch [13/100], Loss: 0.2467, Accuracy: 89.92%
Epoch [14/100], Loss: 0.1960, Accuracy: 89.92%
Epoch [15/100], Loss: 0.2279, Accuracy: 90.76%
Epoch [16/100], Loss: 0.2431, Accuracy: 89.08%
Epoch [17/100], Loss: 0.1571, Accuracy: 93.28%
Epoch [18/100], Loss: 0.0964, Accuracy: 96.64%
Epoch [19/100], Loss: 0.2021, Accuracy: 93.28%
Epoch [20/100], Loss: 0.1211, Accuracy: 95.80%
Epoch [21/100], Loss: 0.1604, Accuracy: 92.44%
Epoch [22/100], Loss:

In [3]:
# 5. Validation Loop
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for images, labels in val_loader:
        images, labels = images.to(device), labels.to(device)

        # Convert images to PIL before passing to CLIPProcessor
        pil_images = [to_pil_image(image) for image in images]

        inputs = processor(images=pil_images, return_tensors="pt", padding=True).to(device)
        outputs = model(inputs) # Call the forward method directly
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

val_accuracy = 100 * correct / total
print(f"Validation Accuracy: {val_accuracy:.2f}%")

# 6. Test the Model
correct, total = 0, 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)

        # Convert images to PIL before passing to CLIPProcessor
        pil_images = [to_pil_image(image) for image in images]

        inputs = processor(images=pil_images, return_tensors="pt", padding=True).to(device)
        outputs = model(inputs) # Call the forward method directly
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

test_accuracy = 100 * correct / total
print(f"Test Accuracy: {test_accuracy:.2f}%")

# Save the model
torch.save(model.state_dict(), 'clip_sealant_detection_model.pth')
print("Model saved as clip_sealant_detection_model.pth")

Validation Accuracy: 89.66%
Test Accuracy: 91.67%
Model saved as clip_sealant_detection_model.pth
