In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd drive/MyDrive/ece_project2/vila
!pwd

/content/drive/MyDrive/ece_project2/vila
/content/drive/MyDrive/ece_project2/vila


In [4]:
#!/bin/bash
# !kaggle datasets download agrigorev/clothing-dataset-full

Dataset URL: https://www.kaggle.com/datasets/agrigorev/clothing-dataset-full
License(s): CC0-1.0
Downloading clothing-dataset-full.zip to /content/drive/MyDrive/ece_project2/vila
100% 6.50G/6.50G [04:49<00:00, 25.9MB/s]
100% 6.50G/6.50G [04:49<00:00, 24.1MB/s]


In [None]:
# !unzip clothing-dataset-full.zip -d clothing-dataset-full

In [6]:
from huggingface_hub import login
import getpass

# Prompt the user to enter their Hugging Face Token
hf_token = getpass.getpass("Enter your Hugging Face Token: ")

# Use the token to log in to Hugging Face
login(token=hf_token)

print("Successfully authenticated with Hugging Face!")

Enter your Hugging Face Token: ··········
Successfully authenticated with Hugging Face!


### *Step* 1: Filter T-shirts and pants dataset

In [3]:
import os
import shutil
import pandas as pd

dest_dataset_dir = "./dataset"
os.makedirs(dest_dataset_dir, exist_ok=True)

tshirt_dir = os.path.join(dest_dataset_dir, "tshirt")
pants_dir = os.path.join(dest_dataset_dir, "pants")
os.makedirs(tshirt_dir, exist_ok=True)
os.makedirs(pants_dir, exist_ok=True)


In [4]:
csv_file = "./clothing-dataset-full/images.csv"

df = pd.read_csv(csv_file)
df.head()


Unnamed: 0,image,sender_id,label,kids
0,4285fab0-751a-4b74-8e9b-43af05deee22,124,Not sure,False
1,ea7b6656-3f84-4eb3-9099-23e623fc1018,148,T-Shirt,False
2,00627a3f-0477-401c-95eb-92642cbe078d,94,Not sure,False
3,ea2ffd4d-9b25-4ca8-9dc2-bd27f1cc59fa,43,T-Shirt,False
4,3b86d877-2b9e-4c8b-a6a2-1d87513309d0,189,Shoes,False


In [5]:
summary = df['label'].value_counts()
tshirt_count = summary.get('T-Shirt', 0)
pants_count = summary.get('Pants', 0)

print(f"Number of T-Shirt images: {tshirt_count}")
print(f"Number of Pants images: {pants_count}")

Number of T-Shirt images: 1011
Number of Pants images: 692


In [8]:
orig_dataset_dir = "./clothing-dataset-full/images_original"
tshirt_dir = os.path.join(dest_dataset_dir, "tshirt")
pants_dir = os.path.join(dest_dataset_dir, "pants")

filtered_df = df[df['label'].isin(["T-Shirt", "Pants"])]

for index, row in filtered_df.iterrows():
  img_name = f"{row['image']}.jpg"
  src_path = os.path.join(orig_dataset_dir, img_name)
  if row['label'] == "T-Shirt":
    shutil.copy(src_path, tshirt_dir)
  elif row['label'] == "Pants":
    shutil.copy(src_path, pants_dir)

### Step 2: Load and Preprocess the Dataset

In [4]:
from torchvision import transforms
from torch.utils.data import DataLoader, random_split
from torchvision.datasets import ImageFolder

dest_dataset_dir = "./dataset"

transform = transforms.Compose([
    transforms.Resize((336, 336)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

dataset = ImageFolder(root=dest_dataset_dir, transform=transform)

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

### Step 3: Load Vila Model (ViTL/14)

In [4]:
# !git clone https://github.com/Efficient-Large-Model/VILA.git
# %cd VILA

/content/drive/MyDrive/ece_project2/vila/VILA


In [6]:
# !chmod a+x environment_setup.sh
# !./environment_setup.sh vila

# !python -W ignore llava/eval/run_vila.py \
#     --model-path Efficient-Large-Model/Llama-3-VILA1.5-8 \
#     --conv-mode llama_3 \
#     --query "<image>\n Please describe the traffic condition." \
#     --image-file "car.jpg"

In [5]:
import torch
from transformers import CLIPProcessor, CLIPModel
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
model.to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e

In [6]:
vision_model = model.vision_model

print(vision_model)

CLIPVisionTransformer(
  (embeddings): CLIPVisionEmbeddings(
    (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
    (position_embedding): Embedding(257, 1024)
  )
  (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  (encoder): CLIPEncoder(
    (layers): ModuleList(
      (0-23): 24 x CLIPEncoderLayer(
        (self_attn): CLIPSdpaAttention(
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): CLIPMLP(
          (activation_fn): QuickGELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, 

### Step 4: Modify Configuration

In [7]:
config = vision_model.config
print(config)

CLIPVisionConfig {
  "_attn_implementation_autoset": true,
  "attention_dropout": 0.0,
  "dropout": 0.0,
  "hidden_act": "quick_gelu",
  "hidden_size": 1024,
  "image_size": 224,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "model_type": "clip_vision_model",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 14,
  "projection_dim": 768,
  "transformers_version": "4.46.3"
}



In [8]:
from transformers import CLIPVisionConfig, CLIPVisionModel

config.image_size = 336
num_patches = (config.image_size // config.patch_size) **  2

updated_vision_model = CLIPVisionModel(config)
updated_vision_model.to(device)
print(updated_vision_model.config)

CLIPVisionConfig {
  "_attn_implementation_autoset": true,
  "attention_dropout": 0.0,
  "dropout": 0.0,
  "hidden_act": "quick_gelu",
  "hidden_size": 1024,
  "image_size": 336,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "model_type": "clip_vision_model",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 14,
  "projection_dim": 768,
  "transformers_version": "4.46.3"
}



### Step 5: Train the Original Model

In [9]:
import os
torch.save(updated_vision_model, "entire_model.pth")

model_size = os.path.getsize("entire_model.pth") / (1024 * 1024)
print(f"Entire Model Size: {model_size:.2f} MB")

Entire Model Size: 1157.98 MB


In [10]:
import time
import psutil

input_size = (3, 336, 336)

def extract_embeddings(model, dataloader, device):
    model.eval()
    all_embeddings = []
    all_labels = []

    batch_times = []

    start_time = time.time()
    if device != "cpu":
      torch.cuda.reset_peak_memory_stats(device)


    with torch.no_grad():
        for images, labels in tqdm(dataloader):
            batch_start_time = time.time()
            images = images.to(device)


            outputs = model(pixel_values=images)
            embeddings = outputs.last_hidden_state[:, 0, :]

            batch_times.append(time.time() - batch_start_time)

            all_embeddings.append(embeddings.cpu())
            all_labels.append(labels.cpu())

    total_time = time.time() - start_time
    print(f"Total Embedding Extraction Time: {total_time:.2f} seconds")

    avg_batch_time = sum(batch_times) / len(batch_times)
    print(f"Average Time per Batch: {avg_batch_time:.2f} seconds")

    if device != "cpu":
      peak_memory = torch.cuda.max_memory_allocated(device) / (1024 * 1024)  # Convert to MB
      print(f"Peak Memory Usage During Embedding Extraction: {peak_memory:.2f} MB")

    return torch.cat(all_embeddings), torch.cat(all_labels)



In [11]:
train_embeddings, train_labels = extract_embeddings(updated_vision_model, train_loader, device)
test_embeddings, test_labels = extract_embeddings(updated_vision_model, test_loader, device)

print(f"Train Embeddings Shape: {train_embeddings.shape}")
print(f"Test Embeddings Shape: {test_embeddings.shape}")

100%|██████████| 43/43 [13:43<00:00, 19.15s/it]


Total Embedding Extraction Time: 823.50 seconds
Average Time per Batch: 0.14 seconds
Peak Memory Usage During Embedding Extraction: 4426.27 MB


100%|██████████| 11/11 [03:16<00:00, 17.84s/it]

Total Embedding Extraction Time: 196.26 seconds
Average Time per Batch: 0.02 seconds
Peak Memory Usage During Embedding Extraction: 4065.65 MB
Train Embeddings Shape: torch.Size([1362, 1024])
Test Embeddings Shape: torch.Size([341, 1024])





In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def train_and_evaluate_classifier(train_embeddings, train_labels, test_embeddings, test_labels):

    train_embeddings_np = train_embeddings.numpy()
    train_labels_np = train_labels.numpy()
    test_embeddings_np = test_embeddings.numpy()
    test_labels_np = test_labels.numpy()


    classifier = LogisticRegression(max_iter=1000)
    classifier.fit(train_embeddings_np, train_labels_np)


    predictions = classifier.predict(test_embeddings_np)
    accuracy = accuracy_score(test_labels_np, predictions)

    print(f"Test Accuracy: {accuracy:.4f}")


In [13]:
train_and_evaluate_classifier(train_embeddings, train_labels, test_embeddings, test_labels)

Test Accuracy: 0.7801


### Step 6:Pruning

#### Linear Layer Pruning

In [14]:
import torch.nn.utils.prune as prune

def apply_pruning(model, amount=0.3):
    pruned_model = model.__class__(model.config).to(next(model.parameters()).device)
    pruned_model.load_state_dict(model.state_dict())

    for name, module in pruned_model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=amount)

    for name, module in pruned_model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.remove(module, 'weight')

    return pruned_model

pruned_model = apply_pruning(updated_vision_model, amount=0.45)

torch.save(pruned_model.state_dict(), "pruned_vision_model.pth")
print(f"Pruned model saved. Size: {os.path.getsize('pruned_vision_model.pth') / 1e6:.2f} MB")


Pruned model saved. Size: 1214.19 MB


In [15]:
pruned_train_embeddings, pruned_train_labels = extract_embeddings(
    pruned_model, train_loader, device)

pruned_test_embeddings, pruned_test_labels = extract_embeddings(
    pruned_model, test_loader, device)

train_and_evaluate_classifier(pruned_train_embeddings, pruned_train_labels,
                              pruned_test_embeddings, pruned_test_labels)


100%|██████████| 43/43 [05:06<00:00,  7.12s/it]


Total Embedding Extraction Time: 306.30 seconds
Average Time per Batch: 0.02 seconds
Peak Memory Usage During Embedding Extraction: 5655.53 MB


100%|██████████| 11/11 [01:15<00:00,  6.90s/it]


Total Embedding Extraction Time: 75.87 seconds
Average Time per Batch: 0.02 seconds
Peak Memory Usage During Embedding Extraction: 5222.78 MB
Test Accuracy: 0.7771


In [16]:
def measure_inference_time(model, dataloader, device):
    model.eval()
    total_time = 0
    total_batches = len(dataloader)

    with torch.no_grad():
        for images, _ in dataloader:
            images = images.to(device)

            start_time = time.time()
            _ = model(pixel_values=images)
            end_time = time.time()

            total_time += (end_time - start_time)

    avg_time_per_batch = total_time / total_batches
    print(f"Total Inference Time: {total_time:.2f} seconds")
    print(f"Average Time per Batch: {avg_time_per_batch:.4f} seconds")

    return total_time, avg_time_per_batch


In [17]:
measure_inference_time(updated_vision_model, test_loader, device)
measure_inference_time(pruned_model, test_loader, device)

Total Inference Time: 0.16 seconds
Average Time per Batch: 0.0146 seconds
Total Inference Time: 0.15 seconds
Average Time per Batch: 0.0140 seconds


(0.15371060371398926, 0.013973691246726296)

In [20]:
import torch.nn.utils.prune as prune

def structured_prune_model(vision_model, prune_amount=0.3):
    for i, layer in enumerate(vision_model.encoder.layers):
        prune.ln_structured(layer.self_attn.k_proj, name="weight", amount=prune_amount, n=2, dim=0)
        prune.ln_structured(layer.self_attn.q_proj, name="weight", amount=prune_amount, n=2, dim=0)
        prune.ln_structured(layer.self_attn.v_proj, name="weight", amount=prune_amount, n=2, dim=0)
        prune.ln_structured(layer.self_attn.out_proj, name="weight", amount=prune_amount, n=2, dim=0)

        prune.ln_structured(layer.mlp.fc1, name="weight", amount=prune_amount, n=2, dim=0)
        prune.ln_structured(layer.mlp.fc2, name="weight", amount=prune_amount, n=2, dim=0)

        print(f"Layer {i}: Pruned {prune_amount * 100}% of output channels")

prune_amount = 0.3
structured_prune_model(updated_vision_model.vision_model, prune_amount)


Layer 0: Pruned 30.0% of output channels
Layer 1: Pruned 30.0% of output channels
Layer 2: Pruned 30.0% of output channels
Layer 3: Pruned 30.0% of output channels
Layer 4: Pruned 30.0% of output channels
Layer 5: Pruned 30.0% of output channels
Layer 6: Pruned 30.0% of output channels
Layer 7: Pruned 30.0% of output channels
Layer 8: Pruned 30.0% of output channels
Layer 9: Pruned 30.0% of output channels
Layer 10: Pruned 30.0% of output channels
Layer 11: Pruned 30.0% of output channels
Layer 12: Pruned 30.0% of output channels
Layer 13: Pruned 30.0% of output channels
Layer 14: Pruned 30.0% of output channels
Layer 15: Pruned 30.0% of output channels
Layer 16: Pruned 30.0% of output channels
Layer 17: Pruned 30.0% of output channels
Layer 18: Pruned 30.0% of output channels
Layer 19: Pruned 30.0% of output channels
Layer 20: Pruned 30.0% of output channels
Layer 21: Pruned 30.0% of output channels
Layer 22: Pruned 30.0% of output channels
Layer 23: Pruned 30.0% of output channels


#### Self-Attention Layer Pruning

In [21]:
def remove_pruning_masks(vision_model):
    for layer in vision_model.encoder.layers:
        prune.remove(layer.self_attn.k_proj, "weight")
        prune.remove(layer.self_attn.q_proj, "weight")
        prune.remove(layer.self_attn.v_proj, "weight")
        prune.remove(layer.self_attn.out_proj, "weight")
        prune.remove(layer.mlp.fc1, "weight")
        prune.remove(layer.mlp.fc2, "weight")
    print("Pruning masks removed. Model finalized.")

remove_pruning_masks(updated_vision_model.vision_model)


Pruning masks removed. Model finalized.


In [22]:
train_embeddings, train_labels = extract_embeddings(updated_vision_model, train_loader, device)
test_embeddings, test_labels = extract_embeddings(updated_vision_model, test_loader, device)

100%|██████████| 43/43 [05:03<00:00,  7.07s/it]


Total Embedding Extraction Time: 303.92 seconds
Average Time per Batch: 0.02 seconds
Peak Memory Usage During Embedding Extraction: 5336.38 MB


100%|██████████| 11/11 [01:14<00:00,  6.77s/it]

Total Embedding Extraction Time: 74.50 seconds
Average Time per Batch: 0.02 seconds
Peak Memory Usage During Embedding Extraction: 5336.38 MB





In [23]:
train_and_evaluate_classifier(train_embeddings, train_labels, test_embeddings, test_labels)

Test Accuracy: 0.7683


In [24]:
measure_inference_time(updated_vision_model, test_loader, device)

Total Inference Time: 0.15 seconds
Average Time per Batch: 0.0136 seconds


(0.1492633819580078, 0.013569398359818892)