In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd drive/MyDrive/ece_project2/vila
!pwd

In [None]:
#!/bin/bash
# !kaggle datasets download agrigorev/clothing-dataset-full

In [None]:
# !unzip clothing-dataset-full.zip -d clothing-dataset-full

In [None]:
from huggingface_hub import login
import getpass

# Prompt the user to enter their Hugging Face Token
hf_token = getpass.getpass("Enter your Hugging Face Token: ")

# Use the token to log in to Hugging Face
login(token=hf_token)

print("Successfully authenticated with Hugging Face!")

### *Step* 1: Filter T-shirts and pants dataset

In [None]:
import os
import shutil
import pandas as pd

dest_dataset_dir = "./dataset"
os.makedirs(dest_dataset_dir, exist_ok=True)

tshirt_dir = os.path.join(dest_dataset_dir, "tshirt")
pants_dir = os.path.join(dest_dataset_dir, "pants")
os.makedirs(tshirt_dir, exist_ok=True)
os.makedirs(pants_dir, exist_ok=True)


In [None]:
csv_file = "./clothing-dataset-full/images.csv"

df = pd.read_csv(csv_file)
df.head()


In [None]:
summary = df['label'].value_counts()
tshirt_count = summary.get('T-Shirt', 0)
pants_count = summary.get('Pants', 0)

print(f"Number of T-Shirt images: {tshirt_count}")
print(f"Number of Pants images: {pants_count}")

In [None]:
orig_dataset_dir = "./clothing-dataset-full/images_original"
tshirt_dir = os.path.join(dest_dataset_dir, "tshirt")
pants_dir = os.path.join(dest_dataset_dir, "pants")

filtered_df = df[df['label'].isin(["T-Shirt", "Pants"])]

for index, row in filtered_df.iterrows():
  img_name = f"{row['image']}.jpg"
  src_path = os.path.join(orig_dataset_dir, img_name)
  if row['label'] == "T-Shirt":
    shutil.copy(src_path, tshirt_dir)
  elif row['label'] == "Pants":
    shutil.copy(src_path, pants_dir)

### Step 2: Load and Preprocess the Dataset

In [None]:
from torchvision import transforms
from torch.utils.data import DataLoader, random_split
from torchvision.datasets import ImageFolder

dest_dataset_dir = "./dataset"

transform = transforms.Compose([
    transforms.Resize((336, 336)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

dataset = ImageFolder(root=dest_dataset_dir, transform=transform)

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

### Step 3: Load Vila Model (ViTL/14)

In [None]:
# !git clone https://github.com/Efficient-Large-Model/VILA.git
# %cd VILA

In [None]:
# !chmod a+x environment_setup.sh
# !./environment_setup.sh vila

# !python -W ignore llava/eval/run_vila.py \
#     --model-path Efficient-Large-Model/Llama-3-VILA1.5-8 \
#     --conv-mode llama_3 \
#     --query "<image>\n Please describe the traffic condition." \
#     --image-file "car.jpg"

In [None]:
import torch
from transformers import CLIPProcessor, CLIPModel
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
model.to(device)


In [None]:
vision_model = model.vision_model

print(vision_model)

### Step 4: Modify Configuration

In [None]:
config = vision_model.config
print(config)

In [None]:
from transformers import CLIPVisionConfig, CLIPVisionModel

config.image_size = 336
num_patches = (config.image_size // config.patch_size) **  2

updated_vision_model = CLIPVisionModel(config)
updated_vision_model.to(device)
print(updated_vision_model.config)

### Step 5: Train the Original Model

In [None]:
import os
torch.save(updated_vision_model, "entire_model.pth")

model_size = os.path.getsize("entire_model.pth") / (1024 * 1024)
print(f"Entire Model Size: {model_size:.2f} MB")

In [None]:
import time
import psutil

input_size = (3, 336, 336)

def extract_embeddings(model, dataloader, device):
    model.eval()
    all_embeddings = []
    all_labels = []

    batch_times = []

    start_time = time.time()
    if device != "cpu":
      torch.cuda.reset_peak_memory_stats(device)


    with torch.no_grad():
        for images, labels in tqdm(dataloader):
            batch_start_time = time.time()
            images = images.to(device)


            outputs = model(pixel_values=images)
            embeddings = outputs.last_hidden_state[:, 0, :]

            batch_times.append(time.time() - batch_start_time)

            all_embeddings.append(embeddings.cpu())
            all_labels.append(labels.cpu())

    total_time = time.time() - start_time
    print(f"Total Embedding Extraction Time: {total_time:.2f} seconds")

    avg_batch_time = sum(batch_times) / len(batch_times)
    print(f"Average Time per Batch: {avg_batch_time:.2f} seconds")

    if device != "cpu":
      peak_memory = torch.cuda.max_memory_allocated(device) / (1024 * 1024)  # Convert to MB
      print(f"Peak Memory Usage During Embedding Extraction: {peak_memory:.2f} MB")

    return torch.cat(all_embeddings), torch.cat(all_labels)



In [None]:
train_embeddings, train_labels = extract_embeddings(updated_vision_model, train_loader, device)
test_embeddings, test_labels = extract_embeddings(updated_vision_model, test_loader, device)

print(f"Train Embeddings Shape: {train_embeddings.shape}")
print(f"Test Embeddings Shape: {test_embeddings.shape}")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def train_and_evaluate_classifier(train_embeddings, train_labels, test_embeddings, test_labels):

    train_embeddings_np = train_embeddings.numpy()
    train_labels_np = train_labels.numpy()
    test_embeddings_np = test_embeddings.numpy()
    test_labels_np = test_labels.numpy()


    classifier = LogisticRegression(max_iter=1000)
    classifier.fit(train_embeddings_np, train_labels_np)


    predictions = classifier.predict(test_embeddings_np)
    accuracy = accuracy_score(test_labels_np, predictions)

    print(f"Test Accuracy: {accuracy:.4f}")


In [None]:
train_and_evaluate_classifier(train_embeddings, train_labels, test_embeddings, test_labels)

### Step 6:Pruning

#### Linear Layer Pruning

In [None]:
import torch.nn.utils.prune as prune

def apply_pruning(model, amount=0.3):
    pruned_model = model.__class__(model.config).to(next(model.parameters()).device)
    pruned_model.load_state_dict(model.state_dict())

    for name, module in pruned_model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=amount)

    for name, module in pruned_model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.remove(module, 'weight')

    return pruned_model

pruned_model = apply_pruning(updated_vision_model, amount=0.45)

torch.save(pruned_model.state_dict(), "pruned_vision_model.pth")
print(f"Pruned model saved. Size: {os.path.getsize('pruned_vision_model.pth') / 1e6:.2f} MB")


In [None]:
pruned_train_embeddings, pruned_train_labels = extract_embeddings(
    pruned_model, train_loader, device)

pruned_test_embeddings, pruned_test_labels = extract_embeddings(
    pruned_model, test_loader, device)

train_and_evaluate_classifier(pruned_train_embeddings, pruned_train_labels,
                              pruned_test_embeddings, pruned_test_labels)


In [None]:
def measure_inference_time(model, dataloader, device):
    model.eval()
    total_time = 0
    total_batches = len(dataloader)

    with torch.no_grad():
        for images, _ in dataloader:
            images = images.to(device)

            start_time = time.time()
            _ = model(pixel_values=images)
            end_time = time.time()

            total_time += (end_time - start_time)

    avg_time_per_batch = total_time / total_batches
    print(f"Total Inference Time: {total_time:.2f} seconds")
    print(f"Average Time per Batch: {avg_time_per_batch:.4f} seconds")

    return total_time, avg_time_per_batch


In [None]:
measure_inference_time(updated_vision_model, test_loader, device)
measure_inference_time(pruned_model, test_loader, device)

In [None]:
import torch.nn.utils.prune as prune

def structured_prune_model(vision_model, prune_amount=0.3):
    for i, layer in enumerate(vision_model.encoder.layers):
        prune.ln_structured(layer.self_attn.k_proj, name="weight", amount=prune_amount, n=2, dim=0)
        prune.ln_structured(layer.self_attn.q_proj, name="weight", amount=prune_amount, n=2, dim=0)
        prune.ln_structured(layer.self_attn.v_proj, name="weight", amount=prune_amount, n=2, dim=0)
        prune.ln_structured(layer.self_attn.out_proj, name="weight", amount=prune_amount, n=2, dim=0)

        prune.ln_structured(layer.mlp.fc1, name="weight", amount=prune_amount, n=2, dim=0)
        prune.ln_structured(layer.mlp.fc2, name="weight", amount=prune_amount, n=2, dim=0)

        print(f"Layer {i}: Pruned {prune_amount * 100}% of output channels")

prune_amount = 0.3
structured_prune_model(updated_vision_model.vision_model, prune_amount)


#### Self-Attention Layer Pruning

In [None]:
def remove_pruning_masks(vision_model):
    for layer in vision_model.encoder.layers:
        prune.remove(layer.self_attn.k_proj, "weight")
        prune.remove(layer.self_attn.q_proj, "weight")
        prune.remove(layer.self_attn.v_proj, "weight")
        prune.remove(layer.self_attn.out_proj, "weight")
        prune.remove(layer.mlp.fc1, "weight")
        prune.remove(layer.mlp.fc2, "weight")
    print("Pruning masks removed. Model finalized.")

remove_pruning_masks(updated_vision_model.vision_model)


In [None]:
train_embeddings, train_labels = extract_embeddings(updated_vision_model, train_loader, device)
test_embeddings, test_labels = extract_embeddings(updated_vision_model, test_loader, device)

In [None]:
train_and_evaluate_classifier(train_embeddings, train_labels, test_embeddings, test_labels)

In [None]:
measure_inference_time(updated_vision_model, test_loader, device)