In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import zipfile
import io
zf = zipfile.ZipFile(io.BytesIO(uploaded['clothing-dataset.zip']), "r")
zf.extractall()

In [None]:
import os
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.image  as mpimg
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing import image
import random

In [None]:
data = pd.read_csv('./clothing-dataset/images.csv').set_index('image')
data.tail()

In [None]:
top_labels = pd.DataFrame(data.groupby('label').size().reset_index().sort_values(0,ascending = False)[:11]['label'])
top_labels = top_labels[top_labels.label!='Not sure']
top_labels_list = sorted(list(top_labels['label']))
top_labels['label_num'] = top_labels['label'].apply(lambda x: top_labels_list.index(x))
top_labels

In [None]:
data_filtered = pd.merge(data.reset_index(), top_labels).set_index('image')
data_filtered['label_str'] = data_filtered['label']
data_filtered['label'] = data_filtered['label_num']

In [None]:
labeled_data = []
for i, item in enumerate(os.listdir( './clothing-dataset/images' )):
    path = os.path.join('./clothing-dataset/images', item) 
    img = image.load_img(path, target_size=(32, 32))
    
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    images = np.vstack([x])[0].tolist()

    try:
        label = data_filtered.loc[item[:-4],'label']
        labeled_data.append({'img':images, 'label':label, 'index':item[:-4]})
    except:
        label = 'no_data'

In [None]:
ind = data_filtered.index.tolist()
random.shuffle(ind)

In [None]:
n = len(data_filtered)
p_train = 0.6
p_val = 0.2
n_train = int(p_train*n)
n_val = int(p_val*n)
train_ind = ind[:n_train]
val_ind = ind[n_train:(n_train+n_val)]
test_ind = ind[(n_train+n_val):]

In [None]:
train_img = []
val_img = []
test_img = []
train_label = []
val_label = []
test_label = []
test_ids = []

for img in labeled_data:
    if img['index'] in train_ind:
        train_img.append(img['img'])
        train_label.append(img['label'])
    elif img['index'] in val_ind:
        val_img.append(img['img'])
        val_label.append(img['label'])
    elif img['index'] in test_ind:
        test_img.append(img['img'])
        test_label.append(img['label'])
        test_ids.append(img['index'])

In [None]:
!pip install transformers --quiet

In [None]:
from datasets import Dataset

In [None]:
train_ds = Dataset.from_dict({'img':train_img,'label':train_label})
val_ds = Dataset.from_dict({'img':val_img,'label':val_label})
test_ds = Dataset.from_dict({'img':test_img,'label':test_label})

In [None]:
from transformers import ViTFeatureExtractor

feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')

In [None]:
def preprocess_images(examples):
    # get batch of images
    images = examples['img']
    # convert to list of NumPy arrays of shape (C, H, W)
    images = [np.array(image, dtype=np.uint8) for image in images]
    images = [np.moveaxis(image, source=-1, destination=0) for image in images]
    # preprocess and add pixel_values
    inputs = feature_extractor(images=images)
    examples['pixel_values'] = inputs['pixel_values']

    return examples

In [None]:
from datasets import Features, ClassLabel, Array3D

# we need to define the features ourselves as both the img and pixel_values have a 3D shape 
features = Features({
    'label': ClassLabel(names = top_labels_list),
    'img': Array3D(dtype="int64", shape=(3,32,32)),
    'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
})

preprocessed_train_ds = train_ds.map(preprocess_images, batched=True, features=features)
preprocessed_val_ds = val_ds.map(preprocess_images, batched=True, features=features)
preprocessed_test_ds = test_ds.map(preprocess_images, batched=True, features=features)

In [None]:
from transformers import ViTModel
from transformers.modeling_outputs import SequenceClassifierOutput
import torch.nn as nn

class ViTForImageClassification(nn.Module):
    def __init__(self, num_labels=10, vector_length=1000):
        super(ViTForImageClassification, self).__init__()
        self.vit = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
        self.dropout = nn.Dropout(0.1)
        self.last_layer = nn.Linear(self.vit.config.hidden_size, num_labels)
        self.num_labels = num_labels

    def forward(self, pixel_values, labels):
        outputs = self.vit(pixel_values=pixel_values)
        output = self.dropout(outputs.last_hidden_state[:,0])
        logits = self.last_layer(output)

        loss = None
        if labels is not None:
          loss_fct = nn.CrossEntropyLoss()
          loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [None]:
from transformers import TrainingArguments, Trainer

metric_name = "accuracy"

args = TrainingArguments(
    f"test-clothing",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    logging_dir='logs',
)

In [None]:
from transformers import default_data_collator

data_collator = default_data_collator

In [None]:
model = ViTForImageClassification()

In [None]:
from datasets import load_metric
import numpy as np

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=preprocessed_train_ds,
    eval_dataset=preprocessed_val_ds,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
class SaveOutput:
    def __init__(self):
        self.outputs = []
        
    def __call__(self, module, module_in, module_out):
        self.outputs.append(module_in)
        
    def clear(self):
        self.outputs = []
        
save_output = SaveOutput()

hook_handles = []

for layer in model.modules():
    if str(layer) == 'Linear(in_features=768, out_features=10, bias=True)':
        handle = layer.register_forward_hook(save_output)
        hook_handles.append(handle)

len(save_output.outputs)

In [None]:
outputs = trainer.predict(preprocessed_test_ds)

In [None]:
print(outputs.metrics)

In [None]:
outputs_vectors = save_output.outputs
vectors_first_in = {}
data_img_ids = test_ids
batch_size=4
for i in range(0,len(data_img_ids)):
    first_index = i // batch_size
    second_index = i % batch_size
    vectors_first_in[data_img_ids[i]]=outputs_vectors[first_index][0][second_index].tolist()

In [None]:
import json
with open('vectors_test', 'w') as fp:
    json.dump(vectors_first_out, fp)

In [None]:
files.download('vectors_test.json') 

In [None]:
import torch
torch.save(model.state_dict(), '/content/model')
files.download('/content/model') 

In [None]:
class ViTForImageClassification(nn.Module):
    def __init__(self, num_labels=10, vector_length=1000):
        super(ViTForImageClassification, self).__init__()
        self.vit = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
        self.dropout = nn.Dropout(0.1)
        self.pre_last_layer = nn.Linear(self.vit.config.hidden_size, vector_length)
        self.last_layer = nn.Linear(vector_length, num_labels)
        self.num_labels = num_labels

    def forward(self, pixel_values, labels):
        outputs = self.vit(pixel_values=pixel_values)
        output = self.dropout(outputs.last_hidden_state[:,0])
        pre_last_output = self.pre_last_layer(output)
        logits = self.last_layer(pre_last_output)

        loss = None
        if labels is not None:
          loss_fct = nn.CrossEntropyLoss()
          loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions)

In [None]:
model_pretrained = ViTForImageClassification()

In [None]:
from scipy import spatial

In [None]:
def changed_weight(layer):
    model_weights = [] 
    for name, param in model.named_parameters():
        if name == layer:
            model_weights = param.tolist()
    pretrained_model_weights = [] 
    for name, param in model_pretrained.named_parameters():
        if name == layer:
            pretrained_model_weights = param.tolist()
    if spatial.distance.cosine(model_weights, pretrained_model_weights) == 0:
        return 'no change'
    else:
        return 'change'

In [None]:
for name, param in model.named_parameters():
    print(name + ': '+ changed_weight(name))