In [1]:
from transformers import CLIPProcessor, CLIPModel

In [2]:
from sklearn.model_selection import train_test_split
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from torch import nn
import torch
import torch.optim as optim
import scipy.io
from torch.utils.data import Dataset,DataLoader
from sklearn.metrics import accuracy_score, f1_score
from tqdm.notebook import tqdm as tq

In [3]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

In [4]:
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

对一张网络上的图片进行分类

In [5]:
from PIL import Image
import requests

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
#Here we need to give CLIP a texual descriptions of image
#CLIP will find the one which matches with the image the most out of the list
inputs = processor(
    text=["a photo of a cat", "a photo of a dog"], images=[image], return_tensors="pt", padding=True
)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities

In [6]:
probs

tensor([[0.9949, 0.0051]], grad_fn=<SoftmaxBackward>)

构建dataset

In [5]:
class StanfordCars(Dataset):
  def __init__(self,metaPath,imgDir,labelMeta,model_name="openai/clip-vit-base-patch32",cuda=False):
    """
    mataPath: path to the annotation file

    imgDir: Where images are stored

    labelMeta: File where label data is stored

    model_name: Name of model we need to store. It is needed because we need to use the
    processor of the particular model to process inputs.

    cuda : To enable gpu acceleration    

    text: to store text like "This is image of {image} car"

    textInput: Input_ids of the text which needs to be passed to CLIP model

    """
    super(StanfordCars,self).__init__()
    self.metaPath = metaPath
    self.labelMeta = labelMeta
    self.path = imgDir
    train_data = scipy.io.loadmat(self.metaPath)
    class_data = scipy.io.loadmat(self.labelMeta)
    #class names
    self.classes = class_data['class_names'][0]
    # This is our data i.e filenames and their labels
    self.data = train_data['annotations'][0]
    # To process inputs
    self.processor = CLIPProcessor.from_pretrained(model_name)
    self.text = []
    self.textInput = None
    self.cuda = cuda

  def processLabels(self):
    """
    Only needs to process text once since every image will belong to at least one class in labels.
    We just process labels one time and then add these 'input_ids' to our images. We will append these later
    to our image pixel_values and pass the whole dict to CLIP model.
    """
    for i in self.classes:
      # Adding text prompt to help clip
      self.text.append(f'This is photo of {i[0]} car')
    #processing this text
    self.textInput = self.processor(text=self.text,return_tensors="pt", padding=True)

    if(self.cuda):
      for k in self.textInput.keys():
        self.textInput[k] = self.textInput[k].cuda()

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    #just to check of processLable method is run or not.
    assert self.textInput!=None,'run the processLabels method'

    bbox_x1,bbox_x2,bbox_y1,bbox_y2,label,fname = self.data[idx]

    label = label.item() - 1 # because labeling starts from 1 in metadata file
    pth = self.path+'/'+fname.item()
    img = Image.open(pth)
    img = img.convert('RGB')
    #using CLIP processor to apply image pre-processing
    img = self.processor(images=img,return_tensors="pt")
    img['pixel_values'] = img['pixel_values'].squeeze() # by default batch size is one

    if(self.cuda):
      img['pixel_values'] = img['pixel_values'].cuda()

    return (img,label)

In [6]:
dataset = StanfordCars(metaPath='content/devkit/cars_train_annos.mat',imgDir='content/cars_train',labelMeta='content/devkit/cars_meta.mat',cuda=True)

In [7]:
dataset.processLabels()

In [8]:
def train_eval_split(dataset,per,seed):
  """
  dataset: Full dataset object

  per: How much train test split

  seed: Random seed

  Splitting dataset.data which contains file name and labels into two parts.
  and then creating two different dataset for train and eval
  """
  train_data,test_data = train_test_split(dataset.data,test_size = per,random_state=seed)
  dataset.data = train_data
  evalDataset = StanfordCars(metaPath='content/devkit/cars_train_annos.mat',imgDir='content/cars_train',labelMeta='content/devkit/cars_meta.mat',cuda=True)
  evalDataset.processLabels()
  evalDataset.data = test_data
  return (dataset,evalDataset)

In [9]:
trainData,evalData = train_eval_split(dataset,0.05,3)

In [10]:
len(trainData)

7736

In [11]:
trainLoader = DataLoader(trainData,batch_size=64,shuffle=True)
evalLoader = DataLoader(evalData,batch_size=8,shuffle=True)

In [14]:
predictions = []
truth = []
#we defined eariler
model.cuda()
model.eval()
for inputs,label in tq(evalLoader):
  #add the attention mask and input_ids to input image pixel values
  for k in evalData.textInput.keys():
    inputs[k] = evalData.textInput[k]
  outputs = model(**inputs)
  logits_per_image = outputs.logits_per_image
  probs = logits_per_image.softmax(dim=1)
  preds =  torch.argmax(probs, dim=1)
  preds=preds.cpu()
  for i in preds:
    predictions.append(i.item())
  for j in label:
    truth.append(j.item())

  0%|          | 0/51 [00:00<?, ?it/s]

zero shot 时 clip 对 stanfordcars 数据集的分类

In [16]:
acc = accuracy_score(truth,predictions)
print(acc)

0.5857843137254902


In [19]:
score = f1_score(truth,predictions,average='weighted')
print(score)

0.5627612410558148


linear probe 后， clip 对 stanfordcars数据集的分类

In [20]:
class FineTuneCLIP(nn.Module):
  def __init__(self,out_shape=196,model_name="openai/clip-vit-base-patch32",freeze=True):
    super(FineTuneCLIP,self).__init__()
    self.CLIP = CLIPModel.from_pretrained(model_name)
    # Freezing the CLIP model
    if(freeze):
      for parameter in self.CLIP.parameters():
        parameter.requires_grad=False
    # Adding extra last layers
    self.fc1 = nn.Sequential(
        nn.Linear(out_shape,out_shape*5),
        nn.BatchNorm1d(out_shape*5),
        nn.ReLU(),
        nn.Dropout(0.25)
    )

    self.fc2 =  nn.Sequential(
        nn.Linear(out_shape*5,out_shape*5),
        nn.BatchNorm1d(out_shape*5),
        nn.ReLU(),
        nn.Linear(out_shape*5,out_shape*5),
        nn.BatchNorm1d(out_shape*5),
        nn.ReLU(),
        nn.Dropout(0.3)
    )


    self.fc3 = nn.Sequential(
        nn.Linear(out_shape*5,out_shape),
        nn.BatchNorm1d(out_shape),
    )

  def forward(self,x,y):
    out = self.CLIP(**x)
    out = out.logits_per_image
    out = self.fc1(out)
    out = self.fc2(out)
    out = self.fc3(out)
    return out

In [21]:
def train(model,train_loader,eval_loader,epochs,criterion,optimizer):
  """
  This function trains our model.

  model: Our model we need to train

  train_loader: contains training data

  eval_loader: Contains validation data

  epochs: No. of epochs

  criterions: Loss function

  optimizer: Optimizer for learning

  """
  model = model.cuda()
  loss_list=[]
  accuracy_list=[]
  size = len(train_loader)
  eval_size = len(eval_loader)
  #val_steps = size//2
  for epoch in range(epochs):
    model.train()
    steps = 1
    #initilizing our tqdm progress bar for checking progress
    train_tq = tq(train_loader)
    for inputs,labels in train_tq:
      steps+=1
      """
      add text input info to dict,
      Here we are adding our 'input_ids' and 'attention_masks'
      which we have already calculated by calling processLabels() function in dataset
      to our 'pixel_values' i.e inputs which are from train_loader

      dataset.textInput = {
        'input_ids' : [tensor]
        'attention_mask': [tensor]
      }

      inputs = {
        'pixel_values' : [tensor] of shape (3,224,224)
      }

      we are adding the 'input_ids' and 'attention_masks'  values so the final input should be

      inputs = {
        'input_ids' : [tensor]
        'attention_mask': [tensor]
        'pixel_values' : [tensor] of shape (3,224,224)
      }

      This is the input to our CLIP model

      """
      for k in dataset.textInput.keys():
        inputs[k] = dataset.textInput[k]
      optimizer.zero_grad()
      outputs = model(inputs)
      #predictions
      preds =  torch.argmax(outputs, dim=1)
      #loss
      loss = criterion(outputs, labels.cuda())
      #accuracy
      acc = torch.sum(preds.cpu() == labels.cpu().data).item()
      acc = acc/len(preds)
      accuracy_list.append(acc)
      loss_list.append(loss.item())
      #backprop
      loss.backward()
      optimizer.step()
      #setting the values of our progress bar
      train_tq.set_description(f'TRAIN :: steps: {steps}/{size+1} accuray : {acc*100:.3f} loss: {loss.item():.4f} preds:{preds[0].item()} label:{labels[0].item()}')
    #calling evaluate method to check validation accuracy
    accuracy,val_loss_list = evaluate(model,eval_loader,criterion)


  return {
      "accuracy":accuracy,
      "train_loss":loss_list,
      "train_accuracy":accuracy_list,
      "val_loss":val_loss_list,
  }

In [22]:
def evaluate(model,eval_loader,criterion):
  #calculates validation accuracy
  eval_size = len(eval_loader)
  val_acc_list = []
  val_loss_list = []
  eval_tq = tq(eval_loader)
  esteps = 0
  model.eval()
  for inputs,labels in eval_tq:
    esteps+=1
    #add text info to dict
    for k in dataset.textInput.keys():
      inputs[k] = dataset.textInput[k]

    outputs = model(inputs)
    preds =  torch.argmax(outputs, dim=1)
    val_loss = criterion(outputs, labels.cuda())
    val_acc = torch.sum(preds.cpu() == labels.cpu().squeeze().data).item()
    val_acc = val_acc/len(preds)
    val_loss_list.append(val_loss.item())
    val_acc_list.append(val_acc)

    eval_tq.set_description(f'EVAL :=: steps: {esteps}/{eval_size} accuray : {val_acc*100:.3f} loss: {val_loss.item():.4f}')

  accuracy = sum(val_acc_list)/len(val_acc_list)
  return (accuracy,val_loss_list)

In [23]:
fineCLIP = FineTuneCLIP()

In [24]:
feature_extract = True
print("Params to learn:")
if feature_extract:
    params_to_update = []
    for name,param in fineCLIP.named_parameters():
        if param.requires_grad == True:#
            params_to_update.append(param)
            print("\t",name)
else:
    for name,param in fineCLIP.named_parameters():
        if param.requires_grad == True:
            print("\t",name)

Params to learn:
	 fc1.0.weight
	 fc1.0.bias
	 fc1.1.weight
	 fc1.1.bias
	 fc2.0.weight
	 fc2.0.bias
	 fc2.1.weight
	 fc2.1.bias
	 fc2.3.weight
	 fc2.3.bias
	 fc2.4.weight
	 fc2.4.bias
	 fc3.0.weight
	 fc3.0.bias
	 fc3.1.weight
	 fc3.1.bias


In [25]:
optimizer = optim.Adam(params_to_update,lr=0.0002)
criterion=nn.CrossEntropyLoss()

In [26]:
kwargs = {"model":fineCLIP,
          "train_loader":trainLoader,
          "eval_loader":evalLoader,
          "epochs":6,
          "criterion":criterion,
          "optimizer":optimizer,
}

In [27]:
res=train(**kwargs)

  0%|          | 0/115 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

In [30]:
predictions = []
truth = []
fineCLIP.eval()
for inputs,label in tq(evalLoader):
  #add the attention mask and input_ids to input image pixel values
  for k in dataset.textInput.keys():
    inputs[k] = dataset.textInput[k]
  outputs = fineCLIP(inputs)
  probs = outputs.softmax(dim=1)
  preds =  torch.argmax(probs, dim=1)
  preds=preds.cpu()
  for i in preds:
    predictions.append(i.item())
  for j in label:
    truth.append(j.item())

  0%|          | 0/49 [00:00<?, ?it/s]

In [31]:
acc = accuracy_score(truth,predictions)
print(acc)

0.7312661498708011


In [32]:
score = f1_score(truth,predictions,average='weighted')
print(score)

0.7179183865230375


In [17]:
class_data = scipy.io.loadmat('content/devkit/cars_meta.mat')

In [18]:
class_data['class_names'][0]

array([array(['AM General Hummer SUV 2000'], dtype='<U26'),
       array(['Acura RL Sedan 2012'], dtype='<U19'),
       array(['Acura TL Sedan 2012'], dtype='<U19'),
       array(['Acura TL Type-S 2008'], dtype='<U20'),
       array(['Acura TSX Sedan 2012'], dtype='<U20'),
       array(['Acura Integra Type R 2001'], dtype='<U25'),
       array(['Acura ZDX Hatchback 2012'], dtype='<U24'),
       array(['Aston Martin V8 Vantage Convertible 2012'], dtype='<U40'),
       array(['Aston Martin V8 Vantage Coupe 2012'], dtype='<U34'),
       array(['Aston Martin Virage Convertible 2012'], dtype='<U36'),
       array(['Aston Martin Virage Coupe 2012'], dtype='<U30'),
       array(['Audi RS 4 Convertible 2008'], dtype='<U26'),
       array(['Audi A5 Coupe 2012'], dtype='<U18'),
       array(['Audi TTS Coupe 2012'], dtype='<U19'),
       array(['Audi R8 Coupe 2012'], dtype='<U18'),
       array(['Audi V8 Sedan 1994'], dtype='<U18'),
       array(['Audi 100 Sedan 1994'], dtype='<U19'),
       arra

In [4]:
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")