In [3]:
! pip install tensorboard
! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git
! pip install pytorch_lightning

[0mCollecting tensorboard
  Downloading tensorboard-2.11.2-py3-none-any.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m105.9 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hCollecting markdown>=2.6.8
  Downloading Markdown-3.4.1-py3-none-any.whl (93 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.3/93.3 kB[0m [31m251.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting grpcio>=1.24.3
  Downloading grpcio-1.51.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m247.0 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard-plugin-wit>=1.6.0
  Downloading tensorboard_plugin_wit-1.8.1-py3-none-any.whl (781 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m781.3/781.3 kB[0m [31m309.2 MB/s[0m eta [36m0:00:00[0m
Collecting werkzeug>=1.0.1
  Downloading Werkzeug-2.2.3-py3-none-any.whl (233 kB)
[2K     

In [2]:
from pathlib import Path
from random import randint, choice

import PIL
import argparse
import clip
import torch
import json
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms as T
from pytorch_lightning import LightningDataModule



# DataLoader

In [3]:
class TextImageDataset(Dataset):
    def __init__(self,
                 data: str,
                 shuffle=False,
                 custom_tokenizer=False
                 ):
        """Create a text image dataset from a json file with img/text pair, img should be preprocessed already by CLIP "ViT-B/32" Preprocessor
        Args:
            data (str): Path of the json file for the input pair. key being preprocessed image file location, value being the corresponding description
            shuffle (bool, optional): Whether or not to have shuffling behavior during sampling. Defaults to False.
            custom_tokenizer (bool, optional): Whether or not there is a custom tokenizer. Defaults to False.
        """
        super().__init__()
        self.shuffle = shuffle
        
        
        with open(data, 'r') as f:
            self.img_to_text = json.load(f)
        self.imgs = list(self.img_to_text.keys())
            
        self.custom_tokenizer = custom_tokenizer

    def __len__(self):
        return len(self.imgs)
    
    def fix_img(self, img):
        return img.convert('RGB') if img.mode != 'RGB' else img

    def random_sample(self):
        return self.__getitem__(randint(0, self.__len__() - 1))

    def sequential_sample(self, ind):
        if ind >= self.__len__() - 1:
            return self.__getitem__(0)
        return self.__getitem__(ind + 1)

    def skip_sample(self, ind):
        if self.shuffle:
            return self.random_sample()
        return self.sequential_sample(ind=ind)

    def __getitem__(self, ind):

        image_file = self.imgs[ind]
        description = self.img_to_text[image_file]

        try:
            tokenized_text = description if self.custom_tokenizer else clip.tokenize(description, truncate=True)[0]
        except:
            print(f"An exception occurred trying to load contract description {image_file}.")
            print(f"Skipping index {ind}")
            return self.skip_sample(ind)
        
        try:
            image_tensor = torch.load(image_file)
        except (PIL.UnidentifiedImageError, OSError) as corrupt_image_exceptions:
            print(f"An exception occurred trying to load file {image_file}.")
            print(f"Skipping index {ind}")
            return self.skip_sample(ind)

        # Success
        return image_tensor, tokenized_text, image_file.split('/')[-1]


In [4]:
class TextImageDataModule(LightningDataModule):
    def __init__(self,
                 data: str,
                 batch_size: int,
                 num_workers=0,
                 shuffle=False,
                 custom_tokenizer=None,
                 eval=False
                 ):
        """Create a text image data module from directories with congruent text and image names.
        Args:
            data (str): Json file containing images and text pairs
            batch_size (int): The batch size of each dataloader.
            num_workers (int, optional): The number of workers in the DataLoader. Defaults to 0.
            shuffle (bool, optional): Whether or not to have shuffling behavior during sampling. Defaults to False.
            custom_tokenizer (transformers.AutoTokenizer, optional): The tokenizer to use on the text. Defaults to None.
            eval (bool, optional): Eval mode or not
        """
        super().__init__()
        self.data = data
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.shuffle = shuffle
        self.custom_tokenizer = custom_tokenizer
        if eval:
            self.drop_last = False
        else:
            self.drop_last = True
        
    
    # Used later for scirpting
    @staticmethod
    def add_argparse_args(parent_parser):
        parser = argparse.ArgumentParser(parents=[parent_parser], add_help=False)
        parser.add_argument('--data', type=str, required=True, help='json file of the text/vision pair')
        parser.add_argument('--batch_size', type=int, help='size of the batch')
        parser.add_argument('--num_workers', type=int, default=0, help='number of workers for the dataloaders')
        parser.add_argument('--shuffle', type=bool, default=False, help='whether to use shuffling during sampling')
        return parser
    
    def setup(self, stage=None):
        self.dataset = TextImageDataset(self.data, shuffle=self.shuffle, custom_tokenizer=not self.custom_tokenizer is None)
    
    def train_dataloader(self):
        return DataLoader(self.dataset, batch_size=self.batch_size, shuffle=self.shuffle, num_workers=self.num_workers, drop_last=self.drop_last, collate_fn=self.dl_collate_fn) # 
    def dl_collate_fn(self, batch):
        if self.custom_tokenizer is None:
            return torch.stack([row[0] for row in batch]), torch.stack([row[1] for row in batch]), [row[2] for row in batch]
        else:
            return torch.stack([row[0] for row in batch]), self.custom_tokenizer([row[1] for row in batch], padding=True, truncation=True, return_tensors="pt"), [row[2] for row in batch]

# Trainer

In [5]:
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime


device = "cuda:0" if torch.cuda.is_available() else "cpu" # If using GPU then use mixed precision training.
model, preprocess = clip.load("ViT-B/32",device=device,jit=False) #Must set jit=False for training
#https://github.com/openai/CLIP/issues/57
def convert_models_to_fp32(model): 
    for p in model.parameters(): 
        p.data = p.data.float() 
        p.grad.data = p.grad.data.float() 



def train_one_epoch(epoch_index, tb_writer, steps, loss_img, loss_txt):
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, batch in enumerate(train_loader):
        # Every data instance is an input + label pair
        images,texts,_ = batch 
        optimizer.zero_grad()

        images = images.to(device)
        texts = texts.to(device)
        logits_per_image, logits_per_text = model(image=images, text=texts)

        ground_truth = torch.arange(len(images),dtype=torch.long,device=device)

        total_loss = (loss_img(logits_per_image,ground_truth) + loss_txt(logits_per_text,ground_truth))/2
        total_loss.backward()
        

        if device == "cpu":
             optimizer.step()
        else : 
            convert_models_to_fp32(model)
            optimizer.step()
            clip.model.convert_weights(model)
            
        # Gather data and report
        running_loss += total_loss.item()
        if i % steps == steps-1:
            last_loss = running_loss / steps # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(train_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss

In [6]:
# Resume Model
model.load_state_dict(torch.load('./model_checkpoint/model_lr_1e-06_bs_64_20230302_213906_33'))

<All keys matched successfully>

In [8]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
epoch_number = 34
steps_to_report = 50

SAVED_PATH = './model_checkpoint'
EPOCH = 20
BATCH_SIZE = 128
LEARNING_RATE = 1e-8

TRAIN_JSON = './data/train_50000.json'
TEST_JSON = './data/test_50000.json'

TrainDataModule = TextImageDataModule(TRAIN_JSON, BATCH_SIZE, num_workers=2, shuffle=True)
TrainDataModule.setup()
train_loader = TrainDataModule.train_dataloader()

TestDataModule = TextImageDataModule(TEST_JSON, BATCH_SIZE, num_workers=2)
TestDataModule.setup()
validation_loader = TestDataModule.train_dataloader()


loss_img = torch.nn.CrossEntropyLoss()
loss_txt = torch.nn.CrossEntropyLoss()
## 


best_vloss = 1_000_000.


# https://github.com/openai/CLIP/issues/150
# As Suggested, turn on eval mode even in training
# model.eval()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, betas=(0.9,0.98), eps=1e-6,weight_decay=0.02) #Params used from paper, the lr is smaller, safer for fine tuning to new dataset

for epoch in range(EPOCH):
    print('EPOCH {}:'.format(epoch_number + 1))
    
    model.train()
    avg_loss = train_one_epoch(epoch_number, writer, steps_to_report, loss_img, loss_txt)
    model.eval()

    running_vloss = 0.0
    with torch.no_grad():
        for i, vbatch in enumerate(validation_loader):
            images,texts,_ = vbatch 

            images = images.to(device)
            texts = texts.to(device)
            logits_per_image, logits_per_text = model(image=images, text=texts)
            ground_truth = torch.arange(len(images),dtype=torch.long,device=device)

            vloss = (loss_img(logits_per_image,ground_truth) + loss_txt(logits_per_text,ground_truth))/2
            
            running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch
    # for both training and validation
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)
    writer.flush()

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = './model_checkpoint/model_lr_{}_bs_{}_{}_{}'.format(LEARNING_RATE, BATCH_SIZE, timestamp, epoch_number)
        torch.save(model.state_dict(), model_path)

    epoch_number += 1



EPOCH 35:
  batch 50 loss: 4.2980859375
  batch 100 loss: 4.283828125
  batch 150 loss: 4.272265625
  batch 200 loss: 4.264375
  batch 250 loss: 4.27109375
  batch 300 loss: 4.276875
LOSS train 4.276875 valid 4.87109375
EPOCH 36:
  batch 50 loss: 4.26734375
  batch 100 loss: 4.264140625
  batch 150 loss: 4.292265625
  batch 200 loss: 4.2857421875
  batch 250 loss: 4.2488671875
  batch 300 loss: 4.28109375
LOSS train 4.28109375 valid 4.87109375
EPOCH 37:
  batch 50 loss: 4.3000390625
  batch 100 loss: 4.27421875
  batch 150 loss: 4.28140625
  batch 200 loss: 4.25203125
  batch 250 loss: 4.26859375
  batch 300 loss: 4.295
LOSS train 4.295 valid 4.87109375
EPOCH 38:
  batch 50 loss: 4.2498828125
  batch 100 loss: 4.2833203125
  batch 150 loss: 4.269921875
  batch 200 loss: 4.3032421875
  batch 250 loss: 4.31109375
  batch 300 loss: 4.270703125
LOSS train 4.270703125 valid 4.87109375
EPOCH 39:
  batch 50 loss: 4.28765625
  batch 100 loss: 4.2790625
  batch 150 loss: 4.2853125
  batch 200 l

In [28]:
%tensorboard --logdir logs

UsageError: Line magic function `%tensorboard` not found.


In [30]:
pip install tbparse

[0mCollecting tbparse
  Downloading tbparse-0.0.7-py3-none-any.whl (17 kB)
Installing collected packages: tbparse
Successfully installed tbparse-0.0.7
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [31]:
from tbparse import SummaryReader
log_dir = "runs/fashion_trainer_20230301_191159"
reader = SummaryReader(log_dir)
df = reader.scalars
print(df)

    step                           tag     value
0     50                    Loss/train  2.975039
1    108                    Loss/train  1.253066
2    166                    Loss/train  0.480698
3    224                    Loss/train  0.285620
4    282                    Loss/train  0.220945
5      1  Training vs. Validation Loss  2.975039
6      1  Training vs. Validation Loss  2.560547
7      2  Training vs. Validation Loss  1.253066
8      2  Training vs. Validation Loss  3.060547
9      3  Training vs. Validation Loss  0.480698
10     3  Training vs. Validation Loss  3.791016
11     4  Training vs. Validation Loss  0.285620
12     4  Training vs. Validation Loss  4.070312
13     5  Training vs. Validation Loss  0.220945
14     5  Training vs. Validation Loss  4.765625


In [None]:
print