# Fine-tune BLIP using Hugging Face `transformers` and `datasets` 🤗

This tutorial is largely based from the [GiT tutorial](https://colab.research.google.com/drive/1HLxgrG7xZJ9FvXckNG61J72FkyrbqKAA?usp=sharing) on how to fine-tune GiT on a custom image captioning dataset. Here we will use a dummy dataset of [football players](https://huggingface.co/datasets/ybelkada/football-dataset) ⚽ that is uploaded on the Hub. The images have been manually selected together with the captions. 
Check the 🤗 [documentation](https://huggingface.co/docs/datasets/image_dataset) on how to create and upload your own image-text dataset.

In [1]:
# install requirements
import sys

!pip3 install transformers==4.15.0 timm==0.4.12 fairscale==0.4.4
# !git clone https://github.com/salesforce/BLIP
!pip install pycocotools pycocoevalcap

Collecting transformers==4.15.0
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting timm==0.4.12
  Downloading timm-0.4.12-py3-none-any.whl (376 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m377.0/377.0 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fairscale==0.4.4
  Downloading fairscale-0.4.4.tar.gz (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.4/235.4 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.m

In [2]:
# ! wget https://github.com/salesforce/BLIP/archive/refs/heads/main.zip
# ! unzip main.zip
# ! cp -r BLIP-main/* /kaggle/working/

In [3]:
# %cd /kaggle/working
# ! git clone https://github.com/MahmoudQaid/Blip
# %cd Blip

In [4]:
from tqdm.notebook import tqdm

# train_caption.py

In [5]:
def u():
    %cd /kaggle/working
    ! rm -r Blip
    ! git clone https://github.com/MahmoudQaid/Blip
    %cd Blip
u()

/kaggle/working
rm: cannot remove 'Blip': No such file or directory
Cloning into 'Blip'...
remote: Enumerating objects: 151, done.[K
remote: Counting objects: 100% (151/151), done.[K
remote: Compressing objects: 100% (124/124), done.[K
remote: Total 151 (delta 84), reused 70 (delta 25), pack-reused 0[K
Receiving objects: 100% (151/151), 6.95 MiB | 8.05 MiB/s, done.
Resolving deltas: 100% (84/84), done.
/kaggle/working/Blip


In [6]:
from collections import defaultdict
import json,string,re

masked_word=['is','at','for','am','are','and','or','but','of']
pattern=r'\b('+'|'.join(masked_word)+r')\b'

def load_doc_karpathy(json_file_path,pattern=None,lower=False):
    # this function return dictionary 
    with open(json_file_path,'r') as file:
        data=json.loads(file.read())
    dict_data=defaultdict(list)
    for example in data['images']:
        temp=[]
        
        for sentence in example['sentences']:
            cap=sentence['raw']
            if lower:
                cap=cap.lower()
                
            cap=cap.translate(str.maketrans('','',string.punctuation))
            
            if pattern is not None:
                cap=re.sub(pattern,'',cap)
            cap=' '.join(cap.split())
            if example['split']=='train':
                dict_data[example['split']].append({'caption':cap,
                                                    'image': example['filename'],
                                                    'image_id': example['imgid']})
            else:
                temp.append(cap)
                
        if example['split']!='train':
            dict_data[example['split']].append({'caption':temp,
                                                'image': example['filename']
                                             })
            
# {'train':[{'image':'name.jpg','image_id':img_id,'caption':cap}],'test':[{'image':'name.jpg','captions':[cap1,...,cap5]}],'val':[{'image':'name.jpg','captions':[cap1,...,cap5]}]}  
    return dict_data


KARPATHY_DATA=load_doc_karpathy(json_file_path='/kaggle/input/karpathy-splits/dataset_flickr8k.json',pattern=pattern,lower=True)   


In [7]:
def prepare_ref_caps_for_evaluate(list_dict,output_json_path):
    references={'annotations':[],
                'images':[]}
    j=0
    for i,d in enumerate(list_dict):
        for cap in d['caption']:
            references['annotations'].append({'image_id':d['image'],'caption':cap,'id':j})
            j+=1
        references['images'].append({'id':d['image']})
    json.dump(references,open(output_json_path,'w'))
    
    return references
    


In [8]:
len(KARPATHY_DATA['train'])

30000

In [9]:
# New=defaultdict(list)
# for i in KARPATHY_DATA:
#     for j,d in enumerate(KARPATHY_DATA[i]):
#         if j ==10:
#             break
#         New[i].append(d)
# # d
# KARPATHY_DATA=dict(New)


In [10]:
# ! mkdir annotation
# json.dump(KARPATHY_DATA['val'],open('annotation/coco_karpathy_val_gt.json','w'))
# json.dump(KARPATHY_DATA['test'],open('annotation/coco_karpathy_test_gt.json','w'))
# json.dump(KARPATHY_DATA['train'],open('annotation/coco_karpathy_train_gt.json','w'))

In [11]:
# ! pip install vit_pytorch
# import torch
# from vit_pytorch.vit import ViT

# v = ViT(
#     image_size = 256,
#     patch_size = 32,
#     num_classes = 1000,
#     dim = 1024,
#     depth = 6,
#     heads = 16,
#     mlp_dim = 2048,
#     dropout = 0.1,
#     emb_dropout = 0.1
# )

# # forward pass now returns predictions and the representation of the final layer
# predictions = v(torch.randn(1, 3, 256, 256))
# predictions.shape

In [12]:
model=None

In [16]:
'''
 * Copyright (c) 2022, salesforce.com, inc.
 * All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 * By Junnan Li
'''
import argparse
import os
import ruamel_yaml as yaml
import numpy as np
import random
import time
import datetime
import json
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torch.distributed as dist
from torch.utils.data import DataLoader

from models.blip import blip_decoder
import utils
from utils import cosine_lr_schedule
from data import create_dataset, create_sampler, create_loader
from data.utils import save_result, coco_caption_eval


# dist.init_process_group(backend='nccl',world_size=1, init_method='env://',rank=0)


def train(model, data_loader, optimizer, epoch, device):
    # train
    model.train()  
    
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    metric_logger.add_meter('loss', utils.SmoothedValue(window_size=1, fmt='{value:.4f}'))
    header = 'Train Caption Epoch: [{}]'.format(epoch)
    print_freq = 50

    for i, (image, caption, _) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
        image = image.to(device)       
        
        loss = model(image, caption)      
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()    
        
        metric_logger.update(loss=loss.item())
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

    # gather the stats from all processes
    metric_logger.synchronize_between_processes()
    print("Averaged stats:", metric_logger.global_avg())     
    return {k: "{:.3f}".format(meter.global_avg) for k, meter in metric_logger.meters.items()}  


@torch.no_grad()
def evaluate(model, data_loader, device, config):
    # evaluate
    model.eval() 
    
    metric_logger = utils.MetricLogger(delimiter="  ")
    header = 'Caption generation:'
    print_freq = 10

    result = []
    for image, image_id in metric_logger.log_every(data_loader, print_freq, header): 
        
        image = image.to(device)       
        
        captions = model.generate(image, sample=False, num_beams=config['num_beams'], max_length=config['max_length'], 
                                  min_length=config['min_length'])
        
        for caption, img_id in zip(captions, image_id):
#             result.append({"image_id": img_id.item(), "caption": caption})
            result.append({"image_id": img_id, "caption": caption})
  
    return result


def main(args, config,model=None):
    utils.init_distributed_mode(args)    
    
    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    cudnn.benchmark = True

    #### Dataset #### 
    print("Creating captioning dataset")
    train_dataset, val_dataset, test_dataset = create_dataset('caption_coco', config,annot_dict=KARPATHY_DATA)  
#     train_dataset, val_dataset, test_dataset = create_dataset('caption_coco', config)  

    if args.distributed:
        num_tasks = utils.get_world_size()
        global_rank = utils.get_rank()            
        samplers = create_sampler([train_dataset,val_dataset,test_dataset], [True,False,False], num_tasks, global_rank)         
    else:
        samplers = [None, None, None]
    
    train_loader, val_loader, test_loader = create_loader([train_dataset, val_dataset, test_dataset],samplers,
                                                          batch_size=[config['batch_size']]*3,num_workers=[4,4,4],
                                                          is_trains=[True, False, False], collate_fns=[None,None,None])         

    #### Model #### 
    print("Creating model")
    if config['load_checkpoint']:
        model = blip_decoder(pretrained='', image_size=config['image_size'], vit=config['vit'], 
                               vit_grad_ckpt=config['vit_grad_ckpt'], vit_ckpt_layer=config['vit_ckpt_layer'], 
                               prompt=config['prompt'])
        checkpoint=torch.load(config['checkpoint'])
        model.load_state_dict(checkpoint['model'])
    elif model is None:
        model = blip_decoder(pretrained=config['pretrained'], image_size=config['image_size'], vit=config['vit'], 
                               vit_grad_ckpt=config['vit_grad_ckpt'], vit_ckpt_layer=config['vit_ckpt_layer'], 
                               prompt=config['prompt'])
        for param in model.visual_encoder.parameters():
            param.requires_grad=False

    model = model.to(device)   
    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
        model_without_ddp = model.module    
    
    optimizer = torch.optim.AdamW(params=model.parameters(), lr=config['init_lr'], weight_decay=config['weight_decay'])
    if config['load_checkpoint']:
        optimizer.load_state_dict(checkpoint['optimizer'])
    best = 0
    best_epoch = 0

    print("Start training")
    start_time = time.time()    
    for epoch in range(0, config['max_epoch']):
        if not args.evaluate:        
            if args.distributed:
                train_loader.sampler.set_epoch(epoch)
                
            cosine_lr_schedule(optimizer, epoch, config['max_epoch'], config['init_lr'], config['min_lr'])
            train_stats = train(model, train_loader, optimizer, epoch, device) 
        val_result = evaluate(model_without_ddp, val_loader, device, config)
        val_result_file = save_result(val_result, args.result_dir, 'val_epoch%d'%epoch, remove_duplicate='image_id')        
    
        test_result = evaluate(model_without_ddp, test_loader, device, config)  
        test_result_file = save_result(test_result, args.result_dir, 'test_epoch%d'%epoch, remove_duplicate='image_id')  
        
        if utils.is_main_process():
            prepare_ref_caps_for_evaluate(KARPATHY_DATA['val'],config['ann_root']+'/coco_karpathy_val_gt.json')
            prepare_ref_caps_for_evaluate(KARPATHY_DATA['test'],config['ann_root']+'/coco_karpathy_test_gt.json')
            coco_val = coco_caption_eval(config['ann_root'],val_result_file,'val')
            coco_test = coco_caption_eval(config['ann_root'],test_result_file,'test')
            if args.evaluate:            
                log_stats = {**{f'val_{k}': v for k, v in coco_val.eval.items()},
                             **{f'test_{k}': v for k, v in coco_test.eval.items()},                       
                            }
                with open(os.path.join(args.output_dir, "evaluate.txt"),"a") as f:
                    f.write(json.dumps(log_stats) + "\n")                   
            else:             
                save_obj = {
                    'model': model_without_ddp.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'config': config,
                    'epoch': epoch,
                }

                if coco_val.eval['CIDEr'] + coco_val.eval['Bleu_4'] > best:
                    best = coco_val.eval['CIDEr'] + coco_val.eval['Bleu_4']
                    best_epoch = epoch                
                    torch.save(save_obj, os.path.join(args.output_dir, 'checkpoint_best.pth')) 
                    
                log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
                             **{f'val_{k}': v for k, v in coco_val.eval.items()},
                             **{f'test_{k}': v for k, v in coco_test.eval.items()},                       
                             'epoch': epoch,
                             'best_epoch': best_epoch,
                            }
                with open(os.path.join(args.output_dir, "log.txt"),"a") as f:
                    f.write(json.dumps(log_stats) + "\n")     
                    
        if args.evaluate: 
            break
#         dist.barrier()     

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str)) 
    return model
class Args(dict):
    __setattr__ = dict.__setitem__
    __getattr__ = dict.__getitem__

args = {
    'config':'./configs/caption_coco.yaml',
    'output_dir':'output/Caption_coco' ,
    'evaluate':False ,
    'device':'cuda',
    'seed':42,
    'world_size':1,  
    'dist_url':'env://',
    'distributed':False
}
args = Args(args) # dict2object

if __name__ == '__main__':
#     parser = argparse.ArgumentParser()
#     parser.add_argument('--config', default='./configs/caption_coco.yaml')
#     parser.add_argument('--output_dir', default='output/Caption_coco')        
#     parser.add_argument('--evaluate', action='store_true')    
#     parser.add_argument('--device', default='cuda')
#     parser.add_argument('--seed', default=42, type=int)
#     parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes')    
#     parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
#     parser.add_argument('--distributed', default=True, type=bool)
#     args = parser.parse_args()

#     config = yaml.load(open(args.config, 'r'), Loader=yaml.Loader)
    config={
     'image_root': '/kaggle/input/flickr8k/Images',
#      'image_root':'/kaggle/input/coco2014',
     'ann_root': 'annotations',
     'coco_gt_root': 'annotation/coco_gt',
#      'pretrained':''
     'pretrained': 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth', # vit: base
#      'pretrained': 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth',  # vit: large
     'vit': 'base',
#      'vit': 'large',
     'vit_grad_ckpt': False,
     'vit_ckpt_layer': 0,
     'batch_size': 16,
     'init_lr': 1e-05,
     'image_size': 384,
     'max_length': 20,
     'min_length': 5,
     'num_beams': 3,
     'prompt': 'a picture of ',
     'weight_decay': 0.05,
     'min_lr': 0,
     'max_epoch': 1,
     'checkpoint':'/kaggle/input/blip-temp/Blip/output/Caption_coco/checkpoint_best.pth',
     'load_checkpoint':False
    }
    args.result_dir = os.path.join(args.output_dir, 'result')

    Path(args.output_dir).mkdir(parents=True, exist_ok=True)
    Path(args.result_dir).mkdir(parents=True, exist_ok=True)
    Path(config['ann_root']).mkdir(parents=True, exist_ok=True)
    
    yaml.dump(config, open(os.path.join(args.output_dir, 'config.yaml'), 'w'))    

    model=main(args,config,model)

Not using distributed mode
Creating captioning dataset
Creating model
load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth
Start training
Train Caption Epoch: [0]  [   0/1875]  eta: 0:53:20  lr: 0.000010  loss: 4.0479  time: 1.7068  data: 1.1439  max mem: 13352
Train Caption Epoch: [0]  [  50/1875]  eta: 0:16:06  lr: 0.000010  loss: 3.2966  time: 0.5105  data: 0.0002  max mem: 13352
Train Caption Epoch: [0]  [ 100/1875]  eta: 0:15:15  lr: 0.000010  loss: 3.5283  time: 0.5021  data: 0.0002  max mem: 13352
Train Caption Epoch: [0]  [ 150/1875]  eta: 0:14:42  lr: 0.000010  loss: 3.2327  time: 0.5001  data: 0.0002  max mem: 13352
Train Caption Epoch: [0]  [ 200/1875]  eta: 0:14:13  lr: 0.000010  loss: 3.0106  time: 0.5010  data: 0.0002  max mem: 13352
Train Caption Epoch: [0]  [ 250/1875]  eta: 0:13:46  lr: 0.000010  loss: 3.1243  time: 0.5130  data: 0.0002  max mem: 13352
Train Caption Epoch: [0]  [ 300/1875]  et

PTBTokenizer tokenized 55487 tokens at 177154.81 tokens per second.
PTBTokenizer tokenized 9904 tokens at 45610.27 tokens per second.


setting up scorers...
Downloading stanford-corenlp-3.6.0 for SPICE ...
Progress: 384.5M / 384.5M (100.0%)
Extracting stanford-corenlp-3.6.0 ...
Done.
computing Bleu score...
{'testlen': 8905, 'reflen': 8929, 'guess': [8905, 7905, 6905, 5905], 'correct': [6972, 4004, 1986, 874]}
ratio: 0.9973121290176954
Bleu_1: 0.781
Bleu_2: 0.628
Bleu_3: 0.484
Bleu_4: 0.359
computing METEOR score...
METEOR: 0.284
computing Rouge score...
ROUGE_L: 0.601
computing CIDEr score...
CIDEr: 1.105
computing SPICE score...


Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [1.2 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [2.1 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [1.1 sec].
Loading classif

SPICE evaluation took: 3.836 min
SPICE: 0.220
Bleu_1: 0.781
Bleu_2: 0.628
Bleu_3: 0.484
Bleu_4: 0.359
METEOR: 0.284
ROUGE_L: 0.601
CIDEr: 1.105
SPICE: 0.220
loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 55218 tokens at 179894.08 tokens per second.
PTBTokenizer tokenized 9915 tokens at 51629.67 tokens per second.


setting up scorers...
computing Bleu score...
{'testlen': 8916, 'reflen': 8895, 'guess': [8916, 7916, 6916, 5916], 'correct': [7044, 4218, 2211, 1054]}
ratio: 1.0023608768970205
Bleu_1: 0.790
Bleu_2: 0.649
Bleu_3: 0.512
Bleu_4: 0.394
computing METEOR score...
METEOR: 0.298
computing Rouge score...
ROUGE_L: 0.613
computing CIDEr score...
CIDEr: 1.175
computing SPICE score...


Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.9 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.7 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.9 sec].
Loading classif

SPICE evaluation took: 3.765 min
SPICE: 0.226
Bleu_1: 0.790
Bleu_2: 0.649
Bleu_3: 0.512
Bleu_4: 0.394
METEOR: 0.298
ROUGE_L: 0.613
CIDEr: 1.175
SPICE: 0.226
Training time 0:30:28


In [None]:
# from torchsummary import summary
# summary(model,)

In [None]:
assert(False)

In [None]:
from glob import glob
glob('output/Caption_coco/result/*')

In [None]:
# chpt['model'].keys()

In [None]:
# visual_dict.keys()

In [None]:
# import torch 
from torchvision.datasets.utils import download_url
download_url('https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth','/kaggle/working')
chpt=torch.load('/kaggle/working/model_large_caption.pth')
visual_dict={}
for i in chpt['model']:
    if i.split('.')[0]=='visual_encoder':
        visual_dict[i[len('visual_encoder.'):]]=chpt['model'][i]
model.visual_encoder.load_state_dict(visual_dict)

In [None]:
# download_url('https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth','/kaggle/working')
# chpt2=torch.load('/kaggle/working/model_base_caption_capfilt_large.pth')

In [None]:
# # len(chpt2['model'].keys()),len(chpt['model'].keys())
# for i,key in enumerate(list(chpt['model'].keys())):
#     if key not in list(chpt2['model'].keys()):
#         print(i," : ",key)

In [None]:
assert(False)

In [None]:
KARPATHY_DATA['val'][0],KARPATHY_DATA['train'][0]

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def eval_train_split():
    train_kar=[]
    train_img=[]
    for i in KARPATHY_DATA['train']:
        if i['image'] in train_img:
            continue
        train_img.append(i['image'])
        train_kar.append({'image':i['image'],'caption':[i['caption']]})

    train_dataset, val_dataset, test_dataset = create_dataset('caption_coco', config,annot_dict={'val':train_kar,'train':KARPATHY_DATA['train'],'test':KARPATHY_DATA['val']})  


    samplers = [None, None, None]

    train_loader, val_loader, test_loader = create_loader([train_dataset, val_dataset, test_dataset],samplers,
                                                          batch_size=[config['batch_size']]*3,num_workers=[4,4,4],
                                                          is_trains=[True, False, False], collate_fns=[None,None,None])         
    train_result=evaluate(model, val_loader, device, config)
    json.dump(train_result,open('/kaggle/working/train_result.json','w'))
    print(train_result[0])
    
    
eval_train_split()


Caption generation:  [  0/375]  eta: 0:21:12    time: 3.3924  data: 0.9143  max mem: 13352
Caption generation:  [ 10/375]  eta: 0:15:29    time: 2.5462  data: 0.0835  max mem: 13352
Caption generation:  [ 20/375]  eta: 0:14:54    time: 2.4770  data: 0.0004  max mem: 13352
Caption generation:  [ 30/375]  eta: 0:14:24    time: 2.4858  data: 0.0003  max mem: 13352
Caption generation:  [ 40/375]  eta: 0:13:57    time: 2.4772  data: 0.0003  max mem: 13352
Caption generation:  [ 50/375]  eta: 0:13:30    time: 2.4759  data: 0.0003  max mem: 13352
Caption generation:  [ 60/375]  eta: 0:13:05    time: 2.4816  data: 0.0002  max mem: 13352
Caption generation:  [ 70/375]  eta: 0:12:40    time: 2.4916  data: 0.0003  max mem: 13352
Caption generation:  [ 80/375]  eta: 0:12:15    time: 2.4943  data: 0.0003  max mem: 13352
Caption generation:  [ 90/375]  eta: 0:11:49    time: 2.4715  data: 0.0003  max mem: 13352
Caption generation:  [100/375]  eta: 0:11:23    time: 2.4508  data: 0.0003  max mem: 13352

# Try model generation

In [None]:
from PIL import Image
import requests
import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
image_size = 384
def load_demo_image(image_size,device): 
    raw_image = Image.open(img_url).convert('RGB')   

    w,h = raw_image.size
    display(raw_image.resize((w//5,h//5)))
    
    transform = transforms.Compose([
        transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
        transforms.ToTensor(),
        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
        ]) 
    image = transform(raw_image).unsqueeze(0).to(device)   
    return image



In [None]:
# beam search
img_url="/kaggle/input/flickr8k/Images/"+KARPATHY_DATA['test'][8]['image']
image=load_demo_image(image_size,device)
caption = model.generate(image, sample=False, num_beams=3, max_length=20, min_length=5) 
# nucleus sampling
# caption = model.generate(image, sample=True, top_p=0.9, max_length=20, min_length=5) 
print('caption: '+caption[0])



In [None]:
assert(False)

# HappyTT 

In [19]:
from collections import defaultdict
import json,string,re

masked_word=['is','at','for','am','are','and','or','but','of']
pattern=r'\b('+'|'.join(masked_word)+r')\b'

def prepare_tt_kar_text(json_file_path,pattern=None,lower=False):
    # this function return dictionary 
    with open(json_file_path,'r') as file:
        data=json.loads(file.read())
    dict_data=defaultdict(lambda: defaultdict(list))
    for example in data['images']:
        temp=[]
        for sentence in example['sentences']:
            cap=sentence['raw']
            if lower:
                cap=cap.lower()
                
            cap=cap.translate(str.maketrans('','',string.punctuation))
            
            if pattern is not None:
                cap=re.sub(pattern,'',cap)
            cap=' '.join(cap.split())
            
            dict_data[example['split']][example['filename']].append(cap)

            
# {'train':[{'image':'name.jpg','image_id':img_id,'caption':cap}],'test':[{'image':'name.jpg','captions':[cap1,...,cap5]}],'val':[{'image':'name.jpg','captions':[cap1,...,cap5]}]}  
    return dict_data
kar_with_pattern=prepare_tt_kar_text('/kaggle/input/karpathy-splits/dataset_flickr8k.json',pattern=pattern,lower=True)
raw_kar=prepare_tt_kar_text('/kaggle/input/karpathy-splits/dataset_flickr8k.json',pattern=None,lower=False)

In [None]:
# train_result=json.load(open('/kaggle/input/blip-mask-results-1e/train_result.json','r'))
# val_result=json.load(open('/kaggle/input/blip-mask-results-1e/val_result.json','r'))
# test_result=json.load(open('/kaggle/input/blip-mask-results-1e/test_result.json','r'))

# len(train_result),len(val_result),len(test_result),train_result[0],val_result[0],test_result[0]

In [20]:
for i in raw_kar['train']:
    print(raw_kar['train'][i])
    print()
    print(kar_with_pattern['train'][i])
    break

['A black dog is running after a white dog in the snow', 'Black dog chasing brown dog through snow', 'Two dogs chase each other across the snowy ground', 'Two dogs play together in the snow', 'Two dogs running through a low lying body of water']

['a black dog running after a white dog in the snow', 'black dog chasing brown dog through snow', 'two dogs chase each other across the snowy ground', 'two dogs play together in the snow', 'two dogs running through a low lying body water']


In [21]:
import csv
captioning_results={
    'train':json.load(open('/kaggle/working/train_result.json','r')),
    'val':json.load(open('output/Caption_coco/result/val_epoch0.json','r')),
    'test':json.load(open('output/Caption_coco/result/test_epoch0.json','r'))
}
def generate_csv(csv_path, splits=['train'],input_mode={'pred':True,'masked':True}):
    # input_dict: {'img_name':['one caption']}
    # refs_dict: {'img_name':['caption1','cap2','cap3']}
    
    with open(csv_path, 'w', newline='') as csvfile:
        writter = csv.writer(csvfile)
        writter.writerow(["input", "target"])
        for sp in splits:
            if input_mode['masked']:
                for img in kar_with_pattern[sp]:
                    for i in range(len(kar_with_pattern[sp][img])):
                        writter.writerow(['grammar: '+kar_with_pattern[sp][img][i], raw_kar[sp][img][i]])
            if input_mode['pred']:
                for res in captioning_results[sp]:
                    for cap in raw_kar[sp][res['image_id']]:
                        writter.writerow(['grammar: '+res['caption'], cap])


In [22]:

generate_csv("train.csv", splits=['train'],input_mode={'pred':True,'masked':True})
generate_csv("eval.csv", splits=['val'],input_mode={'pred':True,'masked':False})
generate_csv("test.csv", splits=['test'],input_mode={'pred':True,'masked':False})

In [None]:
# import pandas as pd
# df=pd.read_csv('train.csv')
# len(df)

In [None]:
# df.head(20)

In [23]:
!pip install happytransformer

from happytransformer import HappyTextToText, TTSettings

happy_tt = HappyTextToText("T5", "vennify/t5-base-grammar-correction")
# happy_tt = HappyTextToText("T5", "t5-base")


args = TTSettings(num_beams=5, min_length=1)

# Add the prefix "grammar: " before each input 
result = happy_tt.generate_text("grammar: This sentences has has bads grammar.", args=args)
print(result.text) # This sentence has bad grammar.

Collecting happytransformer
  Downloading happytransformer-2.4.1-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: happytransformer
Successfully installed happytransformer-2.4.1
[0m

Downloading:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

This sentence has bad grammar.


In [24]:
from happytransformer import TTTrainArgs
args2 = TTTrainArgs(batch_size=50)
happy_tt.train("train.csv", args=args2)

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-4c2273126b3d9aec/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-4c2273126b3d9aec/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?ba/s]

***** Running training *****
  Num examples = 60000
  Num Epochs = 3
  Instantaneous batch size per device = 50
  Total train batch size (w. parallel, distributed & accumulation) = 50
  Gradient Accumulation steps = 1
  Total optimization steps = 3600


Step,Training Loss
500,1.3818
1000,1.2577
1500,1.1992
2000,1.1698
2500,1.1813
3000,1.1472
3500,1.1337




Training completed. Do not forget to share your model on huggingface.co/models =)




# saving model

In [25]:
happy_tt.save("/kaggle/working/happy_tt/")

Configuration saved in /kaggle/working/happy_tt/config.json
Model weights saved in /kaggle/working/happy_tt/pytorch_model.bin
tokenizer config file saved in /kaggle/working/happy_tt/tokenizer_config.json
Special tokens file saved in /kaggle/working/happy_tt/special_tokens_map.json
Copy vocab file to /kaggle/working/happy_tt/spiece.model


In [None]:
before_training=happy_tt.eval('eval.csv')
print(before_training.loss)

In [26]:
preds2=[]
annot={'annotations':[],'images':[]}
for i,res in enumerate(tqdm(captioning_results['test'])):
    out_cap=happy_tt.generate_text("grammar: "+res['caption'], args=args).text
    preds2.append({'image_id':res['image_id'],'caption':out_cap})
    for cap in raw_kar['test'][res['image_id']]:
        annot['images'].append({'id':res['image_id']})
        annot['annotations'].append({'image_id':res['image_id'],'caption':cap,'id':i})
        

  0%|          | 0/1000 [00:00<?, ?it/s]

In [27]:
json.dump(annot,open('/kaggle/working/coco_karpathy_test_gt.json','w'))
json.dump(preds2,open('/kaggle/working/coco_karpathy_test_result.json','w'))
coco_val = coco_caption_eval('/kaggle/working','/kaggle/working/coco_karpathy_test_result.json','test')

loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
tokenization...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


PTBTokenizer tokenized 59208 tokens at 141402.80 tokens per second.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


PTBTokenizer tokenized 10760 tokens at 54938.75 tokens per second.


setting up scorers...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
computing Bleu score...
{'testlen': 9761, 'reflen': 9749, 'guess': [9761, 8761, 7761, 6761], 'correct': [7669, 4580, 2410, 1169]}
ratio: 1.0012308954763565
Bleu_1: 0.786
Bleu_2: 0.641
Bleu_3: 0.503
Bleu_4: 0.385
computing METEOR score...
METEOR: 0.294
computing Rouge score...
ROUGE_L: 0.599
computing CIDEr score...
CIDEr: 1.101
computing SPICE score...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [1.2 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.8 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [1.1 sec].
Loading classif

KeyboardInterrupt: 

In [None]:
assert(False)

In [None]:
json.dump(annot,open('/kaggle/working/coco_karpathy_test_gt.json','w'))
json.dump(preds2,open('/kaggle/working/coco_karpathy_test_result.json','w'))
coco_val = coco_caption_eval('/kaggle/working','/kaggle/working/coco_karpathy_test_result.json','test')
