<a href="https://colab.research.google.com/github/KimDaeUng/t2i_bert_finetuning/blob/master/Get_BERT_CUB_Captions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Get BERT embedding from CUB Captions

In [None]:
!pip install nlp
!pip install transformers

Collecting nlp
[?25l  Downloading https://files.pythonhosted.org/packages/09/e3/bcdc59f3434b224040c1047769c47b82705feca2b89ebbc28311e3764782/nlp-0.4.0-py3-none-any.whl (1.7MB)
[K     |▏                               | 10kB 24.0MB/s eta 0:00:01[K     |▍                               | 20kB 6.1MB/s eta 0:00:01[K     |▋                               | 30kB 6.9MB/s eta 0:00:01[K     |▉                               | 40kB 7.9MB/s eta 0:00:01[K     |█                               | 51kB 7.2MB/s eta 0:00:01[K     |█▏                              | 61kB 8.0MB/s eta 0:00:01[K     |█▍                              | 71kB 9.0MB/s eta 0:00:01[K     |█▋                              | 81kB 9.1MB/s eta 0:00:01[K     |█▉                              | 92kB 10.0MB/s eta 0:00:01[K     |██                              | 102kB 9.3MB/s eta 0:00:01[K     |██▏                             | 112kB 9.3MB/s eta 0:00:01[K     |██▍                             | 122kB 9.3MB/s eta 0:00:01

In [None]:
import os
from glob import glob
import torch

## Prepare dataset

In [None]:
# Mount google drive
from os import path
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from nlp import load_dataset

In [None]:
path_cub_whole_captions =  "/content/drive/My Drive/Colab Notebooks/t2i/data/whole_captions.txt"
whole_dataset = load_dataset('text', data_files=path_cub_whole_captions)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1520.0, style=ProgressStyle(description…




Using custom data configuration default


Downloading and preparing dataset text/default-6f7df17d102c4e7c (download: Unknown size, generated: Unknown size, post-processed: Unknown sizetotal: Unknown size) to /root/.cache/huggingface/datasets/text/default-6f7df17d102c4e7c/0.0.0/3a79870d85f1982d6a2af884fde86a71c771747b4b161fd302d28ad22adf985b...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-6f7df17d102c4e7c/0.0.0/3a79870d85f1982d6a2af884fde86a71c771747b4b161fd302d28ad22adf985b. Subsequent calls will reuse this data.


In [None]:
# Check the path list
whole_dataset['train']['text'][:5]

['the medium sized bird has a dark grey color, a black downward curved beak, and long wings.\n',
 'the bird is dark grey brown with a thick curved bill and a flat shaped tail.\n',
 'bird has brown body feathers, white breast feathers and black beak\n',
 'this bird has a dark brown overall body color, with a small white patch around the base of the bill.\n',
 'the bird has very long and large brown wings, as well as a black body and a long black beak.\n']

In [None]:
# Set BERT tokenizer
from transformers import BertTokenizer
path_model = "/content/drive/My Drive/Colab Notebooks/t2i/BERT_CUB"
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=path_model)

In [None]:
# check the size
len_dataset = len(whole_dataset['train']['text'])
len_dataset

117880

In [None]:
# Preprocessing : Tokenizing and padding
encoded_pad_dataset_len = whole_dataset['train'].map(
    lambda x: tokenizer(x['text'], padding=True,
                        max_length=80, return_length=True),batched=True)

HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))




In [None]:
# Check the dataset
encoded_pad_dataset_len[0:3]['text']

['the medium sized bird has a dark grey color, a black downward curved beak, and long wings.\n',
 'the bird is dark grey brown with a thick curved bill and a flat shaped tail.\n',
 'bird has brown body feathers, white breast feathers and black beak\n']

In [None]:
from tqdm import tqdm
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)

In [None]:
class SimpleCustomBatch:
    def __init__(self, data):
        self.inp = torch.tensor([i['input_ids'] for i in data], dtype=torch.long)
        self.seg = torch.tensor([i['token_type_ids'] for i in data], dtype=torch.long)
        self.att = torch.tensor([i['attention_mask'] for i in data], dtype=torch.long)
        self.len = torch.tensor([i['length'] for i in data], dtype=torch.long)
    # # custom memory pinning method on custom type
    def pin_memory(self):
        self.inp = self.inp.pin_memory()
        self.seg = self.seg.pin_memory()
        self.att = self.att.pin_memory()
        return self

def collate_wrapper(batch):
    return SimpleCustomBatch(batch)

In [None]:
data_loader = DataLoader(encoded_pad_dataset_len,  batch_size=10, collate_fn = collate_wrapper, pin_memory=True)

In [None]:
# get path to save embedding tensor file(*.pt)
caption_text_path = "/content/drive/My Drive/Colab Notebooks/t2i/data/text_c10/*/*.txt"
caption_text_list = glob(caption_text_path)
caption_text_list.sort()

## Save Embedding files

In [None]:
# Load pretrained model
from transformers import BertModel
model = BertModel.from_pretrained(path_model, output_hidden_states=True) 
model.to('cuda')

In [None]:
model.eval()
for i, batch in tqdm(enumerate(data_loader)):
    save_path = caption_text_list[i].split(".txt")[0] + ".pt"
    if os.path.isfile(save_path):
        continue
    else:
        with torch.no_grad():
            hidden = model(batch.inp.cuda(), batch.seg.cuda(), batch.att.cuda())[2]
            token= hidden[-2].to("cpu")
            sentence = torch.mean(token, dim=1)
            bat = {"token" : token, "sentence" : sentence, "length":batch.len}
            torch.save(bat, save_path)

11788it [06:24, 30.64it/s]


## Zip the files
- it takes long time, It' should be better to download it from google drive 

In [None]:

!zip -r '/content/drive/My Drive/Colab Notebooks/t2i/data/text_c10_BERT_embedding.zip' '/content/drive/My Drive/Colab Notebooks/t2i/data/text_c10/'

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
  adding: content/drive/My Drive/Colab Notebooks/t2i/data/text_c10/147.Least_Tern/Least_Tern_0131_153983.pt (deflated 7%)
  adding: content/drive/My Drive/Colab Notebooks/t2i/data/text_c10/147.Least_Tern/Least_Tern_0132_154149.pt (deflated 7%)
  adding: content/drive/My Drive/Colab Notebooks/t2i/data/text_c10/147.Least_Tern/Least_Tern_0133_153816.pt (deflated 7%)
  adding: content/drive/My Drive/Colab Notebooks/t2i/data/text_c10/189.Red_bellied_Woodpecker/ (stored 0%)
  adding: content/drive/My Drive/Colab Notebooks/t2i/data/text_c10/189.Red_bellied_Woodpecker/Red_Bellied_Woodpecker_0057_182154.txt (deflated 62%)
  adding: content/drive/My Drive/Colab Notebooks/t2i/data/text_c10/189.Red_bellied_Woodpecker/Red_Bellied_Woodpecker_0066_182253.txt (deflated 60%)
  adding: content/drive/My Drive/Colab Notebooks/t2i/data/text_c10/189.Red_bellied_Woodpecker/Red_Bellied_Woodpecker_0032_181587.txt (deflated 60%)
  adding: content/drive/My Drive/

In [None]:
# compress the *.pt files only
!find '/content/drive/My Drive/Colab Notebooks/t2i/data/text_c10/' -name '*.pt'
!tar cvf '/content/drive/My Drive/Colab Notebooks/t2i/data/text_c10_pt.tar' `find '/content/drive/My Drive/Colab Notebooks/t2i/data/text_c10/' -name '*.pt'`

## References
- [huggingface examples - language modeling](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb#scrollTo=VmaHZXzmkNtJ)  
- [BERT Fine-Tuning Tutorial with PyTorch, Chris McCormick and Nick Ryan](https://mccormickml.com/2019/07/22/BERT-fine-tuning/#31-bert-tokenizer)