# Installing CLIP

In [1]:
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git
!pip install torchtext

Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 KB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.1.1
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /private/var/folders/3n/qb9qrmdn45q_rjlp5x6g7k5c0000gn/T/pip-req-build-2tfatky2
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /private/var/folders/3n/qb9qrmdn45q_rjlp5x6g7k5c0000gn/T/pip-req-build-2tfatky2
  Resolved https://github.com/openai/CLIP.git to commit a9b1bf5920416aaeaec965c25dd9e8f98c864f16
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25ldone
[?25h  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369400 sha256=529e605f37c7a5d0875538caca5f390df4baf684001e48c4a0ddd9a

In [4]:
from PIL import Image
import torch
from torch import nn, optim
import glob
import os
import pandas as pd
import json
import numpy as np
import clip
from torch.utils.data import Dataset, DataLoader, BatchSampler
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import random
from matplotlib.pyplot import imshow
import torchtext
import nltk, re, string, collections
from nltk.util import ngrams
import collections
%matplotlib inline
BATCH_SIZE = 128
EPOCH = 5

# Preparing Model and Data

In [39]:
data = pd.read_csv("train_val_data/BTC_train.csv", lineterminator='\n', nrows = 10000).tweet
IMG_ROOT = "librosa-images"

img_paths = glob.glob(os.path.join(IMG_ROOT, "*.png"))

d = {}
for img_path in tqdm(img_paths):
    row = int(img_path.split('/')[1].split('.')[0])
    d[img_path] = [data[row]]

  0%|          | 0/10000 [00:00<?, ?it/s]

In [40]:
d

{'librosa-images/3975.png': ['Core Inflation Rate YoY  https://t.co/YiTKXbSJWQ 🙋 Bet with $BTC via  https://t.co/4h0cyuWSHk √'],
 'librosa-images/8820.png': ['@CryptoGodJohn @mickyMafiaTrade $BTC 40K soon 👀'],
 'librosa-images/9280.png': ['@SpaceX @elonmusk @elonmusk still fight dude.. u can reach the orb moon again, keep going $BTC #BTC #BTCUSD #celousdt $celo $ATA'],
 'librosa-images/348.png': ['$BTC simple plan  https://t.co/GTC8f4IBGg'],
 'librosa-images/1804.png': ['#Bitcoin / $BTC  tbh, something like this would make a lot of sense imo.  It might even makes to much sense to happen...  https://t.co/Yjrqm65MLI'],
 'librosa-images/4968.png': ['$BTC volatility soon ⬆️ ⬇️  https://t.co/QUlFXgc64v'],
 'librosa-images/9294.png': ['@elonmusk please share your Bitcoin wallet address to track your movements. Please cash out as $Btc community will block your ‘next’ buyback'],
 'librosa-images/1810.png': ['$BTCUSD 🚨  BTC/USD Forex Signal: Descending Triangle Signals Breakout  https://t.co/A8

## Splitting 20% for Validation

In [41]:
train_img_paths, test_img_paths = train_test_split(img_paths, test_size=0.2, random_state=42)
d_train = {k: d[k] for k in train_img_paths}
d_test = {k: d[k] for k in test_img_paths}
len(d_train), len(d_test)

(8000, 2000)

## Loading Pre-trained CLIP Model and Preprocessor

In [43]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device, jit=False)

## MemeDataset

In [44]:
class MemeDataset(Dataset):
    def __init__(self, data, preprocess):
        self.preprocess = preprocess
        self.img_paths = []
        self.captions = []
        for img_path, captions in data.items():
            for cap in captions:
                self.img_paths.append(img_path)
                self.captions.append(cap)
        self.processed_cache = {}
        for img_path in data:
            self.processed_cache[img_path] = self.preprocess(Image.open(img_path))
        self.img_paths_set = list(data.keys())
        self.path2label = {path: self.img_paths_set.index(path) for path in self.img_paths_set}
        
    def __len__(self):
        return len(self.captions)

    def __getitem__(self, idx):
        img_path = self.img_paths[idx]
        image = self.processed_cache[img_path]
        caption = self.captions[idx]
        label = self.path2label[img_path]
        return image, caption, label

train_dataset = MemeDataset(d_train, preprocess)
test_dataset = MemeDataset(d_test, preprocess)
len(train_dataset), len(test_dataset), train_dataset[0]

(8000,
 2000,
 (tensor([[[-1.7923, -1.7923, -1.7923,  ...,  0.8647,  0.7917,  0.7333],
           [-1.7923, -1.7923, -1.7923,  ...,  0.8792,  0.8063,  0.7479],
           [-1.7923, -1.7923, -1.7923,  ...,  0.9376,  0.8647,  0.8063],
           ...,
           [-1.7923, -1.7923, -1.7923,  ..., -1.7923, -1.7923, -1.7923],
           [-1.7923, -1.7923, -1.7923,  ..., -1.7923, -1.7923, -1.7923],
           [-1.7923, -1.7923, -1.7923,  ..., -1.7923, -1.7923, -1.7923]],
  
          [[-1.7521, -1.7521, -1.7521,  ..., -0.9267, -0.9567, -0.9717],
           [-1.7521, -1.7521, -1.7521,  ..., -0.9267, -0.9567, -0.9717],
           [-1.7521, -1.7521, -1.7521,  ..., -0.9117, -0.9417, -0.9567],
           ...,
           [-1.7521, -1.7521, -1.7521,  ..., -1.7521, -1.7521, -1.7521],
           [-1.7521, -1.7521, -1.7521,  ..., -1.7521, -1.7521, -1.7521],
           [-1.7521, -1.7521, -1.7521,  ..., -1.7521, -1.7521, -1.7521]],
  
          [[-1.4802, -1.4802, -1.4802,  ...,  0.0555,  0.0129, -0.0298

In [45]:
i = 0
for k,v in train_dataset.path2label.items():
    i+=1
    print(k,v)
    if i == 10:
        break

librosa-images/3204.png 0
librosa-images/6765.png 1
librosa-images/1234.png 2
librosa-images/687.png 3
librosa-images/3313.png 4
librosa-images/297.png 5
librosa-images/6382.png 6
librosa-images/7851.png 7
librosa-images/249.png 8
librosa-images/5118.png 9


## BalancedBatchSampler (ensures no same class per batch)

In [46]:
# https://github.com/pytorch/pytorch/blob/e5742494f6080c8e6f43c37689fc18a7c4b39dfd/torch/utils/data/dataloader.py#L145
class BalancedBatchSampler(BatchSampler):
    """
    BatchSampler - from a MNIST-like dataset, samples n_classes and within these classes samples n_samples.
    Returns batches of size n_classes * n_samples
    """

    def __init__(self, labels, n_classes, n_samples):
        self.labels = labels
        self.labels_set = list(set(self.labels.numpy()))
        self.label_to_indices = {label: np.where(self.labels.numpy() == label)[0]
                                 for label in self.labels_set}
        for l in self.labels_set:
            np.random.shuffle(self.label_to_indices[l])
        self.used_label_indices_count = {label: 0 for label in self.labels_set}
        self.count = 0
        self.n_classes = n_classes
        self.n_samples = n_samples
        self.n_dataset = len(self.labels)
        self.batch_size = self.n_samples * self.n_classes

    def __iter__(self):
        self.count = 0
        while self.count + self.batch_size < self.n_dataset:
            classes = np.random.choice(self.labels_set, self.n_classes, replace=False)
            indices = []
            for class_ in classes:
                indices.extend(self.label_to_indices[class_][
                               self.used_label_indices_count[class_]:self.used_label_indices_count[
                                                                         class_] + self.n_samples])
                self.used_label_indices_count[class_] += self.n_samples
                if self.used_label_indices_count[class_] + self.n_samples > len(self.label_to_indices[class_]):
                    np.random.shuffle(self.label_to_indices[class_])
                    self.used_label_indices_count[class_] = 0
            yield indices
            self.count += self.n_classes * self.n_samples

    def __len__(self):
        return self.n_dataset // self.batch_size
    
train_labels = torch.tensor([item[2] for item in train_dataset])
train_sampler = BalancedBatchSampler(train_labels, BATCH_SIZE, 1)
train_dataloader = DataLoader(train_dataset, batch_sampler=train_sampler)

test_labels = torch.tensor([item[2] for item in test_dataset])
test_sampler = BalancedBatchSampler(test_labels, BATCH_SIZE, 1)
test_dataloader = DataLoader(test_dataset, batch_sampler=test_sampler)

# train_dataloader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle=True)
# test_dataloader = DataLoader(test_dataset, batch_size = BATCH_SIZE, shuffle=False)
# len(train_dataset), len(test_dataset), train_dataset[0]

In [47]:
for i, item in enumerate(train_sampler):
#     print(item)
#     print(len(item))
    labels = []
    for idx in item:
        label = train_dataset[idx][2]
        labels.append(label)
    break
len(labels), len(set(labels))

(128, 128)

In [48]:
for batch in train_dataloader:
    imgs, txts, labels = batch
    print(imgs.shape)
    print(len(txts))
    print(labels)
    print(labels.shape)
    print(torch.unique(labels).shape)
    break

torch.Size([128, 3, 224, 224])
128
tensor([4399, 1889, 7107, 2167, 4128, 2918, 6501,  561,  497, 6399,  274,  742,
        5147,  372, 7035, 6846, 1023, 6425, 3099, 1888, 4287, 4069, 1640, 6118,
        4907, 7489, 7849, 3908,  857, 5243, 4240,  703, 2584, 1952, 2954, 4679,
        2879, 7283, 2760, 2008,  770, 7025, 2351, 2382, 1262, 2559, 3098,  835,
        5756, 3246, 5532, 5732, 5592, 3856,  677, 7764, 3777, 4784, 3795,  354,
        5864, 2033, 3244, 1471, 5877, 6938, 4865, 6726, 2407, 6250, 4722, 5163,
        2626, 4412, 3064, 1022, 5352,  999, 3115, 7564, 4067, 1962, 5249, 5345,
        6830, 2130, 3997, 1540, 2629, 5151, 3561, 4088, 4261,  734, 1911, 1568,
        5494, 4247, 7661, 2457, 1750, 2371, 5211, 4831, 6754, 7480, 5828, 5024,
        5411, 7933, 6852, 4385, 3152, 5055, 6724, 5825, 4011, 6232, 5347, 6066,
        7087, 1916, 7400, 1367, 4993, 2728, 3840, 2859])
torch.Size([128])
torch.Size([128])


# Training

In [49]:
#https://github.com/openai/CLIP/issues/57
def convert_models_to_fp32(model): 
    for p in model.parameters(): 
        p.data = p.data.float() 
        p.grad.data = p.grad.data.float() 

if device == "cpu":
    model.float()

loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=5e-5,betas=(0.9,0.98),eps=1e-6,weight_decay=0.2)
optimizer = optim.Adam(model.parameters(), lr=1e-5)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(train_dataloader)*EPOCH)

In [50]:
best_te_loss = 1e5
best_ep = -1
for epoch in range(EPOCH):
    print(f"running epoch {epoch}, best test loss {best_te_loss} after epoch {best_ep}")
    step = 0
    tr_loss = 0
    model.train()
    pbar = tqdm(train_dataloader, leave=False)
    for batch in pbar:
        step += 1
        optimizer.zero_grad()

        images, texts, _ = batch
        images = images.to(device)
        texts = clip.tokenize(texts).to(device)
#         print(images.shape, texts.shape)
        logits_per_image, logits_per_text = model(images, texts)
        ground_truth = torch.arange(BATCH_SIZE).to(device)

        total_loss = (loss_img(logits_per_image,ground_truth) + loss_txt(logits_per_text,ground_truth))/2
        total_loss.backward()
        tr_loss += total_loss.item()
        if device == "cpu":
            optimizer.step()
            scheduler.step()
        else:
            convert_models_to_fp32(model)
            optimizer.step()
            scheduler.step()
            clip.model.convert_weights(model)
        pbar.set_description(f"train batchCE: {total_loss.item()}", refresh=True)
    tr_loss /= step
    
    step = 0
    te_loss = 0
    with torch.no_grad():
        model.eval()
        test_pbar = tqdm(test_dataloader, leave=False)
        for batch in test_pbar:
            step += 1
            images, texts, _ = batch
            images = images.to(device)
            texts = clip.tokenize(texts).to(device)
            logits_per_image, logits_per_text = model(images, texts)
            ground_truth = torch.arange(BATCH_SIZE).to(device)

            total_loss = (loss_img(logits_per_image,ground_truth) + loss_txt(logits_per_text,ground_truth))/2
            te_loss += total_loss.item()
            test_pbar.set_description(f"test batchCE: {total_loss.item()}", refresh=True)
        te_loss /= step
        
    if te_loss < best_te_loss:
        best_te_loss = te_loss
        best_ep = epoch
        torch.save(model.state_dict(), "best_model.pt")
    print(f"epoch {epoch}, tr_loss {tr_loss}, te_loss {te_loss}")
torch.save(model.state_dict(), "last_model.pt")

running epoch 0, best test loss 100000.0 after epoch -1


  0%|          | 0/62 [00:00<?, ?it/s]

RuntimeError: Input Sep 16, 2021: The current Mayer Multiple is 1.05 with a $BTC price of $USD 47,992.34 and a 200 day moving average of $45,882.79 USD. The @TIPMayerMultple has historically been higher 63.49% of the time with an average of 1.44. Learn more at:  https://t.co/9n0xlTWuNP  https://t.co/bQ6z91Ghj0 is too long for context length 77

# Evaluating Precision on Validation Set

In [None]:
model.load_state_dict(torch.load("../input/clipfinetuneweights/best_model.pt"))
NUM_NEG = 127
NUM_TEST = 1000

In [None]:
n_correct = 0
for i in tqdm(range(NUM_TEST)):
    empty = True
    while empty:
        img_path = random.choice(list(d_test.keys()))
        image = preprocess(Image.open(img_path)).unsqueeze(0).to(device)
        name = img_path.split('/')[-1].split('.')[0]
        caps = d_test[img_path]
        if len(caps) > 0:
            pos_txt = random.choice(caps)
        #         pos_txt = ' '.join(pos_txt)
            empty = False
#     print(pos_txt)
    neg_i = 0
    neg_txts = []
    while neg_i < NUM_NEG:
        img_path = random.choice(list(d_test.keys()))
        neg_name = img_path.split('/')[-1].split('.')[0]
        if neg_name == name:
            continue
        caps = d_test[img_path]
        if len(caps) == 0:
            continue
        neg_txt = random.choice(caps)
        if neg_txt in neg_txts:
            continue
        neg_txts.append(neg_txt)
        neg_i += 1
#     print(name)
#     print(f"Positive caption: {pos_txt}")
#     print(f"Negative caption: {neg_txts}")
    text = clip.tokenize([pos_txt]+neg_txts).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)

        logits_per_image, logits_per_text = model(image, text)
        probs = logits_per_image.softmax(dim=-1).cpu().numpy()

#     print("Label probs:", probs)
#     print(np.argmax(probs))
    if np.argmax(probs) == 0:
        n_correct +=1
print(f"Test precision {n_correct/NUM_TEST}")

# Evaluating BLEU and Word Diversity using Naive Sampling

## Sampling Captions for Validation Images According to CLIP Text-Image Proximity

In [None]:
def sample1Caption(img_path, corpus, model, num_cand):
    image = preprocess(Image.open(img_path)).unsqueeze(0).to(device)
    i = 0
    txts = []
    while i < num_cand:
        txt = random.choice(corpus)
        if txt in txts:
            continue
        if len(txt.split())<5 or len(txt)>72:
            continue
        txts.append(txt)
        i += 1
    #     print(name)
    #     print(f"Positive caption: {pos_txt}")
    #     print(f"Negative caption: {neg_txts}")
    text = clip.tokenize(txts).to(device)

    with torch.no_grad():
        logits_per_image, logits_per_text = model(image, text)
        probs = logits_per_image.softmax(dim=-1).cpu().numpy()

    #     print("Label probs:", probs)
    #     print(np.argmax(probs))
    #     imshow(np.asarray(Image.open(img_path)))
    return txts[np.argmax(probs)]

In [None]:
model.load_state_dict(torch.load("../input/clipfinetuneweights/best_model.pt"))
corpus = []
for txtlist in d_train.values():
    corpus += txtlist
len(corpus), corpus[0]

In [None]:
captions = {}
for img_path in tqdm(d_test.keys()):
    caption = sample1Caption(img_path, corpus, model, 1000)
    captions[img_path] = caption

## BLEU Score

In [None]:
for get_bleu in range(1,4):
    bleu_x_lst = []
    bleu_y_lst = []
    for p, caps in d_test.items():
        if not caps:
            continue
        bleu_x_lst.append(captions[p].split())
        splittedcaps = [x.split() for x in caps]
        bleu_y_lst.append(splittedcaps)
    BLEU = torchtext.data.metrics.bleu_score(bleu_x_lst, bleu_y_lst, max_n=get_bleu, weights=[1/get_bleu]*get_bleu)
    print(f"{get_bleu}-gram BLEU score: {BLEU}")

## Word Diversity

In [None]:
sentences = list(captions.values())
BigramCtr = collections.Counter()
UnigramCtr = collections.Counter()
for sentence in sentences:
    BigramCtr.update(nltk.ngrams(sentence, 2))
    UnigramCtr.update(nltk.ngrams(sentence, 1))
# print("Unigram count:",len(BigramCtr)/len(sentences))
# print("Bigram count:",len(UnigramCtr)/len(sentences))
print("Unigram count:",len(BigramCtr))
print("Bigram count:",len(UnigramCtr))

# Case Analysis on Seen and Unseen Images

In [None]:
seen_path = random.choice(list(d_train.keys()))
pred_cap_seen = sample1Caption(seen_path, corpus, model, 1000)
gt_cap_seen = d_train[seen_path][:5]
imshow(Image.open(seen_path))
print(f"Some ground truth captions for this seen image: {gt_cap_seen}")
print(f"Caption sampled by fintuned CLIP for this seen image: {pred_cap_seen}")

In [None]:
unseen_path = random.choice(list(d_test.keys()))
pred_cap_unseen = sample1Caption(unseen_path, corpus, model, 1000)
imshow(Image.open(unseen_path))
gt_cap_unseen = d_test[unseen_path][:5]
print(f"Some ground truth captions for this unseen image: {gt_cap_unseen}")
print(f"Caption sampled by fintuned CLIP for this unseen image: {pred_cap_unseen}")