In [None]:

import glob
import os
import random

image_path='/home/jason/data/coco/images/'
mode='train2014/'
image_list=[]
image_list.extend(glob.glob(os.path.join(image_path,mode, '*.jpg')))
image_list.sort()
print(len(image_list))

text_path='/home/jason/data/coco/text/'
label_list = []
label_list.extend(glob.glob(os.path.join(text_path,mode, '*.txt')))
label_list.sort()
print(len(label_list))

with open(label_list[0], "r") as f:
    data = f.readlines()
    label = random.choice(data)
    print(label)

In [None]:
#https://github.com/openai/CLIP/issues/57
# old fine-tuned based clip

import os
import torch
import glob
from PIL import Image
import random
import clip
from tqdm.notebook import tqdm
import numpy as np
from torchvision import transforms
import torch.nn as nn
import torch.optim as optim
import numpy as np
from tqdm.notebook import tqdm

EPOCH =10
BATCH_SIZE =32

device = "cuda:2" if torch.cuda.is_available() else "cpu" # If using GPU then use mixed precision training.
model, preprocess = clip.load("ViT-B/32",device=device,jit=False) #Must set jit=False for training

class cocodtrain(torch.utils.data.Dataset):
    def __init__(self, image_path='/home/jason/data/coco/images', text_path='/home/jason/data/coco/text', mode='train2014'):

        self.image_list = []
        self.image_list.extend(glob.glob(os.path.join(image_path, mode, '*.jpg')))
        self.image_list.sort()

        self.label_list = []
        self.label_list.extend(glob.glob(os.path.join(text_path, mode, '*.txt')))
        self.label_list.sort()

    def __len__(self):
        return len(self.image_list)

    def __getitem__(self, index):
        image = Image.open(self.image_list[index]).convert("RGB")
        image = image.resize((224,224), Image.BILINEAR)
        image = preprocess(image)
        #image = np.asarray(image)

        with open(self.label_list[index], "r") as f:
            data = f.readlines()
            label = random.choice(data)
            
        return image, label
trainset = cocodtrain('/home/jason/data/coco/images','/home/jason/data/coco/text','train2014')
trainloader = torch.utils.data.DataLoader(
                    trainset, 
                    batch_size=BATCH_SIZE,
                    shuffle=True, 
                    num_workers=16,
                    drop_last=True)

def convert_models_to_fp32(model): 
    for p in model.parameters(): 
        p.data = p.data.float() 
        p.grad.data = p.grad.data.float() 

#device = "cuda:3" if torch.cuda.is_available() else "cpu" # If using GPU then use mixed precision training.
#model, preprocess = clip.load("ViT-B/32",device=device,jit=False) #Must set jit=False for training

#clip.model.convert_weights(model) # Actually this line is unnecessary since clip by default already on float16

loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=5e-5,betas=(0.9,0.98),eps=1e-6,weight_decay=0.2) #Params used from paper, the lr is smaller, more safe for fine tuning to new dataset

for epoch in range(EPOCH):
    print('epoch:', epoch)
    for batch in tqdm(trainloader):
        optimizer.zero_grad()
        list_image,list_txt = batch #list_images is list of image in numpy array(np.uint8), or list of PIL images
        # print(list_image.size()) #torch.Size([32, 3, 224, 224])
        # print(len(list_txt)) #32
      
        images = torch.tensor(np.stack(list_image)).to(device)
        texts = clip.tokenize(list_txt).to(device) #torch.Size([32, 77])
         # print(texts.size()) #torch.Size([32, 77])
        logits_per_image, logits_per_text = model(images, texts)
        #print(logits_per_image.size(),logits_per_text.size()) #torch.Size([32, 32])  torch.Size([32, 32]) 
        ground_truth = torch.arange(BATCH_SIZE,dtype=torch.long,device=device)
        #print('ground.size()',ground_truth.size()) #torch.Size([32])

        total_loss = (loss_img(logits_per_image,ground_truth) + loss_txt(logits_per_text,ground_truth))/2
        print('total_loss',total_loss)
        total_loss.backward()
      
        convert_models_to_fp32(model)
        optimizer.step()
        clip.model.convert_weights(model)
    
torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': total_loss,
    }, f"model_checkpoint/model_10.pt") #just change to your preferred folder/filename      

In [None]:
model, preprocess = clip.load("ViT-B/32",device=device,jit=False) #Must set jit=False for training
checkpoint = torch.load("model_checkpoint/model_10.pt")
#print(model.input_resolution)
# Use these 3 lines if you use default model setting(not training setting) of the clip. For example, if you set context_length to 100 since your string is very long during training, then assign 100 to checkpoint['model_state_dict']["context_length"] 
# checkpoint['model_state_dict']["input_resolution"] = model.input_resolution #default is 224
# checkpoint['model_state_dict']["context_length"] = model.context_length # default is 77
# checkpoint['model_state_dict']["vocab_size"] = model.vocab_size 

model.load_state_dict(checkpoint['model_state_dict'])

In [26]:
# new torch based clip
import os
import torch
import glob
from PIL import Image
import random
import clip
from tqdm.notebook import tqdm
import numpy as np
from torchvision import transforms
import torch.nn as nn
import torch.optim as optim
import numpy as np
from tqdm.notebook import tqdm


from clip.model import Gauss_model
from tqdm.notebook import tqdm

device = "cuda:3" if torch.cuda.is_available() else "cpu" # If using GPU then use mixed precision training.
model_g = Gauss_model().to(device)
_, preprocess = clip.load("ViT-B/32",device=device,jit=False) #Must set jit=False for training


EPOCH =10
BATCH_SIZE =32

class cocodtrain(torch.utils.data.Dataset):
    def __init__(self, image_path='/home/jason/data/coco/images', text_path='/home/jason/data/coco/text', mode='train2014'):

        self.image_list = []
        self.image_list.extend(glob.glob(os.path.join(image_path, mode, '*.jpg')))
        self.image_list.sort()

        self.label_list = []
        self.label_list.extend(glob.glob(os.path.join(text_path, mode, '*.txt')))
        self.label_list.sort()

    def __len__(self):
        return len(self.image_list)

    def __getitem__(self, index):
        image = Image.open(self.image_list[index]).convert("RGB")
        image = image.resize((224,224), Image.BILINEAR)
        image = preprocess(image)
        #image = np.asarray(image)

        with open(self.label_list[index], "r") as f:
            data = f.readlines()
            label = random.choice(data)
            
        return image, label

trainset = cocodtrain('/home/jason/data/coco/images','/home/jason/data/coco/text','train2014')
trainloader = torch.utils.data.DataLoader(
                    trainset, 
                    batch_size=BATCH_SIZE,
                    shuffle=True, 
                    num_workers=0,
                    drop_last=True)
loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_g.parameters(), lr=5e-5,betas=(0.9,0.98),eps=1e-6,weight_decay=0.2) #Params used from paper, the lr is smaller, more safe for fine tuning to new dataset

n = BATCH_SIZE
Kl_matric = torch.ones([n,n])

for epoch in range(EPOCH):
    print('epoch:', epoch)
    for batch in tqdm(trainloader):
        
        list_image,list_txt = batch #list_images is list of image in numpy array(np.uint8), or list of PIL images
        # print(list_image.size()) #torch.Size([32, 3, 224, 224])
        #print(len(list_txt))
        images = torch.tensor(np.stack(list_image),requires_grad=True).to(device)
        #print('image size:',images.size()) #image size: torch.Size([32, 3, 224, 224])
        texts = clip.tokenize(list_txt).to(device) #torch.Size([32, 77])
        #  print(texts.size()) #torch.Size([32, 77])
        image_u,image_std,text_u,text_std= model_g(images, texts)
        #print(image_u.size()) #torch.Size([32, 512])

        for i in range(n):
            for j in range(n):
                (mu1, Sigma1) =  image_u[i], image_std[i]
                (mu2, Sigma2) = text_u[j], text_std[j]

                p_distribution = torch.distributions.MultivariateNormal(mu1, torch.diag_embed(Sigma1))
                q_distribution = torch.distributions.MultivariateNormal(mu2, torch.diag_embed(Sigma2))
                Kl_matric[i,j]  = torch.distributions.kl_divergence(p_distribution, q_distribution)
        
        # print(Kl_matric)
        #Kl_matric.requires_grad=True
        logits_per_image = Kl_matric.to(device)
        logits_per_text = logits_per_image.t()
        print(logits_per_image.size(),logits_per_text.size())

        ground_truth = torch.arange(BATCH_SIZE,dtype=torch.long,device=device)
        print(ground_truth.size())
        total_loss_ = (loss_img(logits_per_image,ground_truth) + loss_txt(logits_per_text,ground_truth))/2
        
        optimizer.zero_grad()
        total_loss_.backward(retain_graph=True)
        print('total loss:', total_loss_)
      
        #convert_models_to_fp32(model)
        optimizer.step()
        #clip.model.convert_weights(model)


epoch: 0


  0%|          | 0/2586 [00:00<?, ?it/s]

RuntimeError: cholesky_cuda: U(1,1) is zero, singular U.

In [28]:
#new numpy based clip
import os
import torch
import glob
from PIL import Image
import random
import clip
from tqdm.notebook import tqdm
import numpy as np
from torchvision import transforms
import torch.nn as nn
import torch.optim as optim
import numpy as np
from tqdm.notebook import tqdm


from clip.model import Gauss_model
from tqdm.notebook import tqdm

device = "cuda:3" if torch.cuda.is_available() else "cpu" # If using GPU then use mixed precision training.
model_g = Gauss_model().to(device)
_, preprocess = clip.load("ViT-B/32",device=device,jit=False) #Must set jit=False for training


EPOCH =10
BATCH_SIZE =32

class cocodtrain(torch.utils.data.Dataset):
    def __init__(self, image_path='/home/jason/data/coco/images', text_path='/home/jason/data/coco/text', mode='train2014'):

        self.image_list = []
        self.image_list.extend(glob.glob(os.path.join(image_path, mode, '*.jpg')))
        self.image_list.sort()

        self.label_list = []
        self.label_list.extend(glob.glob(os.path.join(text_path, mode, '*.txt')))
        self.label_list.sort()

    def __len__(self):
        return len(self.image_list)

    def __getitem__(self, index):
        image = Image.open(self.image_list[index]).convert("RGB")
        image = image.resize((224,224), Image.BILINEAR)
        image = preprocess(image)
        #image = np.asarray(image)

        with open(self.label_list[index], "r") as f:
            data = f.readlines()
            label = random.choice(data)
            
        return image, label

trainset = cocodtrain('/home/jason/data/coco/images','/home/jason/data/coco/text','train2014')
trainloader = torch.utils.data.DataLoader(
                    trainset, 
                    batch_size=BATCH_SIZE,
                    shuffle=True, 
                    num_workers=0,
                    drop_last=True)
loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_g.parameters(), lr=5e-6,betas=(0.9,0.98),eps=1e-6,weight_decay=0.2) #Params used from paper, the lr is smaller, more safe for fine tuning to new dataset


def loss_multivariate_normal_kl2(mu_1, sigmasquare_1, mu_2, sigmasquare_2):
    b, dim = mu_1.shape
    kl = 0
    kkll = torch.ones(b,b)
    for x in range(b):
        for y in range(b):
            c_mu_1 = mu_1[x]
            c_Sigma_1 = torch.diag(sigmasquare_1[x])

            c_mu_2 = mu_2[y]
            c_Sigma_2 = torch.diag(sigmasquare_2[y])

            p1 = torch.prod(sigmasquare_2)
            p2 = torch.prod(sigmasquare_1)
            if p1 == 0 or p2 ==0:
                first = 0
            else:
                first = p1.log() - p2.log()

            #first = c_Sigma_2.det().log() - c_Sigma_1.det().log()
            second = -dim
            third = torch.matmul(c_Sigma_2.inverse(), c_Sigma_1).trace()
            fourth = torch.matmul(torch.matmul((c_mu_2 - c_mu_1).T, c_Sigma_2.inverse()), c_mu_2 - c_mu_1)
            kl = 0.5 * (first + second + third + fourth)
            kkll[x,y] = kl
    return kkll

n = BATCH_SIZE
Kl_matric = np.ones([n,n])

for epoch in range(EPOCH):
    print('epoch:', epoch)
    for batch in tqdm(trainloader):
        optimizer.zero_grad()
        list_image,list_txt = batch #list_images is list of image in numpy array(np.uint8), or list of PIL images
        # print(list_image.size()) #torch.Size([32, 3, 224, 224])
        #print(len(list_txt))
        images = torch.tensor(np.stack(list_image),requires_grad=True).to(device)
        #print('image size:',images.size()) #image size: torch.Size([32, 3, 224, 224])
        texts = clip.tokenize(list_txt).to(device) #torch.Size([32, 77])
        #  print(texts.size()) #torch.Size([32, 77])
        image_u,image_std,text_u,text_std= model_g(images, texts)
        #print(image_u.size()) #torch.Size([32, 512])

        p_q_kl = loss_multivariate_normal_kl2(image_u,image_std,text_u,text_std)
        #print(p_q_kl.size())
        # print(Kl_matric.requires_grad)
        logits_per_image = torch.tensor(p_q_kl,requires_grad=True).to(device)
        logits_per_text = logits_per_image.t()
        #print(logits_per_image.size(),logits_per_text.size())

        ground_truth = torch.arange(BATCH_SIZE,dtype=torch.long,device=device)
        #print(ground_truth.size())
        total_loss = (loss_img(logits_per_image,ground_truth) + loss_txt(logits_per_text,ground_truth))/2
        total_loss.backward()
        print('total loss:', total_loss)
      
        #convert_models_to_fp32(model)
        optimizer.step()
        #clip.model.convert_weights(model)

torch.save({
    'epoch': epoch,
    'model_state_dict': model_g.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': total_loss,
    }, f"model_checkpoint/model_10.pt") #just change to your preferred folder/filename   

epoch: 0


  0%|          | 0/2586 [00:00<?, ?it/s]

  logits_per_image = torch.tensor(p_q_kl,requires_grad=True).to(device)


total loss: tensor(89338.2500, device='cuda:3', grad_fn=<DivBackward0>)
total loss: tensor(13404.0459, device='cuda:3', grad_fn=<DivBackward0>)
total loss: tensor(1953.9993, device='cuda:3', grad_fn=<DivBackward0>)
total loss: tensor(2303.8918, device='cuda:3', grad_fn=<DivBackward0>)
total loss: tensor(6578.2305, device='cuda:3', grad_fn=<DivBackward0>)
total loss: tensor(2087.3604, device='cuda:3', grad_fn=<DivBackward0>)
total loss: tensor(8010.4668, device='cuda:3', grad_fn=<DivBackward0>)
total loss: tensor(65556.3047, device='cuda:3', grad_fn=<DivBackward0>)
total loss: tensor(3617.5542, device='cuda:3', grad_fn=<DivBackward0>)


KeyboardInterrupt: 

: 

In [20]:
import os
import torch
import numpy

n = 1 #batch size

image_u = torch.tensor([0.2, 0.1, 0.5, 0.4])
image_std= torch.tensor([0.14, 0.52, 0.2, 0.4])

text_u = torch.tensor([0.3, 0.6, -0.5, -0.8])
text_std = torch.tensor([0.24, 0.02, 0.31, 0.51])

def multivar_continue_KL_divergence2(mu1, Sigma1,mu2, Sigma2):
    # print(q[0].shape,q[1].shape)
    # print(p[0].shape,p[1].shape)
    a = torch.log(torch.det(Sigma2)/torch.det(Sigma1)) 
    print('a:',a)
    b =torch.matmul(torch.inverse(Sigma2), Sigma1).trace()
    print('b:',b) 
    c = torch.matmul(torch.matmul((mu2 - mu1).t(), torch.inverse(Sigma2)), (mu2 - mu1))
    print('c:',c)
    n = Sigma1.size()[0]
    print('n',n)
    return 0.5 * (a - n + b + c)


# p = (mu1, Sigma1) = torch.transpose(image_u,-1,0), torch.diag_embed(image_std)
p = (mu1, Sigma1) = image_u.t(), torch.diag_embed(image_std)

# q = (mu2, Sigma2) = torch.transpose(text_u,-1,0), torch.diag_embed(text_std)
q = (mu2, Sigma2) = text_u.t(), torch.diag_embed(text_std)

print(multivar_continue_KL_divergence2(mu1, Sigma1,mu2, Sigma2))  # 20.28295597572157
# print(multivar_continue_KL_divergence(q, p))  # 5.883921991346153

a: tensor(-2.0379)
b: tensor(28.0128)
c: tensor(18.5910)
n 4
tensor(20.2830)


In [2]:
import torch
import numpy as np
n = 4 #batch size

mu1 = torch.rand(n, 512)
print(mu1.shape)
Sigma1=torch.rand(n, 512)

mu2 = torch.rand(n,512)
Sigma2=torch.rand(n,512)

# mu1 = torch.tensor([[0.2, 0.1, 0.5, 0.4],[0.2, 0.1, 0.5, 0.4]])#.unsqueeze(0)
# print(mu1.shape)
# Sigma1= torch.tensor([[0.14, 0.52, 0.2, 0.4],[0.14, 0.52, 0.2, 0.4]])#.unsqueeze(0)

# mu2 = torch.tensor([[0.3, 0.6, -0.5, -0.8],[0.3, 0.6, -0.5, -0.8]])#.unsqueeze(0)
# Sigma2 = torch.tensor([[0.24, 0.02, 0.31, 0.51],[0.24, 0.02, 0.31, 0.51]])#.unsqueeze(0)


#Method 2
def loss_multivariate_normal_kl2(mu_1, sigmasquare_1, mu_2, sigmasquare_2):
    b, dim = mu_1.shape
    kl = 0
    kkll = torch.ones(b,b)
    for x in range(b):
        for y in range(b):
            c_mu_1 = mu_1[x]
            c_Sigma_1 = torch.diag(sigmasquare_1[x])

            c_mu_2 = mu_2[y]
            c_Sigma_2 = torch.diag(sigmasquare_2[y])

            p1 = torch.prod(sigmasquare_2)
            p2 = torch.prod(sigmasquare_1)
            if p1 == 0 or p2 ==0:
                first = 0
            else:
                first = p1.log() - p2.log()

            #first = c_Sigma_2.det().log() - c_Sigma_1.det().log()
            second = -dim
            third = torch.matmul(c_Sigma_2.inverse(), c_Sigma_1).trace()
            fourth = torch.matmul(torch.matmul((c_mu_2 - c_mu_1).T, c_Sigma_2.inverse()), c_mu_2 - c_mu_1)
            kl = 0.5 * (first + second + third + fourth)
            kkll[x,y] = kl
    return kkll

#Method 3
def loss_multivariate_normal_kl3(mu_1, sigmasquare_1, mu_2, sigmasquare_2):
    b, dim = mu_1.shape
    kl = 0

    c_mu_1=mu_1.unsqueeze(1)
    Sigma1=sigmasquare_1.unsqueeze(1)

    c_mu_2=mu_2.unsqueeze(0)
    Sigma2=sigmasquare_2.unsqueeze(0)

    # for x in range(b):
    #     for y in range(b):

    c_Sigma_1 = torch.diag(sigmasquare_1).unsqueeze(1)
    c_Sigma_2 = torch.diag(sigmasquare_2).unsqueeze(0)

    p1 = torch.prod(Sigma1)
    p2 = torch.prod(Sigma2)
    if p1 == 0 or p2 ==0:
        first = 0
    else:
        first = p1.log() - p2.log()

    #first = c_Sigma_2.det().log() - c_Sigma_1.det().log()
    second = -dim
    third = torch.matmul(c_Sigma_2.inverse(), c_Sigma_1).trace()
    fourth = torch.matmul(torch.matmul((c_mu_2 - c_mu_1).T, c_Sigma_2.inverse()), c_mu_2 - c_mu_1)
    kl = 0.5 * (first + second + third + fourth)

    return kl



print(loss_multivariate_normal_kl2(mu1,Sigma1,mu2,Sigma2))

mu1=mu1.unsqueeze(1)
Sigma1=Sigma1.unsqueeze(1)

mu2=mu2.unsqueeze(0)
Sigma2=Sigma2.unsqueeze(0)

#Method 2
p_distribution = torch.distributions.MultivariateNormal(mu1, torch.diag_embed(Sigma1))
q_distribution = torch.distributions.MultivariateNormal(mu2, torch.diag_embed(Sigma2))
p_q_kl = torch.distributions.kl_divergence(p_distribution, q_distribution)#.mean()
print(p_q_kl)



torch.Size([4, 512])
tensor([[1340.9336,  938.8438, 5507.3130,  706.9279],
        [1225.6526,  877.7920, 3227.4597,  717.4406],
        [1719.1436, 1162.5123, 4957.9834,  649.7206],
        [1188.8816,  983.1750, 3340.1199,  670.3173]])
tensor([[1331.6353,  961.6362, 5491.2939,  726.7874],
        [1219.8347,  904.0649, 3214.9216,  740.7805],
        [1693.1235, 1168.5833, 4925.2441,  652.8585],
        [1180.8872, 1007.2716, 3325.4053,  691.4807]])


In [23]:
import torch
import numpy as np
n = 4 #batch size


mu1 = torch.tensor([0.2, 0.1, 0.5, 0.4]).unsqueeze(0)
Sigma1= torch.tensor([0.14, 0.52, 0.2, 0.4]).unsqueeze(0)

mu2 = torch.tensor([0.3, 0.6, -0.5, -0.8]).unsqueeze(0)
Sigma2 = torch.tensor([0.24, 0.02, 0.31, 0.51]).unsqueeze(0)

#Method 1 
def loss_multivariate_normal_kl(mu_1, sigmasquare_1, mu_2, sigmasquare_2):
    b, dim = mu_1.shape
    print(b,dim)
    kl = 0

    for bidx in range(b):
        c_mu_1 = mu_1[bidx]
        c_Sigma_1 = torch.diag(sigmasquare_1[bidx])

        c_mu_2 = mu_2[bidx]
        c_Sigma_2 = torch.diag(sigmasquare_2[bidx])

        first = c_Sigma_2.det().log() - c_Sigma_1.det().log()
        second = -dim
        third = torch.matmul(c_Sigma_2.inverse(), c_Sigma_1).trace()
        fourth = torch.matmul(torch.matmul((c_mu_2 - c_mu_1).T, c_Sigma_2.inverse()), c_mu_2 - c_mu_1)
        kl = kl + 0.5 * (first + second + third + fourth)

    kl = kl / b
    return kl
print(loss_multivariate_normal_kl(mu1,Sigma1,mu2,Sigma2))

#Method 2
p_distribution = torch.distributions.MultivariateNormal(mu1, torch.diag_embed(Sigma1))
q_distribution = torch.distributions.MultivariateNormal(mu2, torch.diag_embed(Sigma2))
p_q_kl = torch.distributions.kl_divergence(p_distribution, q_distribution)#.mean()
print(p_q_kl)



1 4
tensor(20.2830)
tensor([20.2830])
