In [1]:
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import types
import torch
import utils
import text_network
import TeacherNet
import ImageNet
import torch.optim as optim
import time
import sys
from torch.autograd import Variable


from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler


Torch Version:  1.3.0+cu92


In [2]:
train_img = torch.load("../hci-intermodal-reasoning/cached_data/train_img")
train_cap = torch.load("../hci-intermodal-reasoning/cached_data/train_cap")
train_mask = torch.load("../hci-intermodal-reasoning/cached_data/train_mask")

val_img = torch.load("../hci-intermodal-reasoning/cached_data/val_img")
val_cap = torch.load("../hci-intermodal-reasoning/cached_data/val_cap")
val_mask = torch.load("../hci-intermodal-reasoning/cached_data/val_mask")

print("Loaded train data", train_img.size(), train_cap.size(), train_mask.size())
print("Loaded val data", val_img.size(), val_cap.size(), val_mask.size())



Loaded train data torch.Size([10000, 3, 224, 224]) torch.Size([10000, 52]) torch.Size([10000, 52])
Loaded val data torch.Size([5000, 3, 224, 224]) torch.Size([5000, 43]) torch.Size([5000, 43])


In [3]:
DELTA = 0.1
BATCH_SIZE = 8
NB_EPOCHS = 10
LOGGER = utils.Logger()

# VOCAB_SIZE = len(train_mask)
# EMBED_DIM = train_cap.size()[1]
# NUN_CLASS = len(train_cap)
# model = TextNet.Text_Net(VOCAB_SIZE, EMBED_DIM, NUN_CLASS)


In [4]:
train_data = TensorDataset(train_img, train_cap, train_mask)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE, num_workers=2)
valid_data = TensorDataset(val_img, val_cap, val_mask)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=BATCH_SIZE * 2, num_workers=2)

device = torch.device("cuda:1")
text_net = text_network.TextNet()
#text_net.to(device)
#text_net = TextNet.Text_Net(VOCAB_SIZE, EMBED_DIM, NUN_CLASS)


model_name = "alexnet"
feature_extract = True
vision_net = ImageNet.Image_Net()
vision_net = vision_net.initialize_model(model_name, feature_extract, use_pretrained=True)
vision_net.to(device)


teacher_net = TeacherNet.Teacher_Net()
ranking_loss = TeacherNet.RankingLossFunc(DELTA)
teacher_net.to(device)
ranking_loss.to(device)

embeds = Variable(torch.Tensor((BATCH_SIZE, 16, 300)), requires_grad=True)

In [5]:
# optimizer
params_to_update_share = []
params_to_update_img = vision_net.parameters()
params_to_update_txt = []


params_to_update = list(params_to_update_share) + list(params_to_update_img) + list(params_to_update_txt)
optimizer = optim.Adam(params_to_update, lr=0.0001)

print("Start to train")
start_time = time.time()
for epoch in range(NB_EPOCHS):
    running_loss = 0.0
    running_corrects = 0.0
    total_samples = 0

    for step, batch in enumerate(train_dataloader):
        img, cap, mask = tuple(t.to(device) for t in batch)
        
        
        with torch.set_grad_enabled(False):
            img_feature = vision_net.forward(img)
            txt_feature = text_net.forward(cap,mask)

        with torch.set_grad_enabled(True):
            img_vec = teacher_net.forward(img_feature)
            txt_vec = teacher_net.forward(txt_feature)
    
            loss = ranking_loss(img_vec, txt_vec)
            preds = teacher_net.predict(img_vec, txt_vec)
            optimizer.step()
            optimizer.zero_grad()
    
        running_loss += loss.item() * BATCH_SIZE
        running_corrects += sum([(i == preds[i]) for i in range(len(preds))])
        total_samples += len(preds)
        #print(running_loss)
        
    print("Epoch %d: train loss = %f" % (epoch, running_loss))
    print(("          train acc = %f (%d/%d)" % (float(running_corrects/total_samples), running_corrects, total_samples)))

print("Training done in %f mins" % ((time.time()-start_time)/60))

Start to train
Epoch 0: train loss = 55999.996223
          train acc = 0.133700 (1337/10000)
Epoch 1: train loss = 55999.970718
          train acc = 0.128200 (1282/10000)
Epoch 2: train loss = 55999.965599
          train acc = 0.132800 (1328/10000)
Epoch 3: train loss = 55999.947254
          train acc = 0.130300 (1303/10000)
Epoch 4: train loss = 55999.971916
          train acc = 0.128300 (1283/10000)
Epoch 5: train loss = 55999.986710
          train acc = 0.127600 (1276/10000)
Epoch 6: train loss = 55999.982712
          train acc = 0.132200 (1322/10000)
Epoch 7: train loss = 55999.986256
          train acc = 0.129500 (1295/10000)
Epoch 8: train loss = 55999.974442
          train acc = 0.131700 (1317/10000)
Epoch 9: train loss = 55999.979477
          train acc = 0.129600 (1296/10000)
Training done in 29.363260 mins
