In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.getcwd() 
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import warnings
warnings.filterwarnings("ignore")
from T5 import *
from datasets import load_dataset
from transformers import T5Tokenizer
from MT_hyperparams import *
import torch.backends.cudnn as cudnn
from utils import *
from attention_params import *
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, SubsetRandomSampler
from torch.autograd import Variable
from losses import *
from architect import *
import logging
import sys
import time

In [3]:
now = time.strftime("%Y-%m-%d-%H_%M_%S",time.localtime(time.time())) 
log_format = '%(asctime)s |\t  %(message)s'
logging.basicConfig(stream=sys.stdout, level=logging.INFO,
    format=log_format, datefmt='%m/%d %I:%M:%S %p')
fh = logging.FileHandler(os.path.join("./log/", now+'.txt'))
fh.setFormatter(logging.Formatter(log_format))
logging.getLogger().addHandler(fh)
dataset = load_dataset('opus_euconst','en-fr')
logging.info(dataset)
logging.info(dataset['train'][5])
# Setting the seeds
np.random.seed(seed_)
torch.cuda.set_device(0)
cudnn.benchmark = True
torch.manual_seed(seed_)
cudnn.enabled=True
torch.cuda.manual_seed(seed_)

12/25 05:15:37 PM |	  Reusing dataset opus_euconst (/home/li/.cache/huggingface/datasets/opus_euconst/en-fr/1.0.0/d1e611a011f28fdda67a97024820e0a3813b4e4decca194d9a20b3207a39b908)


  0%|          | 0/1 [00:00<?, ?it/s]

12/25 05:15:37 PM |	  DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 10104
    })
})
12/25 05:15:37 PM |	  {'translation': {'en': 'CONSIDERING that Article IV-437(2)(e) of the Constitution provides that the Treaty of 16 April 2003 concerning the accessions referred to above shall be repealed;  ', 'fr': "CONSIDÉRANT que l'article\xa0IV-437, paragraphe\xa02, point\xa0e), de la Constitution prévoit l'abrogation du traité du 16\xa0avril 2003 relatif aux adhésions visées ci-dessus;  "}}


In [4]:
# Load the tokenizer.
import random
tokenizer = T5Tokenizer.from_pretrained("t5-base")

criterion = torch.nn.CrossEntropyLoss(ignore_index = tokenizer.pad_token_id, reduction='none')
L = len(dataset['train'])
L_t = L//4*3
L_v = L//8
L_test = L//8
dataset = dataset.shuffle(seed=seed_)


train = dataset['train']['translation'][:L_t]
valid = dataset['train']['translation'][L_t:L_t+L_v]
test = dataset['train']['translation'][-L_test:]
def preprocess(dat):
    for t in dat:
        t['en'] = 'translate English to French:' + t['en']
preprocess(train)
preprocess(valid)
preprocess(test)
logging.info("train len: %d",len(train))
logging.info("valid len: %d",len(valid))
logging.info("test len: %d" ,len(test))
logging.info(train[5])

12/25 05:15:39 PM |	  Loading cached shuffled indices for dataset at /home/li/.cache/huggingface/datasets/opus_euconst/en-fr/1.0.0/d1e611a011f28fdda67a97024820e0a3813b4e4decca194d9a20b3207a39b908/cache-774986f0005795ce.arrow
12/25 05:15:40 PM |	  train len: 7578
12/25 05:15:40 PM |	  valid len: 1263
12/25 05:15:40 PM |	  test len: 1263
12/25 05:15:40 PM |	  {'en': 'translate English to French:, on the basis of Article\xa02, and shall report thereon at least once a year.  ', 'fr': "L'Agence européenne de défense contribue à l'évaluation régulière des contributions des États membres participants en matière de capacités, en particulier des contributions fournies suivant les critères qui seront établis, entre autres, sur la base de l'article\xa02, et en fait rapport au moins une fois par an.  "}


In [5]:
train_data = get_train_Dataset(train, tokenizer)# Create the DataLoader for our training set.
train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), 
                        batch_size=2, pin_memory=True, num_workers=0)
valid_data = get_aux_dataset(valid, tokenizer)# Create the DataLoader for our training set.
valid_dataloader = DataLoader(valid_data, sampler=RandomSampler(valid_data), 
                        batch_size=2, pin_memory=True, num_workers=0)
test_data = get_aux_dataset(test, tokenizer)# Create the DataLoader for our training set.
test_dataloader = DataLoader(test_data, 
                        batch_size=5, pin_memory=True, num_workers=0)#, sampler=RandomSampler(test_data)

In [6]:

A = attention_params(len(train))
A = A.cuda()

# TODO: model loaded from saved model
model_w = T5(criterion=criterion, tokenizer= tokenizer, name = 'model_w_in_main')
model_w = model_w.cuda()
w_optimizer = torch.optim.SGD(model_w.parameters(),w_lr,momentum=momentum,weight_decay=decay)
scheduler_w  = torch.optim.lr_scheduler.CosineAnnealingLR(w_optimizer, float(epochs), eta_min=learning_rate_min)



model_v = T5(criterion=criterion, tokenizer= tokenizer, name = 'model_v_in_main')
model_v = model_v.cuda()
v_optimizer = torch.optim.SGD(model_v.parameters(),v_lr,momentum=momentum,weight_decay=decay)
scheduler_v  = torch.optim.lr_scheduler.CosineAnnealingLR(v_optimizer, float(epochs), eta_min=learning_rate_min)



architect = Architect(model_w, model_v,  A)

In [7]:
x = ['my name is kevin','it is my nameit is my nameit is my name 321312']
for index,i in enumerate(x) :
    x[index] = 'translate English to French:' + x[index]
y= tokenize(x, tokenizer, max_length = summary_length)
input = y[0].cuda()
output  = model_v.generate(input)
tokenizer.batch_decode(output)

['<pad> mon nom est kevin</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>',
 "<pad> c'est mon nomci est ma dénomination 321312</s>"]

In [8]:
def my_test(test_dataloader,model):
    for step, batch in enumerate(test_dataloader):
        model.eval()
        x = Variable(batch[0], requires_grad=False).cuda()
        x_attn = Variable(batch[1], requires_grad=False).cuda()
        y = Variable(batch[2], requires_grad=False).cuda()
        y_attn = Variable(batch[3], requires_grad=False).cuda()

        ls = my_loss(x,x_attn,y,y_attn,model)
        logging.info('%s test loss : %f',model.name,ls)
        break
        

In [9]:
def my_train(epoch, train_dataloader, valid_dataloader, w_model, v_model, architect, A, w_optimizer, v_optimizer, lr_w, lr_v, ):
    for step, batch in enumerate(train_dataloader):
        logging.info("Step count: %d",step)
        
        batch_loss_w, batch_loss_v,  batch_count = 0, 0, 0
        input_w = Variable(batch[0], requires_grad=False).cuda()
        input_w_attn = Variable(batch[1], requires_grad=False).cuda()
        output_w = Variable(batch[2], requires_grad=False).cuda()
        output_w_attn = Variable(batch[3], requires_grad=False).cuda()        
        input_v = Variable(batch[4], requires_grad=False).cuda()
        input_v_attn = Variable(batch[5], requires_grad=False).cuda()      
        attn_idx = Variable(batch[6], requires_grad=False).cuda()
        
        valid_batch = next(iter(valid_dataloader))
        valid_input_v      = Variable(valid_batch[0], requires_grad=False).cuda()
        valid_input_v_attn = Variable(valid_batch[1], requires_grad=False).cuda()
        valid_out_v      = Variable(valid_batch[2], requires_grad=False).cuda()
        valid_out_v_attn = Variable(valid_batch[3], requires_grad=False).cuda()
        

        if epoch <= stop_epoch:
            architect.step(input_w,  output_w,input_w_attn, output_w_attn, w_optimizer, input_v, input_v_attn,valid_input_v, valid_input_v_attn, valid_out_v, 
                valid_out_v_attn, v_optimizer, attn_idx, lr_w, lr_v)

        if epoch <=stop_epoch:
            
            w_optimizer.zero_grad()
            loss_w = CTG_loss(input_w, input_w_attn, output_w, output_w_attn, attn_idx, A, w_model)
            logging.info(f"loss_w (train):{loss_w}")
            batch_loss_w += loss_w.item()
            loss_w.backward()
            nn.utils.clip_grad_norm(w_model.parameters(), grad_clip)
            w_optimizer.step()


            v_optimizer.zero_grad()
            loss_aug = calc_loss_aug(input_v, input_v_attn, w_model, v_model)
            v_loss =  (loss_aug)
            logging.info(f"v_loss (train):{v_loss}")
            batch_loss_v += v_loss.item()
            v_loss.backward()
            nn.utils.clip_grad_norm(v_model.parameters(), grad_clip)
            v_optimizer.step()     
            
            my_test(test_dataloader,w_model) 
            my_test(test_dataloader,v_model)      
        if step % 1  == 0:
            logging.info(str(("Attention Weights A : ", A.alpha)))


In [10]:
my_train(begin_epoch, train_dataloader, valid_dataloader, model_w, model_v,  architect, A, w_optimizer, v_optimizer, w_lr,v_lr)
    

12/25 05:15:56 PM |	  Step count: 0
12/25 05:16:14 PM |	  loss_w (train):3.4845939808292314e-05
12/25 05:16:17 PM |	  v_loss (train):333.8587341308594
12/25 05:16:18 PM |	  model_w_in_main test loss : 0.837824
12/25 05:16:18 PM |	  model_v_in_main test loss : 0.831462
12/25 05:16:18 PM |	  ('Attention Weights A : ', Parameter containing:
tensor([-0.0099, -0.0099, -0.0099,  ..., -0.0099, -0.0099, -0.0099],
       device='cuda:0', requires_grad=True))
12/25 05:16:18 PM |	  Step count: 1
12/25 05:16:27 PM |	  loss_w (train):6.766241131117567e-05
12/25 05:16:29 PM |	  v_loss (train):111.9421157836914
12/25 05:16:29 PM |	  model_w_in_main test loss : 0.837778
12/25 05:16:29 PM |	  model_v_in_main test loss : 0.868355
12/25 05:16:29 PM |	  ('Attention Weights A : ', Parameter containing:
tensor([-0.0182, -0.0182, -0.0182,  ..., -0.0182, -0.0182, -0.0182],
       device='cuda:0', requires_grad=True))
12/25 05:16:29 PM |	  Step count: 2
12/25 05:16:46 PM |	  loss_w (train):1.554307709739078e-0

KeyboardInterrupt: 

In [None]:
import gc

gc.collect()

torch.cuda.empty_cache()

In [None]:
tokenizer.decode([0,  6206,  6667,    27,     1])
tokenizer.decode([13959,  1566,    12,  2379,    10, 17608,   994,    27,     1,     0])
logging.info("vocab size : %d",model_v.vocab_size)
logit = torch.load('logits.pt')
target = torch.load('target_ids.pt')
tokenizer.decode(target[0])
logit.shape
_,maxx = torch.max(logit,dim=-1,keepdim=True)
maxx.shape
tokenizer.decode(maxx[0].squeeze(-1))

model_v.embedding