In [1]:

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, SubsetRandomSampler
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset,load_metric
import torch
import logging
import numpy as np
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
import sys
import time
from transformers.optimization import Adafactor
import os
import gc


In [3]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
local_test = 1
if(local_test==0):
    max_length= 512
    test_step = 10000
    report_step = 1000
    seed = 2
    bs = 64 
    lr = 1e-4
    train_num = 500000
    valid_num = 2000
else:
    max_length= 512
    test_step = 1000
    report_step = 100
    seed = 2
    bs = 4
    lr = 1e-4
    train_num = 5000
    valid_num = 200
now = time.strftime("%Y-%m-%d-%H_%M_%S",time.localtime(time.time())) 
log_format = '%(asctime)s |\t  %(message)s'
logging.basicConfig(stream=sys.stdout, level=logging.INFO,
    format=log_format, datefmt='%m/%d %I:%M:%S %p')
fh = logging.FileHandler(os.path.join("./log/", now+'.txt'),'w',encoding = "UTF-8")
fh.setFormatter(logging.Formatter(log_format))
logging.getLogger().addHandler(fh)


# Setting the seeds
np.random.seed(seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cudnn.benchmark = True
torch.manual_seed(seed)
cudnn.enabled=True
torch.cuda.manual_seed(seed)

In [5]:
class AvgrageMeter(object):

    def __init__(self):
        self.reset()

    def reset(self):
        self.avg = 0
        self.sum = 0
        self.cnt = 0

    def update(self, val, n=1):
        self.sum += val*n #TODO:its just for W
        self.cnt += n
        self.avg = self.sum / self.cnt

def tokenize(text_data, tokenizer, max_length, padding = True):
    
    encoding = tokenizer(text_data, return_tensors='pt', padding=padding, truncation = True, max_length = max_length)

    input_ids = encoding['input_ids']
    
    attention_mask = encoding['attention_mask']
    
    return input_ids, attention_mask
def get_Dataset(dataset, tokenizer):
    train_sentence = [x['de'] for x in dataset]
    train_target = [x['en'] for x in dataset]

  
    model1_input_ids, model1_input_attention_mask = tokenize(train_sentence, tokenizer, max_length = max_length)
  
    model1_target_ids, model1_target_attention_mask = tokenize(train_target, tokenizer, max_length = max_length)
 
    train_data = TensorDataset(model1_input_ids, model1_input_attention_mask, model1_target_ids, model1_target_attention_mask)
   
    return train_data

In [6]:
model = T5ForConditionalGeneration.from_pretrained("Onlydrinkwater/T5-small-de-en").to(device)
tokenizer = T5Tokenizer.from_pretrained("t5-small")
optimizer = Adafactor(model.parameters(), lr = lr ,scale_parameter=False, relative_step=False , warmup_init=False,clip_threshold=1,beta1=0,eps=( 1e-30,0.001))

In [7]:

dataset = load_dataset('wmt16','de-en')
train = dataset['train']['translation'][:train_num]
valid = dataset['train']['translation'][train_num:(train_num+valid_num)]

def preprocess(dat):
    for t in dat:
        t['de'] = "translate German to English: " + t['de']  #needed for T5
preprocess(train)
preprocess(valid)

train_data = get_Dataset(train, tokenizer)
train_dataloader = DataLoader(train_data, sampler= SequentialSampler(train_data), 
                        batch_size=bs, pin_memory=True, num_workers=4)
valid_data = get_Dataset(valid, tokenizer)
valid_dataloader = DataLoader(valid_data, sampler=SequentialSampler(valid_data), 
                        batch_size=bs, pin_memory=True, num_workers=4)

04/21 10:00:48 AM |	  Reusing dataset wmt16 (C:\Users\kevin\.cache\huggingface\datasets\wmt16\de-en\1.0.0\0d9fb3e814712c785176ad8cdb9f465fbe6479000ee6546725db30ad8a8b5f8a)


100%|██████████| 3/3 [00:00<00:00, 30.63it/s]


In [8]:
def my_train(_dataloader,model,optimizer):
    objs = AvgrageMeter()
    for step,batch in enumerate(_dataloader):
        optimizer.zero_grad()
        train_x = Variable(batch[0], requires_grad=False).to(device, non_blocking=False)
        train_x_attn = Variable(batch[1], requires_grad=False).to(device, non_blocking=False)
        train_y = Variable(batch[2], requires_grad=False).to(device, non_blocking=False)    
        train_y_attn = Variable(batch[3], requires_grad=False).to(device, non_blocking=False)    
        train_y[train_y == tokenizer.pad_token_id] = -100
        loss = model(input_ids=train_x, attention_mask=train_x_attn, labels=train_y).loss
        loss.backward()
        optimizer.step()
        objs.update(loss.item(), bs)
        if(step%report_step==0 and step!=0):
            logging.info(f'step:{step}\t,avgloss:{objs.avg}')
            objs.reset()

In [11]:
import copy
@torch.no_grad()
def my_test(_dataloader,model,epoch):
    # logging.info(f"GPU mem before test:{getGPUMem(device)}%")
    acc = 0
    counter = 0
    model.eval()
    metric_sacrebleu =  load_metric('sacrebleu')
    metric_bleu =  load_metric('bleu')

    # for step, batch in enumerate(tqdm(_dataloader,desc ="test for epoch"+str(epoch))):
    for step, batch in enumerate(_dataloader):
        
        test_dataloaderx = Variable(batch[0], requires_grad=False).to(device, non_blocking=False)
        test_dataloaderx_attn = Variable(batch[1], requires_grad=False).to(device, non_blocking=False)
        test_dataloadery = Variable(batch[2], requires_grad=False).to(device, non_blocking=False)
        test_dataloadery_attn = Variable(batch[3], requires_grad=False).to(device, non_blocking=False)
        target_ids = copy.deepcopy(test_dataloadery)
        target_ids[target_ids == tokenizer.pad_token_id] = -100
        ls = model(input_ids=test_dataloaderx, attention_mask=test_dataloaderx_attn, labels=target_ids).loss
        acc+= ls.item()
        counter+= 1
        pre = model.generate(test_dataloaderx ,num_beams = 5, early_stopping = True, max_length = max_length, length_penalty =0.6, repetition_penalty = 0.8)
        x_decoded = tokenizer.batch_decode(test_dataloaderx,skip_special_tokens=True)
        pred_decoded = tokenizer.batch_decode(pre,skip_special_tokens=True)
        label_decoded =  tokenizer.batch_decode(test_dataloadery,skip_special_tokens=True)
        
        pred_str = [x  for x in pred_decoded]
        label_str = [[x] for x in label_decoded]
        pred_list = [x.split()  for x in pred_decoded]
        label_list = [[x.split()] for x in label_decoded]
        metric_sacrebleu.add_batch(predictions=pred_str, references=label_str)
        metric_bleu.add_batch(predictions=pred_list, references=label_list)
        if  step%100==0:
            logging.info(f'x_decoded[:2]:{x_decoded[:2]}')
            logging.info(f'pred_decoded[:2]:{pred_decoded[:2]}')
            logging.info(f'label_decoded[:2]:{label_decoded[:2]}')
            
            
    sacrebleu_score = metric_sacrebleu.compute()
    bleu_score = metric_bleu.compute()
    logging.info('sacreBLEU : %f',sacrebleu_score['score'])#TODO:bleu may be wrong cuz max length
    logging.info('BLEU : %f',bleu_score['bleu'])
    logging.info('test loss : %f',acc/(counter))
    
    del test_dataloaderx,acc,counter,test_dataloaderx_attn,sacrebleu_score,bleu_score,test_dataloadery,test_dataloadery_attn,ls,pre,x_decoded,pred_decoded,label_decoded,pred_str,label_str,pred_list,label_list
    gc.collect()
    torch.cuda.empty_cache()
    model.train()
    
    
    # logging.info(f"GPU mem after test:{getGPUMem(device)}%")
        

In [12]:

my_test(valid_dataloader,model,-1)
for epoch in range(10):

    logging.info(f"\n\n  ----------------epoch:{epoch}----------------")
    my_train(train_dataloader,model,optimizer )
    my_test(valid_dataloader,model,epoch) 
    torch.save(model,'./model/'+now+'model.pt')




04/21 10:06:49 AM |	  x_decoded[:2]:['translate German to English: Dies ist die Intention, die hinter der mündlichen Anfrage steht, und wir müssen dann im Parlament im Rahmen der Berichte in den ordentlichen Verfahren über die Ausschüsse und die Plenarberatungen zu einem ausgewogenen und wirklich zukunftsweisenden Vorschlag kommen.', 'translate German to English: Frau Präsidentin, meine Damen und Herren! Im Namen der Fraktion der Liberalen und Demokratischen Partei Europas möchte ich mein Erstaunen angesichts der jetzigen Debatte im Zusammenhang mit den internationalen Kapitalströmen und ihrer etwaigen Besteuerung bekunden, was, wenn das auch im Wortlaut nicht erwähnt wird, - wie in der Einführung - ein Versuch der Wiederbelebung der vor einigen Jahren von Herrn Tobin vorgeschlagenen Steuer zu sein scheint, der sich unsere Fraktion in der vergangenen Wahlperiode klar und kategorisch mit einer Reihe von Argumenten, die von unserem Vorsitzenden, Herrn Cox, deutlich und zusammenhängend da

KeyboardInterrupt: 