In [None]:
import os
assert os.environ['COLAB_TPU_ADDR'], 'Make sure to select TPU from Edit > Notebook settings > Hardware accelerator'

In [None]:
# installing required libraries and downloading data from source

!pip3 install transformers -q
!pip3 install rouge -q
!pip3 install sentencepiece
!wget https://opiniondigest.s3-us-west-2.amazonaws.com/data/yelp-default-data.zip -q
!unzip yelp-default-data.zip
!rm yelp-default-data.zip
!pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.7-cp36-cp36m-linux_x86_64.whl -q

[K     |████████████████████████████████| 1.5MB 7.0MB/s 
[K     |████████████████████████████████| 890kB 21.7MB/s 
[K     |████████████████████████████████| 2.9MB 27.9MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
Archive:  yelp-default-data.zip
   creating: data/
   creating: data/yelp-default/
  inflating: data/yelp-default/train.csv  
  inflating: data/yelp-default/test_gold_8_15_all_all_300_8.csv  
  inflating: data/yelp-default/summaries_0-200_cleaned_fixed_business_ids.csv  
  inflating: data/yelp-default/test.csv  
  inflating: data/yelp-default/yelp.jsonl  
  inflating: data/yelp-default/dev.csv  
  inflating: data/yelp-default/test_gold.csv  
[K     |████████████████████████████████| 133.6MB 30kB/s 
[K     |████████████████████████████████| 61kB 3.4MB/s 
[?25h

In [None]:
import numpy as np
import pandas as pd
from transformers import T5Tokenizer, EncoderDecoderModel, T5Model, T5ForConditionalGeneration
import torch

In [None]:
# reading data and making required changes to the files

train = pd.read_csv("/content/data/yelp-default/train.csv").loc[:]
val = pd.read_csv("/content/data/yelp-default/dev.csv").loc[:]
train.columns = ['eid', 'rid', 'text', 'extraction', 'phrases']
val.columns = ['eid', 'rid', 'text', 'extraction', 'phrases']
train.to_csv('./train.csv')
val.to_csv('./val.csv')

In [None]:
# config class contains all the required configurations for the training of models

class config:

    MAX_LEN = 128
    # TOKENIZER = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
    TOKENIZER = T5Tokenizer.from_pretrained('t5-small')
    MODEL_LIST = ['bert-base-uncased']

    # Model parameters
    BATCH_SIZE = 16
    SHUFFLE = False
    NO_OF_WORKERS = 1
    EPOCHS = 100
    LR = 1e-2



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




In [None]:
# dataset class contains data preparation functions 

class dataset(torch.utils.data.Dataset):

    def __init__(self, data):
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN
        self.data = data

    def get_target(self, data):
        text = data['text']
        phrases = 'summarize: ' + ','.join(np.random.permutation(data['phrases'].split(' [SEP] ')))
        # phrases = 'summarize: ' + data['phrases'].replace(' [SEP] ', ',')


        encoded_text = self.tokenizer.encode_plus(
            text,
            truncation=True,
            max_length=self.max_len,
            add_special_tokens=True,
            padding = 'max_length',
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt',
        )

        encoded_phrases = self.tokenizer.encode_plus(
            phrases,
            truncation=True,
            max_length=self.max_len,
            add_special_tokens=True,
            padding = 'max_length',
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt',
        )

        input_ids = encoded_text.input_ids[0]
        token_type_ids = encoded_text.token_type_ids[0]
        attention_mask = encoded_text.attention_mask[0]

        p_input_ids = encoded_phrases.input_ids[0]
        p_token_type_ids = encoded_phrases.token_type_ids[0]
        p_attention_mask = encoded_phrases.attention_mask[0]

        return {"review": text, "input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask,
                "phrases": phrases, "p_input_ids": p_input_ids, "p_token_type_ids": p_token_type_ids, "p_attention_mask": p_attention_mask
                }

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.data)


    def __getitem__(self, index):
        return self.get_target(self.data.iloc[index])

In [None]:
# function to define models 

def get_model(model=0):
    # return EncoderDecoderModel.from_encoder_decoder_pretrained(config.MODEL_LIST[model], config.MODEL_LIST[model])
    return T5ForConditionalGeneration.from_pretrained('t5-small')

In [None]:
# function to train on TPUs

from tqdm import tqdm
import transformers as tr
from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel, T5Config, T5ForConditionalGeneration
import torch.optim as optim
import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.distributed.parallel_loader as pl
import time

def map_fn(index, flags):
    torch.manual_seed(flags['seed'])

    
    device = xm.xla_device()  


    print("Process", index ,"is using", xm.xla_real_devices([str(device)])[0])

    train_dataset = dataset(pd.read_csv('./train.csv'))
    val_dataset = dataset(pd.read_csv('./val.csv'))

    
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset,
        num_replicas=xm.xrt_world_size(),
        rank=xm.get_ordinal(),
        shuffle=True)
    
    val_sampler = torch.utils.data.distributed.DistributedSampler(
        val_dataset,
        num_replicas=xm.xrt_world_size(),
        rank=xm.get_ordinal(),
        shuffle=False)
    
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=flags['batch_size'],
        sampler=train_sampler,
        num_workers=flags['num_workers'],
        drop_last=True)

    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=flags['batch_size'],
        sampler=val_sampler,
        shuffle=False,
        num_workers=flags['num_workers'],
        drop_last=True)
    
    #load saved model
    # config_encoder = BertConfig()
    # config_decoder = BertConfig()

    # model_config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
    # model = tr.EncoderDecoderModel.from_pretrained("/content/drive/My Drive/bert_model_final_training_epoch_1",config=model_config).to(device).train()

    model_config = T5Config.from_pretrained('t5-small')
    model = T5ForConditionalGeneration.from_pretrained('/content/drive/MyDrive/t5_small_10epoch_ALL', config=model_config).to(device).train()

    # model = get_model().to(device).train()
    optimizer = optim.Adam(model.parameters(), lr = flags['lr'])
    
    train_start = time.time()
    print("training started")
    for epoch in range(flags['num_epochs']):
        para_train_loader = pl.ParallelLoader(train_loader, [device]).per_device_loader(device)
        epoch_loss = 0
        for batch_num, batch in enumerate(tqdm(para_train_loader)):

            de_output = batch['input_ids']
            de_attention_mask = batch['attention_mask']

            p_input_ids = batch['p_input_ids']
            p_attention_mask = batch['p_attention_mask']

            lm_labels = de_output.clone()

            output = model(input_ids=p_input_ids, attention_mask=p_attention_mask,labels = lm_labels)

            loss = output[0]

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            xm.optimizer_step(optimizer)

            # scheduler.step()

            epoch_loss += loss.item()
        
        print("Mean epoch loss:", (epoch_loss / (batch_num+1)))


    elapsed_train_time = time.time() - train_start
    print("Process", index, "finished training. Train time was:", elapsed_train_time) 

    model.eval()
    eval_start = time.time()
    with torch.no_grad():
        val_loss = 0

        para_val_loader = pl.ParallelLoader(val_loader, [device]).per_device_loader(device)
        for batch_num, batch in enumerate(tqdm(para_val_loader)):
            de_output = batch['input_ids']
            de_attention_mask = batch['attention_mask']

            p_input_ids = batch['p_input_ids']
            p_attention_mask = batch['p_attention_mask']

            lm_labels = de_output.clone()

            output = model(input_ids=p_input_ids, attention_mask=p_attention_mask,labels = lm_labels)

            loss = output[0]
            val_loss += loss.item()
        
        print("Mean val loss:", (val_loss / (batch_num+1)))

    xm.save(model.state_dict(), '/content/drive/MyDrive/t5_small_10epoch_ALL')
    elapsed_eval_time = time.time() - eval_start
    print("Process", index, "finished evaluation. Evaluation time was:", elapsed_eval_time)
    print("Process", index, "Mean eval_loss", (val_loss / (batch_num+1)))
    print('MODEL SAVED!')

In [None]:
%%time
flags = {}
flags['lr'] = 1e-3
flags['batch_size'] = 16
flags['num_workers'] = 8
flags['num_epochs'] = 1
flags['seed'] = 8888
# flags['model'] = 1

xmp.spawn(map_fn, args = (flags,), nprocs = 8, start_method = 'fork')

Process 0 is using TPU:0
Process 2 is using TPU:2
Process 1 is using TPU:1
Process 5 is using TPU:5
Process 6 is using TPU:6
Process 3 is using TPU:3
Process 7 is using TPU:7
Process 4 is using TPU:4
training started


  0%|          | 0/4875 [00:00<?, ?it/s]

training started


  0%|          | 0/4875 [00:00<?, ?it/s]

training started


  0%|          | 0/4875 [00:00<?, ?it/s]

training started


  0%|          | 0/4875 [00:00<?, ?it/s]

training started
training started


  0%|          | 0/4875 [00:00<?, ?it/s]

training started


  0%|          | 0/4875 [00:00<?, ?it/s]

training started


100%|██████████| 4875/4875 [22:01<00:00,  3.69it/s]


Mean epoch loss: 1.6336855308581621


100%|██████████| 4875/4875 [22:01<00:00,  3.69it/s]

Process 6 finished training. Train time was: 1321.999601840973





Mean epoch loss: 1.6246947194612944
Process 5 finished training. Train time was: 1321.998363494873


100%|██████████| 4875/4875 [22:12<00:00,  3.66it/s]


Mean epoch loss: 1.6252446906016422


100%|██████████| 4875/4875 [22:01<00:00,  4.03it/s]

Process 2 finished training. Train time was: 1332.1820726394653


100%|██████████| 4875/4875 [22:16<00:00,  3.65it/s]


Mean epoch loss: 1.626094518783765


100%|██████████| 4875/4875 [22:02<00:00,  3.69it/s]

Process 0 finished training. Train time was: 1336.4975697994232





Mean epoch loss: 1.6299827803709568


100%|██████████| 4875/4875 [21:59<00:00,  3.99it/s]

Process 3 finished training. Train time was: 1322.503359079361


100%|██████████| 4875/4875 [22:00<00:00,  3.69it/s]


Mean epoch loss: 1.6306831300442035
Process 4 finished training. Train time was: 1320.9594633579254


100%|██████████| 4875/4875 [22:01<00:00,  3.69it/s]


Mean epoch loss: 1.6235088771428818
Process 7 finished training. Train time was: 1322.1467015743256


100%|██████████| 4875/4875 [22:00<00:00,  3.69it/s]


Mean epoch loss: 1.6298897025768573
Process 1 finished training. Train time was: 1320.1336843967438


100%|██████████| 603/603 [00:56<00:00, 10.59it/s]


Mean val loss: 1.6203955665947392


100%|██████████| 603/603 [00:56<00:00, 10.62it/s]


Mean val loss: 1.6106466928722452


100%|██████████| 603/603 [00:56<00:00, 10.61it/s]


Mean val loss: 1.614912199539134


100%|██████████| 603/603 [00:57<00:00, 10.57it/s]


Mean val loss: 1.5915811197279301


100%|██████████| 603/603 [00:57<00:00, 10.53it/s]


Mean val loss: 1.606997630094019


100%|██████████| 603/603 [00:57<00:00, 10.56it/s]


Mean val loss: 1.622126676549959


100%|██████████| 603/603 [00:57<00:00, 10.55it/s]


Mean val loss: 1.6230337538727084


100%|██████████| 603/603 [00:57<00:00, 10.53it/s]


Mean val loss: 1.6118681824227075
Process 7 finished evaluation. Evaluation time was: 60.936707735061646
Process 5 finished evaluation. Evaluation time was: 61.10967707633972
Process 2 finished evaluation. Evaluation time was: 61.10675263404846
Process 6 finished evaluation. Evaluation time was: 61.11290907859802
Process 1 finished evaluation. Evaluation time was: 60.93226218223572
Process 3 finished evaluation. Evaluation time was: 61.05033016204834
Process 4 finished evaluation. Evaluation time was: 61.01758551597595
Process 7 Mean eval_loss 1.6230337538727084
Process 5 Mean eval_loss 1.614912199539134
Process 2 Mean eval_loss 1.6118681824227075
Process 6 Mean eval_loss 1.6203955665947392
Process 4 Mean eval_loss 1.622126676549959
Process 1 Mean eval_loss 1.6106466928722452
Process 3 Mean eval_loss 1.606997630094019
MODEL SAVED!
MODEL SAVED!
MODEL SAVED!
MODEL SAVED!
MODEL SAVED!
MODEL SAVED!
MODEL SAVED!
Process 0 finished evaluation. Evaluation time was: 61.09617590904236
Process 0

In [None]:
# train_dataset=dataset(pd.read_csv('./train.csv'))
# for data in train_dataset:
#     print(data)
#     break

In [None]:
# config.TOKENIZER.decode([6])

','

    BERT2BERT 60k 10 epochs 5e-5 16 -> 1.7     -> 27 9.9 25.9
    ROBERTA2ROBERTA 60k 10 epochs 5e-5 16 -> 

In [None]:
!pip install sumeval
import transformers as tr
# from rouge import Rouge
from tqdm import tqdm
# rouge = Rouge(metrics = ['rouge-1','rouge-2', 'rouge-l'])
from sumeval.metrics.rouge import RougeCalculator
rouge = RougeCalculator(stopwords = False, lang = "en")

def evaluateSummary(hypothesis, reference) :	
    size = len(hypothesis)
    rouge_1 = 0
    rouge_2 = 0
    rouge_l = 0

    for i in range(size) :
        rouge_1 = rouge_1 + rouge.rouge_n(hypothesis[i], reference[i], n=1)
        rouge_2 = rouge_2 + rouge.rouge_n(hypothesis[i], reference[i], n=2)
        rouge_l = rouge_l + rouge.rouge_l(hypothesis[i], reference[i])

    avg_score = {'Rouge-1' : rouge_1/size,
    	        'Rouge-2' : rouge_2/size,
                'Rouge-L' : rouge_l/size,
        }	    
    return avg_score

Collecting sumeval
[?25l  Downloading https://files.pythonhosted.org/packages/e6/87/bfc0f9397b9421305863edfdd2dbea637e47204976cb5473535c856338f4/sumeval-0.2.2.tar.gz (80kB)
[K     |████                            | 10kB 18.4MB/s eta 0:00:01[K     |████████▏                       | 20kB 14.4MB/s eta 0:00:01[K     |████████████▏                   | 30kB 10.4MB/s eta 0:00:01[K     |████████████████▎               | 40kB 9.1MB/s eta 0:00:01[K     |████████████████████▎           | 51kB 5.5MB/s eta 0:00:01[K     |████████████████████████▍       | 61kB 5.9MB/s eta 0:00:01[K     |████████████████████████████▌   | 71kB 6.2MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 4.3MB/s 
Collecting sacrebleu>=1.3.2
[?25l  Downloading https://files.pythonhosted.org/packages/a3/c4/8e948f601a4f9609e8b2b58f31966cb13cf17b940b82aa3e767f01c42c52/sacrebleu-1.4.14-py3-none-any.whl (64kB)
[K     |████████████████████████████████| 71kB 8.1MB/s 
[?25hCollecting portalocker
  Downl

In [None]:
# import torch_xla
# import torch_xla.core.xla_model as xm

# dev=xm.xla_device()
# dev='cuda'
import transformers
from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel, RobertaConfig, T5Config, T5Tokenizer, T5ForConditionalGeneration
import torch
import pandas as pd
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# config_encoder = RobertaConfig.from_pretrained('roberta-base')
# config_decoder = RobertaConfig.from_pretrained('roberta-base')

# config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
# model = tr.EncoderDecoderModel.from_pretrained("./my_model",config=config).to(device).eval()

# tok=tr.BertTokenizer.from_pretrained('bert-base-uncased')
# tok = tr.RobertaTokenizer.from_pretrained('roberta-base')

model_config = T5Config.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('/content/drive/MyDrive/t5_small_10epoch_ALL',config=model_config).to(device).eval()

tok = T5Tokenizer.from_pretrained('t5-small')

model.num_parameters()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1197.0, style=ProgressStyle(description…




60506624

In [None]:
model.state_dict().keys()

odict_keys(['shared.weight', 'encoder.embed_tokens.weight', 'encoder.block.0.layer.0.SelfAttention.q.weight', 'encoder.block.0.layer.0.SelfAttention.k.weight', 'encoder.block.0.layer.0.SelfAttention.v.weight', 'encoder.block.0.layer.0.SelfAttention.o.weight', 'encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight', 'encoder.block.0.layer.0.layer_norm.weight', 'encoder.block.0.layer.1.DenseReluDense.wi.weight', 'encoder.block.0.layer.1.DenseReluDense.wo.weight', 'encoder.block.0.layer.1.layer_norm.weight', 'encoder.block.1.layer.0.SelfAttention.q.weight', 'encoder.block.1.layer.0.SelfAttention.k.weight', 'encoder.block.1.layer.0.SelfAttention.v.weight', 'encoder.block.1.layer.0.SelfAttention.o.weight', 'encoder.block.1.layer.0.layer_norm.weight', 'encoder.block.1.layer.1.DenseReluDense.wi.weight', 'encoder.block.1.layer.1.DenseReluDense.wo.weight', 'encoder.block.1.layer.1.layer_norm.weight', 'encoder.block.2.layer.0.SelfAttention.q.weight', 'encoder.block.2.layer.0.SelfA

In [None]:
# val
phrases='pretty good food [SEP] VERY noisy wine list [SEP] Really delicious seasonal fare'
text='''Very good   food indeed! Second time there. Love the food, pretty good wine list, but VERY noisy. So I guess it depends how much you want to shout or listen ... Worth trying, but with lots of energy and will to talk loud. Really delicious seasonal fare.'''
# train
phrases='Great customer </s></s> convenient parking </s></s> easy location </s></s> good service </s></s> really nice pharmacy tech </s></s> nicer location </s></s> never negative shopping experience </s></s> Great products </s></s> great prices </s></s> great store </s></s> super helpful guest </s></s> could orders </s></s> best experience </s></s> sweetest disposition </s></s> knowledgeable products'
text='''If you are looking for something to do, take a break from walking the strip, or take a break from gambling, come watch a movie here! The theater is really old, a little difficult to locate, not visible from the strip (on the side of Denny's), and the theaters are super small, but hey it gives you something to do other than the usual things in Vegas. Prices are movies are definitely a bit pricey. Staff at the front seem to be friendly. Not much more to say about a movie theater. :) Enjoy.'''
phrases='summarize: '+phrases.replace(' </s></s> ',',')
encoded_phrases = tok.encode_plus(
            phrases,
            max_length = 128,
            add_special_tokens = True,
            return_attention_mask = True,
            return_token_type_ids = True,
            return_tensors = 'pt',
            padding = 'max_length',
            truncation = True,
        )

input_ids = encoded_phrases['input_ids'].to(device)
att_mask = encoded_phrases['attention_mask'].to(device)

In [None]:
%%time
output_greed = model.generate(
    input_ids = input_ids, 
    attention_mask = att_mask,
    min_length = 10,
    max_length = 128,
    early_stopping = True,
    pad_token_id = tok.pad_token_id,
    bos_token_id = tok.cls_token_id,
    eos_token_id = tok.sep_token_id,
    )

greedy_sent = tok.decode(output_greed[0], skip_special_tokens=True)
print(greedy_sent)

This is a great store with great prices. Great products, great prices, and the pharmacy tech was really nice. The location is easy to get to, convenient parking, and the guest was super helpful. The owner was the sweetest disposition and made sure we had the best experience possible. I would recommend this to anyone looking for a nicer location. Great customer service.
CPU times: user 23.4 s, sys: 26.8 ms, total: 23.4 s
Wall time: 1.2 s


In [None]:
%%time
output_beam=model.generate(input_ids=input_ids,attention_mask=att_mask,min_length=10,max_length=128,
                        early_stopping=True,pad_token_id=tok.pad_token_id,bos_token_id=tok.cls_token_id
                        ,eos_token_id=tok.sep_token_id,num_beams=5)

beam5_sent=tok.decode(output_beam[0], skip_special_tokens=True)
print(beam5_sent)

This is a great store. Great products, great prices, great prices. The pharmacy tech was really nice and had the sweetest disposition. The location is easy to get to and easy to get to and easy to get to. The guest at the front desk was super helpful with my orders and couldn't have been nicer. Great customer service and good service. This is the best experience I have ever had at a pharmacy.
CPU times: user 49.4 s, sys: 271 ms, total: 49.7 s
Wall time: 2.52 s


In [None]:
%%time
output_beam=model.generate(input_ids=input_ids,attention_mask=att_mask,min_length=10,max_length=128,
                        early_stopping=True,pad_token_id=tok.pad_token_id,bos_token_id=tok.cls_token_id
                        ,eos_token_id=tok.sep_token_id, num_beams=4)

beamn_sent=tok.decode(output_beam[0], skip_special_tokens=True)
print(beamn_sent)

This is a great store! Great products, great prices, great prices. The pharmacy tech was really nice and had the sweetest disposition. The location is easy to get to and easy to get to and easy to get to. The guest at the front desk was super helpful with my orders and couldn't have asked for a better experience. I will definitely be back!
CPU times: user 48.6 s, sys: 476 ms, total: 49.1 s
Wall time: 2.48 s


In [None]:
%%time
evaluateSummary([greedy_sent],[text]),evaluateSummary([beam5_sent],[text]),evaluateSummary([beamn_sent],[text])

CPU times: user 8.18 ms, sys: 0 ns, total: 8.18 ms
Wall time: 8.13 ms


({'Rouge-1': 0.22929936305732482,
  'Rouge-2': 0.025806451612903226,
  'Rouge-L': 0.15286624203821655},
 {'Rouge-1': 0.23668639053254437,
  'Rouge-2': 0.02395209580838324,
  'Rouge-L': 0.14201183431952663},
 {'Rouge-1': 0.2767295597484277,
  'Rouge-2': 0.025477707006369428,
  'Rouge-L': 0.1509433962264151})

In [None]:
def eval_rogue(model, dev, data):
    orig, greedy, beam=[],[],[]
    model.to(dev)
    model.eval()

    test_dataset = dataset(data)

    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size = 32,
    )

    # def batch(iterable, n=1):
    #     l = len(iterable)
    #     for ndx in range(0, l, n):
    #         yield iterable[ndx:min(ndx + n, l)]

    for row in tqdm(test_loader): 
        orig += row['review']
        input_ids = row['p_input_ids'].to(dev)
        att_mask = row['p_attention_mask'].to(dev)

        output_greed = model.generate(
            input_ids=input_ids,
            attention_mask=att_mask,
            min_length=10,
            max_length=50,
            early_stopping=True,
            pad_token_id=tok.pad_token_id,
            bos_token_id=tok.cls_token_id,
            eos_token_id=tok.sep_token_id
            )

        greedy += [tok.decode(output, skip_special_tokens=True) for output in output_greed]

        output_beam = model.generate(
            input_ids=input_ids,
            attention_mask=att_mask,
            min_length=10,
            max_length=50,
            early_stopping=True,
            pad_token_id=tok.pad_token_id,
            bos_token_id=tok.cls_token_id,
            eos_token_id=tok.sep_token_id,
            num_beams=4)

        beam += [tok.decode(output, skip_special_tokens=True) for output in output_beam]

    data['greedy'] = greedy
    data['beam'] = beam    
    print(len(orig), len(beam))    
    
    return {'greedy':evaluateSummary(greedy, orig), 'beam':evaluateSummary(beam, orig)}

In [None]:
test_set = pd.read_csv("/content/data/yelp-default/test_gold_8_15_all_all_300_8.csv")
test_set.columns=['eid', 'rids', 'n', 'text', 'review_0', 'review_1', 'review_2',
       'review_3', 'review_4', 'review_5', 'review_6', 'review_7',
       'extraction', 'phrases']
       
scores = eval_rogue(model, device, test_set)

100%|██████████| 7/7 [01:36<00:00, 13.78s/it]


200 200


In [None]:
scores
# {'beam': {'Rouge-1': 0.23833223304206289,
#   'Rouge-2': 0.03479748240427835,
#   'Rouge-L': 0.15303731099861098},
#  'greedy': {'Rouge-1': 0.240540043067736,
#   'Rouge-2': 0.03369968209532295,
#   'Rouge-L': 0.15454299725336362}}

# {'beam': {'Rouge-1': 0.2392596713943409,
#   'Rouge-2': 0.035799747079077696,
#   'Rouge-L': 0.15874841295479636},
#  'greedy': {'Rouge-1': 0.2369498751709391,
#   'Rouge-2': 0.03521518570567792,
#   'Rouge-L': 0.1629997963483346}}

{'beam': {'Rouge-1': 0.23731161836100853,
  'Rouge-2': 0.033559542929670234,
  'Rouge-L': 0.1559495411401135},
 'greedy': {'Rouge-1': 0.23818624673866842,
  'Rouge-2': 0.03621578068887971,
  'Rouge-L': 0.1577551971461499}}

In [None]:
test_set

Unnamed: 0,eid,rids,n,text,review_0,review_1,review_2,review_3,review_4,review_5,review_6,review_7,extraction,phrases,greedy,beam
0,fjufqwFSQrUhLqoYGTklHQ,15034726,8,"Fresh food, high quality food, delicious and M...",I tried to order steak kebob but they made bee...,Very delicious food in love with cucumber drin...,Woww! My order: Chicken Schwarma with a side...,I was thinking this would be more of a sit dow...,Parsley Modern Mediterranean is wonderful. Ver...,The food always taste fresh and leaves me very...,Now this place is really good i always drive p...,This is Chipotle for Mediterranean food. And i...,"good,Price,value-for-money,positive;okay,Taste...",good Price [SEP] okay Taste [SEP] Very delicio...,This is my favorite Mediterranean place in the...,This place is really good. Very delicious. Ver...
1,2JsLzYF8rUalwpm5LDEcog,91281413101511,8,The food is great here. It can be a little exp...,Food very good. Small unassuming atmosphere wi...,Fantastic place with phenomenal food. Very uni...,"just excellent. friendly staff, good food, fun...",This is my second time visiting --- and the fo...,"Very cool decor and atmosphere, but extremely ...",Over priced specials. Not good. The clam chowd...,Very good! Many items made from scratch and fi...,"The Falls' hidden gem--kooky decor, decadent f...","good,food,food -> quality,positive;excellent,s...",good food [SEP] excellent seafood [SEP] unassu...,"Fantastic place! Family atmosphere, friendly s...","Fantastic place. Family atmosphere, friendly s..."
2,6C_8Mh4lmLc_QEs3hHleBg,1822162321191720,8,Mexican food that reminds me of what my mom us...,"We call this "" Red Chicken. It's consistently ...",I like this place. The food here is like what ...,"It could be a lot better, but with no competit...",One of the best deals and delicious beef and c...,This is as close as the chicken that you will ...,My bro and I have found this place by chance. ...,"Nice simple Mexican food at a great price, lov...","To my homesick Indianz from Arizona, if you wa...","basic charbroiled,chicken,food -> quality,nega...",basic charbroiled chicken [SEP] love chicken [...,Nice simple Mexican food at a great price. The...,Nice simple Mexican food at a great price. I l...
3,5VXsbrqyJx0a4iaa43RNFA,2624252927302831,8,This is an ok breakfast joint. The food is ki...,Great hip breakfast in Gilbert. Food and waitr...,I think Over Easy is a very nice breakfast pla...,Not good at all. The young staff basically ign...,We love the service and food. The staff is alw...,Visited for the first time this morning and ev...,Had the biscuits and gravy...they were OK. ( B...,I want to give a 2 stars because the service s...,I really never post any reviews but this place...,"very nice,breakfast place,food -> quality,posi...",very nice breakfast place [SEP] great food [SE...,Great hip breakfast. I love the service and th...,This is a very nice breakfast place. The staff...
4,5iHctUjkQTGwEvOaBkwMRQ,3836343932353337,8,"This place has a nice buffet, which in itself ...",Awesome place for nice coffee and great delici...,This place is my new favorite! Their coffee is...,I was looking for just a place to get breakfas...,Great lunch spot downtown. Food tastes good. T...,I ate here this week and it was good place for...,We found this place walking toward another bre...,"This review is for the salad bar, I feel like ...",Let`s start with the good stuff first. Clean p...,"favorite,restaurant for lunch,food -> quality,...",favorite restaurant for lunch [SEP] soo good c...,This is my favorite restaurant for lunch. The ...,This is my favorite restaurant for lunch. The ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,4iACB0ppsvwA2mFvLJoNbA,15631566156215651564156115601567,8,This place is a pretty sweet tattoo parlor! i...,Brandon did a great job on my first tattoo. Wa...,Yooooo this place is dope!! We got there an ho...,"From Heidi: Justin is the bomb, he provided ex...",Kevin did a amazing job on the cover up i need...,Thank you Nick Giordono for the sick tattoo! I...,came here on a Saturday after calling to inqui...,Went in for a small tattoo on my wrist and had...,My twin sister and I just got our first tattoo...,"good,experience overall,restaurant -> atmosphe...",good experience overall [SEP] dope place [SEP]...,This place is dope! Terry is a fantastic busin...,This place is dope. Terry is a fantastic busin...
196,0uwQIGlKICIYuRtayVnh7g,15731571157415701572156915681575,8,This place is kind of weird. The food menu is ...,"Pointless to order on line, staff unapologet...",I've come here a few times before and never ha...,Was curious about this place. Pasta was hot an...,If chipotle had a stupid Italian cousin it wou...,"Disappointed. Plus, we were waiting in line an...",This is the worst piada i have been to. They w...,Nice place for good food when you're in a hurr...,"I used to love Piada, but this particular loca...","rude,staff,staff,negative;out of several ingre...",rude staff [SEP] out of several ingredients st...,"Nice place, rude staff, long lines, awkward fo...","Nice place, rude staff, long lines, awkward fo..."
197,4yAvytbVKHqSYAo3mkI9OA,15801583157615821577157815791581,8,"This place has a excellent price point, and th...",I came accidentally to this place and guess wh...,Good food. Nice people. The beef tikka is defi...,"Food is very good, service and atmosphere real...",Food is ok and definitely not PERSIAN. If you ...,"Best kabob in town, I wish I could give more s...","Perfect in every way! The spices, the tenderne...","Yum! Hubs brought home "" to go "" order. Super ...",Wow ...... This place is the bomb. Have been h...,"very good,Food,food -> quality,positive;best,l...",very good Food [SEP] best lunches [SEP] delici...,I really appreciate this restaurant. Nice peop...,One of the best lunches I've had in a long tim...
198,4c19YWOjPmbFUK4-V2GEvg,15841588158715851590158615911589,8,"This place is just ""meh."" I wish the menu had...",I've only eaten appetizers here but they are a...,I really love the idea of this place (menus ta...,As a teacher you would think I would love this...,"Went here yesterday, my girlfriend ordered a b...","It's "" i eat "" I guess. Food is nothing to WRI...","Food is decent, and the drinks are ok. Not a h...","Trivia on tuesday was fun, and the half off ap...",The bald bartender is the worst I've ever seen...,"Huge,portions-,food-quantity,positive;good-,dr...",Huge portions- [SEP] good- drinks [SEP] very c...,This is a very cool option for pub food. Huge ...,This is a very cool option for pub food. Huge ...


In [None]:
!zip op_Roberta60k_10.zip my_model

  adding: my_model (deflated 11%)


In [None]:
# Import PyDrive and associated libraries.
# This only needs to be done once in a notebook.
# !pip install httplib2==0.15.0 -q
# !pip install pydrive --upgrade -q
# !pip install google-api-python-client==1.6 -q

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


In [None]:
# Create & upload a text file.
uploaded = drive.CreateFile()
uploaded.SetContentFile('op_Roberta60k_10.zip')
uploaded.Upload()

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base', return_dict=True)
input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
outputs = model(input_ids=input_ids, labels=labels)
loss = outputs.loss
logits = outputs.logits
input_ids = tokenizer("summarize: studies have shown that owning a dog is good for you ", return_tensors="pt").input_ids  # Batch size 1
outputs = model.generate(input_ids)

In [None]:
tokenizer.decode(outputs[0])

'owning a dog is good for you, according to studies. a dog is'

In [None]:
encoded_text = tokenizer.encode_plus(
            train.iloc[0]['phrases'],
            truncation=True,
            max_length=128,
            add_special_tokens=True,
            padding=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt',
        )

In [None]:
tokenizer.decode(encoded_text.input_ids[0])

'really old theater [SEP] super small theaters [SEP] bit pricey movies [SEP] friendly Staff'

In [None]:
dict(outputs).keys()

dict_keys(['last_hidden_state', 'past_key_values', 'encoder_last_hidden_state'])

In [None]:
train.iloc[0]['phrases']

'really old theater [SEP] super small theaters [SEP] bit pricey movies [SEP] friendly Staff'