<a href="https://colab.research.google.com/github/MinhongW/text_generation/blob/main/fine_tune_t5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [93]:
import pandas as pd
import json
import math
import platform
import sys
import tensorflow as tf

In [None]:
! pip install datasets transformers rouge-score nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
print(f"Python Platform: {platform.platform()}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
gpu = len(tf.config.list_physical_devices('GPU'))>0
print("GPU is", "available" if gpu else "NOT AVAILABLE")

Python Platform: Linux-5.10.147+-x86_64-with-glibc2.31

Python 3.9.16 (main, Dec  7 2022, 01:11:51) 
[GCC 9.4.0]
Pandas 1.5.3
GPU is available


In [None]:
!git clone https://github.com/MinhongW/text_generation.git

fatal: destination path 'text_generation' already exists and is not an empty directory.


In [None]:
#!cd text_generation

In [None]:
t1 = open('text_generation/data/table_train.json')
t2 = open('text_generation/data/table_desc_train.json')
t3 = open('text_generation/data/paper_train.json')

v1 = open('text_generation/data/table_val.json')
v2 = open('text_generation/data/table_desc_val.json')
v3 = open('text_generation/data/paper_val.json')

te1 = open('text_generation/data/table_test.json')
te2 = open('text_generation/data/table_desc_test.json')
te3 = open('text_generation/data/paper_test.json')

In [None]:
# from google.colab import files
# uploaded = files.upload()

In [None]:
# t1 = open('table_train.json')
# t2 = open('table_desc_train.json')
# t3 = open('paper_train.json')

# v1 = open('table_val.json')
# v2 = open('table_desc_val.json')
# v3 = open('paper_val.json')

# te1 = open('table_test.json')
# te2 = open('table_desc_test.json')
# te3 = open('paper_test.json')

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
tables_train = json.load(t1)
descs_train = json.load(t2)
papers_train = json.load(t3)

tables_val = json.load(v1)
descs_val = json.load(v2)
papers_val = json.load(v3)

tables_test = json.load(te1)
descs_test = json.load(te2)
papers_test = json.load(te3)

In [None]:
# double check the order in table file and desc file

for i in range(len(tables_train)):
    table_id1 = tables_train[i]['table_id_paper']
    table_id2 = descs_train[i]['table_id_paper']
    if table_id1 != table_id2:
               print('oops')

# Naive representation

Simply flatten T into a sequence ignoring its table structure by concatenating captions, headers, metrics and targeted cell values.

In [None]:
def naive_representation(tables, descs):
    """
    Input_text is generated by naive representation of the tables.
    Each table is simply flattened into a sequence ignoring its table structure
    by concatenating captions, headers, metrics and targeted cell values.
    Target_text is the description of the corresponding table.
    Returns a df contains input_text and target_text
    
    """
    
    data = {'input_text':[],
           'target_text':[]}
    
    for i in range(len(tables)):
        table = tables[i]
        caption = table['table_id'] + ' ' + table['caption']
        row_names = ' '.join(' '.join(x) for x in table['row_headers'])
        col_names = ' '.join(' '.join(x) for x in table['column_headers'])
        metrics = ' '.join(table['metrics_type'])
        values = ' '.join(' '.join(x) for x in table['contents'])        
        tmp = [caption, row_names, col_names, metrics, values]
        text = ' '.join(tmp)
        
        desc = descs[i]['description']        
        
        data['input_text'].append(text)
        data['target_text'].append(desc)
    
    df = pd.DataFrame(data)      
    
    return df

In [None]:
df_train = naive_representation(tables_train, descs_train)
df_val = naive_representation(tables_val, descs_val)
df_test = naive_representation(tables_test, descs_test)

In [None]:
df_train.head()

Unnamed: 0,input_text,target_text
0,table_2 Comparison of different position featu...,Table 2 summarizes the performances of propose...
1,table_3 Pearson correlation values between hum...,Table 3 presents the correlation results for t...
2,table_4 Comparison between rationale models (m...,Results. Table 4 presents the results of our r...
3,table_2 Spearman’s rank correlation results on...,Table 2 shows the results of our contextdepend...
4,table_4 Examples of attention weights in diffe...,"From Table 4, we can find that in the first ho..."


In [None]:
#df_train['target_text'][3]

In [None]:
from datasets import Dataset
ds_train = Dataset.from_pandas(df_train)
ds_val = Dataset.from_pandas(df_val)
ds_test = Dataset.from_pandas(df_test)

In [None]:
df_test.shape

(135, 2)

# T5 MODEL

In [None]:
# import torch
# from transformers import T5Tokenizer, T5ForConditionalGeneration, Adafactor

In [None]:
import torch
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("Running on the GPU")
else:
    device = torch.device("cpu")
    print("Running on the CPU")

Running on the GPU


In [None]:
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


## Training with trainer API

In [99]:
from transformers import AutoTokenizer
#from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import T5ForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset, load_metric

In [100]:
# !pip install sentencepiece

In [101]:
model_checkpoint = "t5-small"

# AutoTokenizer will automatically select T5tokenizer as we specify a T5 model architecture here
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
def tokenize_function(examples):
    model_inputs = tokenizer(examples["input_text"], padding="max_length", truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
      labels = tokenizer(examples["target_text"], padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


ds_train_tokenized = ds_train.map(tokenize_function, batched=True)
ds_val_tokenized = ds_val.map(tokenize_function, batched=True)
ds_test_tokenized = ds_test.map(tokenize_function, batched=True)

Map:   0%|          | 0/1084 [00:00<?, ? examples/s]



Map:   0%|          | 0/136 [00:00<?, ? examples/s]

Map:   0%|          | 0/135 [00:00<?, ? examples/s]

In [None]:
ds_train_tokenized

Dataset({
    features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1084
})

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [None]:
batch_size = 4
model_name = "t5-test"
model_dir = f"model/{model_name}"

In [None]:
# model_init = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

In [None]:
# # Freeze the first 2 layers of the encoder and the last 2 layers of the decoder
# for name, param in model_init.named_parameters():
#     if 'encoder.block.0' in name or 'encoder.block.1' in name or 'decoder.block.10' in name or 'decoder.block.11' in name:
#         param.requires_grad = False

In [None]:
# just for building the structure, change it later
args = Seq2SeqTrainingArguments(
    output_dir=model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=200,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1"
    #report_to="tensorboard",
)

In [None]:
#!pip install evaluate

In [None]:
import nltk
nltk.download('punkt')
import string

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import numpy as np

metric = load_metric("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels]
    
    # Compute ROUGE scores
    # Return rouge1, rouge2, rougeL, rougeLsum
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    # Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
# # Function that returns an untrained model to be trained
# def model_init():
#     # Freeze the first 2 layers of the encoder and the last 2 layers of the decoder
#     model_init = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
#     for name, param in model_init.named_parameters():
#         if 'encoder.block.0' in name or 'encoder.block.1' in name or 'decoder.block.10' in name or 'decoder.block.11' in name:
#             param.requires_grad = False
#     return model_init

# Function that returns an untrained model to be trained
def model_init():
    return T5ForConditionalGeneration.from_pretrained(model_checkpoint)

model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
for name, param in model.named_parameters():
    if 'encoder.block.0' in name or 'encoder.block.1' in name or 'decoder.block.10' in name or 'decoder.block.11' in name:
        param.requires_grad = False


# use seq2seqtrainer (sub class of trainer)
# cause we need to predict with generate and evaluate with rouge
trainer = Seq2SeqTrainer(
    #model_init=model_init, # model_init has to be callable
    model=model,
    args=args,
    train_dataset=ds_train_tokenized,
    eval_dataset=ds_val_tokenized,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# seq2seqtrainer will automatically use gpu

trainer.train()



Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
100,5.3298,1.656298,0.0,0.0,0.0,0.0,0.0
200,1.6003,1.500811,0.007,0.0,0.007,0.007,0.1397
300,1.6226,1.453208,4.4173,1.7006,3.5594,4.1621,6.4265
400,1.4936,1.433815,14.8585,5.9473,12.1666,14.0984,17.1838
500,1.4463,1.419248,16.586,6.6139,13.7548,15.6958,18.0221
600,1.3866,1.408521,16.8236,6.8893,13.9563,15.9733,18.5809
700,1.4518,1.399142,17.2403,6.934,14.0928,16.2368,18.8603
800,1.4613,1.391177,17.2841,7.0346,14.0509,16.2747,18.8603
900,1.4038,1.384614,17.4418,7.1696,14.1343,16.489,19.0
1000,1.4039,1.378844,17.734,7.2477,14.3725,16.706,19.0


TrainOutput(global_step=2710, training_loss=1.5501806231002526, metrics={'train_runtime': 872.9542, 'train_samples_per_second': 12.418, 'train_steps_per_second': 3.104, 'total_flos': 1467105127956480.0, 'train_loss': 1.5501806231002526, 'epoch': 10.0})

In [None]:
trainer.save_model()

In [None]:
model_dir = f"model/{model_name}"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = T5ForConditionalGeneration.from_pretrained(model_dir)

max_input_length = 512

In [None]:
df_test['input_text'][0]

'table_5 Link prediction results on the test-I, test-II, and test-all sets of FB122 and WN18 (filtered setting). FB122 TransE FB122 TransH FB122 TransR FB122 KALE-Trip FB122 KALE-Pre FB122 KALE-Joint WN18 TransE WN18 TransH WN18 TransR WN18 KALE-Trip WN18 KALE-Pre WN18 KALE-Joint Test-I MRR Test-I MED Test-I HITS@3 (%) Test-I HITS@5 (%) Test-I HITS@10 (%) Test-II MRR Test-II MED Test-II HITS@3 (%) Test-II HITS@5 (%) Test-II HITS@10 (%) Test-ALL MRR Test-ALL MED Test-ALL HITS@3 (%) Test-ALL HITS@5 (%) Test-ALL HITS@10 (%) MRR MED HITS@3 (%) HITS@5 (%) HITS@10 (%) MRR MED HITS@3 (%) HITS@5 (%) HITS@10 (%) MRR MED HITS@3 (%) HITS@5 (%) HITS@10 (%) 0.296 13.0 36.0 41.5 48.1 0.630 2.0 77.5 82.8 88.4 0.480 2.0 58.9 64.2 70.2 0.280 15.0 33.6 39.1 46.4 0.606 2.0 70.1 75.4 82.0 0.460 3.0 53.7 59.1 66.0 0.283 16.0 33.4 39.2 46.0 0.499 2.0 57.0 63.2 70.1 0.401 5.0 46.4 52.4 59.3 0.299 10.0 36.6 42.9 50.2 0.650 2.0 79.0 83.4 88.7 0.492 2.0 59.9 65.2 71.4 0.291 11.0 35.8 41.9 49.8 0.713 1.0 82.9 86

In [None]:
input_text = df_test['input_text'][0]


input_ids = tokenizer.encode(input_text, truncation=True, return_tensors='pt')

# inputs = tokenizer(text, max_length=max_input_length, truncation=True, return_tensors="pt")
# output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=10, max_length=64)
# decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
# predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]
output = model.generate(input_ids=input_ids, max_length=512)
tokenizer.decode(output[0])

'<pad> Table 5 shows the results of the test-I, test-II, and test-all sets of FB122 and WN18 (filtered setting). The results of the test-II and test-all sets of FB122 and WN18 (filtered setting) are compared to the test-all sets of FB122 and WN18 (filtered setting). The results of the test-all sets are based on the test-I, test-II, and test-all sets. The results of the test-II is a significant improvement in the performance of the test-II. The test-II and test-all sets. The test-all sets are a good example of the test-all sets of FB122 and WN18 (filtered setting) are a significant improvement. The results are not a significant improvement in the test-all sets.</s>'

In [None]:
ds_test_tokenized

Dataset({
    features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 135
})

## Evluate the model on the test set

In [None]:
# import torch

# model.to(device)

# # prepare dataloader
# ds_test_tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask'])
# dataloader = torch.utils.data.DataLoader(ds_test_tokenized, batch_size=4)

# dataloader.to(device)

# # generate text for each batch
# all_predictions = []
# for i,batch in enumerate(dataloader):
#   predictions = model.generate(**batch, max_length=512)
#   all_predictions.append(predictions)

# # flatten predictions
# all_predictions_flattened = [pred for preds in all_predictions for pred in preds]

# # tokenize and pad titles
# all_target_texts = tokenizer(ds_test_tokenized["target_text"], max_length=512,
#                              truncation=True, padding="max_length")["input_ids"]

# # compute metrics
# predictions_labels = [all_predictions_flattened, all_target_texts]
# compute_metrics(predictions_labels)

In [None]:
from torch.utils.data import DataLoader

model.to(device)

# define the collate function
def collate_fn(inputs):
    # tokenize the inputs and return a dictionary with input_ids and their lengths
    input_ids = tokenizer.batch_encode_plus(inputs, padding=True, truncation=True, return_tensors='pt')
    input_ids = input_ids['input_ids'].to(device)
    input_lengths = torch.sum(input_ids != tokenizer.pad_token_id, dim=1)
    return {'input_ids': input_ids, 'input_lengths': input_lengths}

# create the DataLoader
test_dataset = DataLoader(ds_test['input_text'], batch_size=4, collate_fn=collate_fn)

generated_outputs = []

model.eval()
with torch.no_grad():
    for batch in test_dataset:
        # move the batch to the GPU
        batch = {k: v.to(device) for k, v in batch.items()}
        # pass the inputs through the model to generate output
        generated_ids = model.generate(
            input_ids=batch['input_ids'],
            attention_mask=batch['input_ids'].ne(tokenizer.pad_token_id),
            max_length=512,
            num_beams=4,   # double check here
            early_stopping=True
        )
        # convert generated ids to text
        generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        # append generated texts to the output list
        generated_outputs.extend(generated_texts)

In [None]:
#df_test.head()

In [None]:
target_texts = df_test['target_text'].to_list() # a list of target strings

# Rouge expects a newline after each sentence
generated_outputs = ["\n".join(nltk.sent_tokenize(output.strip()))
                    for output in generated_outputs]
target_texts = ["\n".join(nltk.sent_tokenize(label.strip())) 
                for label in target_texts]


scores = metric.compute(predictions=generated_outputs, references=target_texts,
                    use_stemmer=True)
scores = {key: value.mid.fmeasure * 100 for key, value in scores.items()}
#scores = {key: round(value.mid.fmeasure, 4) * 100 for key, value in scores.items()}

In [None]:
scores

{'rouge1': 26.038172337801946,
 'rouge2': 8.941688459290196,
 'rougeL': 19.86547792066658,
 'rougeLsum': 22.640253511185882}

In [None]:
# # define the target texts
# target_texts = df_test['target_text'].to_list() # a list of target strings

# scores = metric.compute(predictions=generated_outputs, references=target_texts,
#                     use_stemmer=True) # double check use_stemmer
# # extract the F1 score
# #f1_score = scores[0]['rouge-1']['f']
# # add the F1 score to the list of Rouge scores
# #rouge_scores.append(f1_score)
# scores = {key: value.mid.fmeasure * 100 for key, value in scores.items()}

## Old code for fine-tuning T5 model

In [None]:
# Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
# need to double check here, do we need to use prefix for t5
model_checkpoint = 't5-small'
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)
#moving the model to GPU
model.to(dev)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [None]:
batch_size=4
num_of_batches=math.ceil(len(df_train)/batch_size)
num_of_batches

271

In [None]:
# Initiating the Adafactor optimizer with recommended T5 settings
optimizer = Adafactor(model.parameters(),lr=1e-3,
                      eps=(1e-30, 1e-3),
                      clip_threshold=1.0,
                      decay_rate=-0.8,
                      beta1=None,
                      weight_decay=0.0,
                      relative_step=False,
                      scale_parameter=False,
                      warmup_init=False)

In [None]:
# Html based progress bar

from IPython.display import HTML, display
def progress(loss,value, max=100):
    return HTML(""" Batch loss :{loss}
      <progress    
    value='{value}'max='{max}',style='width: 100%'>{value}
      </progress>
             """.format(loss=loss,value=value, max=max))

In [None]:
#Sets the module in training mode

num_of_epochs = 15

model.train()

loss_per_10_steps = []
for epoch in range(1,num_of_epochs+1):
    print('Running epoch: {}'.format(epoch))
  
    running_loss=0

    out = display(progress(1, num_of_batches+1), display_id=True)
    for i in range(num_of_batches):
        inputbatch = []
        labelbatch = []
        new_df = df_train[i*batch_size:i*batch_size+batch_size]
        for indx,row in new_df.iterrows():
            inputs = row['input_text']+'</s>' 
            labels = row['target_text']+'</s>'   
            inputbatch.append(inputs)
            labelbatch.append(labels)
        inputbatch = tokenizer.batch_encode_plus(inputbatch,padding=True,truncation=True,return_tensors='pt')["input_ids"]
        labelbatch = tokenizer.batch_encode_plus(labelbatch,padding=True,truncation=True,return_tensors="pt")["input_ids"]
        inputbatch = inputbatch.to(dev)
        labelbatch = labelbatch.to(dev)

        # clear out the gradients of all Variables 
        optimizer.zero_grad()

        # Forward propogation
        outputs = model(input_ids=inputbatch, labels=labelbatch)
        loss = outputs.loss
        loss_num=loss.item()
        logits = outputs.logits
        running_loss+=loss_num
        if i%10 ==0:      
            loss_per_10_steps.append(loss_num)
        out.update(progress(loss_num,i, num_of_batches+1))

        # calculating the gradients
        loss.backward()

        #updating the params
        optimizer.step()

    running_loss=running_loss/int(num_of_batches)
    print('Epoch: {} , Running loss: {}'.format(epoch,running_loss))


Running epoch: 1




Epoch: 1 , Running loss: 1.0054126112443496
Running epoch: 2


Epoch: 2 , Running loss: 0.9616369956973734
Running epoch: 3


Epoch: 3 , Running loss: 0.9205098451283585
Running epoch: 4


Epoch: 4 , Running loss: 0.8800978352662822
Running epoch: 5


Epoch: 5 , Running loss: 0.8392817216935633
Running epoch: 6


Epoch: 6 , Running loss: 0.8062150707653968
Running epoch: 7


Epoch: 7 , Running loss: 0.7733110055822288
Running epoch: 8


Epoch: 8 , Running loss: 0.7404188756793187
Running epoch: 9


Epoch: 9 , Running loss: 0.7065555720531632
Running epoch: 10


Epoch: 10 , Running loss: 0.6784806173995852
Running epoch: 11


Epoch: 11 , Running loss: 0.6477029707928865
Running epoch: 12


Epoch: 12 , Running loss: 0.6241181062816253
Running epoch: 13


Epoch: 13 , Running loss: 0.5957367826541852
Running epoch: 14


Epoch: 14 , Running loss: 0.5730258386953291
Running epoch: 15


Epoch: 15 , Running loss: 0.5488903985472183


In [None]:
# torch.save(model.state_dict(),'pytorch_model.bin')

In [None]:
df_test.head()

Unnamed: 0,input_text,target_text
0,"table_5 Link prediction results on the test-I,...",Results. Table 5 show the results in the filte...
1,table_3 Comparison on validation perplexity. B...,"In Figure 3, we present the validation perplex..."
2,table_4 Performance on maximally covered datas...,5.1 Maximum coverage comparison. Table 4 shows...
3,table_4 Open-ended and multiple-choice (MC) re...,4.4 Comparison to State-of-the-Art. Table 4 co...
4,table_1 Corpus size (length in token) and syst...,Results. From the whole corpus of 300 sentence...


In [None]:
input_text = df_test['input_text'][18]
input_text

'table_5 Performance on monolingual word similarity computation with seed lexicon size 6000. Method BiLex Method CLSP-WR Method CLSP-SE Chinese (source) WS-240 Chinese (source) WS-297 English (target) WS-353 English (target) SL-999 accuracy accuracy accuracy accuracy 60.36 62.17 60.46 27.22 61.27 65.25 60.46 27.22 60.84 65.62 62.47 28.79'

In [None]:

model.to(dev)

input_ids = tokenizer.encode(input_text, truncation=True, return_tensors='pt')
input_ids = input_ids.to(dev)
output = model.generate(input_ids=input_ids, max_length=512)
tokenizer.decode(output[0])

'<pad> Table 5 shows the performance of a CLSP-SE model on a monolingual word similarity. We can see that CLSP-WR performs well on a language model with Japanese (source) and Chinese (source) on the Chinese dataset (see the SL-999 dataset). It is also noticeable that on SL-999 (English) and SL-999 ( English), CLSP-SE performs better than T-Ls (WS-297) in SL-999 (WS-297 ( English).</s>'

## Playground

In [None]:
import transformers
from datasets import load_dataset, load_metric

In [None]:
dataset = load_dataset("yelp_review_full")

Downloading builder script:   0%|          | 0.00/4.41k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.55k [00:00<?, ?B/s]

Downloading and preparing dataset yelp_review_full/yelp_review_full to /root/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf...


Downloading data:   0%|          | 0.00/196M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset yelp_review_full downloaded and prepared to /root/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})