<a href="https://colab.research.google.com/github/MinhongW/text_generation/blob/main/build_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import json

In [2]:
!pip install transformers datasets SentencePiece rouge-score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!git clone https://github.com/MinhongW/text_generation.git

fatal: destination path 'text_generation' already exists and is not an empty directory.


In [4]:
t1 = open('text_generation/data/table_train.json')
t2 = open('text_generation/data/table_desc_train.json')
t3 = open('text_generation/data/paper_train.json')

v1 = open('text_generation/data/table_val.json')
v2 = open('text_generation/data/table_desc_val.json')
v3 = open('text_generation/data/paper_val.json')

te1 = open('text_generation/data/table_test.json')
te2 = open('text_generation/data/table_desc_test.json')
te3 = open('text_generation/data/paper_test.json')

In [5]:
tables_train = json.load(t1)
descs_train = json.load(t2)
papers_train = json.load(t3)

tables_val = json.load(v1)
descs_val = json.load(v2)
papers_val = json.load(v3)

tables_test = json.load(te1)
descs_test = json.load(te2)
papers_test = json.load(te3)

In [6]:
def naive_representation(tables, descs):
    """
    Input_text is generated by naive representation of the tables.
    Each table is simply flattened into a sequence ignoring its table structure
    by concatenating captions, headers, metrics and targeted cell values.
    Target_text is the description of the corresponding table.
    Returns a df contains input_text and target_text
    
    """
    
    data = {'input_text':[],
           'target_text':[]}
    
    for i in range(len(tables)):
        table = tables[i]
        caption = table['table_id'] + ' ' + table['caption']
        row_names = ' '.join(' '.join(x) for x in table['row_headers'])
        col_names = ' '.join(' '.join(x) for x in table['column_headers'])
        metrics = ' '.join(table['metrics_type'])
        values = ' '.join(' '.join(x) for x in table['contents'])        
        tmp = [caption, row_names, col_names, metrics, values]
        text = ' '.join(tmp)
        
        desc = descs[i]['description']        
        
        data['input_text'].append(text)
        data['target_text'].append(desc)
    
    df = pd.DataFrame(data)      
    
    return df

In [7]:
df_train = naive_representation(tables_train, descs_train)
df_val = naive_representation(tables_val, descs_val)
df_test = naive_representation(tables_test, descs_test)

In [8]:
df_train.head()

Unnamed: 0,input_text,target_text
0,table_2 Comparison of different position featu...,Table 2 summarizes the performances of propose...
1,table_3 Pearson correlation values between hum...,Table 3 presents the correlation results for t...
2,table_4 Comparison between rationale models (m...,Results. Table 4 presents the results of our r...
3,table_2 Spearman’s rank correlation results on...,Table 2 shows the results of our contextdepend...
4,table_4 Examples of attention weights in diffe...,"From Table 4, we can find that in the first ho..."


# Build RNN model by pulling out the last layer of T5 model as embeddings

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer, T5Model


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [62]:
# Define hyperparameters
batch_size = 32
#embedding_dim = 128
hidden_dim = 256
num_layers = 2
lr = 1e-3
num_epochs = 10

# Define custom dataset and dataloader
class TextDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        input_text = self.df.iloc[index]["input_text"]
        target_text = self.df.iloc[index]["target_text"]
        input_tokens = self.tokenizer.encode(input_text, add_special_tokens=False, padding='max_length', truncation=True)
        target_tokens = self.tokenizer.encode(target_text, add_special_tokens=False, padding='max_length', truncation=True)
        return torch.tensor(input_tokens), torch.tensor(target_tokens)

def collate_fn(batch):
    input_batch = [item[0] for item in batch]
    target_batch = [item[1] for item in batch]
    input_padded = pad_sequence(input_batch, batch_first=True, padding_value=0)
    target_padded = pad_sequence(target_batch, batch_first=True, padding_value=0)
    return input_padded, target_padded

# Prepare data
tokenizer = AutoTokenizer.from_pretrained('t5-small')
model_t5 = T5Model.from_pretrained('t5-small')
train_dataset = TextDataset(df_train, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataset = TextDataset(df_val, tokenizer)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_dataset = TextDataset(df_test, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)


# Freeze T5 parameters
for param in model_t5.parameters():
    param.requires_grad = False

t5_embedding = model_t5.get_input_embeddings()


# Define model architecture
class LSTMGenerator(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, num_layers):
        super(LSTMGenerator, self).__init__()
        #self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding = t5_embedding
        #self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.lstm = nn.LSTM(t5_embedding.embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x):
        x = self.embedding(x)
        output, _ = self.lstm(x)
        output = self.fc(output)
        return output

#vocab_size = tokenizer.vocab_size
vocab_size = tokenizer.vocab_size
model = LSTMGenerator(t5_embedding.embedding_dim, hidden_dim, vocab_size, num_layers)
model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)     

In [11]:
# def calculate_rouge(all_preds, all_targets):
#     rouge_scores = []
#     for i in range(len(all_preds)):
#         pred = all_preds[i]
#         target = all_targets[i]
#         pred_str = tokenizer.decode(pred, skip_special_tokens=True)
#         target_str = tokenizer.decode(target, skip_special_tokens=True)
#         rouge_scores.append(rouge.compute(predictions=[pred_str], references=[target_str])["rouge2"].fmeasure)

#     return sum(rouge_scores) / len(rouge_scores)

In [51]:
!pip install evaluate nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [26]:
# import evaluate

# rouge_metric = evaluate.load('rouge')

# def calculate_rouge(all_preds, all_targets):
#     rouge_scores = []
#     for i in range(len(all_preds)):
#         pred = all_preds[i]
#         target = all_targets[i]
#         pred_str = tokenizer.decode(pred, skip_special_tokens=True)
#         target_str = tokenizer.decode(target, skip_special_tokens=True)
#         results = rouge_metric.compute(predictions=pred_str, references=target_str, use_stemmer=True)
#         # Extract ROUGE f1 scores
#         result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
#         rouge_scores.append(scores)

#     return sum(rouge_scores) / len(rouge_scores)


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [63]:
import evaluate
import numpy as np

import nltk
nltk.download('punkt')
import string
     

rouge_metric = evaluate.load('rouge')

def calculate_rouge(all_preds, all_targets):
    rouge_scores = []
    decoded_preds = tokenizer.batch_decode(all_preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(all_targets, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels]

    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract ROUGE f1 scores
    print(result)
    #result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    result = {key: value * 100 for key, value in result.items()} # changed load_metric to evaluate, hence there is a change here as well

    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in all_preds]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [64]:
# Train model
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, batch in enumerate(train_loader):
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        #outputs = outputs[:, :-1, :].contiguous().view(-1, vocab_size)
        #targets = targets[:, 1:].contiguous().view(-1)
        #loss = criterion(outputs, targets)
        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1).to(device))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 10 == 9:    # print every 10 batches
            print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{i+1}/{len(train_loader)}], Loss: {running_loss/100:.4f}")
            running_loss = 0.0

    # run validation after each epoch
    model.eval() # set model to evaluation mode
    with torch.no_grad():
        val_loss = 0.0
        all_preds = []
        all_targets = []
        for batch in val_loader:
            inputs, targets = batch
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs.view(-1, vocab_size), targets.view(-1).to(device))
            val_loss += loss.item()
            preds = torch.argmax(outputs, dim=2)
            all_preds.extend(preds.tolist())
            all_targets.extend(targets.tolist())

        val_loss /= len(val_loader)
        rouge_scores = calculate_rouge(all_preds, all_targets)
        print(rouge_scores)
        #print(f"Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss:.4f}, Validation Rouge: {rouge_score:.4f}")

Epoch [1/10], Batch [10/34], Loss: 0.8145
Epoch [1/10], Batch [20/34], Loss: 0.3643
Epoch [1/10], Batch [30/34], Loss: 0.3261
{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0, 'gen_len': 1.0}
Epoch [2/10], Batch [10/34], Loss: 0.2958
Epoch [2/10], Batch [20/34], Loss: 0.3091
Epoch [2/10], Batch [30/34], Loss: 0.2990
{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0, 'gen_len': 1.0}
Epoch [3/10], Batch [10/34], Loss: 0.2938
Epoch [3/10], Batch [20/34], Loss: 0.2910
Epoch [3/10], Batch [30/34], Loss: 0.2839
{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0, 'gen_len': 1.0}
Epoch [4/10], Batch [10/34], Loss: 0.2867
Epoch [4/10], Batch [20/34], Loss: 0.2869
Epoch [4/10], Batch [30/34], Loss: 0.2832
{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}


## Evaluate model on test set

In [66]:
model.eval() # set model to evaluation mode
with torch.no_grad():
    test_loss = 0.0
    all_preds = []
    all_targets = []
    for batch in test_loader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1).to(device))
        test_loss += loss.item()
        preds = torch.argmax(outputs, dim=2)
        all_preds.extend(preds.tolist())
        all_targets.extend(targets.tolist())

    test_loss /= len(test_loader)
    rouge_scores = calculate_rouge(all_preds, all_targets)
    print(rouge_scores)

{'rouge1': 0.10152991205489142, 'rouge2': 0.00013811668379121495, 'rougeL': 0.09617968215066196, 'rougeLsum': 0.09939267483683162}
{'rouge1': 10.153, 'rouge2': 0.0138, 'rougeL': 9.618, 'rougeLsum': 9.9393, 'gen_len': 1.0}


In [98]:
preds

tensor([[4398, 4398, 4398,  ...,    0,    0,    0],
        [4398, 4398, 4398,  ...,    0,    0,    0],
        [4398, 4398, 4398,  ...,    0,    0,    0],
        ...,
        [4398, 4398, 4398,  ...,    0,    0,    0],
        [4398, 4398, 4398,  ...,    0,    0,    0],
        [4398, 4398, 4398,  ...,    0,    0,    0]], device='cuda:0')

In [69]:
# print out some examples
input_text = df_test['input_text'][0]
input_text

'table_5 Link prediction results on the test-I, test-II, and test-all sets of FB122 and WN18 (filtered setting). FB122 TransE FB122 TransH FB122 TransR FB122 KALE-Trip FB122 KALE-Pre FB122 KALE-Joint WN18 TransE WN18 TransH WN18 TransR WN18 KALE-Trip WN18 KALE-Pre WN18 KALE-Joint Test-I MRR Test-I MED Test-I HITS@3 (%) Test-I HITS@5 (%) Test-I HITS@10 (%) Test-II MRR Test-II MED Test-II HITS@3 (%) Test-II HITS@5 (%) Test-II HITS@10 (%) Test-ALL MRR Test-ALL MED Test-ALL HITS@3 (%) Test-ALL HITS@5 (%) Test-ALL HITS@10 (%) MRR MED HITS@3 (%) HITS@5 (%) HITS@10 (%) MRR MED HITS@3 (%) HITS@5 (%) HITS@10 (%) MRR MED HITS@3 (%) HITS@5 (%) HITS@10 (%) 0.296 13.0 36.0 41.5 48.1 0.630 2.0 77.5 82.8 88.4 0.480 2.0 58.9 64.2 70.2 0.280 15.0 33.6 39.1 46.4 0.606 2.0 70.1 75.4 82.0 0.460 3.0 53.7 59.1 66.0 0.283 16.0 33.4 39.2 46.0 0.499 2.0 57.0 63.2 70.1 0.401 5.0 46.4 52.4 59.3 0.299 10.0 36.6 42.9 50.2 0.650 2.0 79.0 83.4 88.7 0.492 2.0 59.9 65.2 71.4 0.291 11.0 35.8 41.9 49.8 0.713 1.0 82.9 86

In [78]:
input_tokens = tokenizer.encode(input_text, add_special_tokens=False, padding='max_length', truncation=True)
input = torch.tensor(input_tokens)

In [80]:
input = input.to(device)

In [76]:
model.to(device)

LSTMGenerator(
  (embedding): Embedding(32128, 512)
  (lstm): LSTM(512, 256, num_layers=2, batch_first=True)
  (fc): Linear(in_features=256, out_features=32100, bias=True)
)

In [81]:
output = model(input)

In [93]:
output

tensor([[ 3.3844, -3.1741,  1.6518,  ..., -3.7198, -3.4341, -3.4878],
        [ 4.3015, -4.3983,  2.4005,  ..., -5.0663, -5.0490, -4.8750],
        [ 4.9788, -4.9176,  2.7413,  ..., -5.5665, -5.7839, -5.5590],
        ...,
        [11.2103, -6.7511,  2.8730,  ..., -7.6216, -7.7668, -8.1272],
        [11.3172, -6.7299,  2.8132,  ..., -7.6593, -7.8201, -8.1341],
        [11.4348, -6.8158,  2.8811,  ..., -7.7128, -7.8758, -8.2143]],
       device='cuda:0', grad_fn=<AddmmBackward0>)

In [95]:
pred = torch.argmax(output, dim=1)
pred

tensor([4398, 4398, 4398,    8,    8,    8,    8,    8,    8,    8,    8,    8,
           8,    8,    8,    8,    8,    8,    8,    8,    8,    8,    8,    8,
           8,    8,    8,    8,    8,    8,    8,    8,    8,    8,    8,    8,
           8,    8,    8,    8,    8,    8,    8,    8,    8,    8,    8,    8,
           8,    8,    8,    8,    0,    8,    8,    8,    8,    8,    8,    8,
           8,    8,    8,    8,    8,    8,    8,    8,    8,    8,    8,    8,
           8,    8,    8,    8,    8,    8,    8,    8,    8,    8,    8,    8,
           8,    8,    8,    8,    8,    8,    8,    8,    8,    8,    8,    8,
           8,    8,    8,    8,    8,    8,    8,    8,    8,    8,    8,    8,
           8,    8,    8,    8,    8,    8,    8,    8,    8,    0,    8,    8,
           8,    8,    8,    8,    8,    8,    8,    0,    0,    0,    0,    0,
           8,    8,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,   

In [96]:
tokenizer.decode(pred, skip_special_tokens=True)


'Table Table Table the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the'

In [91]:
a = torch.randn(4, 4)
a
torch.argmax(a)

tensor(9)

In [92]:
a

tensor([[ 0.5970,  0.8134,  0.3729, -0.1163],
        [ 0.3832,  0.3426,  0.4931, -0.8252],
        [ 0.9373,  1.0971,  0.3978,  0.3802],
        [-0.3165,  0.0450, -0.4167, -1.5057]])