<a href="https://colab.research.google.com/github/MinhongW/text_generation/blob/main/build_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import json

In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m67.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


In [4]:
!git clone https://github.com/MinhongW/text_generation.git

Cloning into 'text_generation'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 34 (delta 9), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (34/34), 1.58 MiB | 3.55 MiB/s, done.


In [5]:
t1 = open('text_generation/data/table_train.json')
t2 = open('text_generation/data/table_desc_train.json')
t3 = open('text_generation/data/paper_train.json')

v1 = open('text_generation/data/table_val.json')
v2 = open('text_generation/data/table_desc_val.json')
v3 = open('text_generation/data/paper_val.json')

te1 = open('text_generation/data/table_test.json')
te2 = open('text_generation/data/table_desc_test.json')
te3 = open('text_generation/data/paper_test.json')

In [6]:
tables_train = json.load(t1)
descs_train = json.load(t2)
papers_train = json.load(t3)

tables_val = json.load(v1)
descs_val = json.load(v2)
papers_val = json.load(v3)

tables_test = json.load(te1)
descs_test = json.load(te2)
papers_test = json.load(te3)

In [7]:
def naive_representation(tables, descs):
    """
    Input_text is generated by naive representation of the tables.
    Each table is simply flattened into a sequence ignoring its table structure
    by concatenating captions, headers, metrics and targeted cell values.
    Target_text is the description of the corresponding table.
    Returns a df contains input_text and target_text
    
    """
    
    data = {'input_text':[],
           'target_text':[]}
    
    for i in range(len(tables)):
        table = tables[i]
        caption = table['table_id'] + ' ' + table['caption']
        row_names = ' '.join(' '.join(x) for x in table['row_headers'])
        col_names = ' '.join(' '.join(x) for x in table['column_headers'])
        metrics = ' '.join(table['metrics_type'])
        values = ' '.join(' '.join(x) for x in table['contents'])        
        tmp = [caption, row_names, col_names, metrics, values]
        text = ' '.join(tmp)
        
        desc = descs[i]['description']        
        
        data['input_text'].append(text)
        data['target_text'].append(desc)
    
    df = pd.DataFrame(data)      
    
    return df

In [8]:
df_train = naive_representation(tables_train, descs_train)
df_val = naive_representation(tables_val, descs_val)
df_test = naive_representation(tables_test, descs_test)

In [9]:
df_train.head()

Unnamed: 0,input_text,target_text
0,table_2 Comparison of different position featu...,Table 2 summarizes the performances of propose...
1,table_3 Pearson correlation values between hum...,Table 3 presents the correlation results for t...
2,table_4 Comparison between rationale models (m...,Results. Table 4 presents the results of our r...
3,table_2 Spearman’s rank correlation results on...,Table 2 shows the results of our contextdepend...
4,table_4 Examples of attention weights in diffe...,"From Table 4, we can find that in the first ho..."


In [10]:
!pip install SentencePiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting SentencePiece
  Downloading sentencepiece-0.1.98-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m57.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SentencePiece
Successfully installed SentencePiece-0.1.98


# Build RNN model by pulling out the last layer of T5 model as embeddings

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer, T5Model


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [17]:

# Define hyperparameters
batch_size = 32
embedding_dim = 128
hidden_dim = 256
num_layers = 2
lr = 1e-3
num_epochs = 2

# Define custom dataset and dataloader
class TextDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        input_text = self.df.iloc[index]["input_text"]
        target_text = self.df.iloc[index]["target_text"]
        input_tokens = self.tokenizer.encode(input_text, add_special_tokens=False, padding='max_length', truncation=True)
        target_tokens = self.tokenizer.encode(target_text, add_special_tokens=False, padding='max_length', truncation=True)
        return torch.tensor(input_tokens), torch.tensor(target_tokens)

def collate_fn(batch):
    input_batch = [item[0] for item in batch]
    target_batch = [item[1] for item in batch]
    input_padded = pad_sequence(input_batch, batch_first=True, padding_value=0)
    target_padded = pad_sequence(target_batch, batch_first=True, padding_value=0)
    return input_padded, target_padded

# Prepare data
tokenizer = AutoTokenizer.from_pretrained('t5-small')
model_t5 = T5Model.from_pretrained('t5-small')
train_dataset = TextDataset(df_train, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)


# Freeze T5 parameters
for param in model_t5.parameters():
    param.requires_grad = False

t5_embedding = model_t5.get_input_embeddings()


# Define model architecture
class LSTMGenerator(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, num_layers):
        super(LSTMGenerator, self).__init__()
        #self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding = t5_embedding
        #self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.lstm = nn.LSTM(t5_embedding.embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x):
        x = self.embedding(x)
        output, _ = self.lstm(x)
        output = self.fc(output)
        return output

#vocab_size = tokenizer.vocab_size
vocab_size = tokenizer.vocab_size
model = LSTMGenerator(embedding_dim, hidden_dim, vocab_size, num_layers)
model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

# Train model
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, batch in enumerate(train_loader):
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        #outputs = outputs[:, :-1, :].contiguous().view(-1, vocab_size)
        #targets = targets[:, 1:].contiguous().view(-1)
        #loss = criterion(outputs, targets)
        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1).to(device))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{i+1}/{len(train_loader)}], Loss: {running_loss/100:.4f}")
        running_loss = 0.0


Epoch [1/2], Batch [1/34], Loss: 0.1036
Epoch [1/2], Batch [2/34], Loss: 0.1003
Epoch [1/2], Batch [3/34], Loss: 0.0967
Epoch [1/2], Batch [4/34], Loss: 0.0918
Epoch [1/2], Batch [5/34], Loss: 0.0848
Epoch [1/2], Batch [6/34], Loss: 0.0806
Epoch [1/2], Batch [7/34], Loss: 0.0711
Epoch [1/2], Batch [8/34], Loss: 0.0628
Epoch [1/2], Batch [9/34], Loss: 0.0566
Epoch [1/2], Batch [10/34], Loss: 0.0519
Epoch [1/2], Batch [11/34], Loss: 0.0500
Epoch [1/2], Batch [12/34], Loss: 0.0412
Epoch [1/2], Batch [13/34], Loss: 0.0354
Epoch [1/2], Batch [14/34], Loss: 0.0331
Epoch [1/2], Batch [15/34], Loss: 0.0366
Epoch [1/2], Batch [16/34], Loss: 0.0312
Epoch [1/2], Batch [17/34], Loss: 0.0347
Epoch [1/2], Batch [18/34], Loss: 0.0297
Epoch [1/2], Batch [19/34], Loss: 0.0328
Epoch [1/2], Batch [20/34], Loss: 0.0371
Epoch [1/2], Batch [21/34], Loss: 0.0385
Epoch [1/2], Batch [22/34], Loss: 0.0287
Epoch [1/2], Batch [23/34], Loss: 0.0307
Epoch [1/2], Batch [24/34], Loss: 0.0296
Epoch [1/2], Batch [25/34