## Implement Seq2Seq Model from scratch
- maps an input to an output sequence of different length

In [55]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader , Dataset
import numpy as np
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Custom Dataset

In [56]:
ds = load_dataset("FBK-MT/MCIF", "long_fixedprompt")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating test split: 100%|██████████| 362/362 [00:00<00:00, 39574.06 examples/s]


In [57]:
ds

DatasetDict({
    test: Dataset({
        features: ['id', 'prompt_en', 'prompt_de', 'prompt_it', 'prompt_zh', 'metadata', 'audio', 'video', 'text'],
        num_rows: 362
    })
})

In [61]:
for row in ds['test']:
    print(row)

{'id': '0', 'prompt_en': 'Answer the following question concisely given the English content: What are the main data sources for language models?', 'prompt_de': 'Beantworte die folgende Frage kurz und bündig basierend auf dem englischen Inhalt: Was sind die wichtigsten Datenquellen für Sprachmodelle?', 'prompt_it': 'Rispondi in modo conciso alla seguente domanda dato il contenuto inglese: Quali sono le principali fonti di dati per i modelli linguistici?', 'prompt_zh': '根据所给的英文内容，简要回答以下问题： 语言模型的主要数据来源是什么？', 'metadata': {'qa_origin': 'Transcript', 'qa_type': 'AV'}, 'audio': 'MCIF_DATA/LONG_AUDIOS/ICWfTnUMio.wav', 'video': 'MCIF_DATA/LONG_VIDEOS/ICWfTnUMio.mp4', 'text': 'Hi, I\'m Shangbin, PhD student in the University of Washington. Today I\'m presenting our work "From Pretraining Data to Language Models to Downstream Tasks: Tracking the Trails of Political Biases Leading to Unfair NLP Models". So language models are trained on large scale web crawl data. Political news media are well cov

In [69]:
english = []
french = []


for row in ds['test']:
    if row['prompt_en'] and row['prompt_zh']:
        english.append(row['prompt_en'])
        french.append(row['prompt_zh'])

print(f"English sentences: {len(english)}")
print(f"French sentences: {len(french)}")

for i, (eng, fr) in enumerate(zip(english, french), 1):
    print(f"{i:2d}. English: {eng:<30} French: {fr}")

English sentences: 320
French sentences: 320
 1. English: Answer the following question concisely given the English content: What are the main data sources for language models? French: 根据所给的英文内容，简要回答以下问题： 语言模型的主要数据来源是什么？
 2. English: Answer the following question concisely given the English content: What are the affiliations of the authors of the paper? French: 根据所给的英文内容，简要回答以下问题： 这篇论文的作者所属机构是什么？
 3. English: Summarize the English content in an abstract of approximately 200 words. French: 用400个字左右概括所给的英语内容。
 4. English: Answer the following question concisely given the English content: What is the name of the speaker? French: 根据所给的英文内容，简要回答以下问题： 演讲者的名字是什么？
 5. English: Answer the following question concisely given the English content: Which model did they use to obtain the 82%-87% accuracy? French: 根据所给的英文内容，简要回答以下问题： 他们使用哪种模型获得 82%-87% 的准确率？
 6. English: Summarize the English content in an abstract of approximately 200 words. French: 用400个字左右概括所给的英语内容。
 7. English: Answer the followin

## Vocabulary and tokenization

In [70]:
def build_vocab(sentences):
    vocab = {'<pad>':0,'<sos>':1,'<eos>':2,"unk":3}
    for sentence in sentences:
        for word in sentence.split():
            if word not in vocab:
                vocab[word] = len(vocab)
    return vocab

english_vocab = build_vocab(english)
french_vocab = build_vocab(french)

In [71]:
def tokenize(sentences,vocab,max_len):
    tokenized = []

    for sentence in sentences:
        tokens = [vocab['<sos>']] + [vocab.get(word,vocab['unk']) for word in sentence.split()] + [vocab['<eos>']]
        tokens += [vocab['<pad>']] * (max_len - len(tokens))
        tokenized.append(tokens)
    return np.array(tokenized)

max_len_english = max(len(sentence.split()) for sentence in english) + 2
max_len_french = max(len(sentence.split()) for sentence in french) + 2

english_data = tokenize(english,english_vocab,max_len_english)
french_data = tokenize(french,french_vocab,max_len_french)

## Model

In [72]:
class TranslationDataset(Dataset):
    def __init__(self,src_data,target_data):
        self.src_data = src_data
        self.target_data = target_data

    def __len__(self):
        return len(self.src_data)
    
    def __getitem__(self,idx):
        return torch.tensor(self.src_data[idx]),torch.tensor(self.target_data[idx])
    

dataset = TranslationDataset(english_data,french_data)
dataloader = DataLoader(dataset,batch_size=4,shuffle=True)

In [None]:
class Encoder(nn.Module):
    def __init__(self,input_dim,embed_dim,hidden_dim,num_layers):
        super(Encoder,self).__init__()
        self.embedding = nn.Embedding(input_dim,embed_dim)
        self.lstm = nn.LSTM(embed_dim,hidden_dim,num_layers,batch_first=True)

    def forward(self,x):
        embedded = self.embedding(x)
        outputs , (hidden,cell) = self.lstm(embedded)
        return hidden , cell
    
class Decoder(nn.Module):
    def __init__(self,output_dim,embed_dim,hidden_dim,num_layers):
        super(Decoder,self).__init__()
        self.embedding = nn.Embedding(output_dim,embed_dim)
        self.lstm = nn.LSTM(embed_dim,hidden_dim,num_layers,batch_first=True)
        self.fc = nn.Linear(hidden_dim,output_dim)

    def forward(self,x,hidden,cell):
        x = x.unsqueeze(1)
        embedded = self.embedding(x)
        outputs , (hidden,cell) = self.lstm(embedded,(hidden,cell))
        predictions = self.fc(outputs.squeeze(1))
        return predictions , hidden , cell
    

class Seq2Seq(nn.Module):
    def __init__(self,encoder,decoder,device):
        super(Seq2Seq,self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self,src,target,teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        target_length  = target.size(1)
        target_vocab_size = self.decoder.fc.out_features


        outputs = torch.zeros(batch_size,target_length,target_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)

        input = target[:,0]

        for t in range(1,target_length):
            output , hidden , cell = self.decoder(input,hidden,cell)
            outputs[:,t,:] = output
            top = output.argmax(1)
            input = target[:,t] if torch.rand(1).item() < teacher_forcing_ratio else top

        return outputs
    

In [74]:
device = torch.device('cuda') if torch.cuda.is_available else 'cpu'
print(device)


input_dim = len(english_vocab)
output_dim = len(french_vocab)
embed_dim = 64
hidden_dim = 128
num_layers = 2

encoder = Encoder(input_dim,embed_dim,hidden_dim,num_layers)
decoder = Decoder(output_dim,embed_dim,hidden_dim,num_layers)
model = Seq2Seq(encoder,decoder,device).to(device)

optimizer = optim.Adam(model.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=french_vocab['<pad>'])

cuda


In [84]:
def train(model,dataloader,optimizer,criterion,device,num_epoch=20):
    model.train()
    for epoch in range(num_epoch):
        epoch_loss = 0

        for source , target in dataloader:
            source , target = source.to(device) , target.to(device)

            optimizer.zero_grad()
            output = model(source,target)

            output = output[:,1:].reshape(-1,output.shape[2])
            target = target[:,1:].reshape(-1)

            loss = criterion(output,target)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        print(f'Epoch {epoch+1}/{num_epoch}, Loss: {epoch_loss/len(dataloader):.4f}')


train(model,dataloader,optimizer,criterion,device)

Epoch 1/20, Loss: 1.1313
Epoch 2/20, Loss: 1.1027
Epoch 3/20, Loss: 1.0816
Epoch 4/20, Loss: 1.0966
Epoch 5/20, Loss: 1.0467
Epoch 6/20, Loss: 1.0120
Epoch 7/20, Loss: 1.0161
Epoch 8/20, Loss: 0.9946
Epoch 9/20, Loss: 0.9706
Epoch 10/20, Loss: 0.9272
Epoch 11/20, Loss: 0.9267
Epoch 12/20, Loss: 0.9034
Epoch 13/20, Loss: 0.8697
Epoch 14/20, Loss: 0.8413
Epoch 15/20, Loss: 0.8340
Epoch 16/20, Loss: 0.8126
Epoch 17/20, Loss: 0.7943
Epoch 18/20, Loss: 0.7667
Epoch 19/20, Loss: 0.7473
Epoch 20/20, Loss: 0.7352


In [85]:
base_loss = -np.log(1/len(french_vocab))
base_loss

np.float64(5.5053315359323625)

In [86]:
def translate_sentence(model,sentence,english_vocab,french_vocab,max_len_fr,device):
    model.eval()

    tokens = [english_vocab['<sos>']]+ [english_vocab.get(word,english_vocab['unk']) for word in sentence.split()] + [english_vocab['<eos>']]
    source = torch.tensor(tokens).unsqueeze(0).to(device)


    with torch.no_grad():
        hidden , cell = model.encoder(source)

    target_vocab = {v:k for k, v in french_vocab.items()}
    target_indices = [french_vocab['<sos>']]

    for _ in range(max_len_fr):
        target_tensor = torch.tensor([target_indices[-1]]).to(device)
        output , hidden , cell = model.decoder(target_tensor,hidden,cell)
        pred = output.argmax(1).item()

        target_indices.append(pred)
        if pred == french_vocab['<eos>']:
            break
    
    translate_sentence = [target_vocab[i] for i in target_indices[1:-1]]

    return " ".join(translate_sentence)

In [87]:
for i, (eng, fr) in enumerate(zip(english, french), 1):
    translation = translate_sentence(model,eng,english_vocab,french_vocab,500,device)
    print(i,"#"*100)
    print('Original:',eng)
    print('Ground Truth:',fr)
    print('Prediction:',translation)

    print("\n")

1 ####################################################################################################
Original: Answer the following question concisely given the English content: What are the main data sources for language models?
Ground Truth: 根据所给的英文内容，简要回答以下问题： 语言模型的主要数据来源是什么？
Prediction: 根据所给的英文内容，简要回答以下问题： 在之前的研究中，当人类受试者被给予相同的人格化提示，研究结果是什么？


2 ####################################################################################################
Original: Answer the following question concisely given the English content: What are the affiliations of the authors of the paper?
Ground Truth: 根据所给的英文内容，简要回答以下问题： 这篇论文的作者所属机构是什么？
Prediction: 根据所给的英文内容，简要回答以下问题： 这篇论文的作者所属机构是什么？


3 ####################################################################################################
Original: Summarize the English content in an abstract of approximately 200 words.
Ground Truth: 用400个字左右概括所给的英语内容。
Prediction: <sos> MPP 和网站的简化过程是否有所不同？


4 #####################################################