In [None]:
!pip install torchtext==0.6.0

In [None]:
!pip install evaluate sacrebleu

In [None]:
import torch
import spacy
from torchtext.data.metrics import bleu_score
import sys
import evaluate


def translate_sentence(model, sentence, fix_vocab, bug_vocab, device, max_length=100):

    #print(fix_vocab.stoi["["])
    #print(bug_vocab.stoi["["])

    tokens = sentence.split();
    #print(tokens)

    tokens.insert(0, '<SOS>')
    tokens.append('<EOS>')

    # Go through each fix token and convert to an index
    text_to_indices = [fix_vocab.stoi[token] for token in tokens]
    #print(text_to_indices)

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)
    #print("sentence_tensor")
    #print(sentence_tensor)

    mutant_tokens_so_far = [bug_vocab["<sos>"]]
    mutant_tokens = []
    for i in range(max_length):
        #trg_tensor = torch.LongTensor(mutant_tokens_so_far).unsqueeze(1).to(device)

        with torch.no_grad():
            output = model(sentence_tensor, torch.LongTensor(mutant_tokens_so_far).unsqueeze(1).to(device))

        best_guess = output.argmax(2)[-1, :].item()
        mutant_tokens_so_far.append(best_guess)

        if best_guess == bug_vocab["<eos>"]:

            print("break")
            break

    #print("mutant_tokens_so_far")
    #print(mutant_tokens_so_far)

    translated_sentence = [bug_vocab.itos[idx] for idx in mutant_tokens_so_far]
    #print("transentence")
    #print(translated_sentence)
    # remove start token
    return translated_sentence[1:]


def bleu(data, model, fix_vocab, bug_vocab, device):
    targets = []
    outputs = []
    index = 0
    total_score = 0
    
    with open("output.txt", "a") as output_file:
        for example in data:
            src = vars(example)["f"]
            trg = vars(example)["b"]

            prediction = translate_sentence(model, ' '.join(src), fix_vocab, bug_vocab, device)
            prediction = prediction[:-1]  # remove <eos> token

            #print("prediction#")
            #print(prediction)
            
            #targets.append([trg])
            #outputs.append(prediction)

            source_text = ' '.join(src)
            target_text = ' '.join(trg)
            prediction_text = ' '.join(prediction)

            prediction = [prediction_text]
            reference = [[target_text]]
            chrf = evaluate.load("chrf")
            results = chrf.compute(predictions=prediction, references=reference)
            score = results["score"]
            # Print the output
            print("Round:", index)
            print("Score:", score)
            print()
            
            # Replace print statements with code to append to the text file
            output_file.write("round : " + str(index))
            output_file.write("source\n")
            output_file.write("===================\n")
            output_file.write(source_text + "\n")
            output_file.write("target\n")
            output_file.write("===================\n")
            output_file.write(target_text + "\n")
            output_file.write("prediction\n")
            output_file.write("===================\n")
            output_file.write(prediction_text + "\n")
            output_file.write("score\n")
            output_file.write("===================\n")
            output_file.write(str(score) + "\n")

            total_score = total_score + score
            
            index = index + 1

    #return bleu_score(outputs, targets)
    return total_score

def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
#from tranutils import translate_sentence, bleu, save_checkpoint, load_checkpoint
from torch.utils.tensorboard import SummaryWriter
from torchtext.data import Field, BucketIterator, TabularDataset
import matplotlib.pyplot as plt
import numpy as np

#tokenize
tokenize = lambda x: x.split()

fix = Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True)

bug = Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True)

fields = {'fix': ('f', fix), 'bug': ('b', bug)}

#load file
train_data, valid_data, test_data = TabularDataset.splits(path='/kaggle/input/transformernpdata/normalData', train='train.tsv', validation='validate.tsv', test='test.tsv', format='tsv', fields=fields)

#build vocab
fix.build_vocab(train_data, max_size=10000,min_freq=1)
bug.build_vocab(train_data, max_size=10000,min_freq=1)

fix_vocab = fix.vocab
bug_vocab = bug.vocab

#split data
train_iterator, test_iterator = BucketIterator.splits((train_data, test_data), batch_size=2)

class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        max_len,
        device,
    ):
        super(Transformer, self).__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.src_position_embedding = nn.Embedding(max_len, embedding_size)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
        self.trg_position_embedding = nn.Embedding(max_len, embedding_size)

        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout,
        )
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx

    def make_src_mask(self, src):
        src_mask = src.transpose(0, 1) == self.src_pad_idx

        # (N, src_len)
        return src_mask.to(self.device)

    def forward(self, src, trg):
        src_seq_length, N = src.shape
        trg_seq_length, N = trg.shape

        src_positions = (
            torch.arange(0, src_seq_length)
            .unsqueeze(1)
            .expand(src_seq_length, N)
            .to(self.device)
        )

        trg_positions = (
            torch.arange(0, trg_seq_length)
            .unsqueeze(1)
            .expand(trg_seq_length, N)
            .to(self.device)
        )

        embed_src = self.dropout(
            (self.src_word_embedding(src) + self.src_position_embedding(src_positions))
        )
        embed_trg = self.dropout(
            (self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions))
        )

        src_padding_mask = self.make_src_mask(src)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
            self.device
        )

        out = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding_mask=src_padding_mask,
            tgt_mask=trg_mask,
        )
        out = self.fc_out(out)
        return out


# Training

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

load_model = True
save_model = False

# Training hyperparameters
num_epochs = 0
learning_rate = 3e-4
batch_size = 32

#data = np.load("losses.npy")
data = np.array([[], []])

mean_losses = []
epoch_counts = []
#last_epoch_num = data[0][-1]
last_epoch_num = 0

# Model hyperparameters
src_vocab_size = len(fix.vocab)
trg_vocab_size = len(bug.vocab)
embedding_size = 512
num_heads = 8

#TODO: CHANGE NUMBER OF LAYERS
num_encoder_layers = 5
num_decoder_layers = 5
dropout = 0.10
max_len = 100
forward_expansion = 4
src_pad_idx = bug.vocab.stoi["<pad>"]

# Tensorboard to get nice loss plot
writer = SummaryWriter("runs/loss_plot")
step = 0

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.f),
    device=device,
)

model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.1, patience=10, verbose=True
)

pad_idx = bug.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

#TODO: REPLACE WITH MODEL CHECKPOINT PATH
if load_model:
    load_checkpoint(torch.load("/PATH/TO/TRAINING_MODEL_CHECKPOINT_.PTH.TAR FILE"), model, optimizer)

#sample input for simple evaluation before every training epoch
sentence = "public static void main ( java.lang.String [ ] args ) throws java.io.IOException { java.lang.System.out.println ( STRING_1 ) ; TYPE_1 node = new TYPE_1 ( INT_1 ) ; node . METHOD_1 ( ) ; }"


for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    if save_model:
        checkpoint = {
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
        }
        save_checkpoint(checkpoint)

    model.eval()
    translated_sentence = translate_sentence(
        model, sentence, fix_vocab, bug_vocab, device, max_length=100
    )

    #simple evaluation before every training epoch
    print(f"Translated example sentence: \n {translated_sentence}")
    model.train()
    losses = []

    for batch_idx, batch in enumerate(train_iterator):
        # Get input and targets and get to cuda
        inp_data = batch.f.to(device)
        target = batch.b.to(device)

        # Forward prop
        output = model(inp_data, target[:-1, :])

        output = output.reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()

        loss = criterion(output, target)
        losses.append(loss.item())

        # Back prop
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1

    mean_loss = sum(losses) / len(losses)
    print(f"train loss: {mean_loss}")
    mean_losses.append(mean_loss)
    epoch_counts.append(last_epoch_num + epoch + 1)

    scheduler.step(mean_loss)

data = np.append(data, [epoch_counts, mean_losses], axis=1)
np.save('losses.npy', data)

print("Average Training Loss: ", mean_losses)



# Plot the data
plt.plot(data[0], data[1])
plt.xlabel('Epochs')
plt.ylabel('Training Loss')
plt.title('Training Loss vs. Epochs')
plt.show()
plt.savefig("training_losses.png")

# running on entire test data takes a while
score = bleu(test_data, model, fix_vocab, bug_vocab, device)
print(f"Average CHRF score: {score}")