# Load dataset

In [1]:
import requests

def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)

    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)    

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value

    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

In [2]:
file_id = '1affGFPxQ1RmV73Vk7tDNaDSbUWP4YDHP'
destination = 'texas.json'
download_file_from_google_drive(file_id, destination)

In [8]:
import json
with open('texas.json', 'r') as fp:
    texas = json.load(fp)

In [10]:
texas[0].keys()

dict_keys(['C1', 'C2', 'M1', 'M2'])

In [11]:
# Filter lenght
import numpy as np

comment_len_bound = 3
code_len_bound = 5

mask = np.ones(len(texas), dtype=bool)

for i, data in enumerate(texas):
  C1 = data['C1']
  C2 = data['C2']
  M1 = data['M1']
  M2 = data['M2']
  if len(C1) < comment_len_bound or len(C2) < comment_len_bound or len(M1) < code_len_bound or len(M2) < code_len_bound:
    mask[i] = False
texas = np.array(texas)[mask]

In [12]:
len(texas)

2926

# Label Dataset

In [13]:
def create_dataset(data):
  dataset = []
  for sample in data:
    C1 = sample['C1']
    C2 = sample['C2']
    M1 = sample['M1']
    M2 = sample['M2']
    # New code and old comment --> inconsistency 
    dataset.append({'C': C1, 'M': M2, 'Y': 'INCONS'})
    # New code and new comment --> consistency
    dataset.append({'C': C2, 'M': M2, 'Y': 'CONS'})
  return dataset

In [14]:
dataset = create_dataset(texas)
dataset[0]

{'C': ['crash',
  'report',
  'data',
  'read',
  'from',
  'the',
  'supplied',
  'input',
  'stream'],
 'M': ['non',
  'public',
  'crash',
  'report',
  'data',
  'load',
  'non',
  'file',
  'file',
  'throws',
  'ioexception',
  'jsonexception',
  'final',
  'input',
  'stream',
  'in',
  'new',
  'buffered',
  'input',
  'stream',
  'new',
  'file',
  'input',
  'stream',
  'file',
  'acraconstants',
  'default',
  'buffer',
  'size',
  'in',
  'bytes',
  'try',
  'return',
  'json',
  'utils',
  'to',
  'crash',
  'report',
  'data',
  'new',
  'jsonobject',
  'ioutils',
  'stream',
  'to',
  'string',
  'in',
  'finally',
  'ioutils',
  'safe',
  'close',
  'in'],
 'Y': 'INCONS'}

In [15]:
import json
from pathlib import Path
Path("data").mkdir(parents=True, exist_ok=True)
with open('data/dataset.json', 'w') as f:
  for data in dataset:
    f.write(json.dumps(data) + '\n')

In [16]:
with open('data/dataset.json', 'r') as f:
  print(f.read()[:500])

{"C": ["crash", "report", "data", "read", "from", "the", "supplied", "input", "stream"], "M": ["non", "public", "crash", "report", "data", "load", "non", "file", "file", "throws", "ioexception", "jsonexception", "final", "input", "stream", "in", "new", "buffered", "input", "stream", "new", "file", "input", "stream", "file", "acraconstants", "default", "buffer", "size", "in", "bytes", "try", "return", "json", "utils", "to", "crash", "report", "data", "new", "jsonobject", "ioutils", "stream", "to"


# NBOW2

In [17]:
import torch
from torchtext import data
from torchtext import datasets

CODE = data.Field(tokenize=lambda x: x)
COMMENT = data.Field(tokenize=lambda x: x)
LABEL = data.LabelField(dtype = torch.float)
fields = {'C': ('c', COMMENT), 'M': ('m', CODE), 'Y': ('y', LABEL)}

In [18]:
my_data = data.TabularDataset(
                            path = 'data/dataset.json',
                            format = 'json',
                            fields = fields
)

In [19]:
print(vars(my_data[0]))

{'c': ['crash', 'report', 'data', 'read', 'from', 'the', 'supplied', 'input', 'stream'], 'm': ['non', 'public', 'crash', 'report', 'data', 'load', 'non', 'file', 'file', 'throws', 'ioexception', 'jsonexception', 'final', 'input', 'stream', 'in', 'new', 'buffered', 'input', 'stream', 'new', 'file', 'input', 'stream', 'file', 'acraconstants', 'default', 'buffer', 'size', 'in', 'bytes', 'try', 'return', 'json', 'utils', 'to', 'crash', 'report', 'data', 'new', 'jsonobject', 'ioutils', 'stream', 'to', 'string', 'in', 'finally', 'ioutils', 'safe', 'close', 'in'], 'y': 'INCONS'}


In [20]:
import random
SEED = 1234
train_data, test_data = my_data.split(random_state = random.seed(SEED))
train_data, val_data = train_data.split(split_ratio=0.8, random_state = random.seed(SEED))

In [21]:
MAX_VOCAB_SIZE = 25_000

COMMENT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE)

CODE.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE)

LABEL.build_vocab(train_data)

In [22]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(val_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 3277
Number of validation examples: 819
Number of testing examples: 1756


In [23]:
print(f"Unique tokens in COMMENT vocabulary: {len(COMMENT.vocab)}")
print(f"Unique tokens in CODE vocabulary: {len(CODE.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in COMMENT vocabulary: 3009
Unique tokens in CODE vocabulary: 4369
Unique tokens in LABEL vocabulary: 2


In [24]:
print(CODE.vocab.freqs.most_common(20))

[('get', 6038), ('return', 5407), ('if', 3955), ('string', 3701), ('str', 3201), ('num', 3067), ('new', 3052), ('public', 2743), ('int', 2016), ('final', 1855), ('name', 1633), ('list', 1585), ('value', 1579), ('type', 1543), ('is', 1367), ('to', 1263), ('class', 1232), ('length', 1230), ('result', 1201), ('exception', 1184)]


In [25]:
print(COMMENT.vocab.freqs.most_common(20))

[('the', 3535), ('of', 1238), ('a', 1169), ('if', 1127), ('is', 744), ('or', 661), ('null', 576), ('to', 473), ('this', 469), ('for', 396), ('an', 365), ('in', 352), ('true', 350), ('that', 319), ('value', 313), ('given', 262), ('list', 258), ('not', 248), ('string', 243), ('with', 232)]


In [26]:
print(COMMENT.vocab.itos[:10])

['<unk>', '<pad>', 'the', 'of', 'a', 'if', 'is', 'or', 'null', 'to']


In [27]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7f6adb4b8840>, {'INCONS': 0, 'CONS': 1})


In [28]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, val_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_key=lambda x: len(x.c) + len(x.m), # the BucketIterator needs to be told what function it should use to group the data.
    sort_within_batch=False,
    device = device)

In [30]:
import torch.nn as nn
import torch.nn.functional as F

class NBOW2(nn.Module):
    def __init__(self, comment_vocab_size, code_vocab_size, embedding_dim, output_dim, comment_pad_idx, code_pad_idx):
        
        super().__init__()
        
        self.embedding1 = nn.Embedding(comment_vocab_size, embedding_dim, padding_idx=comment_pad_idx)
        self.embedding2 = nn.Embedding(code_vocab_size, embedding_dim, padding_idx=code_pad_idx)
        
        self.fc = nn.Linear(2 * embedding_dim, output_dim)
        
        self.a1 = torch.nn.Parameter(torch.zeros(embedding_dim, 1))
        self.a2 = torch.nn.Parameter(torch.zeros(embedding_dim, 1))
        
        self.sigmoid = nn.Sigmoid()
        
        
    def forward(self, comment, code):
        
        #comment = [comment len, batch size]
        #code = [code len, batch size]
        
        embedded1 = self.embedding1(comment)
        embedded2 = self.embedding2(code)

        #embedded = [sent len, batch size, emb dim]
        
        embedded1 = embedded1.permute(1, 0, 2)
        embedded2 = embedded2.permute(1, 0, 2)
  
        #embedded = [batch size, sent len, emb dim]
        
        # Get the scalar word importance weights for each word w
        # a: [emb dim, 1]
        # aw: [batch size, sent len, 1]
        
        aw1 = torch.matmul(embedded1, self.a1)
        aw1 = self.sigmoid(aw1)

        aw2 = torch.matmul(embedded2, self.a2)
        aw2 = self.sigmoid(aw2)
        
        # w_emb: [batch size, sent len, emb dim]
        weighted_embed1 = embedded1 * aw1
        weighted_embed2 = embedded2 * aw2

        pooled1 = F.avg_pool2d(weighted_embed1, (weighted_embed1.shape[1], 1)).squeeze(1) 
        pooled2 = F.avg_pool2d(weighted_embed2, (weighted_embed2.shape[1], 1)).squeeze(1)
        #pooled = [batch size, embedding_dim]

        code_comment = torch.cat((pooled1, pooled2), dim=1)
                
        return self.fc(code_comment)
      
    def init_weights(self):
        initrange = 0.5

        self.a1.data.uniform_(0.0, 1.0)
        self.a2.data.uniform_(0.0, 1.0)

        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

In [31]:
CODE_INPUT_DIM = len(CODE.vocab)
COMMENT_INPUT_DIM = len(COMMENT.vocab)
EMBEDDING_DIM = 100
OUTPUT_DIM = 1
COMMENT_PAD_IDX = COMMENT.vocab.stoi[COMMENT.pad_token]
CODE_PAD_IDX = CODE.vocab.stoi[CODE.pad_token]
model = NBOW2(COMMENT_INPUT_DIM, CODE_INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, COMMENT_PAD_IDX, CODE_PAD_IDX)
model.init_weights()

In [32]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 738,201 trainable parameters


In [33]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)

a1
a2
embedding1.weight
embedding2.weight
fc.weight
fc.bias


In [34]:
COMMENT_UNK_IDX = COMMENT.vocab.stoi[COMMENT.unk_token]
CODE_UNK_IDX = CODE.vocab.stoi[CODE.unk_token]

model.embedding1.weight.data[COMMENT_UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding1.weight.data[COMMENT_PAD_IDX] = torch.zeros(EMBEDDING_DIM)


model.embedding2.weight.data[CODE_UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding2.weight.data[CODE_PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [35]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [36]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [37]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [38]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.c, batch.m).squeeze(1)
        
        loss = criterion(predictions, batch.y)
        
        acc = binary_accuracy(predictions, batch.y)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [39]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.c, batch.m).squeeze(1)
            
            loss = criterion(predictions, batch.y)
            
            acc = binary_accuracy(predictions, batch.y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [41]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [42]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut3-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 0s
	Train Loss: 0.698 | Train Acc: 50.19%
	 Val. Loss: 0.725 |  Val. Acc: 48.00%
Epoch: 02 | Epoch Time: 0m 0s
	Train Loss: 0.691 | Train Acc: 51.75%
	 Val. Loss: 0.727 |  Val. Acc: 46.20%
Epoch: 03 | Epoch Time: 0m 0s
	Train Loss: 0.689 | Train Acc: 53.17%
	 Val. Loss: 0.731 |  Val. Acc: 45.21%
Epoch: 04 | Epoch Time: 0m 0s
	Train Loss: 0.685 | Train Acc: 55.21%
	 Val. Loss: 0.736 |  Val. Acc: 46.53%
Epoch: 05 | Epoch Time: 0m 0s
	Train Loss: 0.683 | Train Acc: 56.18%
	 Val. Loss: 0.742 |  Val. Acc: 44.91%


# NBOW2 + Pretrained Embeddings

In [43]:
import torch
from torchtext import data
from torchtext import datasets

CODE = data.Field(tokenize=lambda x: x, preprocessing=lambda x: [w.replace('▁', '') for w in x])
COMMENT = data.Field(tokenize=lambda x: x, preprocessing=lambda x: [w.replace('▁', '') for w in x])
LABEL = data.LabelField(dtype = torch.float)
fields = {'C': ('c', COMMENT), 'M': ('m', CODE), 'Y': ('y', LABEL)}

In [44]:
my_data = data.TabularDataset(
                            path = 'data/dataset.json',
                            format = 'json',
                            fields = fields
)

In [45]:
print(vars(my_data[0]))

{'c': ['crash', 'report', 'data', 'read', 'from', 'the', 'supplied', 'input', 'stream'], 'm': ['non', 'public', 'crash', 'report', 'data', 'load', 'non', 'file', 'file', 'throws', 'ioexception', 'jsonexception', 'final', 'input', 'stream', 'in', 'new', 'buffered', 'input', 'stream', 'new', 'file', 'input', 'stream', 'file', 'acraconstants', 'default', 'buffer', 'size', 'in', 'bytes', 'try', 'return', 'json', 'utils', 'to', 'crash', 'report', 'data', 'new', 'jsonobject', 'ioutils', 'stream', 'to', 'string', 'in', 'finally', 'ioutils', 'safe', 'close', 'in'], 'y': 'INCONS'}


In [46]:
import random
SEED = 1234
train_data, test_data = my_data.split(random_state = random.seed(SEED))
train_data, val_data = train_data.split(split_ratio=0.8, random_state = random.seed(SEED))

In [47]:
MAX_VOCAB_SIZE = 25_000

COMMENT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE,  
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

CODE.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE,  
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [06:28, 2.22MB/s]                           
100%|█████████▉| 398484/400000 [00:16<00:00, 24368.43it/s]

In [48]:
import torchtext.vocab

glove = torchtext.vocab.GloVe(name = '6B', dim = 100)

print(f'There are {len(glove.itos)} words in the vocabulary')

There are 400000 words in the vocabulary


In [49]:
count_intersect = 0
for w in COMMENT.vocab.itos:
  if w in glove.stoi:
    count_intersect += 1

print(count_intersect / len(COMMENT.vocab.itos))

0.9016284479893653


In [50]:
count_intersect = 0
for w in CODE.vocab.itos:
  if w in glove.stoi:
    count_intersect += 1

print(count_intersect / len(CODE.vocab.itos))

0.7896543831540398


In [51]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(val_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 3277
Number of validation examples: 819
Number of testing examples: 1756


In [52]:
print(f"Unique tokens in COMMENT vocabulary: {len(COMMENT.vocab)}")
print(f"Unique tokens in CODE vocabulary: {len(CODE.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in COMMENT vocabulary: 3009
Unique tokens in CODE vocabulary: 4369
Unique tokens in LABEL vocabulary: 2


In [53]:
print(CODE.vocab.freqs.most_common(20))

[('get', 6038), ('return', 5407), ('if', 3955), ('string', 3701), ('str', 3201), ('num', 3067), ('new', 3052), ('public', 2743), ('int', 2016), ('final', 1855), ('name', 1633), ('list', 1585), ('value', 1579), ('type', 1543), ('is', 1367), ('to', 1263), ('class', 1232), ('length', 1230), ('result', 1201), ('exception', 1184)]


In [54]:
print(COMMENT.vocab.freqs.most_common(20))

[('the', 3535), ('of', 1238), ('a', 1169), ('if', 1127), ('is', 744), ('or', 661), ('null', 576), ('to', 473), ('this', 469), ('for', 396), ('an', 365), ('in', 352), ('true', 350), ('that', 319), ('value', 313), ('given', 262), ('list', 258), ('not', 248), ('string', 243), ('with', 232)]


In [55]:
print(COMMENT.vocab.itos[:10])

['<unk>', '<pad>', 'the', 'of', 'a', 'if', 'is', 'or', 'null', 'to']


In [56]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7f6adb4b8840>, {'INCONS': 0, 'CONS': 1})


In [57]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, val_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_key=lambda x: len(x.c) + len(x.m), # the BucketIterator needs to be told what function it should use to group the data.
    sort_within_batch=False,
    device = device)

In [58]:
import torch.nn as nn
import torch.nn.functional as F

class NBOW2(nn.Module):
    def __init__(self, comment_vocab_size, code_vocab_size, embedding_dim, output_dim, 
                 comment_pad_idx, code_pad_idx, dropout):
        
        super().__init__()
        
        self.embedding1 = nn.Embedding(comment_vocab_size, embedding_dim, padding_idx=comment_pad_idx)
        self.embedding2 = nn.Embedding(code_vocab_size, embedding_dim, padding_idx=code_pad_idx)
        
        self.fc = nn.Linear(2 * embedding_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)

        self.a1 = torch.nn.Parameter(torch.zeros(embedding_dim, 1))
        self.a2 = torch.nn.Parameter(torch.zeros(embedding_dim, 1))
        
        self.sigmoid = nn.Sigmoid()
        
        
    def forward(self, comment, code):
        
        #comment = [comment len, batch size]
        #code = [code len, batch size]
        
        embedded1 = self.embedding1(comment)
        embedded2 = self.embedding2(code)

        #embedded = [sent len, batch size, emb dim]
        
        embedded1 = embedded1.permute(1, 0, 2)
        embedded2 = embedded2.permute(1, 0, 2)
  
        #embedded = [batch size, sent len, emb dim]
        
        # Get the scalar word importance weights for each word w
        # a: [emb dim, 1]
        # aw: [batch size, sent len, 1]
        
        aw1 = torch.matmul(embedded1, self.a1)
        aw1 = self.sigmoid(aw1)

        aw2 = torch.matmul(embedded2, self.a2)
        aw2 = self.sigmoid(aw2)
        
        # w_emb: [batch size, sent len, emb dim]
        weighted_embed1 = embedded1 * aw1
        weighted_embed2 = embedded2 * aw2

        pooled1 = F.avg_pool2d(weighted_embed1, (weighted_embed1.shape[1], 1)).squeeze(1) 
        pooled2 = F.avg_pool2d(weighted_embed2, (weighted_embed2.shape[1], 1)).squeeze(1)
        #pooled = [batch size, embedding_dim]

        code_comment = self.dropout(torch.cat((pooled1, pooled2), dim=1))
                
        return self.fc(code_comment)
      
    def init_weights(self):
        initrange = 0.5

        self.a1.data.uniform_(0.0, 1.0)
        self.a2.data.uniform_(0.0, 1.0)

        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

In [59]:
CODE_INPUT_DIM = len(CODE.vocab)
COMMENT_INPUT_DIM = len(COMMENT.vocab)
EMBEDDING_DIM = 100
OUTPUT_DIM = 1
COMMENT_PAD_IDX = COMMENT.vocab.stoi[COMMENT.pad_token]
CODE_PAD_IDX = CODE.vocab.stoi[CODE.pad_token]
DROPOUT = 0.5

model = NBOW2(COMMENT_INPUT_DIM, CODE_INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, COMMENT_PAD_IDX, CODE_PAD_IDX, DROPOUT)
model.init_weights()

In [60]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 738,201 trainable parameters


In [61]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)

a1
a2
embedding1.weight
embedding2.weight
fc.weight
fc.bias


In [62]:
pretrained_embeddings = COMMENT.vocab.vectors

model.embedding1.weight.data.copy_(pretrained_embeddings)

pretrained_embeddings = CODE.vocab.vectors

model.embedding2.weight.data.copy_(pretrained_embeddings)

tensor([[-0.0199,  0.2426, -0.7804,  ..., -0.6855, -0.7572,  1.1121],
        [-0.8491, -0.9379, -1.0158,  ..., -1.1613,  0.1143,  0.6031],
        [ 0.1443,  0.4395,  0.5832,  ...,  0.5013,  0.4954,  0.4992],
        ...,
        [ 0.4221,  0.6307, -0.1291,  ...,  0.1045,  0.1854, -0.4767],
        [-0.0274, -0.3202,  0.4358,  ...,  0.3812,  0.2655,  0.5815],
        [-0.1788, -0.4827, -0.8316,  ..., -1.4720, -1.3727,  1.4800]])

In [63]:
COMMENT_UNK_IDX = COMMENT.vocab.stoi[COMMENT.unk_token]
CODE_UNK_IDX = CODE.vocab.stoi[CODE.unk_token]

model.embedding1.weight.data[COMMENT_UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding1.weight.data[COMMENT_PAD_IDX] = torch.zeros(EMBEDDING_DIM)


model.embedding2.weight.data[CODE_UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding2.weight.data[CODE_PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [64]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [65]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [66]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [67]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.c, batch.m).squeeze(1)
        
        loss = criterion(predictions, batch.y)
        
        acc = binary_accuracy(predictions, batch.y)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [68]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.c, batch.m).squeeze(1)
            
            loss = criterion(predictions, batch.y)
            
            acc = binary_accuracy(predictions, batch.y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [69]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [70]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut3-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 0s
	Train Loss: 0.694 | Train Acc: 49.93%
	 Val. Loss: 0.700 |  Val. Acc: 47.79%
Epoch: 02 | Epoch Time: 0m 0s
	Train Loss: 0.690 | Train Acc: 53.29%
	 Val. Loss: 0.703 |  Val. Acc: 48.03%
Epoch: 03 | Epoch Time: 0m 0s
	Train Loss: 0.691 | Train Acc: 52.84%
	 Val. Loss: 0.706 |  Val. Acc: 48.36%
Epoch: 04 | Epoch Time: 0m 0s
	Train Loss: 0.688 | Train Acc: 54.40%
	 Val. Loss: 0.709 |  Val. Acc: 47.10%
Epoch: 05 | Epoch Time: 0m 0s
	Train Loss: 0.688 | Train Acc: 55.27%
	 Val. Loss: 0.712 |  Val. Acc: 46.20%


In [71]:
model.load_state_dict(torch.load('tut3-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.702 | Test Acc: 49.31%


# LSTM

In [72]:
import torch
from torchtext import data
from torchtext import datasets

CODE = data.Field(tokenize=lambda x: x, preprocessing=lambda x: [w.replace('▁', '') for w in x], include_lengths = True)
COMMENT = data.Field(tokenize=lambda x: x, preprocessing=lambda x: [w.replace('▁', '') for w in x], include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)
fields = {'C': ('c', COMMENT), 'M': ('m', CODE), 'Y': ('y', LABEL)}

In [73]:
my_data = data.TabularDataset(
                            path = 'data/dataset.json',
                            format = 'json',
                            fields = fields
)

In [74]:
print(vars(my_data[0]))

{'c': ['crash', 'report', 'data', 'read', 'from', 'the', 'supplied', 'input', 'stream'], 'm': ['non', 'public', 'crash', 'report', 'data', 'load', 'non', 'file', 'file', 'throws', 'ioexception', 'jsonexception', 'final', 'input', 'stream', 'in', 'new', 'buffered', 'input', 'stream', 'new', 'file', 'input', 'stream', 'file', 'acraconstants', 'default', 'buffer', 'size', 'in', 'bytes', 'try', 'return', 'json', 'utils', 'to', 'crash', 'report', 'data', 'new', 'jsonobject', 'ioutils', 'stream', 'to', 'string', 'in', 'finally', 'ioutils', 'safe', 'close', 'in'], 'y': 'INCONS'}


In [75]:
import random
SEED = 1234
train_data, test_data = my_data.split(random_state = random.seed(SEED))
train_data, val_data = train_data.split(split_ratio=0.8, random_state = random.seed(SEED))

In [76]:
MAX_VOCAB_SIZE = 25_000

COMMENT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE,  
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

CODE.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE,  
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

In [77]:
import torchtext.vocab

glove = torchtext.vocab.GloVe(name = '6B', dim = 100)

print(f'There are {len(glove.itos)} words in the vocabulary')

There are 400000 words in the vocabulary


In [78]:
count_intersect = 0
for w in COMMENT.vocab.itos:
  if w in glove.stoi:
    count_intersect += 1

print(count_intersect / len(COMMENT.vocab.itos))

0.9016284479893653


In [79]:
count_intersect = 0
for w in CODE.vocab.itos:
  if w in glove.stoi:
    count_intersect += 1

print(count_intersect / len(CODE.vocab.itos))

0.7896543831540398


In [80]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(val_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 3277
Number of validation examples: 819
Number of testing examples: 1756


In [81]:
print(f"Unique tokens in COMMENT vocabulary: {len(COMMENT.vocab)}")
print(f"Unique tokens in CODE vocabulary: {len(CODE.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in COMMENT vocabulary: 3009
Unique tokens in CODE vocabulary: 4369
Unique tokens in LABEL vocabulary: 2


In [82]:
print(CODE.vocab.freqs.most_common(20))

[('get', 6038), ('return', 5407), ('if', 3955), ('string', 3701), ('str', 3201), ('num', 3067), ('new', 3052), ('public', 2743), ('int', 2016), ('final', 1855), ('name', 1633), ('list', 1585), ('value', 1579), ('type', 1543), ('is', 1367), ('to', 1263), ('class', 1232), ('length', 1230), ('result', 1201), ('exception', 1184)]


In [83]:
print(COMMENT.vocab.freqs.most_common(20))

[('the', 3535), ('of', 1238), ('a', 1169), ('if', 1127), ('is', 744), ('or', 661), ('null', 576), ('to', 473), ('this', 469), ('for', 396), ('an', 365), ('in', 352), ('true', 350), ('that', 319), ('value', 313), ('given', 262), ('list', 258), ('not', 248), ('string', 243), ('with', 232)]


In [84]:
print(COMMENT.vocab.itos[:10])

['<unk>', '<pad>', 'the', 'of', 'a', 'if', 'is', 'or', 'null', 'to']


In [85]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7f6adb4b8840>, {'INCONS': 0, 'CONS': 1})


In [86]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, val_data, test_data), 
    batch_size = BATCH_SIZE,
    sort=False,
    device = device)

In [87]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, comment_vocab_size, code_vocab_size, 
                 embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, comment_pad_idx, code_pad_idx, dropout):
        
        super().__init__()
        
        self.embedding1 = nn.Embedding(comment_vocab_size, embedding_dim, padding_idx = comment_pad_idx)
        self.embedding2 = nn.Embedding(code_vocab_size, embedding_dim, padding_idx = code_pad_idx)
        
        self.rnn1 = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.rnn2 = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        


        self.fc = nn.Linear(hidden_dim * 2 * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, comment, comment_lengths, code, code_lengths):
        
        #text = [sent len, batch size]
        
        embedded1 = self.dropout(self.embedding1(comment))
        embedded2 = self.dropout(self.embedding2(code))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded1 = nn.utils.rnn.pack_padded_sequence(embedded1, comment_lengths.cpu(), enforce_sorted=False)
        packed_output1, (hidden1, cell1) = self.rnn1(packed_embedded1)

        packed_embedded2 = nn.utils.rnn.pack_padded_sequence(embedded2, code_lengths.cpu(), enforce_sorted=False)
        packed_output2, (hidden2, cell2) = self.rnn2(packed_embedded2)

        
        #unpack sequence
        output1, output_lengths1 = nn.utils.rnn.pad_packed_sequence(packed_output1)
        output2, output_lengths2 = nn.utils.rnn.pad_packed_sequence(packed_output1)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden1 = self.dropout(torch.cat((hidden1[-2,:,:], hidden1[-1,:,:]), dim = 1))
        hidden2 = self.dropout(torch.cat((hidden2[-2,:,:], hidden2[-1,:,:]), dim = 1))

        hidden = self.dropout(torch.cat((hidden1, hidden2), dim=1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)

In [88]:
CODE_INPUT_DIM = len(CODE.vocab)
COMMENT_INPUT_DIM = len(COMMENT.vocab)
EMBEDDING_DIM = 100
OUTPUT_DIM = 1
COMMENT_PAD_IDX = COMMENT.vocab.stoi[COMMENT.pad_token]
CODE_PAD_IDX = CODE.vocab.stoi[CODE.pad_token]
DROPOUT = 0.5
BIDIRECTIONAL = True
N_LAYERS = 2
HIDDEN_DIM = 256

model = RNN(COMMENT_INPUT_DIM, CODE_INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, COMMENT_PAD_IDX, CODE_PAD_IDX, 
            DROPOUT)

In [89]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 5,359,113 trainable parameters


In [90]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)

embedding1.weight
embedding2.weight
rnn1.weight_ih_l0
rnn1.weight_hh_l0
rnn1.bias_ih_l0
rnn1.bias_hh_l0
rnn1.weight_ih_l0_reverse
rnn1.weight_hh_l0_reverse
rnn1.bias_ih_l0_reverse
rnn1.bias_hh_l0_reverse
rnn1.weight_ih_l1
rnn1.weight_hh_l1
rnn1.bias_ih_l1
rnn1.bias_hh_l1
rnn1.weight_ih_l1_reverse
rnn1.weight_hh_l1_reverse
rnn1.bias_ih_l1_reverse
rnn1.bias_hh_l1_reverse
rnn2.weight_ih_l0
rnn2.weight_hh_l0
rnn2.bias_ih_l0
rnn2.bias_hh_l0
rnn2.weight_ih_l0_reverse
rnn2.weight_hh_l0_reverse
rnn2.bias_ih_l0_reverse
rnn2.bias_hh_l0_reverse
rnn2.weight_ih_l1
rnn2.weight_hh_l1
rnn2.bias_ih_l1
rnn2.bias_hh_l1
rnn2.weight_ih_l1_reverse
rnn2.weight_hh_l1_reverse
rnn2.bias_ih_l1_reverse
rnn2.bias_hh_l1_reverse
fc.weight
fc.bias


In [91]:
pretrained_embeddings = COMMENT.vocab.vectors

model.embedding1.weight.data.copy_(pretrained_embeddings)

pretrained_embeddings = CODE.vocab.vectors

model.embedding2.weight.data.copy_(pretrained_embeddings)

tensor([[-0.3300,  0.1821, -1.5894,  ..., -0.0754,  1.2962,  1.0890],
        [ 1.1888,  0.4801,  2.0308,  ...,  1.4999, -1.8085,  0.8142],
        [ 0.1443,  0.4395,  0.5832,  ...,  0.5013,  0.4954,  0.4992],
        ...,
        [ 0.4221,  0.6307, -0.1291,  ...,  0.1045,  0.1854, -0.4767],
        [-0.0274, -0.3202,  0.4358,  ...,  0.3812,  0.2655,  0.5815],
        [ 0.6292,  1.3534,  0.4736,  ..., -0.1559,  1.4951, -0.5672]])

In [92]:
COMMENT_UNK_IDX = COMMENT.vocab.stoi[COMMENT.unk_token]
CODE_UNK_IDX = CODE.vocab.stoi[CODE.unk_token]

model.embedding1.weight.data[COMMENT_UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding1.weight.data[COMMENT_PAD_IDX] = torch.zeros(EMBEDDING_DIM)


model.embedding2.weight.data[CODE_UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding2.weight.data[CODE_PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [93]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [94]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [95]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [96]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        comment, comment_lens = batch.c
        code, code_lens = batch.m
        predictions = model(comment, comment_lens, code, code_lens).squeeze(1)
            
        loss = criterion(predictions, batch.y)
        
        acc = binary_accuracy(predictions, batch.y)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [97]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            comment, comment_lens = batch.c
            code, code_lens = batch.m
            predictions = model(comment, comment_lens, code, code_lens).squeeze(1)
            
            loss = criterion(predictions, batch.y)
            
            acc = binary_accuracy(predictions, batch.y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [98]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [99]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut3-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 7s
	Train Loss: 0.697 | Train Acc: 51.24%
	 Val. Loss: 0.693 |  Val. Acc: 50.26%
Epoch: 02 | Epoch Time: 0m 6s
	Train Loss: 0.693 | Train Acc: 52.47%
	 Val. Loss: 0.694 |  Val. Acc: 50.74%
Epoch: 03 | Epoch Time: 0m 6s
	Train Loss: 0.690 | Train Acc: 52.95%
	 Val. Loss: 0.698 |  Val. Acc: 50.65%
Epoch: 04 | Epoch Time: 0m 6s
	Train Loss: 0.678 | Train Acc: 56.18%
	 Val. Loss: 0.697 |  Val. Acc: 49.80%
Epoch: 05 | Epoch Time: 0m 6s
	Train Loss: 0.666 | Train Acc: 59.90%
	 Val. Loss: 0.711 |  Val. Acc: 51.58%


In [100]:
model.load_state_dict(torch.load('tut3-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.693 | Test Acc: 50.73%
