In [344]:
import torch
from torchtext import data
from torch.utils.data import DataLoader
from torchtext.data import TabularDataset, BucketIterator
import pandas as pd
import numpy as np
import os
from transformers import AutoTokenizer
from dotenv import load_dotenv, find_dotenv

In [238]:
load_dotenv(find_dotenv())

True

In [239]:
project_root = os.environ.get("PROJECT_ROOT")
data_dir = os.environ.get("DATA_DIR")
dataset_fullpath = os.path.join(project_root, data_dir, "output")
small_dataset_output = os.path.join(dataset_fullpath, "model_output", "small_patches")
large_dataset_output = os.path.join(dataset_fullpath, "model_output", "large_patches")

In [240]:
large_patches_df = pd.read_csv(os.path.join(dataset_fullpath, "large-patches.csv"))
small_patches_df = pd.read_csv(os.path.join(dataset_fullpath, "small-patches.csv")) 

# Processing small patches dataset

In [241]:
torch.manual_seed(42)

<torch._C.Generator at 0x7fb8500a4270>

In [286]:

def csv_train_test_split(save_path, df):

    df = df.sample(frac=1).reset_index(drop=True)  

    df['numerical_label'] = df['label'].replace({'correct': 0, 'overfitting': 1})

    total_rows = len(df)
    first = int(0.8 * total_rows)
    second = int(0.9 * total_rows)

    train_df = df.iloc[:first]
    val_df = df.iloc[first:second]
    test_df = df.iloc[second:]

    train_df.to_csv(os.path.join(save_path, "train.csv"), index=False)
    val_df.to_csv(os.path.join(save_path, "val.csv"), index=False)
    test_df.to_csv(os.path.join(save_path, "test.csv"), index=False)

    return train_df, val_df, test_df

In [303]:
train_df, val_df, test_df = csv_train_test_split(small_dataset_output, small_patches_df)

  df['numerical_label'] = df['label'].replace({'correct': 0, 'overfitting': 1})


Using codebert pretrained tokenizer

In [292]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/codebert-base')
pad_index = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
unk_index = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

Building the fields 

In [420]:
MAX_SEQ_LEN = 1024

In [421]:
# buggy # patch # label
BUGGY_CODE = data.Field(use_vocab =False, tokenize = tokenizer.encode, pad_token=pad_index, unk_token=unk_index, fix_length=MAX_SEQ_LEN)
PATCH_CODE = data.Field(use_vocab =False, tokenize= tokenizer.encode, pad_token=pad_index, unk_token=unk_index, fix_length=MAX_SEQ_LEN)
LABEL = data.Field(use_vocab=False, sequential=False)

In [422]:
fields = [("dataset", None),
        ("tool", None),
        ("buggy", BUGGY_CODE),
        ("patch", PATCH_CODE),
        ("label", None),
        ("numerical_label", LABEL)]

In [423]:
fields

[('dataset', None),
 ('tool', None),
 ('buggy', <torchtext.data.field.Field at 0x7fb6f3efcd30>),
 ('patch', <torchtext.data.field.Field at 0x7fb6f3efd5a0>),
 ('label', None),
 ('numerical_label', <torchtext.data.field.Field at 0x7fb6f3efc100>)]

In [424]:
train = data.TabularDataset(
        path= os.path.join(small_dataset_output, "train.csv"),
        format="csv",
        skip_header=True,
        fields=fields
        )
val =  data.TabularDataset(
        path= os.path.join(small_dataset_output, "val.csv"),
        format="csv",
        skip_header=True,
        fields=fields
        )
test = data.TabularDataset(
        path= os.path.join(small_dataset_output, "test.csv"),
        format="csv",
        skip_header=True,
        fields=fields
        )

In [425]:

print(f"Number of training examples: {len(train.examples)}")
print(f"Number of validation examples: {len(val.examples)}")
print(f"Number of testing examples: {len(test.examples)}")

Number of training examples: 946
Number of validation examples: 118
Number of testing examples: 119


In [426]:
print(vars(train.examples[0]))

{'buggy': [0, 26516, 1627, 5457, 15471, 1627, 4, 6460, 48398, 47006, 35524, 860, 25522, 21375, 39525, 48587, 1026, 7933, 5457, 36, 45093, 39525, 48587, 43, 10566, 48587, 4, 6460, 39525, 14699, 49483, 1640, 10672, 31774, 6, 86, 31774, 6, 38879, 4397, 26602, 6184, 5457, 1026, 7933, 4, 560, 48689, 47006, 1437, 2], 'patch': [0, 1594, 1640, 26516, 1627, 328, 5214, 15755, 48512, 762, 5214, 4651, 34587, 1640, 5282, 6, 26516, 1627, 4397, 35524, 1437, 2], 'numerical_label': '1'}


In [427]:
device = "cuda:1" if torch.cuda.is_available() else "cpu"

In [428]:
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, val, test), 
    batch_size = BATCH_SIZE, 
    device = device)

In [429]:
for i, batch in enumerate(train_iterator):
    print("i:",i)
    one_bug_sample = batch.buggy
    one_patch_sample = batch.patch
    print(batch.buggy)
    print(batch.buggy.size())
    print(batch.patch)
    print(batch.patch.size())
    print(batch.numerical_label.size())
    break

i: 0
tensor([[    0,     0,     0,  ...,     0,     0,     0],
        [ 1594, 24303, 14582,  ...,  1990,  1594, 24303],
        [   36,   898,    10,  ...,    36,    36,   449],
        ...,
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1]], device='cuda:1')
torch.Size([1024, 64])
tensor([[    0,     0,     0,  ...,     0,     0,     0],
        [30921,     2,  1437,  ...,  1594,  1594,  1594],
        [23796,     1,     2,  ...,    36,    36,    36],
        ...,
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1]], device='cuda:1')
torch.Size([1024, 64])
torch.Size([64])


In [430]:
vocab_size = len(tokenizer)

print("Vocabulary Size:", vocab_size) 

Vocabulary Size: 50265


# Building the model architecture

In [431]:
import torch.nn as nn
import math
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [432]:
import torch.nn as nn
import math

class CodeEncoder(nn.Module):
  def __init__(self, vocab_size, embedding_dim, num_heads, num_layers, dropout=0.1):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.pos_encoder = PositionalEncoding(d_model=embedding_dim)
    self.embedding_dim = embedding_dim

    self.transformer_encoder = nn.TransformerEncoder(
        nn.TransformerEncoderLayer(
            d_model=embedding_dim, nhead=num_heads, dim_feedforward=4 * embedding_dim, dropout=dropout
        ),
        num_layers=num_layers
    )

  def forward(self, code_input):
    embedded_code = self.embedding(code_input) * math.sqrt(self.embedding_dim) 
    encoded_code = self.pos_encoder(embedded_code)
    output = self.transformer_encoder(encoded_code)  
  
    return output

In [433]:
code_encoder = CodeEncoder(vocab_size, embedding_dim=300, num_heads=4, num_layers=4)
code_encoder.to(device)

CodeEncoder(
  (embedding): Embedding(50265, 300)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=300, out_features=300, bias=True)
        )
        (linear1): Linear(in_features=300, out_features=1200, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=1200, out_features=300, bias=True)
        (norm1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
)

In [434]:
print(one_bug_sample.shape)
print(one_patch_sample.shape)

torch.Size([1024, 64])
torch.Size([1024, 64])


In [435]:
output = code_encoder(one_bug_sample)

In [436]:
output.shape

torch.Size([1024, 64, 300])

In [437]:
output2 = code_encoder(one_patch_sample)

In [438]:
output2.shape

torch.Size([1024, 64, 300])

In [466]:
one_bug = one_bug_sample[:, 2]

In [468]:
tokenizer.convert_ids_to_tokens(one_bug)

['<s>',
 'double',
 'Ġa',
 '2',
 'Ġ=',
 'Ġ(',
 'work',
 '[',
 'np',
 'Ġ-',
 'Ġ8',
 ']',
 'Ġ/',
 'Ġb',
 '2',
 ')',
 'Ġ*',
 'Ġ(',
 '1',
 'Ġ+',
 'Ġwork',
 '[',
 'np',
 'Ġ-',
 'Ġ4',
 ']',
 'Ġ/',
 'Ġb',
 '1',
 ');',
 'Ġif',
 'Ġ(',
 'end',
 'Ġ-',
 'Ġstart',
 'Ġ>',
 'Ġ2',
 ')',
 'Ġ{',
 'Ġb',
 '2',
 'Ġ=',
 'Ġwork',
 '[',
 'nn',
 'Ġ-',
 'Ġ13',
 ']',
 'Ġ/',
 'Ġwork',
 '[',
 'nn',
 'Ġ-',
 'Ġ15',
 '];',
 'Ġa',
 '2',
 'Ġ=',
 'Ġa',
 '2',
 'Ġ+',
 'Ġb',
 '2',
 ';',
 'Ġfor',
 'Ġ(',
 'int',
 'Ġi',
 '4',
 'Ġ=',
 'Ġn',
 'n',
 'Ġ-',
 'Ġ17',
 ';',
 'Ġi',
 '4',
 'Ġ>=',
 'Ġ4',
 'Ġ*',
 'Ġstart',
 'Ġ+',
 'Ġ2',
 'Ġ+',
 'Ġping',
 'P',
 'ong',
 ';',
 'Ġi',
 '4',
 'Ġ-=',
 'Ġ4',
 ')',
 'Ġ{',
 'Ġif',
 'Ġ(',
 'b',
 '2',
 'Ġ==',
 'Ġ0',
 '.',
 '0',
 ')',
 'Ġ{',
 'Ġbreak',
 ';',
 'Ġ}',
 'Ġb',
 '1',
 'Ġ=',
 'Ġb',
 '2',
 ';',
 'Ġif',
 'Ġ(',
 'work',
 '[',
 'i',
 '4',
 ']',
 'Ġ>',
 'Ġwork',
 '[',
 'i',
 '4',
 'Ġ-',
 'Ġ2',
 '])',
 'Ġ{',
 'Ġreturn',
 ';',
 'Ġ}',
 'Ġb',
 '2',
 'Ġ=',
 'Ġb',
 '2',
 'Ġ*',
 'Ġ(',
 'work',
 '[',
 '

In [465]:
one_bug = one_bug_sample[:, 2]