In [1]:
import sys
sys.path.append('..')

In [2]:
import os
os.environ['TOKENIZERS_PARALLELISM']='true'
os.environ['CUDA_VISIBLE_DEVICES']='2'

In [3]:
import re
import os
import torch
import json
import numpy as np
import pandas as pd
import seaborn as sns
import transformers
import catalyst
import difflib
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from transformers import T5Tokenizer, T5TokenizerFast, T5ForConditionalGeneration
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
from typing import Callable, Union, Tuple
from catalyst.loggers.wandb import WandbLogger
from tqdm.notebook import tqdm
from collections import Counter
from torch import nn
from catalyst import dl
from catalyst.callbacks.periodic_loader import PeriodicLoaderCallback
from langdetect import detect
from easse.sari import corpus_sari
from rouge import Rouge

In [4]:
from dataclasses import dataclass

@dataclass
class Config:
    pass

CONFIG = Config()
CONFIG.seed = 1337

import random

random.seed(CONFIG.seed)
os.environ['PYTHONHASHSEED'] = str(CONFIG.seed)
np.random.seed(CONFIG.seed)
torch.manual_seed(CONFIG.seed)
torch.cuda.manual_seed(CONFIG.seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

## Model

In [5]:
import torch
from transformers import CodeGenForCausalLM, AutoTokenizer
from transformers import RobertaTokenizer, T5ForConditionalGeneration

CONFIG.src_max_len = 512
CONFIG.tgt_max_len = 512
CONFIG.pretrained = 'Salesforce/codet5-small'

tokenizer = RobertaTokenizer.from_pretrained(CONFIG.pretrained)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = T5ForConditionalGeneration.from_pretrained(CONFIG.pretrained).to(device)

SEP_TOKEN = str(tokenizer.__dict__['init_kwargs']['sep_token'])

## Read data

In [6]:
import pandas as pd

df_new_disk = pd.read_json(r'data/final_data.json')
df_new_disk.shape

(74455, 10)

In [7]:
df_new_disk.head(2)

Unnamed: 0,code_before,code_after,commit_msg,com_py2imports_before,com_py2imports_after,diff,com_hash,imports,lib8funcs,docs
0,"def _misc(self, func, opts, args):\n ...","def _misc(self, func, opts, args):\n ...",http://code.google.com/p/pytyrant/issues/detai...,"{'0_pytyrant.py': ['import socket', 'import st...","{'0_pytyrant.py': ['import socket', 'import st...","@@ -493,8 +493,10 @@ class Tyrant(object):\n ...",32618ec0a7a05bad587064e20adfb80cb7bd1860,"[import UserDict, import struct, import socket]","[{'UserDict': '*', 'struct': '*', 'socket': '*...",[]
1,"def _tDouble(code, key, integ, fract):\n re...","def _tDouble(code, key, integ, fract):\n re...",fix adddouble typo,"{'0_pytyrant.py': ['import math', 'import sock...","{'0_pytyrant.py': ['import math', 'import sock...","@@ -147,7 +147,7 @@ def _t3F(code, func, opts,...",8b5d9f765a0fbbd76cc7b7ff8d2593436c6a2360,"[import struct, import UserDict, import math, ...","[{'struct': '*', 'UserDict': '*', 'math': '*',...",[Functions to convert between Python values an...


In [8]:
df_new_disk['file_name'] = df_new_disk['com_py2imports_after'].apply(lambda x: list(x)[0])

In [9]:
def generate_prompts(row):
    fn = row['file_name']
    code_before = row['code_before']
    commit_msg = row['commit_msg']
    return f'{commit_msg} {SEP_TOKEN} {code_before}\n'

In [10]:
df_new_disk['prompt'] = df_new_disk.apply(lambda x: generate_prompts(x), axis=1)

In [11]:
print(df_new_disk['prompt'].iloc[100])

Fix RBF features. </s>     def __init__(self, nrows = 5, ncols = 5, walls=[(1,1),(1,2),(1,3),(2,1),(2,2),(2,3),(3,1),(3,2),(3,3)], endstates = [0]):
        self.nrows = nrows
        self.ncols = ncols

        self.walls = walls
        grid = [self.coords(i) for i in range(self.nrows * self.ncols)]
        grid = [s for s in grid if not s in self.walls]
        self.states = dict([(i,s) for (i,s) in enumerate(grid)])
        self.rstates = dict([(s,i) for (i,s) in enumerate(grid)]) # reverse lookup by grid coords

        self.nstates = len(self.states)
        self.nactions = 8
        self.endstates = endstates

        MDP.__init__(self, nstates = self.nstates, nactions = self.nactions)



In [12]:
with open('data/test_idxs.npy', 'rb') as f:
    X_test_idx = np.load(f)

In [13]:
X_test = df_new_disk.iloc[X_test_idx]

In [14]:
X_test.shape

(7446, 12)

## Dataset

In [15]:
from typing import Union, Callable, Tuple
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast

class EditDataset(torch.utils.data.Dataset):
    def __init__(self, 
                 dataset: pd.DataFrame, 
                 tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
                 config):
        self.db = dataset
        self.tokenizer = tokenizer
        
        src_text = self.db['prompt'].values
        tgt_text = self.db['code_after'].values
        
        self.src_text_tokenized = [tokenizer(x,
                                       max_length=config.src_max_len,
                                       truncation=True,
                                       return_attention_mask=False,
                                       ) for x in src_text]
        self.tgt_text_tokenized = [tokenizer(x,
                                       max_length=config.tgt_max_len,
                                       truncation=True,
                                       return_attention_mask=False,
                                       ) for x in tgt_text]

    def __len__(self):
        return len(self.db)

    def __getitem__(self, 
                    idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        src = self.src_text_tokenized[idx]
        tgt = self.tgt_text_tokenized[idx]
        return src, tgt
    
    @staticmethod
    def collate_fn_new(samples: Tuple[torch.Tensor, torch.Tensor], 
                   tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], 
                   config: Config) -> Tuple[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
        src_samples = [x[0] for x in samples]
        tgt_samples = [x[1] for x in samples]

        src_samples = tokenizer.pad(src_samples,
                                    padding='longest',
                                    max_length=config.src_max_len,
                                    return_attention_mask=False,
                                    return_tensors='pt')['input_ids']

        tgt_samples = tokenizer.pad(tgt_samples,
                                    padding='longest',
                                    max_length=config.tgt_max_len,
                                    return_attention_mask=False,
                                    return_tensors='pt')['input_ids']

        return (src_samples, tgt_samples), tgt_samples

In [16]:
test_ds = EditDataset(X_test, tokenizer, CONFIG)

## Model training

In [17]:
CONFIG.batch_size = 16
loaders = {
    'valid_full': torch.utils.data.DataLoader(test_ds, 
                                         batch_size=CONFIG.batch_size,
                                         collate_fn=lambda x: EditDataset.collate_fn_new(x, tokenizer, CONFIG),
                                         num_workers=4, shuffle=False)
}

In [18]:
class EditModel(nn.Module):
    def __init__(self, 
                 pretrained: transformers.modeling_utils.PreTrainedModel, 
                 config: Config):
        super(EditModel, self).__init__()
        self.pretrained = pretrained
        

    def forward(self, 
                x: Tuple[torch.Tensor, torch.Tensor]):
        src, tgt = x
        
        tgt[tgt == 0] == -100
        
        loss = self.pretrained(
            input_ids = src,
            attention_mask = (src != 0).float(),
            labels=tgt,
        ).loss
        return loss
    
    
class Criterion(nn.Module):
    def __init__(self):
        super(Criterion, self).__init__()
        
    def forward(self, pred, tgt):
        return pred

In [19]:
!ls ./models/Salesforce

'codet5-small p(x_t+1 | x_t, commit_msg)'


In [20]:
CONFIG.pattern_path = './models/Salesforce/codet5-small p(x_t+1 | x_t, commit_msg)'
CONFIG.device = 'cuda'

model_edit = EditModel(T5ForConditionalGeneration.from_pretrained(CONFIG.pretrained), CONFIG)
model_edit.load_state_dict(
    torch.load(f'{CONFIG.pattern_path}/checkpoints/model.best.pth', 
               map_location=CONFIG.device))
model_edit = model_edit.pretrained
model_edit.to(CONFIG.device)
model_edit.eval()
print('Success')

Success


In [21]:
from tqdm.auto import tqdm

device = CONFIG.device
CONFIG.beam_size = 1

predictions = []
with torch.no_grad():
    for batch_idx, batch in tqdm(enumerate(loaders['valid_full']), total=len(loaders['valid_full'])):
        (src_, tgt_), _ = batch
        
        generated = model_edit.generate(src_.to(device), 
                                        num_beams=CONFIG.beam_size,
                                        num_return_sequences=1,
                                        max_length=512)
        generated = generated.cpu()
        
        pred = generated.view(-1, CONFIG.beam_size, generated.shape[1])
        for i in range(pred.shape[0]):
            pred_full = []
            for pred_item in pred[i]:
                txt_pred = tokenizer.decode(pred_item, skip_special_tokens=True)
                pred_full.append(txt_pred)
            predictions.append(pred_full)

  0%|          | 0/466 [00:00<?, ?it/s]

In [31]:
print(predictions[5][0])

    def on_mode(self, c, e):
        self.write_event("mode", e,
                         {"%modes%" : e.arguments()[0],
                          "%person%" : e.arguments()[1],
                          "%giver%" : nm_to_n(e.source()),
                         })


In [32]:
print(X_test['code_after'].iloc[5])

    def on_mode(self, c, e):
        self.write_event("mode", e,
                         {"%modes%" : e.arguments()[0],
                          "%person%" : e.arguments()[1] if len(e.arguments()) > 1 else "",
                          "%giver%" : nm_to_n(e.source()),
                         })


In [33]:
X_test['predicted_text'] = [el[0] for el in predictions]

In [34]:
print(np.sum(X_test['predicted_text'] == X_test['code_after']))
print(np.sum(X_test['predicted_text'] == X_test['code_after']) / X_test.shape[0])

1073
0.14410421702927748


In [35]:
print(np.sum(X_test['code_before'] == X_test['code_after']))
print(np.sum(X_test['code_before'] == X_test['code_after']) / X_test.shape[0])

499
0.06701584743486436
