In [63]:
import json
import pickle
import sys
from collections import Counter
from collections import deque
from nltk import ngrams

import astor
import matplotlib.pyplot as plt
import pandas as pd
import yaml
from torchtext.data.metrics import bleu_score
from tqdm import tqdm

from asdl.ast_operation import make_iterlists, seq2ast, Grammar, GrammarRule
from asdl.grammar import ReduceAction
from config.config import init_arg_parser
from dataset.utils import tokenize_for_bleu_eval

# CoNaLa

In [11]:
conala_path_train = 'dataset/data_conala/conala-corpus/conala-train.json'
conala_path_test = 'dataset/data_conala/conala-corpus/conala-test.json'

conala_preprocessing = json.load(open(conala_path_train))
conala_test_preprocessing = json.load(open(conala_path_test))

train_set_conala = pd.read_csv('dataset/data_conala/train/conala-train.csv')
dev_set_conala = pd.read_csv('dataset/data_conala/train/conala-val.csv')
test_set_conala = pd.read_csv('dataset/data_conala/test/conala-test.csv')

# pydf_conala_preprocess = pd.concat([train_set_conala, dev_set_conala, test_set_conala])
pydf_conala_preprocess = pd.concat([train_set_conala, dev_set_conala])

In [15]:
replace_number = 0

for x in pydf_conala_preprocess.values:
    replace_number += len(eval(x[4]))

print('number of replacement CoNaLa:', replace_number)

number of replacement CoNaLa: 55812


In [60]:
number_token_input_actions = 0
total_token_input_actions = []
nl_tokens = 0

variables = ['var_0', 'var_1', 'var_2', 'var_3', 'str_0', 'str_1', 'str_2', 'str_3', 'str_4']

for x in pydf_conala_preprocess.values:
    nl = eval(x[0])
    nl_tokens += len(eval(x[0]))
    nl = [b for b in nl if b not in variables]
    actions = eval(x[1])
    actions = [b for b in actions if b not in variables]
    token_input_actions = list(set(nl) & set(actions))
    total_token_input_actions += token_input_actions
    number_token_input_actions += len(token_input_actions)

print('nombre tokens nl total', nl_tokens)
print('nombre de tokens dans input et actions (sans les var0, str0 + sans duplicates...)', len(list(dict.fromkeys(total_token_input_actions))))
print('nombre de tokens dans input et actions (sans les var0, str0...)', number_token_input_actions)

nombre tokens nl total 33327
nombre de tokens dans input et actions (sans les var0, str0 + sans duplicates...) 877
nombre de tokens dans input et actions (sans les var0, str0...) 3726


Sur CoNaLa, tokenizer BERT fait perdre 200 mots présents dans input et output

In [49]:
number_token_input_output = 0
total_token_input_code = []
nl_tokens = 0

for i, example in enumerate(conala_preprocessing):
    try:
        intent = example['rewritten_intent'].split()
    except:
        intent = example['intent'].split()
    nl_tokens += len(intent)
    snippet = tokenize_for_bleu_eval(example['snippet'])
    
    number_token_input_output += len(set(intent) & set(snippet))
    total_token_input_code += list(set(intent) & set(snippet))

#for i, example in enumerate(conala_test_preprocessing):
#    try:
#        intent = example['rewritten_intent'].split()
#    except:
#        intent = example['intent'].split()
#    nl_tokens += len(intent)
#    snippet = tokenize_for_bleu_eval(example['snippet'])
#    number_token_input_output += len(set(intent) & set(snippet))
    
print('nombre tokens nl total', nl_tokens)
print('nombre de tokens dans input et code (sans les var0, str0 + sans duplicates...)', len(list(dict.fromkeys(total_token_input_code))))
print('nombre de tokens dans input et code', number_token_input_output)

nombre tokens nl total 24593
nombre de tokens dans input et code (sans les var0, str0 + sans duplicates...) 441
nombre de tokens dans input et code 2024


In [45]:
intent = []
code = []

for value in conala_preprocessing:
    try:
        intent.append(value['rewritten_intent'].split())
    except:
        intent.append(value['intent'].split())
    code.append([tokenize_for_bleu_eval(value['snippet'])])
    
for value in conala_test_preprocessing:
    try:
        intent.append(value['rewritten_intent'].split())
    except:
        intent.append(value['intent'].split())
    code.append([tokenize_for_bleu_eval(value['snippet'])])

BLEU = bleu_score(intent, code)
print(BLEU * 100)

0.2882628605719756


In [10]:
intent = []
code = []

for value in conala_preprocessing:
    try:
        intent.append([value['rewritten_intent'].split()])
    except:
        intent.append([value['intent'].split()])
    code.append(tokenize_for_bleu_eval(value['snippet']))
    
for value in conala_test_preprocessing:
    try:
        intent.append([value['rewritten_intent'].split()])
    except:
        intent.append([value['intent'].split()])
    code.append(tokenize_for_bleu_eval(value['snippet']))

BLEU = bleu_score(code, intent)
print(BLEU * 100)

0.32475339248776436


# Django

In [50]:
annot_file = './dataset/data_django/all.anno'
code_file = './dataset/data_django/all.code'

train_set_django = pd.read_csv('./dataset/data_django/train.csv')
dev_set_django = pd.read_csv('./dataset/data_django/dev.csv')
#test_set_django = pd.read_csv('./dataset/data_django/test.csv')

#pydf_django_preprocess = pd.concat([train_set_django, dev_set_django, test_set_django])
pydf_django_preprocess = pd.concat([train_set_django, dev_set_django])

In [62]:
number_token_input_output = 0
nl_tokens = 0
total_token_input_code = []

for idx, (src_query, tgt_code) in enumerate(zip(open(annot_file), open(code_file))):
    try:
        src_query = src_query.strip()
        src_query = tokenize_for_bleu_eval(src_query)
        # src_query = src_query.split()
        nl_tokens += len(src_query)
        tgt_code = tgt_code.strip()
        tgt_code = tgt_code.split()
        number_token_input_output += len(set(src_query) & set(tgt_code))
        total_token_input_code += list(set(src_query) & set(tgt_code))
    except: 
        pass
    
print(nl_tokens)
print(len(total_token_input_code))
print('nombre de tokens dans input et code', number_token_input_output)

272851
72674
nombre de tokens dans input et code 72674


In [56]:
number_token_input_actions = 0
nl_tokens = 0
variables = ['str_0', 'str_1', 'str_2', 'str_3', 'str_4', 'str_5', 'str_6', 'str_7', 'var_0','var_1','var_2','var_3','var_4','var_5', 'var_6', 'var_7']

for x in pydf_django_preprocess.values:
    nl = eval(x[0])
    nl_tokens += len(nl)
    nl = [b for b in nl if b not in variables]
    actions = eval(x[2])
    actions = [b for b in actions if b not in variables]
    number_token_input_actions += len(set(nl) & set(actions))
    total_token_input_code += list(set(nl) & set(actions))

print(nl_tokens)
print(len(list(dict.fromkeys(total_token_input_code))))
print('nombre de tokens dans input et actions (sans les var0, str0...)', number_token_input_actions)

256304
5382
nombre de tokens dans input et actions (sans les var0, str0...) 42043


In [92]:
src_query = []
tgt_code = []

for idx, (query, code) in enumerate(zip(open(annot_file), open(code_file))):
    try:
        src_query.append(tokenize_for_bleu_eval(query.strip()))
        tgt_code.append([tokenize_for_bleu_eval(code.strip())])
        # tgt_code.append([code.strip().split()])
        # print('query', tokenize_for_bleu_eval(query.strip()))
        # print('code', tokenize_for_bleu_eval(code.strip()))
    except: 
        pass
    
BLEU = bleu_score(src_query, tgt_code)
print(BLEU * 100)

19.419631361961365


In [85]:
src_query = []
tgt_code = []

for idx, (query, code) in enumerate(zip(open(annot_file), open(code_file))):
    try:
        src_query.append([tokenize_for_bleu_eval(query.strip())])
        tgt_code.append(tokenize_for_bleu_eval(code.strip()))
    except: 
        pass
    
BLEU = bleu_score(tgt_code, src_query)
print(BLEU * 100)

18.86678050452676


# CodeSearchNet

In [33]:
train_path = './dataset/data_github/python/final/jsonl/train/'
dev_path = './dataset/data_github/python/final/jsonl/valid/'
test_path = './dataset/data_github/python/final/jsonl/test/'

train_set_csn = pd.read_csv(train_path + 'train.csv')
dev_set_csn = pd.read_csv(dev_path + 'valid.csv')
test_set_csn = pd.read_csv(test_path + 'test.csv')

pydf_csn_preprocess = pd.concat([train_set_csn, dev_set_csn, test_set_csn])

In [48]:
number_token_input_actions = 0
number_token_input_output = 0

variables = ['var_0', 'var_1', 'var_2', 'var_3', 'str_0', 'str_1', 'str_2', 'str_3', 'str_4']

for x in pydf_csn_preprocess.values:
    nl = eval(x[0])
    nl = [b for b in nl if b not in variables]
    actions = eval(x[6])
    actions = [b for b in actions if b not in variables]
    code = eval(x[1])
    number_token_input_output += len(set(nl) & set(code))
    number_token_input_actions += len(set(nl) & set(actions))
    

print('nombre de tokens dans input et code', number_token_input_output)
print('nombre de tokens dans input et actions (sans les var0, str0...)', number_token_input_actions)

nombre de tokens dans input et code 22450
nombre de tokens dans input et actions (sans les var0, str0...) 7622
