In [1]:
import json
import pandas as pd
from tqdm import tqdm
import sys

def read_data(file_name):
    items = []
    for i in open(file_name,'r').readlines():
        items.append(json.loads(i))
    return pd.DataFrame(items)
def save_data(df, o_name, suffix = 'json'):
    df = df.astype(object)
    with open(f"{o_name}.{suffix}",'w+') as t:
        for i in tqdm(range(len(df))):
            item = df.iloc[i,:].to_dict()
            t.write(json.dumps(item)+'\n')
def save_dict(d, o_name):
    with open(f"{o_name}.json",'w+') as o:
        o.write(json.dumps(d))

In [2]:
tag = 'train'
train = read_data(f'./{tag}.json')
len(train), len(train.problem_id.unique())

(80906, 480)

In [3]:
tag = 'valid'
valid = read_data(f'./{tag}.json')
len(valid), len(valid.problem_id.unique())

(4742, 565)

In [4]:
tag = 'test'
test = read_data(f'./{tag}.json')
len(test), len(test.problem_id.unique())

(3583, 120)

In [5]:
def edit_distance(tokens1, tokens2):
    m, n = len(tokens1), len(tokens2)
    
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    
    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j
    
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if tokens1[i - 1] == tokens2[j - 1]:  
                dp[i][j] = dp[i - 1][j - 1]
            else:  
                dp[i][j] = min(dp[i - 1][j],     
                               dp[i][j - 1],     
                               dp[i - 1][j - 1]  
                              ) + 1

    return dp[m][n]

tokens1 = ["int", "main", "(", ")", "{", "return", "0", ";", "}"]
tokens2 = ["int", "main", "(", "int", "argc", ")", "{", "return", "1", ";", "}"]

distance = edit_distance(tokens1, tokens2)
distance

3

In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("/share/liangqingyuan/Models/deepseek-coder-1.3b-base")

In [12]:
edit_distance_list = []
rel_edit_dis = []
pos_token_list = []
neg_token_list = []
problem_list = []
unique_nl = []
for _, item in tqdm(test.iterrows(), total=len(test)):
    pos = item["pos"]
    neg = item["neg"]
    pos_tokens = tokenizer.tokenize(pos)
    neg_tokens = tokenizer.tokenize(neg)
    pos_token_list.append(len(pos_tokens))
    neg_token_list.append(len(neg_tokens))
    distance = edit_distance(pos_tokens, neg_tokens)
    edit_distance_list.append(distance)
    rel_edit_dis.append(distance / ((len(pos_tokens)+ len(neg_tokens)/2)))
    if item['nl'] not in unique_nl:
        unique_nl.append(item['nl'])
        problem_list.append(len(tokenizer.tokenize(item['nl'])))
print('RED: ', sum(rel_edit_dis)/len(rel_edit_dis))
print('ED: ', sum(edit_distance_list)/len(edit_distance_list))
print('POS tokens: ', sum(pos_token_list)/len(pos_token_list))
print('NEG tokens: ', sum(neg_token_list)/len(neg_token_list))
print('PROBLEM tokens: ', sum(problem_list)/len(problem_list))

100%|██████████| 3583/3583 [00:32<00:00, 111.50it/s]

RED:  0.12154382399194737
ED:  19.8601730393525
POS tokens:  133.35835891710857
NEG tokens:  129.2416969020374
PROBLEM tokens:  178.68333333333334





In [13]:
len(unique_nl)

120

In [14]:
edit_distance_list = []
rel_edit_dis = []
pos_token_list = []
neg_token_list = []
problem_list = []
unique_nl = []
for _, item in tqdm(train.iterrows(), total=len(train)):
    pos = item["pos"]
    neg = item["neg"]
    pos_tokens = tokenizer.tokenize(pos)
    neg_tokens = tokenizer.tokenize(neg)
    pos_token_list.append(len(pos_tokens))
    neg_token_list.append(len(neg_tokens))
    distance = edit_distance(pos_tokens, neg_tokens)
    edit_distance_list.append(distance)
    rel_edit_dis.append(distance / ((len(pos_tokens)+ len(neg_tokens)/2)))
    if item['nl'] not in unique_nl:
        unique_nl.append(item['nl'])
        problem_list.append(len(tokenizer.tokenize(item['nl'])))
print('RED: ', sum(rel_edit_dis)/len(rel_edit_dis))
print('ED: ', sum(edit_distance_list)/len(edit_distance_list))
print('POS tokens: ', sum(pos_token_list)/len(pos_token_list))
print('NEG tokens: ', sum(neg_token_list)/len(neg_token_list))
print('PROBLEM tokens: ', sum(problem_list)/len(problem_list))

100%|██████████| 80906/80906 [09:04<00:00, 148.66it/s]

RED:  0.11091876696767959
ED:  15.51968951622871
POS tokens:  112.82112575087139
NEG tokens:  109.64739327120363
PROBLEM tokens:  187.6375





In [15]:
edit_distance_list = []
rel_edit_dis = []
pos_token_list = []
neg_token_list = []
problem_list = []
unique_nl = []
for _, item in tqdm(valid.iterrows(), total=len(valid)):
    pos = item["pos"]
    neg = item["neg"]
    pos_tokens = tokenizer.tokenize(pos)
    neg_tokens = tokenizer.tokenize(neg)
    pos_token_list.append(len(pos_tokens))
    neg_token_list.append(len(neg_tokens))
    distance = edit_distance(pos_tokens, neg_tokens)
    edit_distance_list.append(distance)
    rel_edit_dis.append(distance / ((len(pos_tokens)+ len(neg_tokens)/2)))
    if item['nl'] not in unique_nl:
        unique_nl.append(item['nl'])
        problem_list.append(len(tokenizer.tokenize(item['nl'])))
print('RED: ', sum(rel_edit_dis)/len(rel_edit_dis))
print('ED: ', sum(edit_distance_list)/len(edit_distance_list))
print('POS tokens: ', sum(pos_token_list)/len(pos_token_list))
print('NEG tokens: ', sum(neg_token_list)/len(neg_token_list))
print('PROBLEM tokens: ', sum(problem_list)/len(problem_list))

100%|██████████| 4742/4742 [01:29<00:00, 53.05it/s] 

RED:  0.09951244163072674
ED:  22.783002952340784
POS tokens:  202.30514550822437
NEG tokens:  198.5183466891607
PROBLEM tokens:  273.7362831858407



