In [1]:
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
from transformers import RobertaTokenizer
import numpy as np
import matplotlib.pyplot as plt
from parser import DFG_java,DFG_csharp,DFG_python
from parser import (remove_comments_and_docstrings,
                   tree_to_token_index,
                   index_to_code_token,
                   tree_to_variable_index, 
                   detokenize_code, tree_to_token_nodes)
import sys
if 'tree_sitter' in sys.modules:
    del sys.modules['tree_sitter']
    
from tree_sitter import Language, Parser
import os
import pickle

def read_examples(split='train'):
    if (split=='valid') or (split=='dev'):
        split='validation'
    
    # Simplest possible load - no extra parameters
    dataset = load_dataset('code_x_glue_cc_code_to_code_trans', split=split)
    
    examples = []
    for example in dataset:
        examples.append({
            'id': example['id'],
            'java': example['java'],
            'cs': example['cs']
        })
    
    print(f'Loaded {len(examples)} examples from {split} split')
    return pd.DataFrame(examples)

def get_tokenizer_chars(tokenizer):
    tokenizer_chars = []
    for i in range(tokenizer.vocab_size):
        token = tokenizer.decode(i)
        if len(token)==1:
            tokenizer_chars.append(token)
    tokenizer_chars = [c for c in tokenizer_chars if c!='�']
    return tokenizer_chars


def tokenize_codes_texts(texts, batch_size=1024):
    tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
    tokenizer_chars = get_tokenizer_chars(tokenizer)
    texts = [''.join(filter(lambda c:c in tokenizer_chars, text)) for text in texts]
    N = len(texts)
    tokenized_texts = []
    for start in range(0, len(texts),batch_size):
        tokenized_texts += tokenizer(texts[start:start+batch_size]).input_ids
    return tokenized_texts


def length_stats(s, title="Stats"):
    # Check if s contains lists/strings or numbers
    first_val = s.iloc[0] if len(s) > 0 else None
    
    if isinstance(first_val, (list, str)):
        # For lists/strings, get lengths
        lens = s.apply(len)
    else:
        # For numeric data, use values directly
        lens = s
    
    print(title)
    print('mean=%.1f, median=%.1f, std=%.1f, max=%.1f, min=%.1f' %
          (lens.mean(), lens.median(), lens.std(), lens.max(), lens.min()))
    
    # Plotting disabled
    
    
def get_code_tokens_ranges(data, code_col, code_tokens_col):
    tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
    pbar = data.itertuples()
    ranges = []
    
    for row in pbar:
        code_tokens = [tokenizer.decode(ct) for ct in getattr(row, code_tokens_col)][1:-1] # 1:-1 to remove <s> and </s>
        code2 = ''.join(code_tokens) # may miss some spaces / special chars that are in row.code_col
        code = getattr(row, code_col)
        
        # map each position in code2 to a position in code
        code2_to_code = []
        j=0
        for i in range(len(code2)):
            while code2[i]!=code[j]:
                j += 1
            code2_to_code.append(j)
            
        # map each code token to a range in code
        code2_idx = 0
        curr_ranges = []
        for ct in code_tokens:
            s,e = code2_idx, code2_idx+len(ct)
            code2_idx = e
            curr_ranges.append((min(code2_to_code[s:e]),1+max(code2_to_code[s:e])))
        ranges.append([None]+curr_ranges+[None]) # first and last for <s> and </s>
        
    data[code_tokens_col+'_ranges'] = ranges
    
    
def extract_structure(code, parser):
    # ast
    tree = parser[0].parse(bytes(code,'utf8'))    
    root_node = tree.root_node  
    ast_token_nodes = tree_to_token_nodes(root_node) # leaves
    
    # dfg
    tokens_index = [(node.start_point, node.end_point) for node in ast_token_nodes]
    code=code.split('\n')
    code_tokens=[index_to_code_token(x,code) for x in tokens_index] 
    index_to_code={index:(idx,code_) for idx,(index,code_) in enumerate(zip(tokens_index,code_tokens))}
    try:
        DFG,_ = parser[1](root_node,index_to_code,{}) 
    except:
        DFG = []
    for d in DFG:
        assert (d[2]=='comesFrom' or d[2]=='computedFrom')
    DFG = [(d[1], d[4]) for d in DFG if (len(d[4])>0)] # left comes from right
    return code_tokens, ast_token_nodes, DFG


def format_node_ranges(code, nodes):
    line_lens = [len(line)+1 for line in code.split('\n')]
    line_starts = [0] + list(np.cumsum(line_lens))
    return [(line_starts[node.start_point[0]]+node.start_point[1],
             line_starts[node.end_point[0]]+node.end_point[1]) for node in nodes]
    
    
def add_structure(data, lang):
    LANGUAGE = Language('parser/my-languages2.so', 'c_sharp' if lang=='cs' else lang)
    parser = Parser()
    parser.set_language(LANGUAGE) 
    dfg_function={'python':DFG_python, 'java':DFG_java, 'cs':DFG_csharp, 'c_sharp':DFG_csharp}
    parser = [parser, dfg_function[lang]]
        
    ast_leaf_tokens, ast_leaves, ast_leaf_ranges, dfg_edges = [], [], [], []
    for code in data[lang]:
        curr_code_tokens, curr_ast_leaves, curr_dfg_edges = extract_structure(code, parser)
        ast_leaf_tokens.append(curr_code_tokens)
        ast_leaves.append(curr_ast_leaves)
        ast_leaf_ranges.append(format_node_ranges(code, curr_ast_leaves))
        dfg_edges.append(curr_dfg_edges)
        
    data[lang+'_ast_leaves'] = ast_leaves # list of leaf nodes
    data[lang+'_dfg_edges'] = dfg_edges # list of "left leaf node index comes from right leaf nodes indices"
    data[lang+'_ast_leaf_tokens'] = ast_leaf_tokens # list of code substrings corresponding to each leaf
    data[lang+'_ast_leaf_ranges'] = ast_leaf_ranges # list of (start,end) in code for each leaf node
    
    print ('# '+lang+' samples with failed/empty DFG:', (data[lang+'_dfg_edges'].apply(len)==0).sum())
    
    
def overlap(s1,e1,s2,e2):
    return s1<=s2<e1 or s2<=s1<e2
    
    
def get_leaf_code_token_indices(data, lang):
    ast_leaf_token_idxs = []
    pbar = data.itertuples() if len(data)<=100000 else tqdm(data.itertuples())
    for row in pbar:
        ast_leaf_token_idxs.append([])
        code_tokens_last_idx = len(getattr(row, lang+'_tokens'))-1
        code_tokens_ranges = getattr(row, lang+'_tokens_ranges')
        for s,e in getattr(row, lang+'_ast_leaf_ranges'):
            if s==e: # there are leaves with start_point=end_point
                ast_leaf_token_idxs[-1].append([])
                continue
            j = 1
            while not(overlap(s,e,code_tokens_ranges[j][0],code_tokens_ranges[j][1])):
                j += 1
                if j==code_tokens_last_idx: # can't find code tokens for this leaf
                    break
            if j==code_tokens_last_idx: # can't find code tokens for this leaf
                ast_leaf_token_idxs[-1].append([])
                continue
            curr_leaf_token_idxs = []
            while overlap(s,e,code_tokens_ranges[j][0],code_tokens_ranges[j][1]):
                curr_leaf_token_idxs.append(j)
                j += 1
                if j==code_tokens_last_idx:
                    break
            ast_leaf_token_idxs[-1].append(curr_leaf_token_idxs)
    data[lang+'_ast_leaf_code_token_idxs'] = ast_leaf_token_idxs
    print ('Average # leaves with no matching code tokens:', 
           data[lang+'_ast_leaf_code_token_idxs'].apply(lambda x:sum([1 for xi in x if xi==[]])).mean())
    

def get_lr_path(leaf):
    path = [leaf]
    while path[-1].parent is not None:
        path.append(path[-1].parent)
    return path


def get_ll_sim(p1, p2): 
    common = 1
    for i in range(2, min(len(p1), len(p2))+1):
        if p1[-i]==p2[-i]:
            common += 1
        else:
            break
    return common*common / (len(p1)*len(p2))   


def get_ast_lr_paths_and_ll_sim(data, lang):
    sims = []
    lr_paths = []
    for i,row in tqdm(enumerate(data.itertuples())):
        ast_leaves = getattr(row, lang+'_ast_leaves')
        L = min(len(ast_leaves), 512)
        curr_paths = [get_lr_path(leaf) for leaf in ast_leaves]
        curr_sims = np.ones((L,L))
        for i in range(L-1):
            for j in range(i+1,L):
                curr_sims[i,j] = curr_sims[j,i] = get_ll_sim(curr_paths[i], curr_paths[j])
        sims.append(curr_sims)
        lr_paths.append([[node.type for node in path] for path in curr_paths])
    data.drop(columns=[lang+'_ast_leaves'], inplace=True)
    data[lang+'_ll_sims'] = sims
    data[lang+'_lr_paths_types'] = lr_paths


def process_dfg_edges(data, lang):
    dfg_node_code_token_idxs = []
    dfg_edges = []
    pbar = data.itertuples()  
    for row in pbar:
        curr_dfg_edges = getattr(row, lang+'_dfg_edges')
        if len(curr_dfg_edges)>0:
            nodes = sorted(list(set(np.concatenate([[left]+right for left,right in curr_dfg_edges]))))
        else:
            nodes = []
        node_to_idx = {k:i for i,k in enumerate(nodes)}
        ast_leaf_code_token_idxs = getattr(row, lang+'_ast_leaf_code_token_idxs')
        dfg_node_code_token_idxs.append( [ast_leaf_code_token_idxs[i] for i in nodes] )
        dfg_edges.append( [(node_to_idx[left], [node_to_idx[r] for r in right]) for left,right in curr_dfg_edges] )
    data[lang+'_dfg_edges'] = dfg_edges
    data[lang+'_dfg_node_code_token_idxs'] = dfg_node_code_token_idxs
    
    
def some_more_stats(data, lang):
    node_types = set(np.concatenate(list(data[lang+'_lr_paths_types'].apply(lambda ll:np.concatenate(ll)))))
    print ('# node types:', len(node_types))
    if 'ERROR' in node_types:
        num_error_nodes = data[lang+'_lr_paths_types'].apply(lambda paths:
                        np.mean(['ERROR' in path for path in paths]))
        print ('Distrubution of fraction of leaf-root paths with ERROR node in one code')
        length_stats(num_error_nodes)
    print ('Distrubution of AST depth')
    length_stats(data[lang+'_lr_paths_types'].apply(lambda paths:max([len(p) for p in paths])))          
    print ('Distrubution of # ast leaves per code')
    length_stats(data[lang+'_ast_leaf_code_token_idxs'].apply(len))
    print ('Distrubution of # dfg nodes per code')
    length_stats(data[lang+'_dfg_node_code_token_idxs'].apply(len))
    print ('Distrubution of # dfg edges per code')
    def num_dfg_edges(s):
        if s==[]:
            return 0
        return sum([len(rights) for _,rights in s])
    length_stats(data[lang+'_dfg_edges'].apply(num_dfg_edges))
    return node_types


## CodeXGLUE translation

In [2]:
import datasets
#datasets.disable_caching()

save_dir = 'data/codexglue_translation/'
os.makedirs(save_dir, exist_ok=True)
all_node_types = set()
data_by_split = {}
for split in ['train', 'validation', 'test']:
    print ('\n\nSplit='+split)
    data = read_examples(split) # id, java, cs
    for lang in ['java', 'cs']:
        add_structure(data, lang) # lang_ -> ast_leaves, dfg_edges, ast_leaf_tokens, ast_leaf_ranges
        data[lang+'_tokens'] = tokenize_codes_texts(list(data[lang]))
        length_stats(data[lang+'_tokens'], 'Distribution of #'+lang+'_tokens')
        get_code_tokens_ranges(data, lang, lang+'_tokens') # list of (start,end) one for each code_token
        data.drop(columns=[lang], inplace=True)
        get_leaf_code_token_indices(data, lang)
        data.drop(columns=[lang+c for c in ['_ast_leaf_tokens', '_ast_leaf_ranges', '_tokens_ranges']], 
                  inplace=True)
        get_ast_lr_paths_and_ll_sim(data, lang)
        process_dfg_edges(data, lang)
        more_node_types = some_more_stats(data, lang)
        all_node_types.update(more_node_types)
    data_by_split[split] = data
    
# Map node types to indices.
all_node_types = sorted(list(all_node_types))
node_type_to_ind = {t:i for i,t in enumerate(all_node_types)}
pickle.dump(all_node_types, open(save_dir+'all_node_types.pkl', 'wb'))

# Convert node types on paths to indices.
for split in ['train', 'validation', 'test']:
    for lang in ['java', 'cs']:
        data_by_split[split][lang+'_lr_paths_types'] = data_by_split[split][lang+'_lr_paths_types'].apply(
                                            lambda ll: [[node_type_to_ind[t] for t in path] for path in ll])
        
# Save data. Not converting array cols to strings, storing with pickle.
pickle.dump(data_by_split, open(save_dir+'preprocessed_data_by_split.pkl','wb'))



Split=train


Reusing dataset code_x_glue_cc_code_to_code_trans (/Users/cris/.cache/huggingface/datasets/code_x_glue_cc_code_to_code_trans/default/0.0.0/86dd57d2b1e88c6e589646133b76f2fef9d56c82e933d7f276e8a5b60ab18c34)


Loaded 10300 examples from train split
# java samples with failed/empty DFG: 1457




Distribution of #java_tokens
mean=48.4, median=33.0, std=44.2, max=394.0, min=11.0




Average # leaves with no matching code tokens: 0.459126213592233


10300it [00:06, 1490.33it/s]


# node types: 177
Distrubution of fraction of leaf-root paths with ERROR node in one code
Stats
mean=0.0, median=0.0, std=0.1, max=1.0, min=0.0
Distrubution of AST depth
Stats
mean=8.6, median=8.0, std=2.2, max=39.0, min=5.0
Distrubution of # ast leaves per code
Stats
mean=38.8, median=22.0, std=38.9, max=314.0, min=10.0
Distrubution of # dfg nodes per code
Stats
mean=8.1, median=5.0, std=10.9, max=117.0, min=0.0
Distrubution of # dfg edges per code
Stats
mean=7.2, median=4.0, std=12.0, max=172.0, min=0.0




# cs samples with failed/empty DFG: 1192




Distribution of #cs_tokens
mean=59.4, median=55.0, std=44.8, max=407.0, min=12.0




Average # leaves with no matching code tokens: 0.47126213592233007


10300it [00:09, 1094.16it/s]


# node types: 210
Distrubution of fraction of leaf-root paths with ERROR node in one code
Stats
mean=0.0, median=0.0, std=0.1, max=1.0, min=0.0
Distrubution of AST depth
Stats
mean=10.8, median=11.0, std=2.7, max=28.0, min=4.0
Distrubution of # ast leaves per code
Stats
mean=46.2, median=45.0, std=38.7, max=320.0, min=10.0
Distrubution of # dfg nodes per code
Stats
mean=11.7, median=10.0, std=12.0, max=119.0, min=0.0
Distrubution of # dfg edges per code
Stats
mean=10.4, median=7.0, std=17.0, max=1024.0, min=0.0


Split=validation


Reusing dataset code_x_glue_cc_code_to_code_trans (/Users/cris/.cache/huggingface/datasets/code_x_glue_cc_code_to_code_trans/default/0.0.0/86dd57d2b1e88c6e589646133b76f2fef9d56c82e933d7f276e8a5b60ab18c34)


Loaded 500 examples from validation split
# java samples with failed/empty DFG: 70
Distribution of #java_tokens
mean=51.7, median=34.0, std=49.4, max=318.0, min=12.0




Average # leaves with no matching code tokens: 0.494


500it [00:00, 1290.55it/s]


# node types: 145
Distrubution of fraction of leaf-root paths with ERROR node in one code
Stats
mean=0.0, median=0.0, std=0.1, max=1.0, min=0.0
Distrubution of AST depth
Stats
mean=8.7, median=8.0, std=2.3, max=21.0, min=5.0
Distrubution of # ast leaves per code
Stats
mean=42.3, median=22.0, std=43.1, max=261.0, min=10.0
Distrubution of # dfg nodes per code
Stats
mean=9.2, median=5.0, std=12.5, max=79.0, min=0.0
Distrubution of # dfg edges per code
Stats
mean=8.6, median=4.0, std=15.5, max=176.0, min=0.0
# cs samples with failed/empty DFG: 51
Distribution of #cs_tokens
mean=63.8, median=62.0, std=49.4, max=324.0, min=13.0




Average # leaves with no matching code tokens: 0.586


500it [00:00, 927.69it/s] 


# node types: 150
Distrubution of fraction of leaf-root paths with ERROR node in one code
Stats
mean=0.0, median=0.0, std=0.1, max=1.0, min=0.0
Distrubution of AST depth
Stats
mean=11.1, median=11.0, std=2.8, max=24.0, min=6.0
Distrubution of # ast leaves per code
Stats
mean=50.7, median=45.0, std=43.0, max=282.0, min=10.0
Distrubution of # dfg nodes per code
Stats
mean=13.2, median=14.0, std=13.8, max=91.0, min=0.0
Distrubution of # dfg edges per code
Stats
mean=12.0, median=12.0, std=15.8, max=119.0, min=0.0


Split=test


Reusing dataset code_x_glue_cc_code_to_code_trans (/Users/cris/.cache/huggingface/datasets/code_x_glue_cc_code_to_code_trans/default/0.0.0/86dd57d2b1e88c6e589646133b76f2fef9d56c82e933d7f276e8a5b60ab18c34)


Loaded 1000 examples from test split
# java samples with failed/empty DFG: 160
Distribution of #java_tokens
mean=47.6, median=33.0, std=41.6, max=347.0, min=12.0




Average # leaves with no matching code tokens: 0.482


1000it [00:00, 1551.46it/s]


# node types: 153
Distrubution of fraction of leaf-root paths with ERROR node in one code
Stats
mean=0.0, median=0.0, std=0.1, max=1.0, min=0.0
Distrubution of AST depth
Stats
mean=8.6, median=8.0, std=2.1, max=19.0, min=5.0
Distrubution of # ast leaves per code
Stats
mean=38.5, median=22.0, std=38.0, max=306.0, min=10.0
Distrubution of # dfg nodes per code
Stats
mean=8.0, median=5.0, std=10.5, max=73.0, min=0.0
Distrubution of # dfg edges per code
Stats
mean=7.0, median=4.0, std=11.2, max=90.0, min=0.0
# cs samples with failed/empty DFG: 116




Distribution of #cs_tokens
mean=58.4, median=55.0, std=41.5, max=332.0, min=12.0




Average # leaves with no matching code tokens: 0.496


1000it [00:00, 1183.04it/s]


# node types: 174
Distrubution of fraction of leaf-root paths with ERROR node in one code
Stats
mean=0.0, median=0.0, std=0.1, max=1.0, min=0.0
Distrubution of AST depth
Stats
mean=10.7, median=11.0, std=2.7, max=22.0, min=6.0
Distrubution of # ast leaves per code
Stats
mean=45.8, median=45.0, std=37.4, max=270.0, min=10.0
Distrubution of # dfg nodes per code
Stats
mean=11.5, median=9.5, std=11.3, max=82.0, min=0.0
Distrubution of # dfg edges per code
Stats
mean=10.1, median=7.0, std=11.9, max=90.0, min=0.0


In [3]:
import sys
print(f"Python executable: {sys.executable}")
print(f"Python version: {sys.version}")

import tree_sitter
print(f"tree-sitter location: {tree_sitter.__file__}")

# Try to get version
try:
    from tree_sitter import LANGUAGE_VERSION
    print(f"tree-sitter LANGUAGE_VERSION: {LANGUAGE_VERSION}")
except:
    pass

Python executable: /Users/cris/Desktop/StructCoder/.venv/bin/python
Python version: 3.10.17 (main, May 22 2025, 01:38:43) [Clang 20.1.4 ]
tree-sitter location: /Users/cris/Desktop/StructCoder/.venv/lib/python3.10/site-packages/tree_sitter/__init__.py


In [4]:
import os
from pathlib import Path

parser_dir = Path('parser')

# List all .so files
print("Parser files:")
for f in parser_dir.glob('*.so*'):
    print(f"  {f.name}: {f.stat().st_size / 1024 / 1024:.1f} MB, modified: {f.stat().st_mtime}")
    
    # Check if it's a symlink
    if f.is_symlink():
        print(f"    → symlink to: {os.readlink(f)}")

# Test loading
from tree_sitter import Language
try:
    lang = Language('parser/my-languages2.so', 'java')
    print("\n✅ Successfully loaded parser!")
except Exception as e:
    print(f"\n❌ Error loading parser: {e}")

Parser files:
  my-languages.so.x86_backup: 6.6 MB, modified: 1764786776.5274994
  my-languages2.so.x86_backup: 6.9 MB, modified: 1764786776.5358791
  my-languages2.so: 11.4 MB, modified: 1769189794.7306406

✅ Successfully loaded parser!




In [5]:
from datasets import load_dataset
dataset_splits = load_dataset('code_x_glue_cc_code_to_code_trans')
for split in ['train', 'validation', 'test']:
    dataset = dataset_splits[split]
    lines = {'java':[], 'cs':[]}
    for eg in dataset:
        lines['java'].append(eg['java'])
        lines['cs'].append(eg['cs'])
    for lang in ['java','cs']:
        with open('data/codexglue_translation/'+split+'_'+lang+'.txt', 'w') as f:
            lines[lang][-1] = lines[lang][-1][:-1] # to remove last empty line
            f.writelines(lines[lang])        

Reusing dataset code_x_glue_cc_code_to_code_trans (/Users/cris/.cache/huggingface/datasets/code_x_glue_cc_code_to_code_trans/default/0.0.0/86dd57d2b1e88c6e589646133b76f2fef9d56c82e933d7f276e8a5b60ab18c34)


  0%|          | 0/3 [00:00<?, ?it/s]

## CodeXGLUE generation

In [7]:
import json 

def read_examples(split):
    # Map split names to HuggingFace naming convention
    if split in ['valid', 'validation']:
        split='validation'  # HuggingFace uses 'validation', not 'dev'
    
    # Load from HuggingFace instead of local files
    dataset = load_dataset('google/code_x_glue_tc_text_to_code', split=split)
    
    examples = []
    for idx, example in enumerate(dataset):
        nl = example['nl'].strip()
        code = example['code'].strip()
        assert (code == remove_comments_and_docstrings(code, 'java'))
        assert ('\n' not in code)
        examples.append([idx, nl, code])
    
    print(f'Loaded {len(examples)} examples from {split} split')
    return pd.DataFrame(examples, columns=['id', 'nl', 'java'])

save_dir = 'data/codexglue_generation/'
os.makedirs(save_dir, exist_ok=True)
all_node_types = set()
data_by_split = {}
lang = 'java'
for split in ['train', 'validation', 'test']:
    print ('\n\nSplit='+split)
    data = read_examples(split) # id, nl, java
    data['nl_tokens'] = tokenize_codes_texts(list(data['nl']))
    length_stats(data['nl_tokens'], 'Distribution of #text_tokens')
    add_structure(data, 'java') # lang_ -> ast_leaves, dfg_edges, ast_leaf_tokens, ast_leaf_ranges
    data[lang+'_tokens'] = tokenize_codes_texts(list(data[lang]))
    length_stats(data[lang+'_tokens'], 'Distribution of #'+lang+'_tokens')
    get_code_tokens_ranges(data, lang, lang+'_tokens') # list of (start,end) one for each code_token
    with open(save_dir+split+'_'+lang+'.txt', 'w') as f:
        lines = [code+'\n' for code in data[lang]]
        lines[-1] = lines[-1][:-1] # to remove last empty line
        f.writelines(lines) 
    data.drop(columns=['nl', lang], inplace=True)
    get_leaf_code_token_indices(data, lang)
    data.drop(columns=[lang+c for c in ['_ast_leaf_tokens', '_ast_leaf_ranges', '_tokens_ranges']], inplace=True)
    get_ast_lr_paths_and_ll_sim(data, lang)
    process_dfg_edges(data, lang)
    more_node_types = some_more_stats(data, lang)
    all_node_types.update(more_node_types)
    data_by_split[split] = data
    
# Map node types to indices.
all_node_types = sorted(list(all_node_types))
node_type_to_ind = {t:i for i,t in enumerate(all_node_types)}
pickle.dump(all_node_types, open(save_dir+'all_node_types.pkl', 'wb'))

# Convert node types on paths to indices.
for split in ['train', 'validation', 'test']:
    data_by_split[split][lang+'_lr_paths_types'] = data_by_split[split][lang+'_lr_paths_types'].apply(
                                            lambda ll: [[node_type_to_ind[t] for t in path] for path in ll])
        
# Save data. Not converting array cols to strings, storing with pickle.
pickle.dump(data_by_split, open(save_dir+'preprocessed_data_by_split.pkl','wb'))



Split=train


Using custom data configuration google--code_x_glue_tc_text_to_code-d024203af31b2592
Reusing dataset parquet (/Users/cris/.cache/huggingface/datasets/parquet/google--code_x_glue_tc_text_to_code-d024203af31b2592/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


Loaded 100000 examples from train split


Token indices sequence length is longer than the specified maximum sequence length for this model (609 > 512). Running this sequence through the model will result in indexing errors


Distribution of #text_tokens
mean=215.4, median=176.0, std=154.0, max=2248.0, min=20.0




# java samples with failed/empty DFG: 35480




Distribution of #java_tokens
mean=35.4, median=25.0, std=25.7, max=266.0, min=8.0




Average # leaves with no matching code tokens: 0.00645


100000it [00:27, 3629.00it/s]


# node types: 176
Distrubution of fraction of leaf-root paths with ERROR node in one code
Stats
mean=0.0, median=0.0, std=0.0, max=1.0, min=0.0
Distrubution of AST depth
Stats
mean=7.9, median=7.0, std=2.3, max=31.0, min=4.0
Distrubution of # ast leaves per code
Stats
mean=26.6, median=18.0, std=19.8, max=162.0, min=6.0
Distrubution of # dfg nodes per code
Stats
mean=4.2, median=3.0, std=5.5, max=55.0, min=0.0
Distrubution of # dfg edges per code
Stats
mean=3.4, median=2.0, std=5.4, max=95.0, min=0.0


Split=validation


Using custom data configuration google--code_x_glue_tc_text_to_code-d024203af31b2592
Reusing dataset parquet (/Users/cris/.cache/huggingface/datasets/parquet/google--code_x_glue_tc_text_to_code-d024203af31b2592/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


Loaded 2000 examples from validation split


Token indices sequence length is longer than the specified maximum sequence length for this model (604 > 512). Running this sequence through the model will result in indexing errors


Distribution of #text_tokens
mean=187.1, median=153.0, std=128.5, max=1128.0, min=24.0
# java samples with failed/empty DFG: 611




Distribution of #java_tokens
mean=41.1, median=33.0, std=28.6, max=192.0, min=8.0




Average # leaves with no matching code tokens: 0.017


2000it [00:00, 2858.82it/s]


# node types: 153
Distrubution of fraction of leaf-root paths with ERROR node in one code
Stats
mean=0.0, median=0.0, std=0.0, max=0.1, min=0.0
Distrubution of AST depth
Stats
mean=8.4, median=8.0, std=2.4, max=20.0, min=4.0
Distrubution of # ast leaves per code
Stats
mean=31.0, median=23.0, std=22.4, max=153.0, min=6.0
Distrubution of # dfg nodes per code
Stats
mean=5.3, median=3.0, std=6.6, max=61.0, min=0.0
Distrubution of # dfg edges per code
Stats
mean=4.5, median=2.0, std=7.5, max=136.0, min=0.0


Split=test


Using custom data configuration google--code_x_glue_tc_text_to_code-d024203af31b2592
Reusing dataset parquet (/Users/cris/.cache/huggingface/datasets/parquet/google--code_x_glue_tc_text_to_code-d024203af31b2592/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


Loaded 2000 examples from test split


Token indices sequence length is longer than the specified maximum sequence length for this model (646 > 512). Running this sequence through the model will result in indexing errors


Distribution of #text_tokens
mean=203.6, median=169.0, std=142.4, max=896.0, min=23.0
# java samples with failed/empty DFG: 2000




Distribution of #java_tokens
mean=2.0, median=2.0, std=0.0, max=2.0, min=2.0
Average # leaves with no matching code tokens: 1.0


2000it [00:00, 489702.74it/s]

# node types: 1
Distrubution of AST depth
Stats
mean=1.0, median=1.0, std=0.0, max=1.0, min=1.0
Distrubution of # ast leaves per code
Stats
mean=1.0, median=1.0, std=0.0, max=1.0, min=1.0
Distrubution of # dfg nodes per code
Stats
mean=0.0, median=0.0, std=0.0, max=0.0, min=0.0
Distrubution of # dfg edges per code
Stats
mean=0.0, median=0.0, std=0.0, max=0.0, min=0.0





## APPS generation

In [8]:
def read_examples(split):
    examples = []
    dataset = load_dataset("codeparrot/apps")[split]
    for eg in dataset:
        if len(eg['starter_code'])>0:
            form = 'Use call-based format'
        else:
            form = 'Use cell-based format'
        if split=='train':
            sols = []
            for sol in eg['solutions'][2:-2].split('", "'):
                # sol = reindent_code(sol)
                sol = sol.replace('\\n', '\n')
                sols.append(sol)
            examples.append([eg['problem_id'], eg['question']+'\n'+eg['starter_code']+'\n'+form, sols])
        elif split=='test':
            examples.append([eg['problem_id'], eg['question']+'\n'+eg['starter_code']+'\n'+form, None])
    print ('No. of pairs', len(examples))     
    return pd.DataFrame(examples, columns=['id', 'nl', 'python'])

def get_leaf_code_token_indices_fast(data, lang):
    ast_leaf_token_idxs = []
    pbar = data.itertuples() if len(data)<=100000 else tqdm(data.itertuples())
    for row in pbar:
        jj = 1
        ast_leaf_token_idxs.append([])
        code_tokens_last_idx = len(getattr(row, lang+'_tokens'))-1
        code_tokens_ranges = getattr(row, lang+'_tokens_ranges')
        for s,e in getattr(row, lang+'_ast_leaf_ranges'):
            if s==e or jj==code_tokens_last_idx: # there are leaves with start_point=end_point
                ast_leaf_token_idxs[-1].append([])
                continue
            j = jj
            while not(overlap(s,e,code_tokens_ranges[j][0],code_tokens_ranges[j][1])):
                j += 1
                if j==code_tokens_last_idx: # can't find code tokens for this leaf
                    break
            if j==code_tokens_last_idx: # can't find code tokens for this leaf
                ast_leaf_token_idxs[-1].append([])
                continue
            curr_leaf_token_idxs = []
            while overlap(s,e,code_tokens_ranges[j][0],code_tokens_ranges[j][1]):
                curr_leaf_token_idxs.append(j)
                j += 1
                if j==code_tokens_last_idx:
                    break
            jj = j
            ast_leaf_token_idxs[-1].append(curr_leaf_token_idxs)
    data[lang+'_ast_leaf_code_token_idxs'] = ast_leaf_token_idxs
    print ('Average # leaves with no matching code tokens:', 
           data[lang+'_ast_leaf_code_token_idxs'].apply(lambda x:sum([1 for xi in x if xi==[]])).mean())



save_dir = 'data/apps_generation/'
os.makedirs(save_dir, exist_ok=True)
all_node_types = set()
data_by_split = {}
lang = 'python'
for split in ['train', 'test']:
    print ('\n\nSplit='+split)
    data = read_examples(split) # id, nl, python
    data['nl_tokens'] = tokenize_codes_texts(list(data['nl']))
    length_stats(data['nl_tokens'], 'Distribution of #text_tokens')
    if split!='test':
        data = pd.DataFrame([[row.id, row.nl, sol, row.nl_tokens] for row in data.itertuples() for sol in row.python], 
                            columns=data.columns)
        print ('No. of pairs', len(data))  
        add_structure(data, lang) # lang_ -> ast_leaves, dfg_edges, ast_leaf_tokens, ast_leaf_ranges
        
        # Keep a max of 5K leaves to speed up following code.
        for col in [lang+'_'+c for c in ['ast_leaves', 'ast_leaf_tokens', 'ast_leaf_ranges']]:
            data[col] = data[col].apply(lambda x:x[:5000])
        def f(edges):
            filtered_edges = []
            for left,rights in edges:
                if left<5000:
                    rights = [r for r in rights if r<5000]
                    if len(rights)>0:
                        filtered_edges.append((left, rights))
            return filtered_edges
        data['python_dfg_edges'] = data['python_dfg_edges'].apply(f)
            
        data[lang+'_tokens'] = tokenize_codes_texts(list(data[lang]))
        length_stats(data[lang+'_tokens'], 'Distribution of #'+lang+'_tokens')
        get_code_tokens_ranges(data, lang, lang+'_tokens') # list of (start,end) one for each code_token
        data.drop(columns=['nl', lang], inplace=True)
        get_leaf_code_token_indices_fast(data, lang)
        data.drop(columns=[lang+c for c in ['_ast_leaf_tokens', '_ast_leaf_ranges', '_tokens_ranges']], inplace=True)
        get_ast_lr_paths_and_ll_sim(data, lang)
        process_dfg_edges(data, lang)
        more_node_types = some_more_stats(data, lang)
        all_node_types.update(more_node_types)
    data_by_split[split] = data
    
# Map node types to indices.
all_node_types = sorted(list(all_node_types))
node_type_to_ind = {t:i for i,t in enumerate(all_node_types)}
pickle.dump(all_node_types, open(save_dir+'all_node_types.pkl', 'wb'))

# Convert node types on paths to indices.
for split in ['train']:
    data_by_split[split][lang+'_lr_paths_types'] = data_by_split[split][lang+'_lr_paths_types'].apply(
                                            lambda ll: [[node_type_to_ind[t] for t in path] for path in ll])
        
# Save data. Not converting array cols to strings, storing with pickle.
pickle.dump(data_by_split, open(save_dir+'preprocessed_data_by_split.pkl','wb'))
print ('Done.')



Split=train


Downloading: 0.00B [00:00, ?B/s]

No config specified, defaulting to: apps_code/all


Downloading and preparing dataset apps_code/all to /Users/cris/.cache/huggingface/datasets/codeparrot___apps_code/all/0.0.0/04ac807715d07d6e5cc580f59cdc8213cd7dc4529d0bb819cca72c9f8e8c1aa5...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/107M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29G [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset apps_code downloaded and prepared to /Users/cris/.cache/huggingface/datasets/codeparrot___apps_code/all/0.0.0/04ac807715d07d6e5cc580f59cdc8213cd7dc4529d0bb819cca72c9f8e8c1aa5. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

No. of pairs 5000


Token indices sequence length is longer than the specified maximum sequence length for this model (753 > 512). Running this sequence through the model will result in indexing errors


Distribution of #text_tokens
mean=412.0, median=352.0, std=292.7, max=7203.0, min=28.0
No. of pairs 117232




# python samples with failed/empty DFG: 696


Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors


Distribution of #python_tokens
mean=200.7, median=128.0, std=1749.6, max=358184.0, min=3.0


117232it [02:58, 655.68it/s] 


Average # leaves with no matching code tokens: 10.979246280878941


117232it [12:35, 155.18it/s] 


# node types: 187
Distrubution of fraction of leaf-root paths with ERROR node in one code
Stats
mean=0.1, median=0.0, std=0.2, max=1.0, min=0.0
Distrubution of AST depth
Stats
mean=13.8, median=14.0, std=4.0, max=400.0, min=3.0
Distrubution of # ast leaves per code
Stats
mean=127.5, median=95.0, std=151.5, max=5000.0, min=1.0
Distrubution of # dfg nodes per code
Stats
mean=45.3, median=33.0, std=59.5, max=3391.0, min=0.0
Distrubution of # dfg edges per code
Stats
mean=58.6, median=37.0, std=581.3, max=196872.0, min=0.0


Split=test


No config specified, defaulting to: apps_code/all
Reusing dataset apps_code (/Users/cris/.cache/huggingface/datasets/codeparrot___apps_code/all/0.0.0/04ac807715d07d6e5cc580f59cdc8213cd7dc4529d0bb819cca72c9f8e8c1aa5)


  0%|          | 0/2 [00:00<?, ?it/s]

No. of pairs 5000


Token indices sequence length is longer than the specified maximum sequence length for this model (547 > 512). Running this sequence through the model will result in indexing errors


Distribution of #text_tokens
mean=527.1, median=504.0, std=223.1, max=2003.0, min=56.0
Done.
