In [1]:
import pandas as pd
import os

In [2]:
import json


# Mono Project
Create a folder with the following structure, as expected by the NeuralCodeSum repository:
folder
    - train
    - dev
    - test

In [3]:
project_names = \
    ['hibernate-orm', 'intellij-community', 'liferay-portal', 'gradle',
    'hadoop-common', 'presto', 'wildfly', 'spring-framework',
    'cassandra', 'elasticsearch']

In [5]:
from pathlib import Path

def filter_tags(list_tokens):
    return [
        t 
        for t in list_tokens
        if t not in ['<SENTENCE_START>', '<SENTENCE_END/>', '<id>', '</id>']
    ]

def write_line(file, token_list, separator, remove_tags):
    """Write a line and use the specific separator given."""
    if remove_tags:
        token_list = filter_tags(token_list)
    continuous_string = f'{separator}'.join(token_list)
    file.write(continuous_string + '\n')

def save_df_line_by_line(df, 
                         path_model_input, path_model_output,
                         folder_path):
    """Save the dataframe in the format required by transformer."""
    path_model_input = os.path.join(folder_path, path_model_input)
    path_model_output = os.path.join(folder_path, path_model_output)
    with open(path_model_input, 'w') as input_file:
        with open(path_model_output, 'w') as output_file:
            for row in df.iterrows():
                #i += 1
                content = row[1]
                body_tokens = content['tokens']
                write_line(file=input_file, 
                           token_list=body_tokens, 
                           separator='&*separator*&', 
                           remove_tags=True)

                name_tokens = content['name']
                write_line(file=output_file, 
                           token_list=name_tokens, 
                           separator=' ', 
                           remove_tags=False)
    
PERC_VALIDATION = 0.1    
    
for prefix_project in project_names:
    print(prefix_project)
    # create a new folder
    folder_name = f'fake{prefix_project}_transformer'
    Path(folder_name).mkdir(parents=True, exist_ok=True)
    
    #create train and test
    for kind in ['train', 'test']:
        # create alternatively train and test
        sub_folder_name = os.path.join(folder_name, f'{kind}')
        Path(sub_folder_name).mkdir(parents=True, exist_ok=True)
        
        to_convert_path = f'{prefix_project}_{kind}_methodnaming.json'
        df = pd.read_json(to_convert_path, orient='records')
        
        
        out_path_input = 'code_body.original_subtoken'
        out_path_to_predict = 'method_name.original'
        
        if kind == 'train':
            split_point = int(len(df) * (1 - PERC_VALIDATION))
            df_train = df.iloc[:split_point]
            df_val = df.iloc[split_point:]
            # save the train df
            save_df_line_by_line(
                df_train, 
                path_model_input=out_path_input, 
                path_model_output=out_path_to_predict,
                folder_path=sub_folder_name)
            # create dev folder and save the df
            kind = 'dev'
            sub_folder_name = os.path.join(folder_name, f'{kind}')
            Path(sub_folder_name).mkdir(parents=True, exist_ok=True)
            save_df_line_by_line(
                df_val, 
                path_model_input=out_path_input, 
                path_model_output=out_path_to_predict,
                folder_path=sub_folder_name)
        elif kind == 'test':
            # save the test df
            save_df_line_by_line(
                df, 
                path_model_input=out_path_input, 
                path_model_output=out_path_to_predict,
                folder_path=sub_folder_name)

hibernate-orm
intellij-community
liferay-portal
gradle
hadoop-common
presto
wildfly
spring-framework
cassandra
elasticsearch


## STOP ---------------------------------

# Miscellanea
Do not consider from here onwards.

In [2]:
all_train_path = 'all_train_methodnaming.json'
dev_train_path = 'libgdx_test_methodnaming.json'

to_convert_path = all_train_path
out_path_input = 'allamanis_train_for_transformers_code.original_subtoken'
out_path_to_predict = 'allamanis_train_for_transformers_javadoc.original'

In [None]:
to_convert_path = 'intellij-community_test_methodnaming.json'
out_path_input = 'intellij-community_test_for_transformers_code.original_subtoken'
out_path_to_predict = 'allamanis_test_for_transformers_javadoc.original'

In [33]:
df = pd.read_json(to_convert_path, orient='records')

In [34]:
df.head()

Unnamed: 0,filename,name,tokens
0,/ReportConstantReferences_after.java:test2,"[test, 2]","[<SENTENCE_START>, {, }, <SENTENCE_END/>]"
1,/ReportConstantReferences_after.java:println,[println],"[<SENTENCE_START>, {, }, <SENTENCE_END/>]"
2,/ReportConstantReferences_after.java:test,[test],"[<SENTENCE_START>, {, if, (, <id>, foo, </id>,..."
3,/ReportConstantReferences_after.java:testDontR...,"[test, dont, replace, qualifier, with, null]","[<SENTENCE_START>, {, if, (, <id>, bar, </id>,..."
4,/Thinlet.java:setColors,"[set, colors]","[<SENTENCE_START>, {, <id>, c, bg, </id>, =, n..."


# Unique Train - Cross Project

In [25]:
to_convert_path = 'all_train_methodnaming.json'
out_path_input = 'allamanis_train_for_transformers_code.original_subtoken'
out_path_to_predict = 'allamanis_train_for_transformers_javadoc.original'

In [28]:
to_convert_path = 'libgdx_train_methodnaming.json'
out_path_input = 'allamanis_train_for_transformers_dev_code.original_subtoken'
out_path_to_predict = 'allamanis_train_for_transformers_dev_javadoc.original'

In [32]:
to_convert_path = 'intellij-community_test_methodnaming.json'
out_path_input = 'allamanis_test_for_transformers_code.original_subtoken'
out_path_to_predict = 'allamanis_test_for_transformers_javadoc.original'

In [None]:
df = pd.read_json(to_convert_path, orient='records')

In [None]:

def filter_tags(list_tokens):
    return [
        t 
        for t in list_tokens
        if t not in ['<SENTENCE_START>', '<SENTENCE_END/>', '<id>', '</id>']
    ]

def write_line(file, token_list, separator, remove_tags):
    """Write a line and use the specific separator given."""
    if remove_tags:
        token_list = filter_tags(token_list)
    continuous_string = f'{separator}'.join(token_list)
    file.write(continuous_string + '\n')

    
    
#i = 0
with open(out_path_input, 'w') as input_file:
    with open(out_path_to_predict, 'w') as output_file:
        for row in df.iterrows():
            #i += 1
            content = row[1]
            body_tokens = content['tokens']
            write_line(file=input_file, 
                       token_list=body_tokens, 
                       separator='&*separator*&', 
                       remove_tags=True)
            
            name_tokens = content['name']
            write_line(file=output_file, 
                       token_list=name_tokens, 
                       separator=' ', 
                       remove_tags=False)
            
            #if i == 5:
            #    break
    