## Calculate code properties among 10% of original data and save

In [1]:
import os
import random
from datasets import Dataset
from utils.utils import get_code_style_score, get_code_modularity_score, read_jsonl_to_dict, write_dict_to_jsonl


def compute_code_score(example):
    code = example['code']
    try:
        score_modularity = get_code_modularity_score(code)
    except Exception:
        score_modularity = -1.0

    example['score_modularity'] = score_modularity
    return example


def check_code_score(example):
    return example['score_modularity'] >= 0


dataset = read_jsonl_to_dict(os.path.join(os.getcwd(), 'data', 'my_code_contests_train.jsonl'))
demonstration = []

# aggregate demonstration code
# keys for dataset: dict_keys(['name', 'description', 'public_tests', 'private_tests', 'generated_tests', 'source', 'difficulty', 'solutions', 'incorrect_solutions', 'cf_contest_id', 'cf_index', 'cf_points', 'cf_rating', 'cf_tags', 'is_description_translated', 'untranslated_description', 'time_limit', 'memory_limit_bytes', 'input_file', 'output_file'])
# keys for solutions: dict_keys(['cc', 'modules', 'passed', 'solution'])
for data in dataset:
    for i in range(len(data['solutions']['solution'])):
        if data['solutions']['passed'][i]:
            demonstration.append(
                {
                    'description': data['description'],
                    'code': data['solutions']['solution'][i],
                    # more information?
                }
            )

# calculate MoS
random.seed(42)
demonstration = random.sample(demonstration, len(demonstration) // 10) # 10% of total data
demonstration = Dataset.from_list(demonstration)
demonstration = demonstration.map(compute_code_score, num_proc=16)
demonstration = demonstration.filter(check_code_score, num_proc=16)

# save
write_dict_to_jsonl(list(demonstration), os.path.join(os.getcwd(), 'data', 'demonstration_with_new_modularity.jsonl'))

  from .autonotebook import tqdm as notebook_tqdm
Map (num_proc=16): 100%|██████████| 126447/126447 [01:02<00:00, 2013.54 examples/s]
Filter (num_proc=16): 100%|██████████| 126447/126447 [00:06<00:00, 19519.37 examples/s]


## Get 500 demonstrations

In [19]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl, get_code_style_score, get_code_modularity_score, get_average_length_of_variables


random.seed(27) # for reproducibility
num_sample = 10 # number of samples to be sampled from each bin

# load demonstration pool
file_name = 'demonstration_with_new_modularity'
path = f'/data/kdy20401/Workspace/Proj-Code-Generation/MC/data/{file_name}.jsonl'
demonstration = read_jsonl_to_dict(path)
print(f'number of codes in demonstration pool: {len(demonstration)}')

modularity = [] # score_modularity
for data in demonstration:
    modularity.append(data['score_modularity'])

modularity_df = pd.DataFrame({'modularity': np.array(modularity)})

# bins: 0~0.1, 0.1~0.2, ..., 0.9~1.0
num_bin = 10
bins = np.linspace(0, 1, num_bin + 1)

# find the grid cell to which each data point belongs
# include_lowest=True makes 0 style or modularity value included in the first bin
modularity_df['modularity_bin'] = pd.cut(modularity_df['modularity'], bins=bins, labels=False, include_lowest=True)

# sample data points from each bin
# if the number of data points in the bin is less than num_sample, duplication can occur
modularity_sampled_points = modularity_df.groupby(['modularity_bin']).apply(lambda x: x.sample(num_sample, replace=True if len(x) < num_sample else False))

# style_sampled_points.index => (style_bin, code_index)
# (deduplicated) index of sampled data points 
modularity_index = list(set([e[1] for e in modularity_sampled_points.index]))

#  the number of samples is less than expected
# assert len(style_index) == num_bin * num_sample and len(modularity_index) == num_bin * num_sample
assert len(modularity_index) == num_bin * num_sample
        
selected_demonstration_by_modularity = [demonstration[i] for i in modularity_index]

# save each demonstration which has high coverage of style or modularity
write_dict_to_jsonl(selected_demonstration_by_modularity, os.path.join(os.getcwd(), 'data', 'modularity_demonstration_with_new_modularity.jsonl'))

# # for visualization
# plt.scatter(modularity_sampled_points['modularity'], np.array([0.5] * len(modularity_sampled_points)), color='red', label='Sampled Data')
# plt.xlabel('MoS')
# plt.ylabel('temp')
# plt.legend()
# plt.show()    

number of codes in demonstration pool: 125659


  modularity_sampled_points = modularity_df.groupby(['modularity_bin']).apply(lambda x: x.sample(num_sample, replace=True if len(x) < num_sample else False))


## calculate corr between mos and function calls

In [1]:
from utils.utils import count_num_module_calls

base_directory = os.getcwd()

demonstration_dataset = read_jsonl_to_dict(
    
    os.path.join(
        base_directory,
        "data",
        'modularity_demonstration_with_new_modularity.jsonl',
    )  
)

import matplotlib.pyplot as plt
from scipy import stats

mos, function_call = [], []
for data in demonstration_dataset:
    mos.append(data['score_modularity'])
    function_call.append(count_num_module_calls(data['code']))
    
pearsonr_stat = stats.pearsonr(mos, function_call)
pearsonr, pearsonr_p = pearsonr_stat.correlation, pearsonr_stat.pvalue
spearmanr_stat = stats.spearmanr(mos, function_call)
spearmanr, spearmanr_p = spearmanr_stat.correlation, spearmanr_stat.pvalue

plt.scatter(mos, function_call, color='red', label='Sampled Data')
plt.xlabel('MoS')
plt.ylabel('number of function calls')
plt.legend()
plt.show()

print(f'pearsonr: {round(pearsonr, 2)}, pearsonr_p: {round(pearsonr_p, 2)}')
print(f'spearmanr: {round(spearmanr, 2)}, spearmanr_p: {round(spearmanr_p, 2)}')