In [11]:
# divide existing codes into monolithic and modular codes by certain criteria
# (ex, average cc and number of modules used)
def divide_into_monolithic_and_modular_codes(dataset, cc_limit=10, min_num_module=3):
    from utils.utils import count_module_written
    
    
    new_dataset = []

    for data in dataset:
        # save basic information
        new_data = {}
        new_data['problem_name'] = data['name']
        new_data['problem_description'] = data['description']
        new_data['public_tests'] = data['public_tests']
        new_data['private_tests'] = data['private_tests']

        passed = data['solutions']['passed']
        cc = data['solutions']['cc']
        solution = data['solutions']['solution']
        module_list = data['solutions']['modules']

        assert(len(passed) == len(cc) == len(solution) == len(module_list))

        # 1. get monolithic code
        monolithic_code_index = []
        for i, modules in enumerate(module_list):
            # filter solution that does not pass the test case
            if not passed[i]:
                continue
            
            if len(modules) == 0 and cc[i] >= cc_limit:
                monolithic_code_index.append(i)

        # no monolithic code candidate exists
        # if len(monolithic_code_index) == 0:
            # continue

        tmp = {}
        tmp['monolithic_code'] = [solution[i] for i in monolithic_code_index]
        tmp['monolithic_code_cc'] = [cc[i] for i in monolithic_code_index]
        new_data['monolithic_codes'] = tmp
        
        # 2. get modular code
        modular_code_index = []
        for i, (code, modules) in enumerate(zip(solution, module_list)):
            # filter solution that does not pass the test case
            if not passed[i]:
                continue
            
            if len(modules) < min_num_module: continue # at least three modules in the code
            module_use_count = [count_module_written(code, module) for module in modules]
            if all(count >= 2 for count in module_use_count): # all modules must be used
                if cc[i] < cc_limit: # and cc of code must be under 10
                    modular_code_index.append(i)
        
        # no modular code candidate exists
        # if len(modular_code_index) == 0:
            # continue
        
        tmp = {}
        tmp['modular_code'] = [solution[i] for i in modular_code_index]
        tmp['modular_code_cc'] = [cc[i] for i in modular_code_index]
        new_data['modular_codes'] = tmp

        new_dataset.append(new_data)
        
        
    # 3. remove question without pair data is collected
    remove_index = []
    for i, data in enumerate(new_dataset):
        # at least one monolithic code must exist per problem
        # it is okay to have no modular code
        if len(data['monolithic_codes']['monolithic_code']) == 0:
            remove_index.append(i)
    new_dataset = [new_dataset[i] for i in range(len(new_dataset)) if i not in remove_index]

        
    return new_dataset

### load my codecontests dataset and extract problems with both sc and mc codes

In [1]:
from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl
import os

train_dataset = read_jsonl_to_dict(os.path.join(os.getcwd(), 'data', 'my_code_contests_train.jsonl'))

_train_dataset = divide_into_monolithic_and_modular_codes(train_dataset)

write_dict_to_jsonl(_train_dataset, os.path.join(os.getcwd(), 'data', 'my_code_contests_divided_train.jsonl'))


  from .autonotebook import tqdm as notebook_tqdm
