### Divide my_code_contests_{split}.jsonl into two by MoS

In [1]:
import os
import gaoya
import random
from tqdm import tqdm
from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl


split = 'valid'
low_mos_range = [0, 0]
high_mos_range = [0.7, 1]
 
# make code dataset with low MoS code
file = f'/data/kdy20401/Workspace/Proj-Code-Generation/MC/data/ft/my_code_contests_{split}.jsonl'
dataset = read_jsonl_to_dict(file)
low_dataset = []
for pid, data in enumerate(dataset):
    # after preprocessing, there can be no correct python solution in the problem
    if len(data['solutions']['solution']) == 0:
        continue
    
    code = data['solutions']['solution']
    modularity = data['solutions']['modularity']
    indicies = []
    for i in range(len(code)):
        if low_mos_range[0] <= modularity[i] <= low_mos_range[1]:
            indicies.append(i)
            
    dataset[pid]['solutions']['solution'] = [code[i] for i in indicies]
    dataset[pid]['solutions']['modularity'] = [modularity[i] for i in indicies]
    low_dataset.append(dataset[pid]) # low_dataset = [data1, data2,,,]
    

# make code dataset with high MoS code
file = f'/data/kdy20401/Workspace/Proj-Code-Generation/MC/data/ft/my_code_contests_{split}.jsonl'
dataset = read_jsonl_to_dict(file) # should load once again!
high_dataset = []
for pid, data in enumerate(dataset):
    # after preprocessing, there can be no correct python solution in the problem
    if len(data['solutions']['solution']) == 0:
        continue
    
    code = data['solutions']['solution']
    modularity = data['solutions']['modularity']
    indicies = []
    for i in range(len(code)):
        if high_mos_range[0] <= modularity[i] <= high_mos_range[1]:
            indicies.append(i)
            
    dataset[pid]['solutions']['solution'] = [code[i] for i in indicies]
    dataset[pid]['solutions']['modularity'] = [modularity[i] for i in indicies]
    high_dataset.append(dataset[pid])
    
    
    
# find common problem between the two datasets and save
problem1 = set()
for data in low_dataset:
    if len(data['solutions']['solution']) == 0: # there can be no code in specific MoS range
        continue
    problem1.add(data['name'])

problem2 = set()
for data in high_dataset:
    if len(data['solutions']['solution']) == 0:
        continue
    problem2.add(data['name'])


common_problem = problem1.intersection(problem2)
print(len(problem1), len(problem2), len(common_problem))

low_dataset = [data for data in low_dataset if data['name'] in common_problem]
high_dataset = [data for data in high_dataset if data['name'] in common_problem]


# save
write_dict_to_jsonl(low_dataset, os.path.join(os.getcwd(), 'data/ft', f'my_code_contests_{split}_low.jsonl'))
write_dict_to_jsonl(high_dataset, os.path.join(os.getcwd(), 'data/ft', f'my_code_contests_{split}_high.jsonl'))

  from .autonotebook import tqdm as notebook_tqdm


78 78 75


In [2]:
# check data statistics

split = 'valid'

# low dataset
degree = 'low'
file = f'/data/kdy20401/Workspace/Proj-Code-Generation/MC/data/ft/my_code_contests_{split}_{degree}.jsonl'
dataset = read_jsonl_to_dict(file)
print('LOW MOD DATASET:')
print(f'total number of problem: {len(dataset)}')

num_code = sum([len(data['solutions']['solution']) for data in dataset])
print(f'total number of code: {num_code}')
print(f'average number of codes per problem: {num_code / len(dataset)}')


# high dataset
degree = 'high'
file = f'/data/kdy20401/Workspace/Proj-Code-Generation/MC/data/ft/my_code_contests_{split}_{degree}.jsonl'
dataset = read_jsonl_to_dict(file)
print('HIGH MOD DATASET:')
print(f'total number of problem: {len(dataset)}')

num_code = sum([len(data['solutions']['solution']) for data in dataset])
print(f'total number of code: {num_code}')
print(f'average number of codes per problem: {num_code / len(dataset)}')


LOW MOD DATASET:
total number of problem: 75
total number of code: 3919
average number of codes per problem: 52.25333333333333
HIGH MOD DATASET:
total number of problem: 75
total number of code: 1229
average number of codes per problem: 16.386666666666667


<!-- ### deduplicate two dataset (+ flatten) -->

### deduplicate two dataset (+ flatten, same number of data)

In [20]:
import os
import gaoya
import random
from tqdm import tqdm
from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl


def deduplicate_codes(codes, max_num_codes=25):
    # build index
    index = gaoya.minhash.MinHashStringIndex(
        hash_size=64, # 64 bit integer for hash value
        jaccard_threshold=0.5, # similarity threshold for considering two documents as similar
        num_bands=60, # same as paper
        band_size=5, # same as paper
        num_hashes=60*5, # number of hash values for each document
        analyzer='word', # determine how to split the text into tokens (word or char)
        lowercase=True, # convert all text to lowercase
        ngram_range=(3,4) # use 3,4 grams
    )

    deduplicated_codes = []
    deduplicated_codes_indices = []
    indicies = list(range(len(codes)))
    random.shuffle(indicies)

    # get unique codes
    for i in indicies:
        code = codes[i]
        if len(index.query(code)) == 0:
            index.insert_document(i, code)
            deduplicated_codes.append(code)
            deduplicated_codes_indices.append(i)
            
        if len(deduplicated_codes) == max_num_codes:
            break
    
    return deduplicated_codes, deduplicated_codes_indices


split = 'train'

degree = 'low'
file = f'/data/kdy20401/Workspace/Proj-Code-Generation/MC/data/ft/my_code_contests_{split}_{degree}.jsonl'
low_dataset = read_jsonl_to_dict(file)

degree = 'high'
file = f'/data/kdy20401/Workspace/Proj-Code-Generation/MC/data/ft/my_code_contests_{split}_{degree}.jsonl'
high_dataset = read_jsonl_to_dict(file)


low_dedup_dataset, high_dedup_dataset = [], []
for idx, (low_data, high_data) in enumerate(zip(low_dataset, high_dataset)):
    # code
    low_mod_code = low_data['solutions']['solution']
    high_mod_code = high_data['solutions']['solution']
    
    # MoS
    low_mod_score = low_data['solutions']['modularity']
    high_mod_score = high_data['solutions']['modularity']
    
    # deduplication per problem (maximum number of codes per problem: 25)
    deduplicated_low_mod_code, indicies1 = deduplicate_codes(low_mod_code, 25)
    deduplicated_high_mod_code, indicies2 = deduplicate_codes(high_mod_code, 25)
    
    # sampling same number of codes from each problem
    num_min = min(len(deduplicated_low_mod_code), len(deduplicated_high_mod_code))
    deduplicated_low_mod_code = random.sample(deduplicated_low_mod_code, num_min)
    deduplicated_high_mod_code = random.sample(deduplicated_high_mod_code, num_min)
    
    # save
    low_dataset[idx]['solutions']['solution'] = deduplicated_low_mod_code
    low_dataset[idx]['solutions']['modularity'] = [low_mod_score[i] for i in indicies1]
    
    high_dataset[idx]['solutions']['solution'] = deduplicated_high_mod_code
    high_dataset[idx]['solutions']['modularity'] = [high_mod_score[i] for i in indicies2]
    
    low_dedup_dataset.append(low_dataset[idx])
    high_dedup_dataset.append(high_dataset[idx])
    
    
# flatten low_dataset
flattened_dataset = [] 
for data in low_dedup_dataset:
    for i in range(len(data['solutions']['solution'])):
        flattened_dataset.append({
            'name': data['name'],
            'description': data['description'],
            'public_tests': data['public_tests'],
            'private_tests': data['private_tests'],
            'source': data['source'],
            'difficulty': data['difficulty'],
            'cf_contest_id': data['cf_contest_id'],
            'cf_index': data['cf_index'],
            'cf_points': data['cf_points'],
            'cf_rating': data['cf_rating'],
            'cf_tags': data['cf_tags'],
            'code': data['solutions']['solution'][i],
            'modularity': data['solutions']['modularity'][i],
        })
print(f'low mod dataset: {len(flattened_dataset)}')
write_dict_to_jsonl(flattened_dataset, os.path.join(os.getcwd(), 'data/ft', f'my_code_contests_{split}_low_deduplicated.jsonl'))


# flatten high_dataset
flattened_dataset = [] 
for data in high_dedup_dataset:
    for i in range(len(data['solutions']['solution'])):
        flattened_dataset.append({
            'name': data['name'],
            'description': data['description'],
            'public_tests': data['public_tests'],
            'private_tests': data['private_tests'],
            'source': data['source'],
            'difficulty': data['difficulty'],
            'cf_contest_id': data['cf_contest_id'],
            'cf_index': data['cf_index'],
            'cf_points': data['cf_points'],
            'cf_rating': data['cf_rating'],
            'cf_tags': data['cf_tags'],
            'code': data['solutions']['solution'][i],
            'modularity': data['solutions']['modularity'][i],
        })
print(f'high mod dataset: {len(flattened_dataset)}')
write_dict_to_jsonl(flattened_dataset, os.path.join(os.getcwd(), 'data/ft', f'my_code_contests_{split}_high_deduplicated.jsonl'))

low mod dataset: 61012
high mod dataset: 61012


In [21]:
# Check after deduplication


from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl


split = 'train'


degree = 'low'
file = f'/data/kdy20401/Workspace/Proj-Code-Generation/MC/data/ft/my_code_contests_{split}_{degree}_deduplicated.jsonl'
dataset = read_jsonl_to_dict(file)

print('LOW MOD DATASET:')
unique_problem = set()
for data in dataset:
    unique_problem.add(data['name'])
print(f'total number of problem: {len(unique_problem)}')
print(f'total number of code: {len(dataset)}')
print(f'average number of code per problem: {len(dataset) / len(unique_problem)}')



degree = 'high'
file = f'/data/kdy20401/Workspace/Proj-Code-Generation/MC/data/ft/my_code_contests_{split}_{degree}_deduplicated.jsonl'
dataset = read_jsonl_to_dict(file)

print('HIGH MOD DATASET:')
unique_problem = set()
for data in dataset:
    unique_problem.add(data['name'])
print(f'total number of problem: {len(unique_problem)}')
print(f'total number of code: {len(dataset)}')
print(f'average number of code per problem: {len(dataset) / len(unique_problem)}')


LOW MOD DATASET:
total number of problem: 5042
total number of code: 61012
average number of code per problem: 12.100753669178896
HIGH MOD DATASET:
total number of problem: 5042
total number of code: 61012
average number of code per problem: 12.100753669178896


### final preprocessing before ft

In [23]:
from datasets import load_dataset
import os


degree = 'high'

train_file = f'data/ft/my_code_contests_train_{degree}_deduplicated.jsonl'
validation_file = f'data/ft/my_code_contests_valid_{degree}_deduplicated.jsonl'

data_files = {}
data_files["train"] = train_file
data_files["validation"] = validation_file

raw_datasets = load_dataset(
    'json',
    data_files=data_files,
    token=None,
)

def preprocess_func(example):
    instruction = (
        "Write a python code to solve the following coding problem "
        "that obeys the constraints and passes the example test cases. "
        "The output code needs to read from and write to standard IO. "
        "Please wrap your code answer using ```:"
    )

    example['text'] = 'Q: ' + instruction + '\n' + example['description'].strip() + '\n' + 'A: ```' + example['code'].strip() + '```'
    
    return example

new_datasets = raw_datasets.map(preprocess_func)
new_datasets['train'].to_json(os.path.join(os.getcwd(), 'data/ft_final', f'my_code_contests_train_{degree}.jsonl'))
new_datasets['validation'].to_json(os.path.join(os.getcwd(), 'data/ft_final', f'my_code_contests_valid_{degree}.jsonl'))

Generating train split: 61012 examples [00:10, 5990.66 examples/s]
Generating validation split: 1039 examples [00:00, 12465.73 examples/s]
Map: 100%|██████████| 61012/61012 [00:18<00:00, 3308.62 examples/s]
Map: 100%|██████████| 1039/1039 [00:00<00:00, 4611.29 examples/s]
Creating json from Arrow format: 100%|██████████| 62/62 [00:10<00:00,  5.85ba/s]
Creating json from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 18.49ba/s]


14942441