In [None]:
from datasets import load_dataset

data = load_dataset("JanDkff/TinyFuncData-docstring", split="train")
data

In [None]:
format = """{comment} {language}
{docstring}
{name}{params}
"""

def reconstruct_python_func(example):
    
    docstring = ""
    lines = example['docstring'].split("\n")
    for line in lines:
        docstring += f"# {line}\n"
    
    name = example['name']
    
    params = ""
    args = example['params']
    if args[0] == '[':
        params += "("
        args = [arg.strip()[1:-1] for arg in args[1:-1].split(",")]
        for arg in args:
            params += arg + ", "
        params = params[:-2] + "):"
    else:
        params += args + ":"
    
    body = "\n".join(example['body'].split("\n")[1:])
    
    return format.format(
        comment="#",
        language="Python",
        docstring=docstring,
        name=name,
        params=params,
    ), body, example["file_id"]


def reconstruct_func(example):
    if example["language"] == "Python":
        return reconstruct_python_func(example)
    else:
        if example["language"] in ["Shell","Ruby"]:
            comment = "#"
        else:
            comment = "//"
            
        docstring = example["docstring"]
        new_doc = ""
        for line in docstring.split("\n"):
            new_doc += f"{line}\n"
        
        
        func = format.format(
            comment=comment,
            language=example['language'],
            docstring=new_doc,
            name=example['name'],
            params=example['params'],
        ), example['body'], example["file_id"]
        
        return func

In [None]:
import pandas as pd

df = pd.DataFrame(data)

In [None]:
one_example_per_language = df.groupby('language').apply(lambda x: x.sample(1)).reset_index(drop=True)

In [None]:
array = [reconstruct_func(example) for example in one_example_per_language.to_dict(orient="records")]

for a in array:
    print(a)
    print("##########################################################")

In [None]:
funcs = []

for example in data:
    if len(example['body']) > 1000: continue
    if not example['body'].replace('{','').replace('}','').replace(':','').replace('\n','').strip(): continue
    
    head, body, id = reconstruct_func(example)
    funcs.append({"language":example['language'],"head":head,"body":body,"file_id":id})

In [None]:
import pandas as pd

df = pd.DataFrame(funcs)
df.head()

In [None]:
df.to_parquet("filtered_funcs.parquet")

In [None]:
print(df.sample(1).to_dict(orient="records")[0]["body"])

In [None]:
def stratified_sample(df, frac, random_state=None):
    grouped = df.groupby('language')
    sampled_df = grouped.apply(lambda x: x.sample(frac=frac, random_state=random_state)).reset_index(drop=True)
    return sampled_df

seed = 42
stratified_dfs = []
#for i in range(5):
#    index = seed+i
stratified_df = stratified_sample(df, frac=0.05, random_state=seed)

In [None]:
#print(stratified_dfs)

print(
len(df['language'].value_counts()))

for x in stratified_dfs:
    print(len(
    x['language'].value_counts()))

In [None]:
stratified_df.to_parquet(f"5percent_stratified_{seed}.parquet")

In [None]:
print(stratified_dfs[0]['body'][60000])

In [None]:
samples_per_group = round(len(df)/200)

def sample_fixed_per_group(df, n_samples, random_state=None):
    return df.groupby("language").apply(lambda x: x.sample(n=n_samples, random_state=random_state)).reset_index(drop=True)

fixed_dfs = []
#for i in range(5):
#    index = seed+i
fixed_df = sample_fixed_per_group(df, n_samples=samples_per_group,random_state=seed)

In [None]:

fixed_df.to_parquet(f"5percent_fixed_{seed}.parquet")