In [1]:
import polars as pl
from PredCST.utils.dataset_creation import create_dataset
from typing import List, Tuple, Optional
import cynde.functional as cf
import math

In [2]:
import os

In [3]:
import polars as pl
import math
import time

def create_dataset_tf(df: pl.DataFrame, ds_size, target_node, node_list):
    start = time.time()
    
    df_ = df.drop(['type', 'code', 'type',
        'cst_tree',
        'file_name',
        'modules',
        'version',
        'license',
        'code_token_len',
        'cst_tree_token_len',
        'code_text-embedding-3-small_embedding',
        'code_text-embedding-3-large_embedding',
        'cst_tree_text-embedding-3-small_embedding'])
    
    print(f"Time to drop columns: {time.time()-start}")
    second_start = time.time()
    for column in df_.columns:
        if isinstance(df_[column].dtype,pl.Int64) and column != 'index':
            df_ = df_.with_columns((pl.col(column) > 0).alias(column))
    print(f"Time to change int64 to boolean: {time.time()-second_start}")
    fifty_percent = ds_size/2
    third_start = time.time()
    target_rows = df_.filter(pl.col(target_node)==True)
    if target_rows.shape[0] >fifty_percent:
        target_rows = target_rows.sample(fifty_percent)

    other = df_.filter(pl.col(target_node)==False)
    
    node_list = node_list.filter(pl.col("column") != target_node)
    k_other = node_list.select(pl.col("column")).unique()
    print(f"Time to filter target and other: {time.time()-third_start}")
    k = len(k_other)
    per_sample = math.floor(fifty_percent/k)
    fourth_start = time.time()
    without = []
    for o_node in k_other['column'].to_list():
        temp = other.filter(pl.col(o_node)==True)
        try:
            temp = temp.sample(per_sample)
            without.append(temp)
        except:
            without.append(temp)
            continue
    print(f"Time to filter other nodes in for loop: {time.time()-fourth_start}")
    train = pl.concat([target_rows] + without)
    test = df_.join(train, on="index", how="anti")
    train_indexes = train['index'].to_list()
    test_indexes = test['index'].to_list()
    print(f"Time to join train and test: {time.time()-fourth_start}")
    fifth_start = time.time()
    train = df.filter(pl.col("index").is_in(train_indexes))
    test = df.filter(pl.col("index").is_in(test_indexes))
    print(f"Time to filter train and test: {time.time()-fifth_start}")
    

    return train, test

In [4]:
data_url = r"C:\Users\Tommaso\Documents\Dev\PredCST\python_3_12_1_standard_lib_all_with_counts.parquet"

In [5]:
df = pl.read_parquet(data_url)
df = df.with_row_index()


In [6]:
node_cols = df.columns[13:]
embeddings_col = [col for col in df.columns if "embedding" in col]
index = df.columns[0]

In [7]:
def booleanize_nodes(df: pl.DataFrame, node_columns : List[str]) -> pl.DataFrame:
    expression= [pl.col(col) > 0 for col in node_columns]
    return df.with_columns(expression)

def divide_df_by_target(df: pl.DataFrame,target:str) -> Tuple[pl.DataFrame, pl.DataFrame]:
    if target not in df.columns:
        raise ValueError(f"Target {target} not in columns")
    print(f"Total rows: {df.shape[0]}")
    target_rows = df.filter(pl.col(target)==True)
    other = df.filter(pl.col(target)==False)
    print(f"Target {target} rows: {target_rows.shape[0]}")
    print(f"Other rows: {other.shape[0]}")
    return target_rows, other

def remove_empty_nodes(df: pl.DataFrame, node_columns: List[str]) -> pl.DataFrame:
    zero_cols = [col for col in node_columns if df[col].sum() == 0]
    return df.drop(zero_cols), zero_cols

def resample_dataset(df:pl.DataFrame,df_target: pl.DataFrame, df_other:pl.DataFrame, size: int = math.inf) -> pl.DataFrame:
    size_target,size_other = [df_target.shape[0], df_other.shape[0]]
    min_size = min(size_target, size_other,size)
    df_index = sample_dataset_index(df_target, df_other, min_size)
    # df_target.vstack(df_other).filter(pl.col("index").is_in(df_index["index"]))
    return df.filter(pl.col("cv_index").is_in(df_index["cv_index"]))

def sample_dataset_index(df_target: pl.DataFrame, df_other:pl.DataFrame, size: int = math.inf) -> pl.DataFrame:
    size_target,size_other = [df_target.shape[0], df_other.shape[0]]
    min_size = min(size_target, size_other,size)
    return df_target.select(pl.col("cv_index")).sample(min_size).vstack(df_other.select(pl.col("cv_index")).sample(min_size))

def prepare_boolean_dataset(df: pl.DataFrame, node_columns: List[str],type:str = "function") ->Tuple[pl.DataFrame, List[str]]:
    if type not in ["module", "class", "function"]:
        raise ValueError(f"Type {type} not in ['module', 'class', 'function']")
    df_no_emb = df.drop(embeddings_col)
    df_ = df_no_emb.filter(pl.col("type") == type)
    df_fb = booleanize_nodes(df_, node_cols)
    df_f_ne, empty_cols = remove_empty_nodes(df_fb, node_cols)
    return df_f_ne, empty_cols
   
def get_dataset_target(bool_ne_df: pl.DataFrame, target: str) -> pl.DataFrame: 
    df_target , df_other = divide_df_by_target(bool_ne_df, target)
    return resample_dataset(bool_ne_df, df_target, df_other)

def vanilla_resample_kfold(df:pl.DataFrame,target_group:List[str],k:int, size:int = math.inf):
    target = target_group[0]
    df =cf.check_add_cv_index(df)
    df_only_target = df.select(pl.col(["cv_index",target]))
    df_target, df_other = divide_df_by_target(df_only_target, target)
    df_resampled = resample_dataset(df_only_target, df_target, df_other, size)
    index_df =cf.stratified_kfold(df_resampled, target_group,k,pre_name=f"resampled_by_{target}_")
    return index_df
    


    

In [8]:
df_no_emb = df.drop(embeddings_col)

In [9]:
df_m = df_no_emb.filter(pl.col("type") == "module")
df_c = df_no_emb.filter(pl.col("type") == "class")
df_f = df_no_emb.filter(pl.col("type") == "function")


In [10]:
df_fb = booleanize_nodes(df_f, node_cols)
df_f_ne, empty_cols = remove_empty_nodes(df_fb, node_cols)
df_f_ne.shape

(64403, 63)

In [11]:
out = cf.nested_cv(df_f_ne, ("resample","stratified"),["If"],5,["If"],5,5,5)

cv_type: resample
cv_type: stratified
Total rows: 64403
Target If rows: 15654
Other rows: 48749
Total rows: 64403
Target If rows: 15654
Other rows: 48749
Total rows: 64403
Target If rows: 15654
Other rows: 48749
Total rows: 64403
Target If rows: 15654
Other rows: 48749
Total rows: 64403
Target If rows: 15654
Other rows: 48749


In [12]:
out.null_count()

cv_index,If,outer_resample_If_replica_0_fold_0,outer_resample_If_replica_0_fold_1,outer_resample_If_replica_0_fold_2,outer_resample_If_replica_0_fold_3,outer_resample_If_replica_0_fold_4,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_0_fold_0,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_0_fold_1,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_0_fold_2,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_0_fold_3,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_0_fold_4,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_1_fold_0,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_1_fold_1,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_1_fold_2,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_1_fold_3,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_1_fold_4,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_2_fold_0,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_2_fold_1,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_2_fold_2,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_2_fold_3,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_2_fold_4,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_3_fold_0,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_3_fold_1,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_3_fold_2,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_3_fold_3,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_3_fold_4,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_4_fold_0,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_4_fold_1,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_4_fold_2,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_4_fold_3,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_4_fold_4,outer_resample_If_replica_0_fold_1_inner_stratified_If_replica_0_fold_0,outer_resample_If_replica_0_fold_1_inner_stratified_If_replica_0_fold_1,outer_resample_If_replica_0_fold_1_inner_stratified_If_replica_0_fold_2,outer_resample_If_replica_0_fold_1_inner_stratified_If_replica_0_fold_3,outer_resample_If_replica_0_fold_1_inner_stratified_If_replica_0_fold_4,…,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_2_fold_3,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_2_fold_4,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_3_fold_0,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_3_fold_1,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_3_fold_2,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_3_fold_3,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_3_fold_4,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_4_fold_0,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_4_fold_1,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_4_fold_2,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_4_fold_3,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_4_fold_4,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_0_fold_0,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_0_fold_1,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_0_fold_2,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_0_fold_3,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_0_fold_4,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_1_fold_0,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_1_fold_1,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_1_fold_2,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_1_fold_3,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_1_fold_4,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_2_fold_0,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_2_fold_1,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_2_fold_2,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_2_fold_3,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_2_fold_4,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_3_fold_0,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_3_fold_1,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_3_fold_2,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_3_fold_3,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_3_fold_4,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_4_fold_0,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_4_fold_1,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_4_fold_2,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_4_fold_3,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_4_fold_4
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,…,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,…,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095


In [13]:
out.null_count()

cv_index,If,outer_resample_If_replica_0_fold_0,outer_resample_If_replica_0_fold_1,outer_resample_If_replica_0_fold_2,outer_resample_If_replica_0_fold_3,outer_resample_If_replica_0_fold_4,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_0_fold_0,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_0_fold_1,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_0_fold_2,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_0_fold_3,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_0_fold_4,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_1_fold_0,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_1_fold_1,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_1_fold_2,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_1_fold_3,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_1_fold_4,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_2_fold_0,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_2_fold_1,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_2_fold_2,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_2_fold_3,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_2_fold_4,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_3_fold_0,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_3_fold_1,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_3_fold_2,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_3_fold_3,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_3_fold_4,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_4_fold_0,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_4_fold_1,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_4_fold_2,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_4_fold_3,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_4_fold_4,outer_resample_If_replica_0_fold_1_inner_stratified_If_replica_0_fold_0,outer_resample_If_replica_0_fold_1_inner_stratified_If_replica_0_fold_1,outer_resample_If_replica_0_fold_1_inner_stratified_If_replica_0_fold_2,outer_resample_If_replica_0_fold_1_inner_stratified_If_replica_0_fold_3,outer_resample_If_replica_0_fold_1_inner_stratified_If_replica_0_fold_4,…,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_2_fold_3,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_2_fold_4,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_3_fold_0,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_3_fold_1,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_3_fold_2,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_3_fold_3,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_3_fold_4,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_4_fold_0,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_4_fold_1,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_4_fold_2,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_4_fold_3,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_4_fold_4,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_0_fold_0,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_0_fold_1,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_0_fold_2,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_0_fold_3,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_0_fold_4,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_1_fold_0,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_1_fold_1,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_1_fold_2,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_1_fold_3,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_1_fold_4,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_2_fold_0,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_2_fold_1,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_2_fold_2,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_2_fold_3,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_2_fold_4,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_3_fold_0,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_3_fold_1,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_3_fold_2,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_3_fold_3,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_3_fold_4,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_4_fold_0,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_4_fold_1,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_4_fold_2,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_4_fold_3,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_4_fold_4
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,…,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,…,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095


In [14]:
out["outer_resample_If_replica_0_fold_0"].value_counts()

outer_resample_If_replica_0_fold_0,count
str,u32
,33095
"""dev""",25048
"""test""",6260


In [15]:
out.null_count()

cv_index,If,outer_resample_If_replica_0_fold_0,outer_resample_If_replica_0_fold_1,outer_resample_If_replica_0_fold_2,outer_resample_If_replica_0_fold_3,outer_resample_If_replica_0_fold_4,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_0_fold_0,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_0_fold_1,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_0_fold_2,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_0_fold_3,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_0_fold_4,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_1_fold_0,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_1_fold_1,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_1_fold_2,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_1_fold_3,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_1_fold_4,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_2_fold_0,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_2_fold_1,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_2_fold_2,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_2_fold_3,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_2_fold_4,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_3_fold_0,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_3_fold_1,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_3_fold_2,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_3_fold_3,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_3_fold_4,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_4_fold_0,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_4_fold_1,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_4_fold_2,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_4_fold_3,outer_resample_If_replica_0_fold_0_inner_stratified_If_replica_4_fold_4,outer_resample_If_replica_0_fold_1_inner_stratified_If_replica_0_fold_0,outer_resample_If_replica_0_fold_1_inner_stratified_If_replica_0_fold_1,outer_resample_If_replica_0_fold_1_inner_stratified_If_replica_0_fold_2,outer_resample_If_replica_0_fold_1_inner_stratified_If_replica_0_fold_3,outer_resample_If_replica_0_fold_1_inner_stratified_If_replica_0_fold_4,…,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_2_fold_3,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_2_fold_4,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_3_fold_0,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_3_fold_1,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_3_fold_2,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_3_fold_3,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_3_fold_4,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_4_fold_0,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_4_fold_1,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_4_fold_2,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_4_fold_3,outer_resample_If_replica_4_fold_3_inner_stratified_If_replica_4_fold_4,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_0_fold_0,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_0_fold_1,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_0_fold_2,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_0_fold_3,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_0_fold_4,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_1_fold_0,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_1_fold_1,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_1_fold_2,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_1_fold_3,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_1_fold_4,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_2_fold_0,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_2_fold_1,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_2_fold_2,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_2_fold_3,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_2_fold_4,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_3_fold_0,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_3_fold_1,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_3_fold_2,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_3_fold_3,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_3_fold_4,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_4_fold_0,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_4_fold_1,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_4_fold_2,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_4_fold_3,outer_resample_If_replica_4_fold_4_inner_stratified_If_replica_4_fold_4
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,…,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,…,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095,33095


In [16]:
vanilla_resample_kfold(df_f_ne,["If"],5)

Total rows: 64403
Target If rows: 15654
Other rows: 48749


cv_index,resampled_by_If_fold_0,resampled_by_If_fold_1,resampled_by_If_fold_2,resampled_by_If_fold_3,resampled_by_If_fold_4
u32,str,str,str,str,str
0,"""train""","""train""","""train""","""test""","""train"""
1,"""train""","""test""","""train""","""train""","""train"""
2,"""train""","""train""","""train""","""train""","""test"""
4,"""train""","""train""","""test""","""train""","""train"""
5,"""test""","""train""","""train""","""train""","""train"""
…,…,…,…,…,…
64397,"""test""","""train""","""train""","""train""","""train"""
64399,"""test""","""train""","""train""","""train""","""train"""
64400,"""test""","""train""","""train""","""train""","""train"""
64401,"""train""","""test""","""train""","""train""","""train"""


In [17]:
df_fb = booleanize_nodes(df_f, node_cols)
df_f_ne, empty_cols = remove_empty_nodes(df_fb, node_cols)
new_node_cols = list(set(node_cols) - set(empty_cols))
df_target , df_other = divide_df_by_target(df_f_ne, "If")

Total rows: 64403
Target If rows: 15654
Other rows: 48749


In [18]:
times = time.time()
df_out_one = sample_dataset(df_f_ne,df_target, df_other)
print(f"Time to sample: {time.time()-times}")
times = time.time()
# out_index = sample_dataset_two(df_target, df_other)
# df_out_two = df_f_ne.filter(pl.col("index").is_in(out_index["index"]))
# print(f"Time to sample two: {time.time()-times}")

NameError: name 'sample_dataset' is not defined

In [None]:
df_out_one.shape, df_out_two.shape

NameError: name 'df_out_two' is not defined

In [None]:
out_index

index
u32
3385
3387
3388
3389
3391
…
9544
66766
45319
31474


In [None]:
index

'index'

In [None]:
index["index"]

TypeError: string indices must be integers

In [None]:
df_f_ne.filter(pl.col("index").is_in(index["index"]))

TypeError: string indices must be integers

In [None]:
df_target

index,type,code,cst_tree,file_name,modules,version,license,code_token_len,cst_tree_token_len,FunctionCall,Argument,Import,If,For,While,Try,With,Lambda,Global,Nonlocal,ListComprehension,DictComprehension,SetComprehension,GeneratorExpression,Yield,Await,Return,Break,Continue,Raise,Assert,Pass,BitInvert,Not,And,Or,Add,BitAnd,BitOr,BitXor,Divide,FloorDivide,LeftShift,MatrixMultiply,Modulo,Multiply,Power,RightShift,Subtract,Equal,GreaterThan,GreaterThanEqual,In,Is,LessThan,LessThanEqual,NotEqual,IsNot,NotIn,Colon,Comma,Dot
u32,str,str,str,str,str,str,str,i64,i64,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool
3385,"""function""",""" def _runtest…","""FunctionDef(  …","""./repo_list/st…","""test/libregrte…","""3.12.1""","""Python Softwar…",435,11549,true,true,true,true,false,false,true,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,true,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,true,false,false,true,true
3387,"""function""",""" def show_wind…","""FunctionDef(  …","""./repo_list/st…","""idlelib/autoco…","""3.12.1""","""Python Softwar…",767,22098,true,true,false,true,true,false,true,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,true,false,false,true,true,true,false,false,false,false,false,false,false,false,false,false,false,true,true,false,false,false,false,false,false,true,false,false,true,true,true
3388,"""function""",""" # # User visi…","""FunctionDef(  …","""./repo_list/st…","""aifc.py""","""3.12.1""","""Python Softwar…",43,1138,true,true,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true
3389,"""function""",""" def testGetSe…","""FunctionDef(  …","""./repo_list/st…","""test/test_sock…","""3.12.1""","""Python Softwar…",447,10100,true,true,false,true,true,false,true,false,false,false,false,false,false,false,false,false,false,false,true,false,true,false,true,false,true,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,true,false,false,false,false,true,false,false,true,true
3391,"""function""",""" def test_noni…","""FunctionDef(  …","""./repo_list/st…","""test/test_grp.…","""3.12.1""","""Python Softwar…",74,2904,true,true,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,true
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
67779,"""function""",""" def display_…","""FunctionDef(  …","""./repo_list/st…","""test/libregrte…","""3.12.1""","""Python Softwar…",614,18610,true,true,false,true,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,true,true,false,false,true,true
67780,"""function""",""" def find(self…","""FunctionDef(  …","""./repo_list/st…","""site-packages/…","""3.12.1""","""Python Softwar…",983,24087,true,true,false,true,true,true,false,false,false,false,false,false,false,false,false,false,false,true,false,false,true,false,false,false,true,true,true,false,false,true,false,false,false,false,false,true,false,false,false,false,false,false,false,true,true,false,false,true,false,true,false,true,true
67783,"""function""",""" def _add_to_c…","""FunctionDef(  …","""./repo_list/st…","""site-packages/…","""3.12.1""","""Python Softwar…",241,9085,true,true,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,true
67784,"""function""",""" def test_pare…","""FunctionDef(  …","""./repo_list/st…","""idlelib/idle_t…","""3.12.1""","""Python Softwar…",342,10686,true,true,false,true,true,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,true,true


In [None]:
df_other.select(pl.col(new_node_cols).sum())

Not,FloorDivide,DictComprehension,Return,RightShift,GeneratorExpression,MatrixMultiply,LessThan,If,While,Argument,Equal,Global,Power,Continue,Lambda,Add,LessThanEqual,Subtract,Multiply,Dot,Pass,NotEqual,Or,SetComprehension,Import,Divide,In,Nonlocal,BitAnd,Assert,GreaterThan,ListComprehension,With,Break,BitOr,BitXor,NotIn,Try,And,Modulo,Await,GreaterThanEqual,For,Raise,LeftShift,Comma,IsNot,Colon,BitInvert,Yield,FunctionCall,Is
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
370,207,80,12585,54,502,9,283,0,358,45993,1002,110,436,49,1031,3696,178,1007,2044,42476,3380,249,418,75,971,789,443,432,183,490,294,906,7577,63,421,63,84,2975,375,1751,1014,225,4866,2037,215,38145,180,2290,39,1073,41793,457


In [None]:
df_target.select(pl.col(new_node_cols).sum())

Not,FloorDivide,DictComprehension,Return,RightShift,GeneratorExpression,MatrixMultiply,LessThan,If,While,Argument,Equal,Global,Power,Continue,Lambda,Add,LessThanEqual,Subtract,Multiply,Dot,Pass,NotEqual,Or,SetComprehension,Import,Divide,In,Nonlocal,BitAnd,Assert,GreaterThan,ListComprehension,With,Break,BitOr,BitXor,NotIn,Try,And,Modulo,Await,GreaterThanEqual,For,Raise,LeftShift,Comma,IsNot,Colon,BitInvert,Yield,FunctionCall,Is
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
4886,290,114,9179,86,621,0,1330,15654,1263,15180,4758,159,232,948,386,3488,652,1518,1155,14925,1081,1920,2155,41,740,288,2340,159,323,751,1346,779,1738,1065,298,41,1130,3625,3348,2435,251,792,4643,4170,138,14165,2751,2458,59,616,15165,3462


The algorithm should work:

1) Filter two frame with and without target
2) populate a new without target frame with a stratified representation of the other nodes

['FunctionCall',
 'Argument',
 'Import',
 'If',
 'BaseCompoundStatement',
 'For',
 'While',
 'Try',
 'With',
 'Lambda',
 'Global',
 'Nonlocal',
 'ListComprehension',
 'DictComprehension',
 'SetComprehension',
 'GeneratorExpression',
 'Yield',
 'Await',
 'Return',
 'Break',
 'Continue',
 'Raise',
 'Assert',
 'Pass',
 'BitInvert',
 'UnaryMinus',
 'Not',
 'UnaryPlus',
 'And',
 'Or',
 'Add',
 'BitAnd',
 'BitOr',
 'BitXor',
 'Divide',
 'FloorDivide',
 'LeftShift',
 'MatrixMultiply',
 'Modulo',
 'Multiply',
 'Power',
 'RightShift',
 'Subtract',
 'Equal',
 'GreaterThan',
 'GreaterThanEqual',
 'In',
 'Is',
 'LessThan',
 'LessThanEqual',
 'NotEqual',
 'IsNot',
 'NotIn',
 'AddAssign',
 'SubtractAssign',
 'MultiplyAssign',
 'DivideAssign',
 'ModuloAssign',
 'AndAssign',
 'OrAssign',
 'XorAssign',
 'LeftShiftAssign',
 'RightShiftAssign',
 'PowerAssign',
 'FloorDivideAssign',
 'MatrixMultiplyAssign',
 'Colon',
 'Comma',
 'Dot']

In [None]:
node_list = pl.DataFrame({"column":df.columns[13:]})
node_list

column
str
"""FunctionCall"""
"""Argument"""
"""Import"""
"""If"""
"""BaseCompoundSt…"
…
"""FloorDivideAss…"
"""MatrixMultiply…"
"""Colon"""
"""Comma"""


In [None]:
out = create_dataset_tf(df, 1000, "Break", node_list)

Time to drop columns: 0.0010013580322265625
Time to change int64 to boolean: 0.019043922424316406
Time to filter target and other: 0.07357668876647949
Time to filter other nodes in for loop: 0.022034168243408203
Time to join train and test: 0.027542591094970703
Time to filter train and test: 2.9780807495117188
