In [10]:
import polars as pl
from PredCST.utils.dataset_creation import create_dataset
from typing import List, Tuple, Optional
import cynde.functional as cf
import math
import time
import numpy as np

In [2]:
import os

In [16]:
def booleanize_nodes(df: pl.DataFrame, node_columns : List[str]) -> pl.DataFrame:
        expression= [pl.col(col) > 0 for col in node_columns]
        return df.with_columns(expression)

def remove_empty_nodes(df: pl.DataFrame, node_columns: List[str]) -> pl.DataFrame:
    zero_cols = [col for col in node_columns if df[col].sum() == 0]
    return df.drop(zero_cols), zero_cols
# Get the directory above the current directory
data_url = r"C:\Users\Tommaso\Documents\Dev\PredCST\python_3_12_1_standard_lib_all_with_counts.parquet"
# data_url = "/Users/tommasofurlanello/Documents/Dev/PredCST/python_3_12_1_standard_lib_all_with_counts.parquet"
# dataset_path = os.path.join(cache_dir, dataset_name)
start_time = time.time()
cynde_dir = os.getenv('CYNDE_DIR')
mount_dir = os.getenv('MODAL_MOUNT')
df = pl.read_parquet(data_url)
df = df.with_row_index()
print(f"Time to read the dataset: {time.time() - start_time} seconds")
start_time = time.time()
node_cols = df.columns[13:]
df_f = df.filter(pl.col("type") == "function").filter(pl.col("code_text-embedding-3-small_embedding").is_not_null())

df_fb = booleanize_nodes(df_f, node_cols)
df_f_ne, empty_cols = remove_empty_nodes(df_fb, node_cols)

print(f"Time to preprocess the dataset: {time.time() - start_time} seconds")

models_dict = {"RandomForest": [{"n_estimators": 50, "max_depth": 10}]}
inputs =[{"numerical":["code_text-embedding-3-small_embedding"]},
        {"numerical":["code_text-embedding-3-large_embedding"]}]

# Call the train_nested_cv_from_np function with the required arguments
df_f_ne = cf.check_add_cv_index(df_f_ne)
target = "If"

Time to read the dataset: 17.883113145828247 seconds
Time to preprocess the dataset: 5.290783166885376 seconds


In [17]:
df = df_f_ne
cv_type=("resample","stratified")
mount_dir=mount_dir
inputs=inputs
models=models_dict
group_outer=[target]
k_outer = 5
group_inner=[target]
k_inner = 5
r_outer=1
r_inner=1
skip_class=False
target_column = target

In [18]:
start_time = time.time()
df = cf.check_add_cv_index(df)
pred_df = cf.nested_cv(df, cv_type, group_outer, k_outer, group_inner, k_inner, r_outer, r_inner, return_joined=False)
cv_df = df.join(pred_df, on="cv_index", how="left")
results_df = pl.DataFrame(schema=cf.RESULTS_SCHEMA)
print("results schema: ", results_df.schema)
print(f"results shape: {results_df.shape}")

cv_type: resample
cv_type: stratified
Total rows: 59445
Target If rows: 13359
Other rows: 46086
results schema:  OrderedDict([('classifier', String), ('classifier_hp', String), ('fold_name', String), ('pred_name', String), ('input_features_name', String), ('accuracy_train', Float64), ('accuracy_val', Float64), ('accuracy_test', Float64), ('mcc_train', Float64), ('mcc_val', Float64), ('mcc_test', Float64), ('train_index', List(UInt32)), ('val_index', List(UInt32)), ('test_index', List(UInt32)), ('train_time', String), ('pred_time', String), ('eval_time', String), ('total_cls_time', String), ('k_outer', Int64), ('k_inner', Int64), ('r_outer', Int64), ('r_inner', Int64)])
results shape: (0, 22)


In [19]:
df["code_text-embedding-3-large_embedding"].null_count()

0

In [20]:
# Preprocess the dataset
preprocess_start_time = time.time()
feature_arrays, labels, _ = cf.preprocess_dataset(df, inputs, target_column=target_column)


Feature array shape for code_text-embedding-3-small_embedding: (59445, 1536)
Feature array shape for code_text-embedding-3-large_embedding: (59445, 3072)


In [21]:
#save the arrays to cynde_mount folder
print(f"Saving arrays to {mount_dir}")
for feature_name,feature_array in feature_arrays.items():
    np.save(os.path.join(mount_dir,feature_name+".npy"),feature_array)
np.save(os.path.join(mount_dir,"labels.npy"),labels)
preprocess_end_time = time.time()
print(f"Preprocessing completed in {preprocess_end_time - preprocess_start_time} seconds")

Saving arrays to C:\Users\Tommaso\Documents\Dev\Cynde\cynde_mount
Preprocessing completed in 32.64471960067749 seconds
