In [1]:
from cynde.functional.distributed_cv import train_nested_cv_from_np_modal, cv_stub, preprocess_np_modal
import cynde.functional as cf
import os
import polars as pl
from typing import List
import time
from cynde.functional.predict.types import StratifiedConfig,Feature,FeatureSet,NumericalFeature, CategoricalFeature,EmbeddingFeature, InputConfig, ClassifierConfig, LogisticRegressionConfig, RandomForestClassifierConfig, HistGradientBoostingClassifierConfig, CVConfig
from cynde.functional.predict.preprocess import convert_utf8_to_enum, check_add_cv_index, preprocess_inputs,map_list_to_cols,load_preprocessed_features
from cynde.functional.predict.cv import stratified_combinatorial
from cynde.functional.predict.classify import create_pipeline

from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline


def load_minihermes_data(data_path: str = r"C:\Users\Tommaso\Documents\Dev\Cynde\cache\OpenHermes-2.5_embedded.parquet") -> pl.DataFrame:
    return pl.read_parquet(data_path)

df = load_minihermes_data()
df = convert_utf8_to_enum(df, threshold=0.2)
df = check_add_cv_index(df,strict=False)
print(df.columns)

feature_set_small_data = {"embeddings":[{"column_name":"conversations_text-embedding-3-small_embeddings",
                                         "name":"feature set for the smaller oai embeddings"}]}
feature_set_large_data = {"embeddings":[{"column_name":"conversations_text-embedding-3-large_embeddings",
                                         "name":"feature set for the larger oai embeddings"}]}

input_config_data = {"feature_sets":[feature_set_small_data,feature_set_large_data],
                        "target_column":"target",
                        "save_folder":"C:/Users/Tommaso/Documents/Dev/Cynde/cynde_mount/"}

input_config = InputConfig.model_validate(input_config_data,context={"df":df})
print("Input config:")
print(input_config)





C:\Users\Tommaso\Documents\Dev\Cynde
Column 'conversations' has a high ratio of unique values (1.00). Skipping conversion to Enum.
['cv_index', 'conversations', 'source', 'target', 'conversations_text-embedding-3-small_embeddings', 'conversations_text-embedding-3-large_embeddings']
Input config:
feature_sets=[FeatureSet(numerical=[], embeddings=[EmbeddingFeature(column_name='conversations_text-embedding-3-small_embeddings', name='feature set for the smaller oai embeddings', description=None, scaler_type=<ScalerType.STANDARD_SCALER: 'StandardScaler'>, embedder='text-embedding-3-small', embedding_size=1536)], categorical=[]), FeatureSet(numerical=[], embeddings=[EmbeddingFeature(column_name='conversations_text-embedding-3-large_embeddings', name='feature set for the larger oai embeddings', description=None, scaler_type=<ScalerType.STANDARD_SCALER: 'StandardScaler'>, embedder='text-embedding-3-small', embedding_size=1536)], categorical=[])] target_column='target' save_folder='C:/Users/Tom

In [2]:
input_config.feature_sets[0].column_names()

['conversations_text-embedding-3-small_embeddings']

In [3]:
preprocess_inputs(df, input_config)


selected columns: ['cv_index', 'target', 'conversations_text-embedding-3-small_embeddings']
selected columns: ['cv_index', 'target', 'conversations_text-embedding-3-large_embeddings']


NameError: name 'a' is not defined

In [None]:
preprocess_inputs(df, input_config)

classifiers_config = ClassifierConfig(classifiers=[RandomForestClassifierConfig(n_estimators=100),RandomForestClassifierConfig(n_estimators=500)])
print("Classifiers config:")
print(classifiers_config)
groups = ["target"]
cv_config = CVConfig(inner= StratifiedConfig(groups=groups,k=5),
                     inner_replicas=1,
                     outer = StratifiedConfig(groups=groups,k=5),
                        outer_replicas=1)
print("CV config:")
print(cv_config)
## here the distributed part start

selected columns: ['cv_index', 'target', 'conversations_text-embedding-3-small_embeddings']
selected columns: ['cv_index', 'target', 'conversations_text-embedding-3-large_embeddings']
Classifiers config:
classifiers=[RandomForestClassifierConfig(classifier_name=<ClassifierName.RANDOM_FOREST: 'RandomForestClassifier'>, n_jobs=-1, n_estimators=100, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None, monotonic_cst=None), RandomForestClassifierConfig(classifier_name=<ClassifierName.RANDOM_FOREST: 'RandomForestClassifier'>, n_jobs=-1, n_estimators=500, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, o

In [None]:
from sklearn.ensemble import RandomForestClassifier,HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, PowerTransformer, QuantileTransformer, Normalizer, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import polars as pl

from cynde.functional.predict.types import FeatureSet,InputConfig,ClassifierConfig,BaseClassifierConfig, LogisticRegressionConfig, RandomForestClassifierConfig, HistGradientBoostingClassifierConfig, CVConfig

def create_pipeline(df: pl.DataFrame, feature_set: FeatureSet, classifier_config: BaseClassifierConfig) -> Pipeline:
    """ maybne the df.schema is enough and we do not need to pass the whole df """
    transformers = []
    numerical_features = [feature.column_name for feature in feature_set.numerical]
    if numerical_features:
        scaler = feature_set.numerical[0].get_scaler()  # Assuming all numerical features use the same scaler
        transformers.append(("numerical", scaler, numerical_features))
    embedding_features = [feature.column_name for feature in feature_set.embeddings]
    if embedding_features:
        #embedding features are stored as list[float] in polars but we map them to multiple columns of float in sklearn
        # so here we assume that we already pre-processed each embedding_feature to bea  lsit of columns of format column_name_{i}
        #accumulate for each embedding feature the list of columns that represent it and flatten it
        embedding_features = [f"{feature}_{i}" for feature in embedding_features for i in range(0,feature_set.embeddings[0].embedding_size)]
        scaler = feature_set.embeddings[0].get_scaler()  # Assuming all embedding features use the same scaler
        transformers.append(("embedding", scaler, embedding_features))

    categorical_features = [feature.column_name for feature in feature_set.categorical]
    if categorical_features:
        for feature in feature_set.categorical:
            if feature.one_hot_encoding:
                if df[feature.column_name].dtype == pl.Categorical:
                    categories = [df[feature.column_name].unique().to_list()]
                elif df[feature.column_name].dtype == pl.Enum:
                    categories = [df[feature.column_name].dtype.categories]
                else:
                    raise ValueError(f"Column '{feature.column_name}' must be of type pl.Categorical or pl.Enum for one-hot encoding.")
                one_hot_encoder = OneHotEncoder(categories=categories, handle_unknown='error', sparse_output=False)
                transformers.append((f"categorical_{feature.column_name}", one_hot_encoder, [feature.column_name]))
            else:
                if df[feature.column_name].dtype not in [pl.Float32, pl.Float64]:
                    raise ValueError(f"Column '{feature.column_name}' must be of type pl.Float32 or pl.Float64 for physical representation.")
                transformers.append((f"categorical_{feature.column_name}", "passthrough", [feature.column_name]))

    preprocessor = ColumnTransformer(transformers)

    # Create the classifier based on the classifier configuration
    if isinstance(classifier_config, LogisticRegressionConfig):
        classifier = LogisticRegression(**classifier_config.dict(exclude={"classifier_name"}))
    elif isinstance(classifier_config, RandomForestClassifierConfig):
        classifier = RandomForestClassifier(**classifier_config.dict(exclude={"classifier_name"}))
    elif isinstance(classifier_config, HistGradientBoostingClassifierConfig):
        classifier = HistGradientBoostingClassifier(**classifier_config.dict(exclude={"classifier_name"}))
    else:
        raise ValueError(f"Unsupported classifier: {classifier_config.classifier_name}")

    pipeline = Pipeline([("preprocessor", preprocessor), ("classifier", classifier)])
    pipeline.set_output(transform="polars")
    return pipeline

In [None]:
preprocess_inputs(df, input_config)

selected columns: ['cv_index', 'target', 'conversations_text-embedding-3-small_embeddings']
selected columns: ['cv_index', 'target', 'conversations_text-embedding-3-large_embeddings']


In [None]:
a

NameError: name 'a' is not defined

In [None]:
emb_col_name = input_config.feature_sets[0].embeddings[0].column_name

In [None]:
df = map_list_to_cols(df, emb_col_name)


In [None]:
len([col in df.columns for col in df.columns if emb_col_name in col])

1536

In [None]:
pipe = create_pipeline(df, input_config.feature_sets[0], classifiers_config.classifiers[0])

In [None]:
pipe.fit(df, df["target"])

In [None]:
models = []
for classifiers in classifiers_config.classifiers:
    for feature_set in input_config.feature_sets:
        pipeline = create_pipeline(df, feature_set, classifiers)
        models.append(pipeline)
print(models)

[Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('embedding', StandardScaler(),
                                                  ['conversations_text-embedding-3-small_embeddings_0',
                                                   'conversations_text-embedding-3-small_embeddings_1',
                                                   'conversations_text-embedding-3-small_embeddings_2',
                                                   'conversations_text-embedding-3-small_embeddings_3',
                                                   'conversations_text-embedding-3-small_embedding...
                                                   'conversations_text-embedding-3-small_embeddings_24',
                                                   'conversations_text-embedding-3-small_embeddings_25',
                                                   'conversations_text-embedding-3-small_embeddings_26',
                                                   '