from sklearnex import patch_sklearn
patch_sklearn()

# IO

In [None]:
from PineBioML import IO

In [None]:
# Path
input_folder = "./input/"
output_folder = "./output/"
export_title = "example_basic "

In [None]:
x, y = IO.read_multiple_groups(
    file_path_list=[
        input_folder+ "example_group1.csv", # all samples from this file will have y = 0, because it is in the 1st place of the file path list.
        input_folder+ "example_group2.tsv", # all samples from this file will have y = 1, because it is in the 2nd place of the file path list.
    ],
    index_col=0
    )

In [None]:
y.describe()

In [None]:
target_label = 1.0

# Train test split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.33, random_state=142, shuffle=True)

# Experiment

In [None]:
from PineBioML.model.utils import Pine, sklearn_esitimator_wrapper

from PineBioML.preprocessing import Normalizer, Pass
from PineBioML.preprocessing.impute import knn_imputer, simple_imputer
from PineBioML.preprocessing.utils import feature_extension
from PineBioML.selection.classification import essemble_selector
from PineBioML.model.supervised import Classification
from sklearn.linear_model import LogisticRegression


# define the experiment pipeline
experiment = [
    ("MissingValueProcessing", {
        "mean": simple_imputer(threshold=1., strategy="mean")
        }),
    ("Standarization", {
        "PowerTransformer": Normalizer(method="PowerTransformer"), 
        "StandardScaler": Normalizer(method="StandardScaler"),
        }),
    ("Selection", {
        "ensemble":essemble_selector(RF_trees=256, z_importance_threshold = 1), 
        "None": Pass()
        }),
    ("Modeling", {
        "LgisticRegression": sklearn_esitimator_wrapper(LogisticRegression(penalty = None)),
        "RandomForest": Classification.RandomForest_tuner(target="mcc",validate_penalty=True),
        "XGBoost": Classification.XGBoost_tuner(validate_penalty=True),
        }),
]

In [None]:
pine_automl = Pine(experiment, target_label = target_label, cv_result= True)
result = pine_automl.do_experiment(x_train, y_train, x_valid, y_valid)

In [None]:
pine_automl.experiment_detail()

In [None]:
# print the results, sort according to test_auc
result.sort_values("test_auc")

In [None]:
# save the result
result.to_csv("./output/example_Pine_result.csv")

# Pick a model and save it

In [None]:
# notice that the model is not fitted yet
model = pine_automl.recall_model(id = 3)

In [None]:
# fit the model
model.fit(x_train, y_train)
y_valid_prob = model.predict_proba(x_valid)

### summary

In [None]:
from PineBioML.report.utils import classification_summary

classification_summary(y_valid, y_valid_prob, target_label=target_label)

### save the model

In [None]:
IO.save_model(model, "./output/models/", "mode0", overide=True)