# IO

In [None]:
import PineBioML.IO as IO

In [None]:
# example_PipeLine.py ./input/ example_data.xlsx 0 y

data = IO.read_file("./input/HAPTdata.csv", index_col=None)

##### pick the target label column
y = data["target"]

##### drop the target from dataframe, using the remaining to predict y (the target)
x = data.drop("target", axis = 1)

# Train test split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.33, random_state=142, shuffle=True)

# Data Overview

In [None]:
from PineBioML.report.utils import data_overview
data_overview(x_train, y_train, show_fig = True)

# Modeling

In [None]:
from PineBioML.preprocessing import Normalizer
from PineBioML.preprocessing import impute
from PineBioML.selection.classification import essemble_selector
import PineBioML.model.supervised.Classification as tuner
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline(
    [
        ("Standarization", Normalizer(method="StandardScaler")),
        ("Missing_Value_Processing", impute.simple_imputer(threshold = 0.6, strategy = "constant")),
        ("Feature_Selection", essemble_selector(z_importance_threshold = 1)),
        ("Modeling", tuner.RandomForest_tuner(target="qwk"))
     ]
)

In [None]:
pipe.fit(x_train, y_train)

# Evaluation

In [None]:
from PineBioML.report.utils import classification_summary

classification_summary(y_valid, pipe.predict_proba(x_valid), show_fig=False)

# Saving Model

In [None]:
# save model
IO.save_model(pipe, "./output/models/", "MostInOne", overide=True)

# Reload model

In [None]:
# load model
presaved_model = IO.load_model("./output/models/MostInOne")

In [None]:
classification_summary(y_valid, presaved_model.predict_proba(x_valid)) 