# IO

In [None]:
from PineBioML import IO

In [None]:
# Path
input_folder = "./input/"
output_folder = "./output/"
export_title = "example_basic "

In [None]:
##### Read data method 2: read multiple files
x, y = IO.read_multiple_groups(
    file_path_list=[
        input_folder+ "example_group1.csv", # all samples from this file will have y = 0, because it is in the 1st place of the file path list.
        input_folder+ "example_group2.tsv", # all samples from this file will have y = 1, because it is in the 2nd place of the file path list.
    ],
    index_col=0
    )

In [None]:
x.head(5)

In [None]:
y.value_counts()

# Preprocessing

In [None]:
from PineBioML.preprocessing import Normalizer

x = Normalizer(method="StandardScaler").fit_transform(x)

In [None]:
# Impute 0
from PineBioML.preprocessing import impute

# The imputer will drop the features with missing value more than 60%.
# Remaining will be filled by 0.
simple = impute.simple_imputer(threshold = 0.6, strategy = "constant")
x = simple.fit_transform(x)

# Data overview

In [None]:
import PineBioML.report.utils as report_utils

In [None]:
from sklearn.preprocessing import LabelEncoder
from pandas import Series

# turn y into a regression problem
y = Series(LabelEncoder().fit_transform(y), index = y.index, name = y.name)

In [None]:
report_utils.data_overview(x, y, is_classification=False, discrete_legend=False, prefix="Original Data (labeled y)", save_fig=True, save_path="./output/images/", show_fig=True)

# Feature selection

In [None]:
from PineBioML.selection.regression import essemble_selector

In [None]:
# Use several way to select top-k important features, which includes decision tree, random forest, lasso, multi lasso, svm, adaboost, xgboost
ensemble_selector = essemble_selector(k = 5)
x = ensemble_selector.fit_transform(x, y) # if k = -1, it returns all the feature scores

In [None]:
importance = ensemble_selector.what_matters()
importance

# Modeling

### Train test split

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.33, random_state=142, shuffle=True, stratify=y)

### Tuning a model

In [None]:
import PineBioML.model.supervised.Regression as tuner

rf = tuner.RandomForest_tuner(target="mse", kernel_seed=11, valid_seed = 2222, optuna_seed = 333).fit(x_train, y_train)
rf.plot()

In [None]:
report_utils.regression_summary(y_valid, rf.predict(x_valid), x_valid, prefix="rf")