# IO

In [None]:
from PineBioML import IO

In [None]:
# Path
input_folder = "./input/"
output_folder = "./output/"
export_title = "example_basic "

In [None]:
##### Read data method 1: read single file
##### index_col is the column of index of samples (or sample name).
##### If your data do not have such column, please set index_col=None
data = IO.read_file(input_folder+ "HAPTdata.csv", index_col=None)

##### pick the target label column
y = data["target"]

##### drop the target from dataframe, using the remaining to predict y (the target)
x = data.drop("target", axis = 1)

In [None]:
x.head(5)

In [None]:
y.value_counts()

# Preprocessing

In [None]:
from PineBioML.preprocessing import Normalizer

x = Normalizer(method="StandardScaler").fit_transform(x)

In [None]:
from PineBioML.preprocessing import impute

# The imputer will drop the features within which more than 60% of values are unkown.
# Remaining will be filled by 0.
simple = impute.simple_imputer(threshold = 0.6, strategy = "constant")
x = simple.fit_transform(x)

# Data overview

In [None]:
import PineBioML.report.utils as report_utils

In [None]:
report_utils.data_overview(x, y, is_classification=True, prefix="Original Data", save_fig=True, save_path="./output/images/", show_fig=True)

# Feature selection

In [None]:
from PineBioML.selection.classification import essemble_selector

In [None]:
# Use several way to select top-k important features, which includes decision tree, random forest, lasso, multi lasso, svm, adaboost, xgboost
ensemble_selector = essemble_selector(k = 10)
important_feature, importance = ensemble_selector.Select(x, y) # if k = -1, it returns all the feature scores

In [None]:
# The features that was selected by ML algoritms.
important_feature

In [None]:
# The feature importance given by diffirent methods.
# Please Notice that the score between diffirent methods is not comparable.
importance

In [None]:
x = ensemble_selector.transform(x)

# Modeling

### Train test split

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.33, random_state=142, shuffle=True, stratify=y)

### Tuning a model

In [None]:
import PineBioML.model.supervised.Classification as tuner

In [None]:
svm = tuner.SVM_tuner(target="qwk", kernel_seed=11, valid_seed = 222, optuna_seed = 3333).fit(x_train, y_train)

In [None]:
svm.plot()

In [None]:
svm.reference()

In [None]:
report_utils.classification_summary(y_valid, svm.predict_proba(x_valid))