In [None]:
import pandas as pd
import numpy as np

# IO

In [None]:
from PineBioML.preprocessing import IO


In [None]:
# Path
input_folder = "./input/"
output_folder = "./output/"
export_title = "example_basic "

In [None]:
##### Read data method 1: read single file
data = IO.read_file(input_folder+ "example_data.xlsx", index_col=0)
y = data["y"]
x = data.drop("y", axis = 1)


# Preprocessing

In [None]:
### within sample normalization
#x = x.T
#x = (x-x.mean())/x.std()
#x = x.T

### standarization
x = (x-x.mean())/x.std()

In [None]:
# Impute 0
from PineBioML.preprocessing import impute

# The imputer will drop the features with missing value more than 50%.
# Remaining will be filled by 0.
simple = impute.simple_imputer(threshold = 0.6, strategy = "constant")
x, y = simple.fit_transform(x, y)

# Analysis utils

In [None]:
import PineBioML.report.utils as report_utils

In [None]:
report_utils.data_overview(x, y)

# Feature selection

In [None]:
from PineBioML.selection.ensemble import selector

In [None]:
# Use several way to select features, which includes decision tree, random forest, lasso, multi lasso, svm, adaboost, xgboost
ensemble_selector = selector()
important_feature, importance = ensemble_selector.Select(x, y, 10)

In [None]:
# The features that was selected by ML algoritms.
important_feature

In [None]:
# The feature importance given by diffirent methods.
# Please Notice that the score between diffirent methods is not comparable.
importance

In [None]:
merge_importance = ((importance-importance.mean())/importance.std()).mean(axis = 1).sort_values(ascending=False)
merge_importance

In [None]:
report_utils.data_overview(x[merge_importance.index[:5]], y)

# Modeling

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x[merge_importance.index[:5]], y, test_size=0.33, random_state=142, shuffle=True)

In [None]:
import PineBioML.model.supervised.Classification as tuner

### optuna

In [None]:
rf = tuner.RandomForest_tuner(x_train, y_train, target="f1").tune()

In [None]:
svm = tuner.SVC_tuner(x_train, y_train, target="roc_auc").tune()

In [None]:
# fit the model
rf.fit(x_train, y_train)
svm.fit(x_train, y_train)

In [None]:
report_utils.classification_summary(y_valid ,rf.predict_proba(x_valid)[:, 1]) 

In [None]:
report_utils.classification_summary(y_valid, svm.predict_proba(x_valid)[:, 1]) 

### Mljar

In [None]:
from supervised.automl import AutoML

mljar = AutoML(mode = "Perform", results_path = output_folder+"MLjar")
mljar.fit(x_train, y_train)

In [None]:
report_utils.classification_summary(y_valid, mljar.predict_proba(x_valid)[:, 1])

# Output

In [None]:
# Save the result ro output_path
important_feature.to_csv(output_folder + export_title + " important feature.csv")
importance.to_csv(output_folder+export_title+ " importance.csv")

In [None]:
import joblib
# save model
joblib.dump(rf, output_folder+"models/"+export_title+"RandomForest")
joblib.dump(svm, output_folder+"models/"+export_title+"SVM")

In [None]:
# load model
presaved_model = joblib.load(output_folder+"models/"+export_title+"SVM")
report_utils.classification_summary(y_valid, presaved_model.predict_proba(x_valid)[:, 1]) 