In [1]:
import pandas as pd
import numpy as np

In [2]:
from gprofiler import GProfiler
gp = GProfiler(return_dataframe=True)

# IO

In [3]:
import PineBioML.IO as IO

In [4]:
# Path
input_folder = "./input/"
output_folder = "./output/"
export_title = "example_ccRCC_proteomics "

The data is from  [LinkedOmicsKB](https://kb.linkedomics.org/).

In [5]:
##### Read data method 2: read several files at once
x, group_label = IO.read_multiple_groups(
    [
        input_folder+ "CCRCC_proteomics_gene_abundance_log2_reference_intensity_normalized_Normal.txt",
        input_folder+ "CCRCC_proteomics_gene_abundance_log2_reference_intensity_normalized_Tumor.txt"
    ],
    transpose= True # Set True to transpose data before merging
    )

y = group_label

In [None]:
x.head(5)

In [None]:
y.head(5)

# Preprocessing

In [None]:
# Convert ENSEMBL ID to gene symbol by gprofiler
x.columns = [col.split(".")[0] for col in x.columns]
gene_name = gp.convert(query=x.columns.to_list())["name"]

# For those who can't find a gene symbol, they will remain in ENSEMBL ID
matched = gene_name == "None"
x.columns = x.columns.where(matched, gene_name)

# The result
x.head(5)

In [9]:
### Revert to Original scale
x = np.power(2, x)

In [None]:
# Impute 0
from PineBioML.preprocessing import impute

# The imputer will drop the features with missing value more than 50%.
# Remaining will be filled by 0.
simple = impute.simple_imputer(threshold = 0.5, strategy = "constant")
x = simple.fit_transform(x, y)

# retsult
x.head(5)

In [11]:
# make a copy for volcanal plot
x_copy = x.copy()

In [12]:
from PineBioML.preprocessing import Normalizer
### within sample normalization
x = Normalizer(method = "Normalizer").fit_transform(x)

### standarization
#x = (x-x.mean())/x.std()

from PineBioML.report.utils import data_overview

data_overview(x, y, label_name = "ccRCC tissue", title = "Example ccRCC Proteomics")

# Feature selection

In [13]:
from PineBioML.selection.Volcano import Volcano_selection

In [None]:
# Using volcano plot to select feature.
vol = Volcano_selection(k= -1, strategy = "p", log_domain=False, absolute= False)
vol_importance = vol.Select(
    x_copy, y)  # The number of features to select, and -1 means choose all.

# Plot the vlcano plot.
vol.plotting()

In [None]:
from PineBioML.selection.classification import essemble_selector

# Use several way to select features, which includes decision tree, random forest, lasso, multi lasso, svm, adaboost, xgboost
selector = essemble_selector(k = 100, z_importance_threshold=1, RF_trees = 1024*8)
important_feature, importance = selector.Select(x, y)

In [None]:
# The features that was selected by ML algoritms.
important_feature

In [None]:
# The feature importance given by diffirent methods.
# Please Notice that the score between diffirent methods is not comparable.
importance

# Analysis utils

In [None]:
### Plot important features from diffirent methods on volcano plot.
for method in importance:
    vol.plotting(external=True,
                 external_score=importance[method].dropna(),
                 title=method + " in volcano",
                 show=False,
                 saving=True,
                 save_path="./output/images/" + export_title)

In [None]:
# Using g:profiler to do gene enrichment.
gp_enrichment_result = []
for method in important_feature.columns:
    gp_result = gp.profile(query = important_feature[method].dropna().to_list()).sort_values("p_value")
    gp_result["method"] = method
    gp_enrichment_result.append(gp_result[["method", "source", "native", "parents", "p_value", "description", "intersection_size"]])
gp_enrichment_result = pd.concat(gp_enrichment_result,
                                 axis=0).reset_index(drop=True)
# The result
gp_enrichment_result

In [None]:
# Showing the top 10 significant result
gp_enrichment_result.sort_values("p_value").head(20)

# Modeling

In [21]:
from sklearn.model_selection import cross_val_score
from PineBioML.model.supervised.Classification import SVM_tuner

In [None]:
svm = SVM_tuner().fit(selector.transform(x), y)
svm.plot()

# Output

In [23]:
# Save the result ro output_path
important_feature.to_csv(output_folder + export_title + " important feature.csv")
importance.to_csv(output_folder+export_title+ " importance.csv")
gp_enrichment_result.to_csv(output_folder+export_title+ " g_prophiler.csv")
svm_performance.to_csv(output_folder+export_title+ " rbfSVM_performance.csv")
vol_importance.to_csv(output_folder + export_title + " volcano_foldchange.csv")