In [None]:
from dpks.quant_matrix import QuantMatrix
from dpks.classification import Classifier

In [None]:
qm = QuantMatrix(
    quantification_file="../tests/input_files/data_sepsis.tsv",
    design_matrix_file="../tests/input_files/design_sepsis.tsv")

In [None]:
quantified_data = (
    qm.normalize(
        method="mean",
    )
    .quantify(method="top_n")
)

In [None]:
params = {
        'min_child_weight': [0.01, 0.1, 0.5, 1, 5, 10, 25],
        'gamma': [0.1, 0.5, 1, 1.5, 2, 5, 10],
        'subsample': [0.4, 0.6, 0.8, 1.0],
        'colsample_bytree': [0.1, 0.2, 0.4, 0.6, 0.8, 1.0],
        'max_depth': [2, 3, 4, 5, 6, 7,8,9,10],
        "learning_rate": [0.0001,0.001, 0.01, 0.1, 1],
        "reg_alpha": [1e-5, 1e-2, 0.1, 1, 10, 100],
        "reg_lambda": [1e-5, 1e-2, 0.1, 1, 10, 100]
        }

quantified_data.classify(classifier="xgboost", shap_algorithm="tree", run_rfe = True, rfe_min_features_to_select=5, rfe_step=10, run_param_search=True, param_grid=params, random_state=42, n_iter=1000)
quantified_data.clf.best_params

In [None]:
quantified_data.to_df()

In [None]:
df = quantified_data.to_df()
df[df['Protein'] == 'P59665'] 

In [None]:
import matplotlib.pyplot as plt
plt.scatter(df['FeatureRank'], df['SHAP'])
plt.xlabel('FeatureRank')
plt.ylabel('SHAP')

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np

def _generate_data_matrices(
     quantitative_data: QuantMatrix, scale: bool
) -> tuple:
    le = LabelEncoder()
    Y = le.fit_transform(quantitative_data.quantitative_data.var["group"].values)
    X = quantitative_data.quantitative_data.X.copy().transpose()
    X = np.nan_to_num(X, copy=True, nan=0.0)
    if scale:
        X = StandardScaler().fit_transform(X)
    return X, Y

scale = True
X, Y = _generate_data_matrices(quantified_data, scale)

In [None]:
clf = Classifier(quantified_data.clf.classifier)
clf.cross_validation(X,Y,k_folds=5)
clf.scores

In [None]:
clf.fit(X,Y)
quantified_data.quantitative_data.obs['SHAP']= clf.feature_importances_
print("This should equals the height of the Feature 232-bar:", quantified_data.quantitative_data.obs['SHAP'][232])
print("This should equals the height of the Feature 148-bar:", quantified_data.quantitative_data.obs['SHAP'][148])
import shap

shap.summary_plot(clf.shap_values, X, max_display=5, plot_type='bar')

In [None]:
shap.summary_plot(clf.shap_values, X, max_display=5, feature_names = quantified_data.quantitative_data.obs['Protein'])

In [None]:
import matplotlib.pyplot as plt

rfe_results = quantified_data.selector.cv_results_
n_scores = len(rfe_results["mean_test_score"])

plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Mean test accuracy")
plt.errorbar(
    range(5, n_scores+5),
    rfe_results["mean_test_score"],
    yerr=rfe_results["std_test_score"],
    
)
plt.xticks(plt.xticks()[0][1:], 
           labels=[int(_)*10 for _ in plt.xticks()[0][1:]])
plt.xlim([0,60])
plt.show()