In [22]:
from sklearn.feature_selection import SelectFromModel
import pandas as pd
import numpy as np
import joblib
import dtale

In [23]:
# markers names

In [24]:
data_wax_A = pd.read_csv("../data/interim/wax_data_A.csv", index_col=0)
markers_A = data_wax_A.drop("wax", axis=1).columns

data_wax_B = pd.read_csv("../data/interim/wax_data_B.csv", index_col=0)
markers_B = data_wax_B.drop("wax_F2", axis=1).columns

In [25]:
# load pre-trained models

In [26]:
lr = joblib.load("../models/logistic_regression_wax_A")
rf = joblib.load("../models/random_forest_wax_A")
xgb = joblib.load("../models/XGBoost_wax_A")

In [27]:
lr2 = joblib.load("../models/logistic_regression_wax_B")
rf2 = joblib.load("../models/random_forest_wax_B")
xgb2 = joblib.load("../models/XGBoost_wax_B")

In [28]:
# Selection

In [29]:
# Dataset wax A 
lr_selector_A = SelectFromModel(lr, prefit=True, threshold="mean")
rf_selector_A = SelectFromModel(rf, prefit=True, threshold="mean")
xgbc_selector_A = SelectFromModel(xgb, prefit=True, threshold="mean")

In [30]:
# Dataset wax_F2 B
lr_selector_B = SelectFromModel(lr2, prefit=True, threshold="mean")
rf_selector_B = SelectFromModel(rf2, prefit=True, threshold="mean")
xgbc_selector_B = SelectFromModel(xgb2, prefit=True, threshold="mean")

In [31]:
# Dataset wax A
selected_features_dict_A = {"Markers": markers_A,"Logistic regression": lr_selector_A.get_support(), "Logistic regression coefficients": np.mean(np.abs(lr.coef_), axis=0), "Random forest": rf_selector_A.get_support(), "Random forest coefficients": rf.feature_importances_,
                          "XGBoost": xgbc_selector_A.get_support(), "XGBoost coefficients": xgb.feature_importances_}

selected_features_A = pd.DataFrame.from_dict(selected_features_dict_A).set_index("Markers", drop=True)

selected_features_A["Points"] = np.sum((selected_features_A["Logistic regression"], selected_features_A["Random forest"], selected_features_A["XGBoost"]), axis=0)
selected_features_A["Impact"] = np.sum((selected_features_A["Logistic regression coefficients"], selected_features_A["Random forest coefficients"], selected_features_A["XGBoost coefficients"]), axis=0)

# Dataset wax B
selected_features_dict_B = {"Markers": markers_B,"Logistic regression": lr_selector_B.get_support(), "Logistic regression coefficients": np.mean(np.abs(lr2.coef_), axis=0), "Random forest": rf_selector_B.get_support(), "Random forest coefficients": rf2.feature_importances_,
                          "XGBoost": xgbc_selector_B.get_support(), "XGBoost coefficients": xgb2.feature_importances_}

selected_features_B = pd.DataFrame.from_dict(selected_features_dict_B).set_index("Markers", drop=True)

selected_features_B["Points"] = np.sum((selected_features_B["Logistic regression"], selected_features_B["Random forest"], selected_features_B["XGBoost"]), axis=0)
selected_features_B["Impact"] = np.sum((selected_features_B["Logistic regression coefficients"], selected_features_B["Random forest coefficients"], selected_features_B["XGBoost coefficients"]), axis=0)

In [35]:
selected_features_A

Unnamed: 0_level_0,Logistic regression,Logistic regression coefficients,Random forest,Random forest coefficients,XGBoost,XGBoost coefficients,Points,Impact
Markers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5487157_55:C>T,False,0.000000,False,0.000000,False,0.000000,0,0.000000
3743550_33:A>C,False,0.000000,False,0.000000,False,0.000000,0,0.000000
3890504_12:A>G,False,0.000000,False,0.000000,False,0.000000,0,0.000000
3749835_12:T>A,False,0.000000,False,0.000000,False,0.000000,0,0.000000
5803165_43:G>C,False,0.000000,False,0.000000,False,0.000000,0,0.000000
3580391_37:C>T,False,0.000000,False,0.000000,False,0.000000,0,0.000000
3596752_10:G>A,False,0.000000,False,0.000000,False,0.000000,0,0.000000
5497838_52:C>G,False,0.000000,False,0.000000,False,0.000000,0,0.000000
3365135_13:C>G,False,0.000000,False,0.000326,False,0.000000,0,0.000326
3358883_23:T>C,False,0.000000,False,0.000000,False,0.000000,0,0.000000


In [36]:
selected_wax_A = selected_features_A[selected_features_A["Points"] >= 2]
selected_wax_A = selected_wax_A.sort_values(by=["Points", "Impact"], ascending=False)

selected_wax_B = selected_features_B[selected_features_B["Points"] >= 2]
selected_wax_B = selected_wax_B.sort_values(by=["Points", "Impact"], ascending=False)

In [37]:
dtale.show(selected_wax_A)



In [38]:
selected_wax_A.shape

(20, 8)

In [39]:
dtale.show(selected_wax_B)



In [40]:
selected_wax_B.shape

(35, 8)

In [41]:
# Data set wax 1
selected_wax_A.to_excel("../data/processed/wax_results_A.xls")

# Data set wax 2
selected_wax_B.to_excel("../data/processed/wax_results_B.xls")

In [43]:
# Data set wax A
data_wax_selected_A = data_wax_A[selected_wax_A.index.tolist()+["wax"]]
data_wax_selected_A.to_excel("../data/processed/wax_selected_A.xls")

# Data set wax B
data_wax_selected_B = data_wax_B[selected_wax_B.index.tolist()+["wax_F2"]]
data_wax_selected_B.to_excel("../data/processed/wax_selected_B.xls")

In [44]:
# Merge

In [45]:
selected_wax_A["Assigment to 2R"] = "A"
selected_wax_B["Assigment to 2R"] = "B"

summary = pd.concat((selected_wax_A, selected_wax_B), axis=0)

In [46]:
summary = summary.sort_values(by=["Points", "Impact"], ascending=False)
dtale.show(summary)



In [47]:
summary.to_excel("../data/processed/summary.xls")