In [1]:
from sklearn.feature_selection import SelectFromModel
import pandas as pd
import numpy as np
import joblib

In [2]:
# markers names

In [3]:
data_wax_A = pd.read_csv("../data/interim/wax_data_A.csv", index_col=0)
markers_A = data_wax_A.drop("wax", axis=1).columns

data_wax_B = pd.read_csv("../data/interim/wax_data_B.csv", index_col=0)
markers_B = data_wax_B.drop("wax_F2", axis=1).columns

In [4]:
# load pre-trained models

In [5]:
lr = joblib.load("../models/logistic_regression_wax_A")
rf = joblib.load("../models/random_forest_wax_A")
xgb = joblib.load("../models/XGBoost_wax_A")

In [6]:
lr2 = joblib.load("../models/logistic_regression_wax_B")
rf2 = joblib.load("../models/random_forest_wax_B")
xgb2 = joblib.load("../models/XGBoost_wax_B")

In [7]:
# Selection

In [8]:
# Dataset wax 1
lr_selector_A = SelectFromModel(lr, prefit=True, threshold="mean")
rf_selector_A = SelectFromModel(rf, prefit=True, threshold="mean")
xgbc_selector_A = SelectFromModel(xgb, prefit=True, threshold="mean")

In [9]:
# Dataset wax 2
lr_selector_B = SelectFromModel(lr2, prefit=True, threshold="mean")
rf_selector_B = SelectFromModel(rf2, prefit=True, threshold="mean")
xgbc_selector_B = SelectFromModel(xgb2, prefit=True, threshold="mean")

In [10]:
# Dataset wax 1
selected_features_dict_A = {"Markers": markers_A,"Logistic regression": lr_selector_A.get_support(), "Logistic regression coefficients": np.sum(np.abs(lr.coef_), axis=0), "Random forest": rf_selector_A.get_support(), "Random forest coefficients": rf.feature_importances_,
                          "XGBoost": xgbc_selector_A.get_support(), "XGBoost coefficients": xgb.feature_importances_}

selected_features_A = pd.DataFrame.from_dict(selected_features_dict_A).set_index("Markers", drop=True)

selected_features_A["Points"] = np.sum((selected_features_A["Logistic regression"], selected_features_A["Random forest"], selected_features_A["XGBoost"]), axis=0)
selected_features_A["Impact"] = np.sum((selected_features_A["Logistic regression coefficients"], selected_features_A["Random forest coefficients"], selected_features_A["XGBoost coefficients"]), axis=0)

# Dataset wax 2
selected_features_dict_B = {"Markers": markers_B,"Logistic regression": lr_selector_B.get_support(), "Logistic regression coefficients": np.sum(np.abs(lr2.coef_), axis=0), "Random forest": rf_selector_B.get_support(), "Random forest coefficients": rf2.feature_importances_,
                          "XGBoost": xgbc_selector_B.get_support(), "XGBoost coefficients": xgb2.feature_importances_}

selected_features_B = pd.DataFrame.from_dict(selected_features_dict_B).set_index("Markers", drop=True)
selected_features_B["Points"] = np.sum((selected_features_B["Logistic regression"], selected_features_B["Random forest"], selected_features_B["XGBoost"]), axis=0)
selected_features_B["Impact"] = np.sum((selected_features_B["Logistic regression coefficients"], selected_features_B["Random forest coefficients"], selected_features_B["XGBoost coefficients"]), axis=0)


In [11]:
selected_wax_A = selected_features_A[selected_features_A["Points"] >= 2]
selected_wax_A = selected_wax_A.sort_values(by=["Points", "Impact"], ascending=False)

selected_wax_B = selected_features_B[selected_features_B["Points"] >= 2]
selected_wax_B = selected_wax_B.sort_values(by=["Points", "Impact"], ascending=False)

In [12]:
selected_wax_A

Unnamed: 0_level_0,Logistic regression,Logistic regression coefficients,Random forest,Random forest coefficients,XGBoost,XGBoost coefficients,Points,Impact
Markers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3591025,True,0.667231,True,0.031913,True,0.787593,3,1.486737
3889647,True,0.291122,True,0.027437,True,0.067095,3,0.385655
3585843,True,0.20985,True,0.038007,True,0.017952,3,0.265809
3358122,True,0.12595,True,0.029265,True,0.021131,3,0.176346
3341848,True,0.075929,True,0.008343,True,0.002167,3,0.086439
3596333_37:G>A,True,0.189322,True,0.002748,False,0.0,2,0.19207
3362424_51:T>G,True,0.156371,True,0.003633,False,0.0,2,0.160004
3341813,True,0.110822,True,0.001793,False,0.0,2,0.112614
3590103,True,0.077688,True,0.032741,False,0.0,2,0.110429
3736285,True,0.079614,True,0.023673,False,0.0,2,0.103287


In [13]:
selected_wax_A.shape

(17, 8)

In [14]:
selected_wax_B

Unnamed: 0_level_0,Logistic regression,Logistic regression coefficients,Random forest,Random forest coefficients,XGBoost,XGBoost coefficients,Points,Impact
Markers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3593882,True,0.552643,True,0.022676,True,0.024216,3,0.599535
3578307_27:A>G,True,0.412798,True,0.011571,True,0.012885,3,0.437253
3908692_28:C>T,True,0.314596,True,0.016947,True,0.00845,3,0.339993
3362575_18:C>T,True,0.192316,True,0.012785,True,0.086207,3,0.291309
4485942_42:T>G,True,0.192047,True,0.001389,True,0.068968,3,0.262404
3597393_10:T>G,True,0.061615,True,0.004174,True,0.162081,3,0.22787
4092788_55:G>A,True,0.044455,True,0.003999,True,0.009425,3,0.057879
3750485,True,0.022135,True,0.008561,True,0.00247,3,0.033167
3896238,True,0.52146,True,0.008993,False,0.0,2,0.530453
3731702,True,0.352067,True,0.010661,False,0.0,2,0.362728


In [15]:
selected_wax_B.shape

(35, 8)

In [16]:
# Data set wax 1
selected_wax_A.to_excel("../data/processed/wax_results_A.xls")

# Data set wax 2
selected_wax_B.to_excel("../data/processed/wax_results_B.xls")

In [17]:
# Data set wax A
data_wax_selected_A = data_wax_A[selected_wax_A.index.tolist()+["wax"]]
data_wax_selected_A.to_excel("../data/processed/wax_selected_A.xls")

# Data set wax B
data_wax_selected_B = data_wax_B[selected_wax_B.index.tolist()+["wax_F2"]]
data_wax_selected_B.to_excel("../data/processed/wax_selected_B.xls")

In [18]:
# Merge

In [19]:
selected_wax_A["Assigment to 2R"] = "A"
selected_wax_B["Assigment to 2R"] = "B"

summary = pd.concat((selected_wax_A, selected_wax_B), axis=0)

In [20]:
summary = summary.sort_values(by=["Points", "Impact"], ascending=False)
summary

Unnamed: 0_level_0,Logistic regression,Logistic regression coefficients,Random forest,Random forest coefficients,XGBoost,XGBoost coefficients,Points,Impact,Assigment to 2R
Markers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3591025,True,0.667231,True,0.031913,True,0.787593,3,1.486737,A
3593882,True,0.552643,True,0.022676,True,0.024216,3,0.599535,B
3578307_27:A>G,True,0.412798,True,0.011571,True,0.012885,3,0.437253,B
3889647,True,0.291122,True,0.027437,True,0.067095,3,0.385655,A
3908692_28:C>T,True,0.314596,True,0.016947,True,0.00845,3,0.339993,B
3362575_18:C>T,True,0.192316,True,0.012785,True,0.086207,3,0.291309,B
3585843,True,0.20985,True,0.038007,True,0.017952,3,0.265809,A
4485942_42:T>G,True,0.192047,True,0.001389,True,0.068968,3,0.262404,B
3597393_10:T>G,True,0.061615,True,0.004174,True,0.162081,3,0.22787,B
3358122,True,0.12595,True,0.029265,True,0.021131,3,0.176346,A


In [21]:
summary.to_excel("../data/processed/summary.xls")