In [1]:
import pandas as pd
import numpy as np
from data_prep import create_x_y_data

In [2]:
X_train, y_train, X_test, y_test, X, y = create_x_y_data()

In [20]:
columns = X_train.columns

### Lasso

In [92]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

log_model = LogisticRegression(penalty='l1', solver='saga', max_iter=10000)
log_model.fit(X_train, y_train)
y_pred = log_model.predict_proba(X_test)
log_model_ll = log_loss(y_test, y_pred)
log_model_ll

0.169102361565386

In [93]:
neg_corr = np.argsort(log_model.coef_)[0][:10]
poss_corr = np.argsort(log_model.coef_)[0][::-1][:10]

In [103]:
top_20_lasso = set.union(set(columns[neg_corr]), set(columns[poss_corr]))
top_20_lasso

{'ENSG00000005513.9',
 'ENSG00000078399.14',
 'ENSG00000079102.15',
 'ENSG00000101333.15',
 'ENSG00000120833.12',
 'ENSG00000156535.12',
 'ENSG00000164100.8',
 'ENSG00000164659.13',
 'ENSG00000165092.11',
 'ENSG00000167767.12',
 'ENSG00000171509.14',
 'ENSG00000188626.6',
 'ENSG00000232629.7',
 'ENSG00000248334.5',
 'ENSG00000254535.3',
 'ENSG00000259207.6',
 'ENSG00000260182.1',
 'ENSG00000267453.5',
 'ENSG00000271952.1',
 'ENSG00000279717.1'}

In [34]:
lasso_negative_features = dict(zip(columns[neg_corr], log_model.coef_[0][neg_corr]))

lasso_positive_features = dict(zip(columns[poss_corr], log_model.coef_[0][poss_corr]))

In [45]:
neg_corr_df = pd.DataFrame({'Genes': columns[neg_corr], 'Weights': log_model.coef_[0][neg_corr] })

pos_corr_df = pd.DataFrame({'Genes': columns[poss_corr], 'Weights': log_model.coef_[0][poss_corr] })

display(neg_corr_df.head())

display(pos_corr_df.head())

Unnamed: 0,Genes,Weights
0,ENSG00000271952.1,-0.207542
1,ENSG00000164659.13,-0.17693
2,ENSG00000078399.14,-0.163137
3,ENSG00000156535.12,-0.151611
4,ENSG00000188626.6,-0.123141


Unnamed: 0,Genes,Weights
0,ENSG00000260182.1,0.197338
1,ENSG00000167767.12,0.117894
2,ENSG00000164100.8,0.095247
3,ENSG00000171509.14,0.091827
4,ENSG00000267453.5,0.076074


In [26]:
from model_blender import important_gene_mask

In [33]:
print(len(important_gene_mask(columns, log_model.coef_[0])))

lasso_important = important_gene_mask(columns, log_model.coef_[0])

88


lasso found 88 genes important

two most important ones: ENSG00000271952.1 (negative), ENSG00000260182.1 (positive)

### Random Forest

In [79]:
from sklearn.ensemble import  RandomForestClassifier
from model_feature_importance import gene_weight_finder

In [84]:
rf_important, rf_top_20_names, rf_top_20_weights, rf_number_important, rf_ll = gene_weight_finder(rf, 
                                                                                X_train, X_test,
                                                                               y_train, y_test)
rf_feat_import_df = pd.DataFrame({'Genes': rf_top_20_names, 'Weights': rf_top_20_weights})

display(rf_feat_import_df.head())

print(f'log loss {rf_ll}')

print(f'number feats: {rf_number_important}')

Unnamed: 0,Genes,Weights
0,ENSG00000254369.5,0.02127
1,ENSG00000230453.8,0.017299
2,ENSG00000129194.6,0.016711
3,ENSG00000106006.6,0.013307
4,ENSG00000137449.14,0.013274


log loss 0.2970060960391264
number feats: 1649


### XGB

In [85]:
from xgboost import XGBClassifier
xgb = XGBClassifier(learning_rate = 0.01, max_depth = 3, n_estimators = 700, random_state=8, n_jobs=-1)

xgb_important, xgb_top_20_names, xgb_top_20_weights, xgb_number_important, xgb_ll = gene_weight_finder(xgb, 
                                                                                X_train, X_test,
                                                                               y_train, y_test)
xgb_feat_import_df = pd.DataFrame({'Genes': xgb_top_20_names, 'Weights': xgb_top_20_weights})

display(xgb_feat_import_df.head())

print(f'log loss {xgb_ll}')

print(f'number feats: {xgb_number_important}')

Unnamed: 0,Genes,Weights
0,ENSG00000254369.5,0.074813
1,ENSG00000230453.8,0.051746
2,ENSG00000172236.15,0.046758
3,ENSG00000206120.10,0.045511
4,ENSG00000092853.12,0.040524


log loss 0.18784231207841498
number feats: 241


### Intersection

In [87]:
len(set.intersection(set(xgb_important), set(rf_important), set(lasso_important)))

34

number of features that were given some weight in any of the three models

In [105]:
set.intersection(set(xgb_top_20_names), set(top_20_lasso) )

{'ENSG00000248334.5'}