In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics
from sklearn.metrics import roc_curve, precision_recall_curve, auc, confusion_matrix
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay
from statistics import mean, median
import shap
from sklearn.ensemble import RandomForestRegressor
import math
from scipy.stats import ranksums
from sklearn.model_selection import GridSearchCV

In [2]:
MR = 5

data_file = ("data/{}_mr_50_cond/simulated_noNoise.txt").format(MR)
grn_file = ("data/{}_mr_50_cond/bipartite_GRN.csv").format(MR)

data = pd.read_csv(data_file, sep="\t", header=0)
grn_df = pd.read_csv(grn_file, sep = ",", header = None, names=['TF_ID', 'G_ID'])
grn_df['class'] = 1

N_genes = 100  # total no. of genes
N_TFs = N_genes

In [24]:
n_estimators=[50, 200, 500, 1000, 2000]  # number of trees in the forest
criterion='squared_error'  # variance reduction equivalent
max_features = ['log2', 'sqrt', 100] # max no. of features to use in each tree
bootstrap = [True, False]
random_state = 42  # for reproducibility

In [25]:
param_grid = {
    'bootstrap': bootstrap,
    'max_features': max_features,
    'n_estimators': n_estimators
}

In [29]:
# Normalize Expression data to unit-variance
data_n = StandardScaler(with_mean=False).fit_transform(data.to_numpy())

# Initialize matrices
W = np.zeros(shape=(N_genes,N_TFs))
W_shap = np.zeros(shape=(N_genes,N_TFs))
Fscores = np.zeros(shape=(N_genes,))
best_params = shape=(N_genes,3

for j in np.arange(0,N_genes):
    # read TF and gene expression data X and Gj
    X, Gj= data_n[:,:N_TFs], data_n[:,N_genes+j]
    
    # fit an RF model to predict gene expression from TF
    M_rf = RandomForestRegressor(criterion=criterion, random_state=random_state)
    grid_search = GridSearchCV(estimator = M_rf, param_grid = param_grid, cv = 3)
    
    grid_search.fit(X,Gj)
                     
    best_grid = grid_search.best_estimator_
    
    # train score
    Fscores[j] = best_grid.score(X,Gj)

    # Get the weights for all edges connecting TFs to gene j
    W[j,:] = best_grid.feature_importances_

    # look at feature importance based on SHAP values
    explainer = shap.TreeExplainer(best_grid)
    shap_values = explainer(X)
    W_shap[j,:] = np.mean(np.abs(shap_values.values), axis=0)

TypeError: float() argument must be a string or a real number, not 'dict'

In [30]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 'log2', 'n_estimators': 50}

In [None]:
W_df = pd.DataFrame(np.abs(W))

plt.figure(figsize=(13, 3))
ax = sns.heatmap(W_df,
            cmap = "flare",
            linewidths=0.5)

grn_pred = pd.melt(W_df.reset_index(), id_vars = 'index', var_name='TF_ID', value_name='W_pred').rename(columns={'index': 'G_ID'})

grn_pred['G_ID'] = grn_pred['G_ID'].astype(np.int64) + 100
grn_pred['TF_ID'] = grn_pred['TF_ID'].astype(np.int64)

grn_eval = pd.merge(grn_pred,grn_df, on=['G_ID', 'TF_ID'], how='left')
grn_eval['class'] = grn_eval['class'].fillna(int(0))

grn_eval.to_csv("results/{}_mr_50_cond/grn_eval_rf_vr_{}.csv".format(MR, max_features))

precision, recall, thresholds_prc = precision_recall_curve(grn_eval['class'], grn_eval['W_pred'])
fpr, tpr, thresholds_roc = roc_curve(grn_eval['class'], grn_eval['W_pred'])
# Use AUC function to calculate the area under the curve of precision recall curve
print("auprc" , auc(recall, precision))
print("auroc" , auc(fpr,tpr))

roc_gene = [] 
for i in range(100):
    grn_eval_gene = grn_eval.iloc[i::N_TFs,:]
    roc_gene.append(metrics.roc_auc_score(grn_eval_gene['class'], grn_eval_gene['W_pred']))
    
print("mean auroc", mean(roc_gene))

pr_display = PrecisionRecallDisplay(precision=precision, recall=recall).plot()
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()

print("ranksums", ranksums(grn_eval[grn_eval['class']==1]['W_pred'], grn_eval[grn_eval['class']==0]['W_pred'], alternative='greater'))

prc = pd.DataFrame({'precision': precision, 'recall': recall}, columns=['precision', 'recall'])
roc = pd.DataFrame({'fpr': fpr, 'tpr': tpr}, columns=['fpr', 'tpr'])
prc.to_csv("results/{}_mr_50_cond/grn_prc_rf_vr_{}.csv".format(MR, max_features))
roc.to_csv("results/{}_mr_50_cond/grn_roc_rf_vr_{}.csv".format(MR, max_features))

In [None]:
W_shap_df = pd.DataFrame(np.abs(W_shap))

plt.figure(figsize=(13, 3))
ax = sns.heatmap(W_shap_df,
            cmap = "flare",
            linewidths=0.5)

grn_pred_shap = pd.melt(W_shap_df.reset_index(), id_vars = 'index', var_name='TF_ID', value_name='W_pred').rename(columns={'index': 'G_ID'})

grn_pred_shap['G_ID'] = grn_pred_shap['G_ID'].astype(np.int64) + 100
grn_pred_shap['TF_ID'] = grn_pred_shap['TF_ID'].astype(np.int64)

grn_eval_shap = pd.merge(grn_pred_shap,grn_df, on=['G_ID', 'TF_ID'], how='left')
grn_eval_shap['class'] = grn_eval_shap['class'].fillna(int(0)) == 1.0

grn_eval_shap.to_csv("results/{}_mr_50_cond/grn_eval_rf_shap_{}.csv".format(MR, max_features))

precision, recall, thresholds_prc = precision_recall_curve(grn_eval_shap['class'], grn_eval_shap['W_pred'])
fpr, tpr, thresholds_roc = roc_curve(grn_eval_shap['class'], grn_eval_shap['W_pred'])
# Use AUC function to calculate the area under the curve of precision recall curve
print("auprc" , auc(recall, precision))
print("auroc" , auc(fpr,tpr))

roc_gene = [] 
for i in range(100):
    grn_eval_gene = grn_eval_shap.iloc[i::N_TFs,:]
    roc_gene.append(metrics.roc_auc_score(grn_eval_gene['class'], grn_eval_gene['W_pred']))
    
print("mean auroc", mean(roc_gene))

pr_display = PrecisionRecallDisplay(precision=precision, recall=recall).plot()
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()

print("ranksums",ranksums(grn_eval_shap[grn_eval_shap['class']==1]['W_pred'], grn_eval_shap[grn_eval_shap['class']==0]['W_pred'], alternative='greater'))

prc = pd.DataFrame({'precision': precision, 'recall': recall}, columns=['precision', 'recall'])
roc = pd.DataFrame({'fpr': fpr, 'tpr': tpr}, columns=['fpr', 'tpr'])
prc.to_csv("results/{}_mr_50_cond/grn_prc_rf_shap_{}.csv".format(MR, max_features))
roc.to_csv("results/{}_mr_50_cond/grn_roc_rf_shap_{}.csv".format(MR, max_features))