In [46]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics
from sklearn.metrics import roc_curve, precision_recall_curve, auc, confusion_matrix
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay


In [40]:
MR = '5'#5 or 40 or 100

data_file = ("data/{}_mr_50_cond/simulated_noNoise.txt").format(MR)
grn_file = ("data/{}_mr_50_cond/bipartite_GRN.csv").format(MR)

data = pd.read_csv(data_file, sep="\t", header=0)
grn_df = pd.read_csv(grn_file, sep = ",", header = None, names=['TF_ID', 'G_ID'])
grn_df['W_true'] = 1

N_genes = 100  # total no. of genes
N_TFs = N_genes

In [41]:
n_estimators=1000  # number of trees in the forest (as per GENIE paper)
criterion='squared_error'  # variance reduction equivalent
max_features = int(np.sqrt(N_genes-1)) # max no. of features to use in each tree (as per GENIE paper)
random_state = 42  # for reproducibility

In [42]:
# Normalize Expression data to unit-variance
data_n = StandardScaler(with_mean=False).fit_transform(data.to_numpy())

# Initialize matrices
W = np.zeros(shape=(N_genes,N_TFs))
Fscores = np.zeros(shape=(N_genes,))

for j in np.arange(0,N_genes):
    # read TF and gene expression data X and Gj
    X, Gj= data_n[:,:N_TFs], data_n[:,N_genes+j]
    
    # fit an RF model to predict gene expression from TF
    M_rf = RandomForestRegressor(criterion=criterion, n_estimators=n_estimators, max_features=max_features, random_state=random_state).fit(X,Gj)

    # train score
    Fscores[j] = M_rf.score(X,Gj)

    # Get the weights for all edges connecting TFs to gene j
    W[j,:] = M_rf.feature_importances_

    # # # look at feature importance based on SHAP values
    # # explainer = shap.TreeExplainer(M_rf)
    # # shap_values = explainer(X)

In [43]:
W_df = pd.DataFrame(np.abs(W), dtype=np.float32)

In [48]:
grn_pred = pd.melt(W_df.reset_index(), id_vars = 'index', var_name='TF_ID', value_name='W_pred').rename(columns={'index': 'G_ID'})

grn_pred['G_ID'] = grn_pred['G_ID'].astype(np.int64) + 100
grn_pred['TF_ID'] = grn_pred['TF_ID'].astype(np.int64)

grn_eval = pd.merge(grn_pred,grn_df, on=['G_ID', 'TF_ID'], how='left')
grn_eval['W_true'] = grn_eval['W_true'].fillna(int(0)) == 1.0


In [49]:
precision, recall, thresholds_prc = precision_recall_curve(grn_eval['class'], grn_eval['corr'])
fpr, tpr, thresholds_roc = roc_curve(grn_eval['class'], grn_eval['corr'])
# Use AUC function to calculate the area under the curve of precision recall curve
print(auc(recall, precision))
print(auc(fpr,tpr))

roc_gene = [] 
for i in range(100):
    grn_eval_gene = grn_eval.iloc[i::N_TFs,:]
    roc_gene.append(metrics.roc_auc_score(grn_eval_gene['class'], grn_eval_gene['corr']))
    
print(mean(roc_gene))

KeyError: 'class'

In [None]:
pr_display = PrecisionRecallDisplay(precision=precision, recall=recall).plot()

In [None]:
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()

In [None]:
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds_roc[optimal_idx]

In [None]:
optimal_threshold

In [None]:
grn_eval.loc[grn_eval['W_pred'] >= optimal_threshold, 'pred'] = 1
grn_eval.loc[grn_eval['W_pred'] < optimal_threshold, 'pred'] = 0


In [None]:
confusion_matrix(grn_eval['class'], grn_eval['pred'])