In [12]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

def load_data(expression_file):
    data = pd.read_csv(expression_file, sep="\t")
    tf_expression = data.iloc[:, :100].values  # Assuming TFs are the first 100 columns
    tg_expression = data.iloc[:, 100:].values  # Assuming TGs are the next 100 columns
    return tf_expression, tg_expression

def train_models(tf_expression, tg_expression):
    models = []
    for i in range(tg_expression.shape[1]):
        model = RandomForestRegressor(n_estimators=100)
        model.fit(tf_expression, tg_expression[:, i])
        models.append(model)
    return models

def infer_grn(models, threshold=0.01):
    grn_edges = []
    for i, model in enumerate(models):
        importance = model.feature_importances_
        print(importance)
        regulators = np.where(importance > threshold)[0]
        for reg in regulators:
            grn_edges.append((reg, i+100))  # TF ids are 0-99, TG ids are 100-199
    return grn_edges

def evaluate_grn(predicted_edges, ground_truth_file):
    ground_truth = pd.read_csv(ground_truth_file, header=None)
    ground_truth_set = set(tuple(x) for x in ground_truth.values)
    predicted_set = set(predicted_edges)
    
    # Calculate precision, recall, and AUROC or other appropriate metrics
    # This is a simplification; in practice, you might need a more sophisticated evaluation
    
    tp = len(predicted_set & ground_truth_set)
    fp = len(predicted_set - ground_truth_set)
    fn = len(ground_truth_set - predicted_set)
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    
    return precision, recall

# Load your data and adjust file path as necessary!
tf_expression, tg_expression = load_data(r'C:\Users\Neel Patel\Documents\Github Repositories\Machine-Learning-Biosciences-Final-Project\Project1\100_mr_50_cond\simulated_noNoise.txt')

# Train a model for each target gene
models = train_models(tf_expression, tg_expression)

# Infer the gene regulatory network
predicted_grn_edges = infer_grn(models)

# Evaluate the GRN
precision, recall = evaluate_grn(predicted_grn_edges, r'C:\Users\Neel Patel\Documents\Github Repositories\Machine-Learning-Biosciences-Final-Project\Project1\100_mr_50_cond\bipartite_GRN.csv')

print(f"Precision: {precision}, Recall: {recall}")


[0.00164527 0.00461832 0.00638097 0.00565427 0.00787769 0.00190517
 0.00310656 0.00538565 0.00236359 0.00482968 0.00726952 0.00512827
 0.00247066 0.00289155 0.00500929 0.00226573 0.00515091 0.00264121
 0.00860236 0.00607505 0.00323905 0.00180325 0.00334087 0.00545934
 0.0116773  0.01073826 0.00515912 0.00488914 0.00202392 0.00883262
 0.00305472 0.00528866 0.00966426 0.00855027 0.00882238 0.01130228
 0.0050351  0.00257666 0.00173651 0.01637498 0.00815138 0.00417709
 0.00547009 0.00569809 0.02634239 0.07865359 0.00225844 0.0035282
 0.00811586 0.00422138 0.00234898 0.00283415 0.01810859 0.00562321
 0.11146517 0.00761659 0.00776509 0.00507075 0.00626188 0.00539381
 0.0103386  0.00158211 0.01312152 0.00852895 0.004273   0.00462062
 0.00455409 0.0055442  0.00610183 0.01664001 0.00696074 0.00613088
 0.00493666 0.01008091 0.01727931 0.00991769 0.00357577 0.03665411
 0.00614216 0.00311334 0.06157334 0.0092405  0.00723295 0.01116156
 0.00813715 0.00563006 0.00316799 0.00345771 0.00683391 0.06425

[0.00300567 0.00793148 0.00566728 0.00600357 0.08221774 0.00639814
 0.00530955 0.09697896 0.00585432 0.00780883 0.0035557  0.00494765
 0.00250369 0.0034034  0.00326608 0.01023406 0.00781302 0.08050147
 0.00383906 0.00134494 0.00919985 0.00386271 0.00391434 0.00489488
 0.00408602 0.00451868 0.00567341 0.06035978 0.00390442 0.00770716
 0.00745357 0.00330212 0.00499614 0.00284589 0.00584468 0.01196334
 0.00132161 0.02113307 0.00953274 0.00286127 0.00171754 0.00822951
 0.0018947  0.0065582  0.00500811 0.00298042 0.0049389  0.0071528
 0.00795819 0.00594186 0.00317363 0.00380201 0.00512982 0.00331768
 0.00642202 0.00757336 0.00298001 0.00341073 0.00718774 0.00436806
 0.00808309 0.00206175 0.01931854 0.00671661 0.00606566 0.00417684
 0.00388767 0.00178809 0.0032114  0.00756821 0.00418965 0.00598806
 0.00156327 0.01121963 0.00268492 0.04070325 0.00577474 0.00732763
 0.00448679 0.03978842 0.00810225 0.00419231 0.00480417 0.00361545
 0.06327666 0.00370413 0.00449739 0.01880027 0.00180218 0.00426

[0.00298884 0.00209089 0.00609703 0.00418019 0.00916255 0.00143199
 0.00207128 0.00844786 0.00918211 0.0039467  0.00494861 0.0032438
 0.00431347 0.00594386 0.00348652 0.00468568 0.0087393  0.01178076
 0.01011637 0.00220842 0.00689359 0.02533412 0.00377832 0.03535085
 0.01228941 0.00590056 0.00489024 0.03374823 0.00202722 0.00207987
 0.00496331 0.00620612 0.00313284 0.00462307 0.00326703 0.00988017
 0.00088112 0.00214136 0.00480436 0.00355829 0.00513939 0.002345
 0.00350448 0.00221098 0.00863659 0.00098411 0.00963148 0.01663769
 0.01251557 0.00319997 0.00197326 0.37335212 0.00844809 0.00523665
 0.00248044 0.00782791 0.00345715 0.00655295 0.003804   0.00461829
 0.00353898 0.00372997 0.01023555 0.00317735 0.02547592 0.00852993
 0.00331443 0.00383731 0.00169107 0.01102357 0.00240948 0.00181053
 0.00792312 0.00610952 0.0053933  0.0046099  0.00494798 0.0038107
 0.00514846 0.00247734 0.00354663 0.00224174 0.00413937 0.00455345
 0.0083698  0.01031149 0.0171393  0.0013357  0.00543954 0.00310891

# Approach 1 - Correlation Pruning

In [11]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor

def correlation_pruning(preliminary_network, expression_data, threshold=0.4):
    pruned_network = []
    for tf, tg in preliminary_network:
        correlation, _ = pearsonr(expression_data[:, tf], expression_data[:, tg])  # tg is already referenced from indices 100 onward
        if abs(correlation) > threshold:
            pruned_network.append((tf, tg))
    return pruned_network

# Assume the rest of the functions like train_models and load_data are defined as above
expression_data = pd.read_csv(r'C:\Users\Neel Patel\Documents\Github Repositories\Machine-Learning-Biosciences-Final-Project\Project1\100_mr_50_cond\simulated_noNoise.txt', sep="\t", header=None).values

# Load your data
tf_expression, tg_expression = load_data(r'C:\Users\Neel Patel\Documents\Github Repositories\Machine-Learning-Biosciences-Final-Project\Project1\100_mr_50_cond\simulated_noNoise.txt')

# Train a model for each target gene and construct preliminary network
models = train_models(tf_expression, tg_expression)
preliminary_network = infer_grn(models)  # Modify this function to output (tf, tg) pairs

# Prune the network based on correlation
final_network = correlation_pruning(preliminary_network, expression_data)

# Evaluate the final GRN
precision, recall = evaluate_grn(final_network, r'C:\Users\Neel Patel\Documents\Github Repositories\Machine-Learning-Biosciences-Final-Project\Project1\100_mr_50_cond\bipartite_GRN.csv')

print(f"Precision: {precision}, Recall: {recall}")

#THIS DOES NOT WORK REALLY WELL!

Precision: 0.24606151537884471, Recall: 0.6735112936344969


# Approach 2 - Topological graphs and centrality


In [15]:
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.ensemble import RandomForestRegressor

def load_data(expression_file):
    data = pd.read_csv(expression_file, sep="\t", header=None)
    tf_expression = data.iloc[:, :100].values  # Assuming TFs are the first 100 columns
    tg_expression = data.iloc[:, 100:].values  # Assuming TGs are the next 100 columns
    return tf_expression, tg_expression

def train_models(tf_expression, tg_expression):
    models = []
    for i in range(tg_expression.shape[1]):
        model = RandomForestRegressor(n_estimators=100)
        model.fit(tf_expression, tg_expression[:, i])
        models.append(model)
    return models

def infer_grn(models):
    preliminary_network = []
    for tg, model in enumerate(models):
        importance = model.feature_importances_
        # Considering TFs with non-zero importance
        for tf, imp in enumerate(importance):
            if imp > 0:
                preliminary_network.append((tf, tg + 100))  # tg + 100 to adjust index for TGs
    return preliminary_network

def apply_network_topology_analysis(preliminary_network, models):
    G = nx.DiGraph()
    for tf, tg in preliminary_network:
        weight = models[tg - 100].feature_importances_[tf]  # Adjusting index for TGs
        G.add_edge(tf, tg, weight=weight)

    centrality = nx.betweenness_centrality(G, weight='weight')
    refined_network = [(tf, tg) for tf, tg in preliminary_network if centrality[tf] > np.median(list(centrality.values()))]

    return refined_network

def evaluate_grn(predicted_network, ground_truth_file):
    ground_truth = pd.read_csv(ground_truth_file, header=None)
    ground_truth_set = set(map(tuple, ground_truth.values))
    predicted_set = set(predicted_network)
    
    tp = len(predicted_set & ground_truth_set)
    fp = len(predicted_set - ground_truth_set)
    fn = len(ground_truth_set - predicted_set)
    
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    
    return precision, recall

# Load data
expression_file = r'C:\Users\Neel Patel\Documents\Github Repositories\Machine-Learning-Biosciences-Final-Project\Project1\100_mr_50_cond\simulated_noNoise.txt'
ground_truth_file = r'C:\Users\Neel Patel\Documents\Github Repositories\Machine-Learning-Biosciences-Final-Project\Project1\100_mr_50_cond\bipartite_GRN.csv'
tf_expression, tg_expression = load_data(expression_file)

# Train Random Forest models
models = train_models(tf_expression, tg_expression)

# Infer preliminary GRN
preliminary_network = infer_grn(models)

# Refine network using topology analysis
final_network = apply_network_topology_analysis(preliminary_network, models)

# Evaluate the GRN
precision, recall = evaluate_grn(final_network, ground_truth_file)
print(f"Precision: {precision}, Recall: {recall}")


Precision: 0, Recall: 0.0
