# Gene Regulatory Network Inference
This is the notebook to process data and define functions before making a python script

In [1]:
# Import basic packages
import pandas as pd
import numpy as np

In [26]:
# LCL expression data
lcl_exp_path = '../dataset/LCL_networks/expression'
lcl_geu = pd.read_csv(f'{lcl_exp_path}/Geuvadis.txt', sep = '\t').set_index('Gene')

In [3]:
# Make prepare dataset function
from sklearn import model_selection
def prep_dataset(target_gene, exp_df):
    '''
    Prepares training set and test set for target gene
    '''
    # Get y (target) and predictor matrix (X)
    y = exp_df.loc[target_gene, :]
    X = exp_df.drop(target_gene).transpose().values
    
    
    # Split 80:20 for test and train
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.2)
    
    return X_train, X_test, y_train, y_test

from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

def grn_lasso(target_gene, exp_df):
    # Prep data
    X_train, X_test, y_train, y_test = prep_dataset(target_gene, exp_df)
    
    # Use Lasso regression with Cross validations
    lasso_reg = Lasso(alpha = 0.01)
    lasso_reg.fit(X_train, y_train)
    
    # Get scores (R^2)
    train_score = lasso_reg.score(X_train, y_train) # Note: R^2 not very good, maybe use other methods
    test_score = lasso_reg.score(X_test, y_test)
    
    # Get weights of lasso, non zero weights are regulators
    predictors = exp_df.drop(target_gene).index
    zero_coef_filter= lasso_reg.coef_ == 0
    non_zero_preds = predictors[zero_coef_filter]
    
    
    return non_zero_preds.values

In [6]:
# Gene Regulator dictionary
# Key: target gene
# Value: List of regulators

lcl_regulators = dict()

lcl_genes = lcl_geu.index.values
for gene in lcl_genes:
    lcl_regulators[gene] = grn_lasso(gene, lcl_geu)



In [42]:
# Read in an expression file, and predict GRN using a method
def predict_grn(exp_file, method, *args):
    '''
    Function that reads in expression level and predict its GRN using a specific method
    
    Args:
        - exp_file: Path to expression file
        - method: Method of inferring GRN (Example: grn_lasso)
        - *args: additional arguments for method
    
    Returns:
        - Set of predicted edges with format (regulator(space)target)
    '''

    exp_df = pd.read_csv(exp_file, sep = '\t').set_index('Gene')

    pred_edges = set()
    
    
    for target in exp_df.index.values:
        # Get regulators for every target gene
        pred_regs = method(target, exp_df, *args)
        
        # Add the predicted regulators -> target edge to the predicted edges
        for reg in pred_regs:
            pred_edges.add(f'{reg}->{target}')
    
    return pred_edges

lcl_pred_edges = predict_grn(f'{lcl_exp_path}/Geuvadis.txt', grn_lasso)



In [43]:
# Performance Metrics
def iou_score(gold_file, pred_grn_edges):
    '''
    Function to score predicted grn vs gold standard based on intersection over union.

    Score:
    For starters, we will use the simple score of intersection / union
    Intersection: Count of edges in both gold standard AND predicted grn
    Union: Count of gold standard edges + predicted grn edges - Intersection
    
    Intersection / Union is the score.
    
    Args:
        - gold_file: Path to gold standard file
        - pred_grn_edges: set of predicted grn edges
    '''
    
    # Read in gold standard file
    gold_df = pd.read_csv(gold_file, sep = '\t', header = None, names = ['Regulator', 'Target']) 
    
    # Set of gold standard edges
    gold_edge_set = set(gold_df.loc[:, 'Regulator'] + '->' + gold_df.loc[:, 'Target'])
    
    # Get Intersection and Union
    intersection = gold_edge_set.intersection(pred_grn_edges)
    union = gold_edge_set.union(pred_grn_edges)
        
    # Get score: Intersection / Union
    score = len(intersection) / len(union)
    
    return score
    
    
lcl_gold = '../dataset/LCL_networks/gold/Cusanovich_gold.txt'
iou_score(lcl_gold, lcl_pred_edges)

0.00038918995888678537