# Gene Regulatory Network Prediction of Yeast Network
The task:
Predict which transcription factors affects genes (For now this is non-tf)

In [1]:
# Import stuff
import pandas as pd
import numpy as np

In [28]:
# Get expression data
ko_df = pd.read_csv('../dataset/yeast_networks/expression/KO.txt', sep='\t').set_index('Gene')
nv_df = pd.read_csv('../dataset/yeast_networks/expression/NatVar.txt', sep='\t').set_index('Gene')
stress_df = pd.read_csv('../dataset/yeast_networks/expression/Stress.txt', sep='\t').set_index('Name')

# Get Transcription Factors
ko_tf = pd.read_csv('../dataset/yeast_networks/expression/KO_TF_names.txt', sep='\t', header=None)
ko_tf.columns = ['TF']

nv_tf = pd.read_csv('../dataset/yeast_networks/expression/NatVar_TF_names.txt', sep='\t', header=None)
nv_tf.columns = ['TF']

stress_tf = pd.read_csv('../dataset/yeast_networks/expression/Stress_TF_names.txt', sep='\t', header=None)
stress_tf.columns = ['TF']

# Note that there are different gene list for each df, so we will use ko as test for now.

In [108]:
# Prepare dataset: set which are the predictors (TF), and which are the targets
from sklearn import model_selection
def prep_dataset(target_gene, tf_list, exp_df):
    '''
    Prepares training set and test set for target gene
    
    Args:
        - target_gene: target gene for the iteration (y)
        - exp_df: expression dataframe (already in pandas df format)
        - tf_list: transcription factors, which will be the predictors (X)
        
    Returns:
        - Training and Testing set to be used in model predictions
        - label for predictors, so we can subset this later
    '''
    # Get y (target) and predictor matrix (X)
    y = exp_df.loc[target_gene, :].values
    X = exp_df.loc[tf_list, :]
    
    if target_gene in tf_list.values:
        X = X.drop(index=target_gene)
    
    X_label = X.index # Predictor labels for X
    X = X.values.transpose()
    
    # Split 80:20 for test and train
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.2)

    
    return X_train, X_test, y_train, y_test, X_label

In [124]:
from sklearn.linear_model import Lasso, LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

def grn_lasso(target_gene, tf_list, exp_df):
    '''
    GRN inference method using lasso regression
    
    Args:
        - target_gene: target gene for the iteration (y)
        - exp_df: expression dataframe (already in pandas df format)
        - tf_list: transcription factors, which will be the predictors (X)
        
    Returns:
        - Numpy array of type str, with list of non-zero weight predictors
    '''
    # Prep data
    X_train, X_test, y_train, y_test, X_label = prep_dataset(target_gene, tf_list, exp_df)
    
    # Use Lasso regression
    lasso_reg = LassoCV(alphas = [0.01, 0.1, 1, 10, 100], cv=5)
    lasso_reg.fit(X_train, y_train)
    
    # Get scores (R^2)
    train_score = lasso_reg.score(X_train, y_train) # Note: R^2 not very good, maybe use other methods
    test_score = lasso_reg.score(X_test, y_test)
    
    # Get weights of lasso, non zero weights are regulators
    predictors = X_label
    nonzero_filter = lasso_reg.coef_ != 0
    nonzero_preds = predictors[nonzero_filter]
    
    
    return nonzero_preds.values

def grn_regforest(target_gene, tf_list, exp_df):
    '''
    GRN inference method using regression forest. This method does not assume linearity of data.
    
    Args:
        - target_gene: target gene for the iteration (y)
        - exp_df: expression dataframe (already in pandas df format)
        - tf_list: transcription factors, which will be the predictors (X)
        
    Returns:
        - Numpy array of type str, with list of non-zero weight predictors
        
    '''
    # Prep data
    X_train, X_test, y_train, y_test, X_label = prep_dataset(target_gene, tf_list, exp_df)
    
    # Use regerssion tree
    forest_reg = RandomForestRegressor(n_estimators = 10, max_depth = 8, bootstrap = True, min_samples_leaf = 10, n_jobs=-1)
    forest_reg.fit(X_train, y_train)
    
    # Get Scores (R^2)
    train_score = forest_reg.score(X_train, y_train)
    test_score = forest_reg.score(X_test, y_test)
    
    # Get feature importance
    predictors = X_label
    nonzero_filter = forest_reg.feature_importances_ != 0
    nonzero_preds = predictors[nonzero_filter]
    
    return nonzero_preds.values

In [117]:
# Read in an expression file, and predict GRN using a method
def predict_grn(exp_df, tf_list, method, *args):
    '''
    Function that reads in expression level and predict its GRN using a specific method
    
    Args:
        - exp_df: expression dataframe (already in pandas df format), index is gene names, columns are sample / treatments
        - method: Method of inferring GRN (Example: grn_lasso)
        - *args: additional arguments for method
        - tf_list: transcription factors, which will be the predictors (X)
        
    Returns:
        - Set of predicted edges with format (regulator(space)target)
    '''
    
    pred_edges = set()

    for target in exp_df.index.values:
        # Get regulators for every target gene
        pred_regs = method(target, tf_list, exp_df, *args)
        
        # Add the predicted regulators -> target edge to the predicted edges
        for reg in pred_regs:
            pred_edges.add(f'{reg}->{target}')
    
    return pred_edges

In [125]:
# Predict Edges
ko_regforest_edges = predict_grn(ko_df, ko_tf.loc[:, 'TF'], grn_regforest)
ko_lasso_edges = predict_grn(ko_df, ko_tf.loc[:, 'TF'], grn_lasso)

Currently finished: 100.0 %


In [126]:
# Performance Metrics

def iou_score(gold_file, pred_grn_edges):
    '''
    Function to score predicted grn vs gold standard based on intersection over union.

    Score:
    For starters, we will use the simple score of intersection / union
    Intersection: Count of edges in both gold standard AND predicted grn
    Union: Count of gold standard edges + predicted grn edges - Intersection

    Intersection / Union is the score.

    Args:
        - gold_file: Path to gold standard file
        - pred_grn_edges: set of predicted grn edges

    Returns:
        - IOU score
    '''

    # Read in gold standard file
    gold_df = pd.read_csv(gold_file, sep = '\t', header = None, names = ['Regulator', 'Target']) 

    # Set of gold standard edges
    gold_edge_set = set(gold_df.loc[:, 'Regulator'] + '->' + gold_df.loc[:, 'Target'])

    # Get Intersection and Union
    intersection = gold_edge_set.intersection(pred_grn_edges)
    union = gold_edge_set.union(pred_grn_edges)

    # Get score: Intersection / Union
    iou_score = len(intersection) / len(union)
    print(f'Union edges count: {len(union)}')
    print(f'Intersection edges count: {len(intersection)}')
    
    return iou_score

ko_lasso_score = iou_score('../dataset/yeast_networks/gold/MacIsaac2.KO.txt', lasso_edges)
ko_regforest_score = iou_score('../dataset/yeast_networks/gold/MacIsaac2.KO.txt', ko_regforest_edges)

Union edges count: 452886
Intersection edges count: 609
Union edges count: 675914
Intersection edges count: 821


In [127]:
print(f'KO Lasso score: {ko_lasso_score}') # Much better than last time in try 1
print(f'KO Regforest score: {ko_regforest_score}') 

KO Lasso score: 0.00134470926458314
KO Regforest score: 0.0012146515680989001
