# Gene Regulatory Network Prediction of Yeast Network
The task:
Predict which transcription factors affects genes (For now this is non-tf)

In [1]:
# Import stuff
import pandas as pd
import numpy as np
import grn_preds

In [2]:
# Get expression data
ko_df = pd.read_csv('../dataset/yeast_networks/expression/KO.txt', sep='\t').set_index('Gene')
nv_df = pd.read_csv('../dataset/yeast_networks/expression/NatVar.txt', sep='\t').set_index('Gene')
stress_df = pd.read_csv('../dataset/yeast_networks/expression/Stress.txt', sep='\t').set_index('Name')

# Get Transcription Factors
ko_tf = pd.read_csv('../dataset/yeast_networks/expression/KO_TF_names.txt', sep='\t', header=None)
ko_tf.columns = ['TF']

nv_tf = pd.read_csv('../dataset/yeast_networks/expression/NatVar_TF_names.txt', sep='\t', header=None)
nv_tf.columns = ['TF']

stress_tf = pd.read_csv('../dataset/yeast_networks/expression/Stress_TF_names.txt', sep='\t', header=None)
stress_tf.columns = ['TF']

# Note that there are different gene list for each df, so we will use ko as test for now.

In [3]:
# Generate gold dataset / gold truth vector
ko_gold = grn_preds.generate_gold_dataset(['../dataset/yeast_networks/gold/MacIsaac2.KO.txt', 
                                 '../dataset/yeast_networks/gold/YEASTRACT_Type2.KO.txt', 
                                 '../dataset/yeast_networks/gold/YEASTRACT_Count3.KO.txt'])

In [4]:
# Start Pipeline: Generate existing truth edges
ko_main_df = grn_preds.generate_possible_edges(ko_tf.loc[:, 'TF'], ko_df)
ko_main_df = grn_preds.populate_actual_column(ko_main_df, ko_gold)

In [10]:
# Use only 10 random target genes in the ko_df, then use all TF to predict
# Get 10 random target genes
sampled_target = list(ko_df.sample(30).index)

# Get ground truth for these target genes
sampled_gold_df = ko_gold[ko_gold.loc[:, 'Target'].isin(sampled_target)]
sampled_ko_main_df = ko_main_df[ko_main_df.loc[:, 'Target'].isin(sampled_target)]

# Populate actual columns
sampled_ko_main_df = grn_preds.populate_actual_column(sampled_ko_main_df, ko_gold)

# Predict
for target in sampled_target:
    print(target)
    lasso_edges, lasso_scores = grn_preds.grn_lasso(target, ko_tf.loc[:, 'TF'], ko_df, alphas = [0.001, 0.01, 0.1, 1, 10, 100], cv = 5)
    rf_edges, rf_scores = grn_preds.grn_regforest(target, ko_tf.loc[:, 'TF'], ko_df, n_estimators = 100, max_depth = 8, bootstrap = True, min_samples_leaf = 10, n_jobs = -1)
    svr_lin_edges, svr_lin_scores = grn_preds.grn_svr(target, ko_tf.loc[:, 'TF'], ko_df, kernel='linear')
    
    print(len(svr_lin_edges), len(svr_lin_scores[0]), svr_lin_scores)
    
#     svr_rbf_edges, svr_rbf_scores = grn_preds.grn_svr(target, ko_tf.loc[:, 'TF'], ko_df, kernel='rbf')
#     svr_poly_edges, svr_poly_scores = grn_preds.grn_svr(target, ko_tf.loc[:, 'TF'], ko_df, kernel='poly')
    
    
    sampled_ko_main_df.loc[lasso_edges, 'Lasso scores'] = abs(lasso_scores)
    sampled_ko_main_df.loc[rf_edges, 'Regforest scores'] = rf_scores
    sampled_ko_main_df.loc[svr_lin_edges, 'SVR Linear scores'] = abs(svr_lin_scores)
#     sampled_ko_main_df.loc[svr_rbf_edges, 'SVR RBF scores'] = abs(svr_rbf_scores)
#     sampled_ko_main_df.loc[svr_poly_edges, 'SVR Polynomial scores'] = abs(svr_poly_scores)

YLR435W




536 1 [[ 2.51512689e-02 -6.68011278e-02  2.27750396e-02 -6.57942463e-02
  -2.28454243e-02  1.85942824e-02  1.51836178e-03 -5.35802438e-02
   3.85968764e-02  5.12785125e-02  3.37142442e-02  1.73623956e-02
   4.66667502e-02  5.29972716e-04 -2.91525280e-02 -2.00973568e-02
  -2.95089657e-02 -3.37428537e-03  1.02105387e-02 -8.35292921e-02
  -1.33886288e-02 -1.07317543e-02  2.12252175e-02 -3.26371078e-02
   3.69213052e-02 -3.19747013e-02 -5.88516274e-03 -3.31670721e-02
  -3.18693461e-02  3.36239930e-02  5.43829112e-02 -2.82608482e-03
   5.99375232e-02  2.84503006e-02  1.72651869e-03  4.13089765e-02
   3.80490766e-02  5.98629559e-02  1.02972314e-01  1.93297066e-02
   2.26959344e-02  3.43531882e-03 -2.20977798e-02 -7.93194506e-03
  -5.07948282e-02  4.99458115e-03  2.62752448e-02 -2.44729247e-02
   7.20334787e-02  6.24496511e-03  2.82096181e-02 -7.61441781e-03
   3.57888308e-02 -1.93614571e-02  2.68180827e-02  2.69021864e-02
  -4.77681891e-02  1.51647453e-02 -1.75059401e-02  6.60025267e-02
  -5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


ValueError: Must have equal len keys and value when setting with an ndarray

In [None]:
# Generate auroc for lasso and regforest
grn_preds.generate_auroc(sampled_ko_main_df.loc[:, 'Actual'], sampled_ko_main_df.loc[:, 'Lasso scores'], 'AUC Curve for Lasso')
grn_preds.generate_auroc(sampled_ko_main_df.loc[:, 'Actual'], sampled_ko_main_df.loc[:, 'Regforest scores'], 'AUC Curve for Regression Forest (GENIE3)')
grn_preds.generate_auroc(sampled_ko_main_df.loc[:, 'Actual'], sampled_ko_main_df.loc[:, ''], 'AUC Curve for Linear SVR')