## Libraries & Modules

In [1]:
%reload_kedro

In [2]:
from loguru import logger
import string
from rich import print

# Data manipulation
import pandas as pd
import numpy as np
from modspy_data.helpers import KnowledgeGraphScores

# Distributed
import dask.dataframe as dd
from dask.distributed import Client, progress, performance_report
from dask_jobqueue import SLURMCluster
from dask import delayed
import dask

#### SLURM cluster setup

In [3]:
cluster = SLURMCluster(cores=1,
                       processes=1,
                       memory="8GB",
                       account="def-mtarailo_cpu",
                       walltime="00:10:00",
                       log_directory='./logs')
client = Client(cluster)
cluster

Tab(children=(HTML(value='<div class="jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-outpu…

In [4]:
cluster.scale(1)

In [6]:
!sq

            JOBID     USER              ACCOUNT           NAME  ST  TIME_LEFT NODES CPUS TRES_PER_N MIN_MEM NODELIST (REASON) 
         21653141    rahit     def-mtarailo_cpu    dask-worker   R       9:52     1    1        N/A      8G gra316 (None) 
         21648366    rahit     def-mtarailo_cpu    interactive   R    7:18:01     1    1        N/A     12G gra149 (None) 


## Loading STRINGDB data

In [7]:
# Loading Interaction
ppi = catalog.load('string_interactions')
# Loading gene-protein map/alias file. Because ppi is indicated using stringdb ID.
ppi_alias = catalog.load('string_alias')

In [8]:
ppi.columns

Index(['protein1', 'protein2', 'neighborhood', 'neighborhood_transferred',
       'fusion', 'cooccurence', 'homology', 'coexpression',
       'coexpression_transferred', 'experiments', 'experiments_transferred',
       'database', 'database_transferred', 'textmining',
       'textmining_transferred', 'combined_score'],
      dtype='object')

## Loading Scored Dataset

Scores are collected by running the Kedro pipeline. The pipeline is run in the following way:

```bash
kedro run --runner="modspy_data.runner.SLURMRunner"
```

Only GO and POs are used for the analysis. zyg-1 does not have any DO score hence dropped.

In [9]:
# jvl = catalog.load('jvl_scored')
jvl_go = catalog.load('jvl_scored@pandas')
jvl_po = catalog.load('jvl_scored_pheno')

olida_go = catalog.load('olida_scored')
olida_po = catalog.load('olida_scored_pheno')

mtg_raw = catalog.load('zyg1_raw')
mtg_go = catalog.load('zyg1_scored')
mtg_po = catalog.load('zyg1_scored_pheno')
true_mods = catalog.load('zyg1_truemods')

# Take unique pairs

In [10]:
print("JVL scores")
print(jvl_go.shape)
print(jvl_po.shape)
jvl_go.drop_duplicates(subset=['QueryGene','SuppressorGene',], inplace=True)
jvl_po.drop_duplicates(subset=['QueryGene','SuppressorGene',], inplace=True)
print('After removing dups')
print(jvl_go.shape)
print(jvl_po.shape)

print("\nOLIDA scores")
print(olida_go.shape)
print(olida_po.shape)
olida_go.drop_duplicates(subset=['gene_a', 'gene_b'], inplace=True)
olida_po.drop_duplicates(subset=['gene_a', 'gene_b'], inplace=True)
print('After removing dups')
print(olida_go.shape)
print(olida_po.shape)

print("\nMTG scores")
print(mtg_go.shape)
print(mtg_po.shape)
mtg_go.drop_duplicates(subset=['gene_symbol','target_gene_symbol'], inplace=True)
mtg_po.drop_duplicates(subset=['gene_symbol','target_gene_symbol'], inplace=True)
print('After removing dups')
print(mtg_go.shape)
print(mtg_po.shape)

Consolidate all scores from multiple dataframe into a single dataframe <b>for each dataset<b>.

In [11]:
_jvl = jvl_go.merge(jvl_po, how='left', on=['QueryGene','SuppressorGene'])
_olida = olida_go.merge(olida_po, how='left', on=['gene_a','gene_b'])
_mtg = mtg_go.merge(mtg_po, how='left', on=['gene_symbol','target_gene_symbol'])

In [12]:
print(_jvl.columns)
print(_olida.columns)
print(_mtg.columns)

Looks like there is difference in the column names. i.e. PO and HP are used for phenotype

In [13]:
# Change column names
_jvl.rename(columns={'QueryGene': 'target_gene_symbol', 'SuppressorGene': 'modifier_gene_symbol'}, inplace=True)
_olida.rename(columns={'gene_b': 'target_gene_symbol', 'gene_a': 'modifier_gene_symbol'}, inplace=True)
_mtg.rename(columns={'gene_symbol': 'modifier_gene_symbol'}, inplace=True)

_jvl.columns = _jvl.columns.str.replace(r'^(HP|PO)', 'pheno', regex=True)
_olida.columns = _olida.columns.str.replace(r'^(HP|PO)', 'pheno', regex=True)
_mtg.columns = _mtg.columns.str.replace(r'^(HP|PO)', 'pheno', regex=True)

print(_jvl.columns)
print(_olida.columns)
print(_mtg.columns)

# Labeling Dataset

In [14]:
# Adding true mods columns
# true_mods.rename(columns={"strain": "Sample", "true_modifier": "modifier_gene_symbol"}, inplace=True)
# true_mods['is_modifier'] = 1
# _mtg = pd.merge(_mtg, true_mods, how='left', on=['modifier_gene_symbol']).fillna(0)
# # _mtg.rename(columns={"is_modifier_y": "is_modifier"}, inplace=True)

_mtg['datasource'] = 'MTG'

_jvl['is_modifier'] = 1
_jvl['datasource'] = 'JVL'

_olida['datasource'] = 'OLIDA'

In [15]:
_mtg.columns

Index(['modifier_gene_symbol', 'target_gene_symbol',
       'go_n_common_ancestors_max', 'go_n_common_ancestors_avg',
       'go_n_common_ancestors_bma', 'go_n_union_ancestors_max',
       'go_n_union_ancestors_avg', 'go_n_union_ancestors_bma', 'go_batet_max',
       'go_batet_avg', 'go_batet_bma', 'go_batet_log_max', 'go_batet_log_avg',
       'go_batet_log_bma', 'go_resnik_max', 'go_resnik_avg', 'go_resnik_bma',
       'go_resnik_scaled_max', 'go_resnik_scaled_avg', 'go_resnik_scaled_bma',
       'go_lin_max', 'go_lin_avg', 'go_lin_bma', 'go_jiang_max',
       'go_jiang_avg', 'go_jiang_bma', 'go_jiang_seco_max',
       'go_jiang_seco_avg', 'go_jiang_seco_bma', 'is_modifier',
       'pheno_n_common_ancestors_max', 'pheno_n_common_ancestors_avg',
       'pheno_n_common_ancestors_bma', 'pheno_n_union_ancestors_max',
       'pheno_n_union_ancestors_avg', 'pheno_n_union_ancestors_bma',
       'pheno_batet_max', 'pheno_batet_avg', 'pheno_batet_bma',
       'pheno_batet_log_max', 'pheno_bat

In [16]:
# Adding interaction to MTG from previously computed/collected dataset
mtg_raw = mtg_raw.drop_duplicates(subset=('target_gene_symbol', 'gene_symbol'), keep='first')
print(_mtg.shape)

int_cols = ['neighborhood','neighborhood_transferred', 'fusion', 'cooccurence', 'homology', 'coexpression', 'coexpression_transferred', 
                  'experiments', 'experiments_transferred', 'database', 'database_transferred', 'textmining', 'textmining_transferred', 'combined_score']
_mtg = _mtg.merge(mtg_raw[['gene_symbol', 'target_gene_symbol']+int_cols], how='left', left_on=['modifier_gene_symbol', 'target_gene_symbol', ], right_on=['gene_symbol', 'target_gene_symbol'])
_mtg = _mtg.drop('gene_symbol', axis=1)
print(_mtg.shape)

## Add interaction score from STRING-DB

In [17]:
merged_df = pd.concat([_jvl,_olida], ignore_index=True).fillna(0)

# apply lowercasing and removing punctuation to the textual column
merged_df['modifier_gene_symbol_norm'] = merged_df['modifier_gene_symbol'].str.lower().str.replace('[{}]-'.format(string.punctuation), '', regex=True)
merged_df['target_gene_symbol_norm'] = merged_df['target_gene_symbol'].str.lower().str.replace('[{}]-'.format(string.punctuation), '', regex=True)
ppi_alias['alias_norm'] = ppi_alias['alias'].str.lower().str.replace('[{}]-'.format(string.punctuation), '', regex=True)

# Removing duplicates
merged_df = merged_df.drop_duplicates(subset=('target_gene_symbol', 'modifier_gene_symbol'), keep='first')
ppi_alias = ppi_alias.drop_duplicates(subset='alias_norm', keep='first').drop('source', axis=1)

In [18]:
merged_ddf = dd.from_pandas(merged_df, npartitions=2)

In [19]:
# Adding alias for query gene
_merged_df = merged_ddf.merge(ppi_alias, how='left', left_on='modifier_gene_symbol_norm', right_on="alias_norm")
print(_merged_df.columns)
# Adding alias for suppressor gene
_merged_df = _merged_df.merge(ppi_alias, how='left', left_on='target_gene_symbol_norm', right_on="alias_norm")
print(_merged_df.columns)

Before adding (by merge) PPI data,  make sure the types are correct and the column names are consistent.

In [20]:
# _merged_df = _merged_df.fillna(0)
_merged_df = _merged_df.fillna({'protein_x': '', 'protein_y': '', 'alias_x': '', 'alias_y': '', 'alias_norm_x': '', 'alias_norm_y': ''})

# Explicitly cast columns to string type
_merged_df['protein_x'] = _merged_df['protein_x'].astype(str)
_merged_df['protein_y'] = _merged_df['protein_y'].astype(str)
_merged_df['alias_x'] = _merged_df['alias_x'].astype(str)
_merged_df['alias_y'] = _merged_df['alias_y'].astype(str)
_merged_df['alias_norm_x'] = _merged_df['alias_norm_x'].astype(str)
_merged_df['alias_norm_y'] = _merged_df['alias_norm_y'].astype(str)

# Adding StringDB interaction score datasource
_merged_df = _merged_df.merge(ppi, how='left', left_on=['protein_x','protein_y'], right_on=['protein1', 'protein2'])

_merged_df_computed = _merged_df.compute()
print(_merged_df_computed.columns)
# null_counts = _merged_df.isnull().sum().compute()
# print(null_counts)

## Keep only the relevant columns

In [21]:
from re import T


feat_cols = ['go_n_common_ancestors_max', 'go_n_common_ancestors_avg',
       'go_n_common_ancestors_bma', 'go_n_union_ancestors_max',
       'go_n_union_ancestors_avg', 'go_n_union_ancestors_bma', 'go_batet_max',
       'go_batet_avg', 'go_batet_bma', 'go_batet_log_max', 'go_batet_log_avg',
       'go_batet_log_bma', 'go_resnik_max', 'go_resnik_avg', 'go_resnik_bma',
       'go_resnik_scaled_max', 'go_resnik_scaled_avg', 'go_resnik_scaled_bma',
       'go_lin_max', 'go_lin_avg', 'go_lin_bma', 'go_jiang_max',
       'go_jiang_avg', 'go_jiang_bma', 'go_jiang_seco_max',
       'go_jiang_seco_avg', 'go_jiang_seco_bma', 
       'pheno_n_common_ancestors_max', 'pheno_n_common_ancestors_avg',
       'pheno_n_common_ancestors_bma', 'pheno_n_union_ancestors_max',
       'pheno_n_union_ancestors_avg', 'pheno_n_union_ancestors_bma',
       'pheno_batet_max', 'pheno_batet_avg', 'pheno_batet_bma',
       'pheno_batet_log_max', 'pheno_batet_log_avg', 'pheno_batet_log_bma',
       'pheno_resnik_max', 'pheno_resnik_avg', 'pheno_resnik_bma',
       'pheno_resnik_scaled_max', 'pheno_resnik_scaled_avg',
       'pheno_resnik_scaled_bma', 'pheno_lin_max', 'pheno_lin_avg',
       'pheno_lin_bma', 'pheno_jiang_max', 'pheno_jiang_avg',
       'pheno_jiang_bma', 'pheno_jiang_seco_max', 'pheno_jiang_seco_avg',
       'pheno_jiang_seco_bma', 'neighborhood', 'neighborhood_transferred',
       'fusion', 'cooccurence', 'homology', 'coexpression',
       'coexpression_transferred', 'experiments', 'experiments_transferred',
       'database', 'database_transferred', 'textmining',
       'textmining_transferred', 'combined_score']
target_attr = 'is_modifier'
id_cols = ['datasource', 'modifier_gene_symbol', 'target_gene_symbol']

print(_merged_df_computed[id_cols+feat_cols+[target_attr]].shape)
print(_mtg.shape)

dataset_df = pd.concat([_merged_df_computed[id_cols+feat_cols+[target_attr]],_mtg], ignore_index=True).fillna(0)
display(dataset_df.head())
print(dataset_df.shape)

Unnamed: 0,datasource,modifier_gene_symbol,target_gene_symbol,go_n_common_ancestors_max,go_n_common_ancestors_avg,go_n_common_ancestors_bma,go_n_union_ancestors_max,go_n_union_ancestors_avg,go_n_union_ancestors_bma,go_batet_max,...,coexpression,coexpression_transferred,experiments,experiments_transferred,database,database_transferred,textmining,textmining_transferred,combined_score,is_modifier
0,JVL,ETV1,ATR,59.0,2.960317,6.109155,101.0,28.846782,70.535211,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,JVL,DCLRE1C,BRCA1,22.0,4.16309,7.260417,75.0,29.659375,46.791667,1.0,...,80.0,0.0,292.0,0.0,540.0,0.0,321.0,83.0,779.0,1.0
2,JVL,SMARCAL1,BRCA1,45.0,5.598765,10.268041,87.0,31.152206,59.731959,1.0,...,49.0,62.0,0.0,0.0,0.0,0.0,506.0,204.0,602.0,1.0
3,JVL,TRIP13,BRCA1,43.0,4.51511,11.19802,128.0,39.033929,86.455446,1.0,...,243.0,110.0,0.0,0.0,0.0,0.0,285.0,95.0,505.0,1.0
4,JVL,MUS81,BRCA2,29.0,4.599469,9.086957,88.0,28.369439,55.318841,1.0,...,0.0,61.0,0.0,0.0,500.0,0.0,768.0,350.0,919.0,1.0


In [24]:
print(dataset_df['datasource'].value_counts())

In [23]:
catalog.save('modifiers', dataset_df)