In [83]:
%reload_kedro

In [84]:
from loguru import logger

# Data manipulation
import pandas as pd
import numpy as np
from modspy_data.helpers import KnowledgeGraphScores

# Distributed
import dask.dataframe as dd
from dask.distributed import Client
from dask import delayed

In [85]:
# Loading Interaction
protein_interaction_df = catalog.load('string_interactions')
protein_alias_df = catalog.load('string_alias')


In [86]:
list(protein_interaction_df.columns)


[1m[[0m
    [32m'protein1'[0m,
    [32m'protein2'[0m,
    [32m'neighborhood'[0m,
    [32m'neighborhood_transferred'[0m,
    [32m'fusion'[0m,
    [32m'cooccurence'[0m,
    [32m'homology'[0m,
    [32m'coexpression'[0m,
    [32m'coexpression_transferred'[0m,
    [32m'experiments'[0m,
    [32m'experiments_transferred'[0m,
    [32m'database'[0m,
    [32m'database_transferred'[0m,
    [32m'textmining'[0m,
    [32m'textmining_transferred'[0m,
    [32m'combined_score'[0m
[1m][0m

In [87]:
jvl = catalog.load('jvl_scored')

In [88]:
jvl.columns


[1;35mIndex[0m[1m([0m[1m[[0m[32m'QueryGene'[0m, [32m'SuppressorGene'[0m, [32m'go_n_common_ancestors_max'[0m,
       [32m'go_n_common_ancestors_avg'[0m, [32m'go_n_common_ancestors_bma'[0m,
       [32m'go_n_union_ancestors_max'[0m, [32m'go_n_union_ancestors_avg'[0m,
       [32m'go_n_union_ancestors_bma'[0m, [32m'go_batet_max'[0m, [32m'go_batet_avg'[0m,
       [32m'go_batet_bma'[0m, [32m'go_batet_log_max'[0m, [32m'go_batet_log_avg'[0m,
       [32m'go_batet_log_bma'[0m, [32m'go_resnik_max'[0m, [32m'go_resnik_avg'[0m, [32m'go_resnik_bma'[0m,
       [32m'go_resnik_scaled_max'[0m, [32m'go_resnik_scaled_avg'[0m, [32m'go_resnik_scaled_bma'[0m,
       [32m'go_lin_max'[0m, [32m'go_lin_avg'[0m, [32m'go_lin_bma'[0m, [32m'go_jiang_max'[0m,
       [32m'go_jiang_avg'[0m, [32m'go_jiang_bma'[0m, [32m'go_jiang_seco_max'[0m,
       [32m'go_jiang_seco_avg'[0m, [32m'go_jiang_seco_bma'[0m[1m][0m,
      [33mdtype[0m=[32m'object'[0m[1m)[0

In [89]:
# Use all 8 cores
client = Client(processes = True)

In [57]:
protein_alias_df_delayed = delayed(protein_alias_df)

In [61]:
protein_alias_df_future = client.scatter(protein_alias_df_delayed, broadcast=True)

In [78]:

def get_string_protein_id(protein_alias_df, gene_name):
    """Get STRING DB protein id using their alias file

    Arguments:
        protein_alias_df {DataFrame} -- Loaded alias file dataframe
        gene_name {string} -- Gene name

    Returns:
        string -- Returns protein ID. NOTE: STRING DB tag single protein with a gene (as of Jan 2020). That assumption is taken into account here.
    """
    proteins = protein_alias_df[protein_alias_df["alias"].str.match(
        "^"+str(gene_name)+"$", case=False, na=False)]
    if len(proteins.index) == 0:
        logger.debug("NO proteins found for {}".format(gene_name))
        return None
    elif len(proteins.index) > 1:
        logger.debug("Multiple proteins found for {}".format(gene_name))
    return proteins.iloc[0]['protein']


# def get_interaction_score(row, protein_interaction_df, protein_alias_df, col_names=('target_gene_symbol', 'gene_symbol')):
#     target_gene = row[col_names[0]]
#     query_gene = row[col_names[1]]
#     target_protein = get_string_protein_id(protein_alias_df, target_gene)
#     query_protein = get_string_protein_id(protein_alias_df, query_gene)
#     proteins = [target_protein, query_protein]
#     logger.debug(f"{target_gene} - {query_gene}")
#     # empty_interaction = pd.DataFrame(proteins + np.zeros(len(list(protein_interaction_df.columns))-2), columns=protein_interaction_df.columns)
#     empty_interaction = pd.concat([row, pd.Series(proteins + np.zeros(len(list(
#         protein_interaction_df.columns))-2).tolist(), index=protein_interaction_df.columns)])
#     if target_protein and query_protein:
#         interactions = protein_interaction_df[(protein_interaction_df["protein1"].isin(
#             proteins)) & (protein_interaction_df["protein2"].isin(proteins))]
#         if interactions.empty:
#             return empty_interaction
#         interaction_with_top_score = interactions.loc[interactions['combined_score'].idxmax(
#         )].values.tolist()    # Using the interaction that has BEST combined score
#         return pd.concat([row, pd.Series(interaction_with_top_score, index=protein_interaction_df.columns)])
#     else:
#         return empty_interaction
    
def get_interaction_score(row, protein_interaction_df, protein_alias_df, col_names=('target_gene_symbol', 'modifier_gene_symbol')):
    target_gene = row[col_names[0]]
    query_gene = row[col_names[1]]
    target_protein = get_string_protein_id(protein_alias_df, target_gene)
    query_protein = get_string_protein_id(protein_alias_df, query_gene)
    proteins = [target_protein, query_protein]
    logger.debug(f"{target_gene} - {query_gene}")
    # empty_interaction = pd.DataFrame(proteins + np.zeros(len(list(protein_interaction_df.columns))-2), columns=protein_interaction_df.columns)
    empty_interaction = pd.Series(proteins + np.zeros(len(list(
        protein_interaction_df.columns))-2).tolist(), index=protein_interaction_df.columns)
    if target_protein and query_protein:
        interactions = protein_interaction_df[(protein_interaction_df["protein1"].isin(
            proteins)) & (protein_interaction_df["protein2"].isin(proteins))]
        if interactions.empty:
            return empty_interaction
        interaction_with_top_score = interactions.loc[interactions['combined_score'].idxmax(
        )].values.tolist()    # Using the interaction that has BEST combined score
        return pd.Series(interaction_with_top_score, index=protein_interaction_df.columns)
    else:
        return empty_interaction
    
def add_stringdb_partition(df_partition, col_names=('target_gene_symbol', 'modifier_gene_symbol')):
    protein_interaction_df = catalog.load('string_interactions')
    protein_alias_df = catalog.load('string_alias')
    pickle.dumps(protein_interaction_df)
    df_partition[protein_interaction_df.columns] = df_partition.apply(
        lambda row: get_interaction_score(row, protein_interaction_df, protein_alias_df, ('QueryGene', 'SuppressorGene')), 
        axis=1
    )
    return df_partition

# def add_protein_alias(row, protein_alias_df, col_names=('target_gene_symbol', 'gene_symbol')):
#     target_gene = row[col_names[0]]
#     query_gene = row[col_names[1]]
#     target_protein = get_string_protein_id(protein_alias_df, target_gene)
#     query_protein = get_string_protein_id(protein_alias_df, query_gene)
#     proteins = [target_protein, query_protein]
#     return pd.Series(proteins, index=['target_protein', 'query_protein'])

# def add_protein_alias_partition(df_partition, protein_alias_df):
#     df_partition[['target_protein', 'query_protein']] = df_partition.apply(
#         lambda row: add_protein_alias(row, protein_alias_df, ('QueryGene', 'SuppressorGene')), 
#         axis=1
#     )
#     return df_partition

In [64]:

def add_protein_interaction(ddf, protein_interaction_df, protein_alias_df, col_names=('target_gene_symbol', 'gene_symbol')):
    cols = list(protein_interaction_df.columns)
    cols_dtype = {
        'protein1': 'object',
        'protein2': 'object'
    }
    for c in cols:
        cols_dtype[c] = 'float32'
    jvl_int_ddf = ddf.apply(get_interaction_score, 
                          args=(protein_interaction_df, protein_alias_df, col_names),
                          axis=1, meta=cols_dtype)
    jvl_int_ddf.compute()
    return jvl_int

In [82]:
import pickle
protein_interaction_df = catalog.load('string_interactions')
protein_alias_df = catalog.load('string_alias')
pickle.dumps(protein_interaction_df)

In [79]:
jvl_ddf = dd.from_pandas(jvl, npartitions=3)

In [80]:
meta_data = {
        'protein1': 'object',
        'protein2': 'object'
}

jvl_p_ddf = jvl_ddf.map_partitions(add_stringdb_partition, meta=meta_data)
jvl_p_df = jvl_p_ddf.compute()

In [55]:
cols = list(protein_interaction_df.columns)
cols_dtype = {
    'protein1': 'object',
    'protein2': 'object'
}
for c in cols:
    cols_dtype[c] = 'float32'
jvl_df = jvl.apply(add_protein_alias, 
                      args=(protein_alias_df, ('QueryGene', 'SuppressorGene')),
                      axis=1)
# jvl_p_df = jvl_p_ddf.compute()

[32m2023-09-13 18:54:50.279[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_string_protein_id[0m:[36m17[0m - [34m[1mMultiple proteins found for APOE[0m
[32m2023-09-13 18:54:53.519[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_string_protein_id[0m:[36m17[0m - [34m[1mMultiple proteins found for CASP7[0m
[32m2023-09-13 18:54:56.849[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_string_protein_id[0m:[36m17[0m - [34m[1mMultiple proteins found for APOE[0m
[32m2023-09-13 18:55:00.639[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_string_protein_id[0m:[36m17[0m - [34m[1mMultiple proteins found for HBB[0m
[32m2023-09-13 18:55:03.762[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_string_protein_id[0m:[36m17[0m - [34m[1mMultiple proteins found for APOE[0m


In [53]:
cols = list(protein_interaction_df.columns)
cols_dtype = {
    'protein1': 'object',
    'protein2': 'object'
}
for c in cols:
    cols_dtype[c] = 'float32'
jvl_p_ddf = jvl_ddf.apply(add_protein_alias, 
                      args=(protein_alias_df_delayed, ('QueryGene', 'SuppressorGene')),
                      axis=1, meta={
                                    'protein1': 'object',
                                    'protein2': 'object'
                                })
jvl_p_df = jvl_p_ddf.compute()

In [48]:
jvl_int = add_protein_interaction(jvl_ddf, protein_interaction_df, protein_alias_df, col_names=('QueryGene', 'SuppressorGene'))

In [28]:
protein_alias_df[protein_alias_df["alias"].str.match(
        "^"+str('BBS4')+"$", case=False, na=False)]

Unnamed: 0,protein,alias,source
514943,9606.ENSP00000268057,BBS4,BioMart_HUGO
514944,9606.ENSP00000268057,BBS4,Ensembl_EntrezGene
514945,9606.ENSP00000268057,BBS4,Ensembl_HGNC
514946,9606.ENSP00000268057,BBS4,Ensembl_HGNC_symbol
514947,9606.ENSP00000268057,BBS4,Ensembl_UniProt
514948,9606.ENSP00000268057,BBS4,Ensembl_WikiGene
514949,9606.ENSP00000268057,BBS4,KEGG_NAME
514950,9606.ENSP00000268057,BBS4,UniProt_GN_Name
