In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
import src.utils as utils
import ipywidgets
from clustergrammer2 import net
from scipy.stats import zscore

>> clustergrammer2 backend version 0.17.0


In [2]:
import os
CPUS = os.cpu_count()
RANDOMSTATE = 29
DATAFOLDER = 'proteomics'

In [3]:
def flip_dict(dict_to_flip):
    
    """
    Takes a dictionary as input and return a new dictionary with values and keys of the old dictionary switched. 
    Takes care of same value corresponding to multiple key situation. 
    
    ----------
    Parameters:
    dict_to_flip: python dictionary object, keys as string or number, values as list
    
    Return: python dictionarly object
    
    Example: old_dict = {'i': ['a', 'b'],
                        'j': ['b', 'c']}
            new_dict = flip_dict(old_dict)
            new_dict
            {'a':'i', 'b':'i', 'c':'j'}
    """
    
    new_dict = {}
    old_keys = dict_to_flip.keys()
    for old_key in old_keys:
        old_values = dict_to_flip[old_key]
        for old_value in old_values:
            if old_value in new_dict.keys():
                pass
            else:
                new_dict[old_value] = old_key
    
    return new_dict

In [4]:
def return_keyword_item(data, kw, kw_col='Disease', item_col='Proteins'):
    
    """
    Takes a pandas dataframe and return values in kw_col that contains keyword (kw), 
    and return a collection of all items in item_col of rows that contain keyword in kw_col.
    
    Parameters:
    ----------
    kw: string, keyword
    data: pandas dataframe
    kw_col: column to look for keyword
    item_col: column to extract items
    """
    
    df = data.copy()
    kw_rel = list(set([i for i in df[kw_col] if kw in i]))
    items_kw_rel = df[df[kw_col].isin(kw_rel)][item_col].unique().tolist()
    
    return kw_rel, items_kw_rel

### Explore CKG output datasets

#### Display files

In [5]:
pd.set_option('max_columns', 9)

files = [file for file in os.listdir(DATAFOLDER) if '.tsv' in file]
w_data = widgets.Dropdown(options=files, index=5)

def show_data(file):
    filename = os.path.join(DATAFOLDER, file)
    global data # only here to show-case data for report
    try:
        data = pd.read_csv(filename, sep='\t', index_col='Sample ID')
    except:
        data = pd.read_csv(filename, sep='\t')
    try:
        w_cols.options = list(data.columns)
    except:
        pass
    display(data.head())
    print('Data shape:' + str(data.shape))
out = widgets.interactive_output(show_data, controls={'file':w_data})

widgets.VBox([w_data, out])

VBox(children=(Dropdown(index=5, options=('complex_associations.tsv', 'correlation_correlation.tsv', 'Data Mat…

#### Display file headers

In [6]:
w_cols = widgets.SelectMultiple(options=list(data.columns))

def show_selected_proteins(columns):
    if len(columns)> 0:
        display(data[list(w_cols.value)][:5])
        print(data[list(w_cols.value)].describe())
    else:
        print('Select proteins')

out_sel = widgets.interactive_output(show_selected_proteins, {'columns': w_cols})
widgets.VBox([w_cols, out_sel])

VBox(children=(SelectMultiple(options=('annotation', 'identifier', 'source', 'group'), value=()), Output()))

### Load output data

In [7]:
f0 = 'drug_associations.tsv'
f1 = 'complex_associations.tsv'
f2 = 'disease_associations.tsv'
f3 = 'pathway annotation.tsv'
f4 = 'pathway_enrichment_Pathways_regulation_enrichment.tsv'
f5 = 'regulation table.tsv'
f6 = 'processed.tsv'
f7 = 'original.tsv'
f8 = 'regulated.tsv'

pd.set_option('max_column', 12)
file_names = [f0, f1, f2, f3, f4, f5, f6, f7, f8]

file_paths = [os.path.join(DATAFOLDER, DATA) for DATA in file_names]
files_data = [pd.read_csv(f_data, sep = '\t') for f_data in file_paths]

data_drug = files_data[0]
data_complex = files_data[1]
data_disease = files_data[2]
data_pathway = files_data[3]
data_pathway_en = files_data[4]
data_reg = files_data[5]
data_regsig = data_reg[data_reg.rejected]
data_proc = files_data[6]
data_raw = files_data[7]

### Druggable proteins

In [8]:
df = data_drug.copy()
df['Proteins'] = df.Proteins.str.split(';')
df = df.explode('Proteins')

druggable_proteins = df.Proteins.unique().tolist()

### Disease associated proteins
- 'liver'
- 'alcohol'
- 'diabetes'
- 'kidney'
- 'cardiovascular'
- 'metabolic'

In [9]:
df = data_disease.copy()
df['Proteins'] = df.Proteins.str.split(';')
df = df.explode('Proteins')

kws = ['liver', 'alcohol', 'diabetes', 'kidney', 'cardiovascular', 'metabolic']

kw_disease_dict = {}
disease_prot_dict = {}
for kw in kws:
    diseases, proteins = return_keyword_item(data=df, kw=kw)
    kw_disease_dict[kw] = diseases
    disease_prot_dict[kw] = proteins

pdi_dict = {}
for kw in kws:
    prots = disease_prot_dict[kw]
    for prot in prots:
        if prot in pdi_dict.keys():
            pass
        else:
            pdi_dict[prot] = kw + ' disease'

In [10]:
w_data = widgets.Dropdown(options=kws)

def show_disease(kw):
    display(kw_disease_dict[kw])
    
def show_protein(kw):
    proteins = disease_prot_dict[kw]
    nr = len(proteins)
    print()
    print('Number of proteins associated with {} disease: {}'.format(kw, nr))
    print()
    display(proteins[:20])
    
out1 = widgets.interactive_output(show_disease, controls={'kw':w_data})
out2 = widgets.interactive_output(show_protein, controls={'kw':w_data})

widgets.VBox([w_data, out1, out2])

VBox(children=(Dropdown(options=('liver', 'alcohol', 'diabetes', 'kidney', 'cardiovascular', 'metabolic'), val…

### Protein complexes

In [11]:
df = data_complex.copy()
df['Proteins'] = df.Proteins.str.split(';')
df = df.explode('Proteins').sort_values(by='num_proteins', ascending=False)

In [12]:
comps = df.Complex.unique().tolist()
comp_prot_dict = {}
for comp in comps:
    proteins = return_keyword_item(data = df, kw=comp, kw_col='Complex')[1]
    comp_prot_dict[comp] = proteins
    
pcom_dict = flip_dict(comp_prot_dict)

### Pathway enrichment

In [13]:
df = data_pathway_en.copy()
df['identifiers'] = df.identifiers.str.split(',')
df = df.explode('identifiers').sort_values(by='foreground', ascending=False)

In [14]:
pathways = df.terms.unique().tolist()
pw_prot_dict = {}
for pathway in pathways:
    proteins = return_keyword_item(data = df, kw=pathway, kw_col='terms', item_col='identifiers')[1]
    pw_prot_dict[pathway] = proteins
    
ppw_dict = flip_dict(pw_prot_dict)

# Pathway-centric prioritization

### Regulated pathway

In [15]:
nr_of_sig_prot = data_regsig.identifier.unique().tolist().__len__()
nr_of_total_prot = data_reg.identifier.unique().tolist().__len__()
func_fold = lambda x: (x.foreground/nr_of_sig_prot)/((x.foreground + x.background)/nr_of_total_prot)
func_perc = lambda x: x.foreground/(x.foreground + x.background)

df = data_pathway_en.copy()
df = df.assign(fold_enrich = func_fold, percentage = func_perc)

df = df.assign(new_pvalue = lambda x: -np.log10(x.padj))

df = df[df.rejected].drop(['identifiers', 'background', 'pvalue', 'padj', 'rejected'], 1).set_index('terms')
df = df.apply(zscore)
df = df.rename({'new_pvalue':'-Log10 padj', 'foreground':'Nr. of proteins', 'fold_enrich':'fold enrichment'}, axis =1)

df_cluster_pw = df.copy()
print(df_cluster_pw.shape)

(69, 4)


### Clustergrammer2 visualization

In [16]:
data = df_cluster_pw
net.load_df(data)
net.cluster()
net.widget()

CGM2(network='{"row_nodes": [{"name": "Regulation of actin dynamics for phagocytic cup formation", "ini": 69, …

# Protein-centric prioritization

### Regulated proteins

In [17]:
protein_sig = data_reg[data_reg.rejected]['identifier'].unique().tolist()

data_proc_pv = data_proc.melt(id_vars=['group', 'sample', 'subject'], value_name='Log2 intensity', var_name='Protein ID')

data_sig_median = data_proc_pv.groupby(['Protein ID', 'group'])['Log2 intensity'].median().unstack().loc[protein_sig]
data_sig_median.columns = ['F0', 'F1', 'F2', 'F3', 'F4']

### Disease progression incremental matrix

In [18]:
df = data_sig_median.copy()
df_diff = df.diff(axis=1).drop('F0', 1)
df_diff.columns = ['i'+ str(i) for i in range(1,5)]
df_diff['i5'] = df.F2 - df.F0
df_diff['i6'] = df.F3 - df.F0
df_diff['i7'] = df.F4 - df.F0
df_diff = df_diff.apply(lambda x: 2 ** x - 1)

### Further filtering

In [19]:
df = df_diff.copy()
#df = df.reindex(druggable_proteins)
df_fil = df.copy()

### Normalization 

In [20]:
df_fil = df_fil.apply(zscore)

### Add categories

In [21]:
df = df_fil.copy()
rows = df.index.tolist()
label1 = [[x, 'Druggable: +'] if x in druggable_proteins else [x, 'Druggable: NA'] for x in rows]
label2 = [['Disease: ' + pdi_dict[x]] if x in pdi_dict.keys() else ['Disease: NA'] for x in rows]
label3 = [['Complex: ' + pcom_dict[x]] if x in pcom_dict.keys() else ['Complex: NA'] for x in rows]
label4 = [['Pathway: ' + ppw_dict[x]] if x in ppw_dict.keys() else ['Pathway: NA'] for x in rows]
new_index = [label1[i] + label2[i] + label3[i] + label4[i] for i in range(len(rows))]
new_index = [tuple(i) for i in new_index]

df_cat = df.copy()
df_cat.index = new_index

### Clustergrammer2 visualization

In [22]:
data = df_cat.sort_values(by='i2', ascending=False)
net.load_df(data)
cats = ['Druggable', 'Disease', 'Complex', 'Pathway']

for i in range(4):
    cat = cats[i]
    net.set_cat_colors(cat_colors={'NA':'white'}, axis=0, cat_index=i+1, cat_title=cat)
    
net.cluster()
net.widget()

CGM2(network='{"row_nodes": [{"name": "CSPG4~Q6UVK1", "ini": 1068, "clust": 885, "rank": 1067, "rankvar": 1067…