In [1]:

import pandas as pd
import numpy as np
import os, time 
from remics import cumulants, cures, cuna
import matplotlib.pyplot as plt
import seaborn as sns


### Read data
We use a sample data from TCGA Breast cancer study. We have selected a subset of mRNAs, miRNAs and proteins which are associated with breast cancer in this data.

In [2]:
df = pd.read_csv('../data/tcga_brca_forcuna.csv')
df.index=df['ID']
df.drop(['ID'], axis=1, inplace=True)

target = pd.read_csv('../data/tcga_subtypes_train.csv')
y = target['x'].map({'Basal':1, 'Her2':2, 'LumA': 3})

assert df.shape[0] == len(y)
print('X and y dimensions match!\n')

print("Number of individuals: ", df.shape[0])
print("Number of features: ", df.shape[1])
print("\nNumber of each subtype: ", target.groupby('x').size())

X and y dimensions match!

Number of individuals:  180
Number of features:  43

Number of each subtype:  x
Basal    54
Her2     35
LumA     91
dtype: int64



### Computing Cumulants

Pre-requisites if using Julia Cumulants i.e. julia = 1:

* Install Julia to your environment:

    + `wget https://julialang-s3.julialang.org/bin/linux/x64/1.9/julia-1.9.2-linux-x86_64.tar.gz`

    + `tar zxvf julia-1.9.2-linux-x86_64.tar.gz`

* Export path to Julia:
    + `export PATH = "$PATH:/data/shared/user/julia-1.9.2/bin"`
    
* Launch Julia and install packages for computing cumulants:

    ```
    using Pkg
    Pkg.add.(["Cumulants", "NPZ", "LinearAlgebra", "Random", "Statistics"])
    ```




In [3]:
beg_time = time.time()
cumulants_df, vec_df = cumulants.get_cumulants(df, verbose = 1, julia = 1, order = 3)
print("Time spent computing cumulants (mins): ", (time.time() - beg_time)/60)

computing cumulants (Julia)...
running: julia /dccstor/boseukb/conda/sow30/lib/python3.11/site-packages/remics/cumulants.jl /dccstor/boseukb/conda/sow30/lib/python3.11/site-packages/remics/ 3
Command executed successfully.

Time spent computing cumulants (mins):  0.2696907838185628


### Filter significant cumulants

In [4]:
filt_cumulants_df = cumulants_df[cumulants_df['P'] < 0.05]  
filt_vec_df = vec_df[vec_df.k.isin(list(filt_cumulants_df.index))]
print("Number of significant redescription groups: ", filt_vec_df.shape[0])

Number of significant redescription groups:  4241


### Print cumulant statistics

In [5]:
print("Cumulant statistics \n")
print(filt_cumulants_df.sample(n=3))
print("\n ------------")


Cumulant statistics 

                                    k_res      Mean    StdDev         Z  \
index                                                                     
CCNA2&LRIG1&hsa-mir-197         -0.161069 -0.003175  0.078330 -2.015763   
KDM4B&hsa-mir-106b&hsa-mir-146a -0.171406 -0.001692  0.076792 -2.210054   
MED13L&NTN4&hsa-mir-106a         0.145065  0.006503  0.068045  2.036307   

                                        P  
index                                      
CCNA2&LRIG1&hsa-mir-197          0.043825  
KDM4B&hsa-mir-106b&hsa-mir-146a  0.027101  
MED13L&NTN4&hsa-mir-106a         0.041720  

 ------------


## CuRES

In [6]:
beg_time = time.time()

cures_vec, res = cures.get_cures(filt_vec_df.set_index('k').T, 
                                        y, 
                                        verbose=1,               
                                        multi_class=True)

print("Time spent computing cures (mins): ", (time.time() - beg_time)/60)

**************************************
CuReS prediction statistics
**************************************
-------------------------------------
Model fitting complete
-------------------------------------
F1 score of fitted model on test data:  0.9167844657242742
-------------------------------------
**************************************
Time spent computing cures (mins):  0.04987299839655558


## Computing CuNA (Cumulant-based network analysis)
CuNA returns the following:
    1. A dataframe with edges in its rows and the connected vertices in columns along with the statistical significance (measured by p-value) from the Fisher Exact test. 
    2. **count** or weight of the edge. 
    3. A dataframe of varying row lengths (contains None for empty fields) with the community membership information for all the vertices. 
    4. A dataframe with node rank. A score indicating the importance of each vertex across different centrality measures. The lower the score means higher the importance. 

In [7]:
beg_time = time.time()
p = [1e-2, 1e-5, 1e-8]
interactions, nodes, communities, noderank = cuna.get_network(cumulants_df.reset_index().copy(),
                                                            0, 
                                                            p, 
                                                            verbose=0)
print("Time spent computing CuNA network (mins): ", (time.time() - beg_time)/60)

{'NTN4': [1, 16, 8, 4, 1], 'LMO4': [2, 10, 9, 5, 5], 'hsa-mir-106a': [3, 6, 10, 6, 3], 'CSRP2': [4, 25, 3, 2, 4], 'C4orf34': [5, 27, 1, 1, 10], 'hsa-mir-186': [6, 3, 17, 14, 2], 'hsa-mir-146a': [7, 9, 14, 13, 6], 'SLC43A3': [8, 11, 11, 7, 16], 'hsa-mir-532': [9, 1, 21, 23, 7], 'SEMA3C': [10, 29, 2, 3, 17], 'hsa-mir-130b': [11, 15, 26, 17, 9], 'PREX1': [12, 22, 4, 8, 13], 'LRIG1': [13, 4, 25, 29, 15], 'hsa-mir-1301': [14, 2, 13, 15], 'TTC39A': [15, 17, 12, 12, 8], 'ZNF552': [16, 13, 16, 18, 19], 'hsa-mir-590': [17, 14, 23, 21, 12], 'hsa-mir-20a': [18, 26, 6, 9, 20], 'hsa-mir-93': [19, 21, 20, 16, 18], 'hsa-mir-197': [20, 5, 22, 26, 14], 'CCNA2': [21, 23, 24, 22, 21], 'hsa-let-7d': [22, 24, 29, 19, 11], 'DTWD2': [23, 19, 19, 24], 'hsa-mir-106b': [24, 18, 32, 25], 'E2F1': [25, 20, 30, 30], 'FUT8': [26, 28, 15, 20], 'hsa-mir-17': [27, 30, 7, 11], 'KDM4B': [28, 8, 31, 32], 'hsa-mir-505': [29, 31, 5, 10], 'ASPM': [30, 7, 28, 31], 'FMNL2': [31, 12, 18, 27], 'MEX3A': [32, 32, 27, 28]}
Time spe

  final_df = pd.concat(appended_df)


In [8]:
print(interactions.sample(4))

              v1            v2      pval  count                     edge_pair
90   hsa-mir-186         CCNA2  0.008202      4          (CCNA2, hsa-mir-186)
24   hsa-mir-505        SEMA3C  0.000416     18         (SEMA3C, hsa-mir-505)
1   hsa-mir-106a  hsa-mir-146a  0.000005      4  (hsa-mir-106a, hsa-mir-146a)
16  hsa-mir-146a  hsa-mir-130b  0.000199      3  (hsa-mir-130b, hsa-mir-146a)


In [9]:
communities

Unnamed: 0,0,1,2,3,4,5,6,7
Community0,PREX1,KDM4B,CSRP2,hsa-mir-1301,SEMA3C,LRIG1,hsa-mir-505,ZNF552
Community1,FMNL2,hsa-mir-590,FUT8,LMO4,CCNA2,NTN4,ASPM,
Community2,hsa-mir-130b,E2F1,hsa-mir-106b,hsa-mir-532,hsa-let-7d,DTWD2,MEX3A,
Community3,hsa-mir-20a,hsa-mir-146a,hsa-mir-17,hsa-mir-197,C4orf34,hsa-mir-186,,
Community4,SLC43A3,TTC39A,hsa-mir-93,hsa-mir-106a,,,,


In [10]:
noderank.sample(4)

Unnamed: 0,Node,Score
8,hsa-mir-532,12.2
1,LMO4,6.2
30,FMNL2,22.0
18,hsa-mir-93,18.8
