# NMF lab

### Task (45 minutes) 

Data: MOFA's CLL dataset

1. Load and prepare data, then perform NMR on the joint representation of the methylation and mRNA datasets.
2. Assume that there are two cancer subtypes and cluster them :)
3. Find the driving features and verify their functionality.

Optional:
- Observe convergence
- Use the other omics tables as well
- Compare with the results from the MOFA analysis
https://bioconductor.riken.jp/packages/3.9/bioc/vignettes/MOFA/inst/doc/MOFA_example_CLL.html

Questions:


Code:
- Lab example uses a NMF implementation in Python:
    - https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html#sklearn.decomposition.NMF
- For R, feel free to use this package:
    - https://cran.r-project.org/web/packages/NMF/index.html

In [1]:
data_loc = "./data/"
import pandas as pd
df_meth = pd.read_csv(data_loc + "CLL_data_Methylation.csv", index_col=0)
df_mrna = pd.read_csv(data_loc + "CLL_data_mRNA.csv", index_col=0)

# drop nans by column
df_mrna = df_mrna.dropna(axis='columns')
df_meth = df_meth.dropna(axis='columns')

df_mrna = df_mrna.T
df_meth = df_meth.T

In [2]:
df_meth.head()

Unnamed: 0,cg10146935,cg26837773,cg17801765,cg13244315,cg06181703,cg19626656,cg15207968,cg12755103,cg23651812,cg14287724,...,cg07016730,cg25152348,cg08425796,cg05418105,cg22249529,cg07600533,cg08260245,cg19112186,cg10770023,cg00270625
H045,1.811086,-5.172572,5.411526,-0.118825,5.120384,0.145951,-3.436869,-3.844246,2.075422,3.501829,...,3.547843,0.060132,4.442026,2.861301,5.246799,3.901933,5.713831,5.70352,5.166255,4.911655
H109,-3.997508,1.59487,5.412693,1.043871,1.27948,-3.928433,2.989245,0.393004,4.800121,3.159201,...,0.887926,-0.214753,4.561187,3.919911,5.058302,2.634941,5.10746,1.326244,0.677912,5.281115
H024,-2.844313,0.16117,0.365706,-4.219236,0.7211,-3.418859,-3.250385,-2.691305,0.534854,-4.629484,...,-4.486709,0.121749,-2.841373,-3.607177,0.765651,1.516759,5.676245,5.488636,4.221828,5.379716
H056,-3.338656,-2.093433,0.373634,-1.592196,4.047059,0.226601,2.377386,-2.775075,0.419985,0.312388,...,-4.238214,0.137862,-3.964855,-2.27094,-2.631909,-3.884756,5.950338,5.354059,4.934536,5.366823
H079,-0.019362,3.74898,5.41201,1.416418,5.237422,0.324213,-0.647632,-3.098837,5.397188,3.41077,...,2.758021,0.021011,0.673296,3.45523,-3.140733,-4.238106,6.040756,5.584746,5.095111,5.33847


In [3]:
for c in df_meth.columns:
    mask = df_meth[c] < 0
    df_meth[c + '_n'] = df_meth[c].mask(mask)
    df_meth[c + '_p'] = - df_meth[c].mask(~mask)
    df_meth = df_meth.drop([c], axis=1)
df_meth = df_meth.fillna(0)


In [4]:
df_meth.head()

Unnamed: 0,cg10146935_n,cg10146935_p,cg26837773_n,cg26837773_p,cg17801765_n,cg17801765_p,cg13244315_n,cg13244315_p,cg06181703_n,cg06181703_p,...,cg07600533_n,cg07600533_p,cg08260245_n,cg08260245_p,cg19112186_n,cg19112186_p,cg10770023_n,cg10770023_p,cg00270625_n,cg00270625_p
H045,1.811086,0.0,0.0,5.172572,5.411526,0.0,0.0,0.118825,5.120384,0.0,...,3.901933,0.0,5.713831,0.0,5.70352,0.0,5.166255,0.0,4.911655,0.0
H109,0.0,3.997508,1.59487,0.0,5.412693,0.0,1.043871,0.0,1.27948,0.0,...,2.634941,0.0,5.10746,0.0,1.326244,0.0,0.677912,0.0,5.281115,0.0
H024,0.0,2.844313,0.16117,0.0,0.365706,0.0,0.0,4.219236,0.7211,0.0,...,1.516759,0.0,5.676245,0.0,5.488636,0.0,4.221828,0.0,5.379716,0.0
H056,0.0,3.338656,0.0,2.093433,0.373634,0.0,0.0,1.592196,4.047059,0.0,...,0.0,3.884756,5.950338,0.0,5.354059,0.0,4.934536,0.0,5.366823,0.0
H079,0.0,0.019362,3.74898,0.0,5.41201,0.0,1.416418,0.0,5.237422,0.0,...,0.0,4.238106,6.040756,0.0,5.584746,0.0,5.095111,0.0,5.33847,0.0


In [28]:
df = df_mrna.T
df = df/df.mean()
fro = df.apply(lambda x: (x**2).sum()**.5, axis=0)
df_mrna = df / fro

In [29]:
df = df_meth.T
df = df/df.mean()
fro = df.apply(lambda x: (x**2).sum()**.5, axis=0)
df_meth = df / fro

In [30]:
X = pd.concat([df_mrna, df_meth])
X = X.dropna(axis='columns')
print(X.shape)

(13496, 135)


In [33]:
X.head()

Unnamed: 0,H045,H109,H024,H056,H079,H164,H059,H167,H113,H049,...,H271,H006,H084,H260,H192,H070,H255,H135,H247,H066
ENSG00000244734,0.009022,0.005391,0.020382,0.026755,0.01214,0.005026,0.010195,0.003092,0.004567,0.00518,...,0.008746,0.017387,0.021283,0.024401,0.013766,0.006895,0.006624,0.00318,0.017701,0.007995
ENSG00000158528,0.023238,0.026321,0.004801,0.006514,0.023816,0.022488,0.01132,0.005297,0.005619,0.00518,...,0.007797,0.007975,0.005282,0.025212,0.006281,0.003239,0.025178,0.00318,0.00626,0.024524
ENSG00000198478,0.017657,0.005391,0.025392,0.016334,0.009752,0.02493,0.007419,0.008332,0.010429,0.005966,...,0.004636,0.010719,0.01857,0.020788,0.012456,0.003239,0.00589,0.00318,0.01622,0.023507
ENSG00000175445,0.025108,0.021643,0.003135,0.003081,0.026607,0.021274,0.023107,0.023463,0.004567,0.005966,...,0.004636,0.01825,0.018723,0.023259,0.003126,0.009398,0.023147,0.00318,0.011393,0.021184
ENSG00000174469,0.005235,0.025055,0.003135,0.027334,0.010923,0.021449,0.016399,0.005297,0.004567,0.026762,...,0.02858,0.020996,0.02409,0.018633,0.010119,0.026308,0.003097,0.026659,0.019995,0.021093


In [34]:
from sklearn.decomposition import NMF
model = NMF(n_components=2, init='random', random_state=0)
W = model.fit_transform(X)
H = model.components_

In [35]:
print(H.shape, W.shape)

(2, 135) (13496, 2)


In [55]:
#TODO: scatter plot

array([2.13261265, 0.        ])

In [36]:
import numpy as np
clusters = np.argmax(H, axis = 0)
clusters

array([1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 1])