In [None]:
import h5py
from IPython.display import display, Markdown
import numpy as np
import pandas as pd
import xarray as xr

In [None]:
import plot

In [None]:
sfa = xr.open_dataset('../models/sfa.nc')
display(sfa)

In [None]:
gexp = xr.open_dataset('../data/processed/gene-expression.nc')
gexp = gexp.set_coords('hgnc_symbol')
gexp = gexp['log2_cpm']
assert(np.all(gexp['case'] == sfa['case']))
display(gexp)

## Overview ##

In [None]:
plot.heatmap(sfa['factors'], row_dendrogram=True, col_dendrogram=True, xticklabels='index')

## Association with Clinical Data##

In [None]:
clin = pd.read_table('../data/raw/imagene_clinical.tsv')
clin = clin.set_index('margins_patient')
clin.index.name = 'case'
clin = clin.to_xarray()
clin = clin.loc[{'case': sfa['case']}]
assert(np.all(clin['case'] == sfa['case']))
display(clin)

In [None]:
plot.boxplot(clin['ihc_subtype'], -sfa['factors'][0, :])

In [None]:
plot.boxplot(clin['ihc_subtype'], gexp[:, gexp['hgnc_symbol']=='ESR1'][:, 0])

## Comparison to factors on TCGA ##

In [None]:
with h5py.File('../data/external/tcga-breast-gexp+rppa+cn-sfa-solution.h5') as f:
    tcga_factors = xr.DataArray(
        data=np.array(f['factors']),
        dims=['sample', 'factor'],
        coords={
            'sample': np.array([s.decode() for s in f['sample names']], 'object'),
            'factor': np.array(['Factor {}'.format(i+1) for i in range(f['factors'].shape[1])]), 
        },
    )

In [None]:
for factor in sfa['factor']:
    factor = factor.item()
    display(Markdown("### {} ###".format(factor)))
    imagene_f = sfa['factors'].loc[{'factor': factor}]
    tcga_f = tcga_factors.loc[{'factor': factor}]
    
    with plot.subplots(2, 1, sharex=True) as (fix, axs):
        plot.hist(imagene_f, ax=axs[0], title="Imagene")
        plot.hist(tcga_f, ax=axs[1], title="TCGA")
    
    plot.qqplot(tcga_f, imagene_f, diagonal=True, xlabel="TCGA", ylabel="Imagene")