In [None]:
from IPython.display import display
import numpy as np
import pandas as pd
import scipy.stats
from statsmodels.sandbox.stats.multicomp import multipletests
import seaborn
import xarray as xr

In [None]:
import plot
import util

## Load Data ##

In [None]:
sfa = xr.open_dataset("../models/sfa_tcga/sfa.nc").load()
eigenbreasts = xr.open_dataset("../data/processed/mri-eigenbreasts.nc").load()
assert all(eigenbreasts['case'] == sfa['case'])

In [None]:
sfa = sfa.rename({'factor': 'gexp_factor'})
sfa['gexp_factor'] = ["GF{}".format(i+1) for i in range(len(sfa['gexp_factor']))]

## Correlation Factorsâ€”MRI features ##

Compute correlation between all MRI features (except for patient number, Comment and Multifocal) and SFA factors (`factor_feature_cor`). We also compute the nominal p value (`factor_feature_cor_nom_p`) and Bonferroni adjusted p-value (`factor_feature_cor_p`).

In [None]:
eb_f_cor = dict()
for eb_set in eigenbreasts.data_vars:
    eb_da = eigenbreasts[eb_set].transpose('case', 'PC')[:, 0:50]
    sfa_sel = sfa.reindex(case=eb_da['case'])
    eb_f_cor[eb_set] = util.cor(eb_da, sfa_sel['factors'], 'case')
    nom_p = eb_f_cor[eb_set]['nominal_p']
    _, p, _, _ = multipletests(nom_p.values.flat, method='bonferroni')
    eb_f_cor[eb_set]['p'] = (nom_p.dims, p.reshape(nom_p.shape))
    _, fdr, _, _ = multipletests(nom_p.values.flat, method='fdr_bh')
    eb_f_cor[eb_set]['fdr'] = (nom_p.dims, fdr.reshape(nom_p.shape))

In [None]:
pd.DataFrame({eb: np.sum(c['fdr'] < 0.25).item() for eb, c in eb_f_cor.items()}, index=[''])

In [None]:
with plot.subplots(len(eb_f_cor), 1, figsize=(12, len(eb_f_cor)*2), sharex=True) as (fig, axs):
    for eb_set_i, eb_set in enumerate(eb_f_cor):
        seaborn.distplot(
            np.abs(eb_f_cor[eb_set]['correlation']).max('PC'),
            hist=True, kde=False, bins='sturges', rug=True,
            ax=axs[eb_set_i],
        )                
        axs[eb_set_i].set_xlabel('')
        axs[eb_set_i].set_title(eb_set)
    axs[-1].set_xlabel('Maximum Pearson Correlation per factor')

In [None]:
with plot.subplots(len(eb_f_cor), 1, figsize=(12, len(eb_f_cor)*2), sharex=True) as (fig, axs):
    for eb_set_i, eb_set in enumerate(eb_f_cor):
        eb_vals = np.array(eb_f_cor[eb_set]['nominal_p'].values.flat)
        seaborn.distplot(
            eb_vals,
            hist=True, bins='sturges', kde=False,
            ax=axs[eb_set_i],
        )                
        axs[eb_set_i].set_xlabel('')
        axs[eb_set_i].set_title(eb_set)
    axs[-1].set_xlabel('Pearson Correlation nominal p-value')

In [None]:
with plot.subplots(len(eb_f_cor), 1, figsize=(12, len(eb_f_cor)*2), sharex=True) as (fig, axs):
    for eb_set_i, eb_set in enumerate(eb_f_cor):
        eb_vals = np.log10(np.array(eb_f_cor[eb_set]['nominal_p'].values.flat))
        seaborn.distplot(
            eb_vals,
            hist=True, bins='sturges', kde=False,
            ax=axs[eb_set_i],
        )                
        axs[eb_set_i].set_xlabel('')
        axs[eb_set_i].set_title(eb_set)
        axs[eb_set_i].set_yscale('log')
    axs[-1].set_xlabel('Pearson Correlation nominal p-value')

Heatmap of correlations. All of them are very low.

Heatmap of correlation with nominal p-values < 0.05. This is without multiple testing correction.

In [None]:
with plot.subplots(1, 1) as (fig, ax):
    plot.heatmap(
        eb_f_cor['contra_ds8']['correlation'],
        mask=eb_f_cor['contra_ds8']['nominal_p'] > 0.05,
        cmap='coolwarm',
        row_dendrogram=False,
        xticklabels=sfa['factor_name'].values,
        ax=ax,
    )
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

None of the correlation are significant after multiple testing correction.

In [None]:
with plot.subplots(1, 1) as (fig, ax):
    plot.heatmap(
        eb_f_cor['both_ds4']['correlation'],
        mask=eb_f_cor['both_ds4']['fdr'] > 0.25,
        cmap='coolwarm',
        row_dendrogram=False,
        xticklabels=sfa['factor_name'].values,
        ax=ax,
    )
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

In [None]:
with plot.subplots(1, 1) as (fig, ax):
    plot.heatmap(
        eb_f_cor['ipsi_ds8']['correlation'],
        mask=eb_f_cor['ipsi_ds8']['fdr'] > 0.25,
        cmap='coolwarm',
        row_dendrogram=False,
        xticklabels=sfa['factor_name'].values,
        ax=ax,
    )
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')