In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import pyreporter as pr
import recon
model = recon.recon.Recon()
util = recon.utilities.Utilities()


In [2]:
prot = pd.read_excel('../../data/UC/12014_2019_9224_MOESM2_ESM.xlsx', header=7)
prot = prot[prot['Gene names'].isin(model.genes.symbol)][['Gene names', 'Ratio UC/H']]

In [None]:
_prot

In [None]:
_prot = util.map_gene(df=prot.groupby('Gene names').max(), g_mapping=model.genes, mapping_column='symbol')
_prot['base'] = 1.0
_prot = _prot.drop_duplicates().dropna()
# _prot

res = pr.workflows.workflow_Fang2012(cobra_model=model.model, mapped_genes_baseline=_prot['base'],
                                          mapped_genes_comparison=_prot['Ratio UC/H'], gene_fill=1.0)

In [None]:
pd.DataFrame(res).to_csv('../../data/UC/pr_centrality_Schniers2019.csv')

In [3]:
# Proteomics data
proteomics = pd.read_csv('../../../reporter_metabolites/mapped_proteomics.csv', index_col='number')
proteomics.index = proteomics.index.astype('str')
proteomics['base'] = 1.0

In [4]:
proteomics

Unnamed: 0_level_0,Unnamed: 0,log2FoldChange,padj,base
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10351.1,ABCA8,-1.113058,6.906275e-04,1.0
5243.1,ABCB1,-0.874997,2.344156e-05,1.0
8647.1,ABCB11,-1.092346,3.285012e-04,1.0
10449.1,ACAA2,-0.829147,2.477650e-06,1.0
38.1,ACAT1,-0.837336,1.011360e-07,1.0
...,...,...,...,...
7360.2,UGP2,-0.759488,2.177387e-05,1.0
79799.1,UGT2A3,-1.531056,3.225847e-07,1.0
7381.1,UQCRB,-0.632647,4.106313e-05,1.0
8875.1,VNN2,0.779873,7.906539e-02,1.0


In [5]:
res = pr.workflows.workflow_Fang2012(cobra_model=model.model, mapped_genes_baseline=proteomics['base'],
                               mapped_genes_comparison=np.exp2(proteomics['log2FoldChange']), gene_fill=1.0)

In [6]:
pd.Series(res).to_csv('../../data/UC_analyses/pr_centrality_proteomics_158p.csv', sep='\t')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
def get_pca(df, n_components=2):
    from sklearn.decomposition import PCA
    pca = PCA(n_components=n_components)
    pca.fit(df)
    return pca.transform(df)

def plot_pca(pca, df, ax, column='Intrinsic subtype (PAM50)', figtitle=None, 
             palette='Set2', fontsize=20, annotate=False):
    if column is not None:
        sp = sns.scatterplot(x=pca[:, 0], y=pca[:, 1], hue=df[column], palette=palette, ax=ax)
    else:
        sp = sns.scatterplot(x=pca[:, 0], y=pca[:, 1], ax=ax, palette=palette)
    if annotate == True:
        for i in range(pca.shape[0]):
            ax.text(x=pca[i, 0]+0.3, y=pca[i, 1]+0.3, s=df.index[i]) 
    if figtitle:
        sp.set_title(figtitle, fontsize=fontsize)
    sp.set_xlabel('PC1')
    sp.set_ylabel('PC2')
    return sp

In [None]:
pca = get_pca(pdf)
fig, ax = plt.subplots(figsize=(10, 10))
plot_pca(pca, pdf, ax, column=None)

In [None]:
pdata = pd.read_csv('../data/UC_analyses/full_data/180820 Arve MQ and uniprot updated OxM AcNterm proteinGroups.txt', 
                    sep='\t', index_col='Gene names')
pnames = pd.read_csv('../data/UC_analyses/full_data/180120AS Arvesølvet sample names to MQ identifiers.csv', sep=',')

def get_pdata(control, pdata=pdata, pnames=pnames):
    pnames['pid'] = [i.split(' ')[1] for i in pnames['Sample name']]
    _pnames = pnames[pnames.pid.isin(set([i.split(' ')[1] for i in control['ID from Proteomic run'].dropna()]) & set(pnames['pid']))]
    cols = ['Reporter intensity corrected ' + ' '.join(i.split(' ')[-2:]) for i in _pnames['MaxQuant file identifier']]
    p_control = util.map_gene(df=pdata[cols], g_mapping=model.genes, mapping_column='symbol')
    return p_control

mdata = pd.read_csv('../data/UC_analyses/metabolomics/Metabolomics meta data.csv', sep=',')
mets = pd.read_csv('../data/UC_analyses/metabolites.csv', sep=',')
control = mdata[mdata.Degree == 'C']
mild = mdata[mdata.Degree == 'Mild']
moderate = mdata[mdata.Degree == 'Moderate']
severe = mdata[mdata.Degree == 'Severe']
relapse = mdata[mdata.Degree == 'R']

p_control = get_pdata(control, pdata)
p_mild = get_pdata(mild, pdata)
p_moderate = get_pdata(moderate, pdata)
p_severe = get_pdata(severe, pdata)
p_relapse = get_pdata(relapse, pdata)

In [None]:
pdf = pd.concat((p_mild, p_moderate, p_severe, p_relapse, p_control), axis=1)
pdf = pdf[~(pdf == 0.0).all(axis=1)]
pdf = pdf.reset_index().groupby('Gene names').max()
# pdf.to_csv('../data/UC_analyses/mapped_proteomics_full.csv', sep='\t')

In [None]:
_pdf = util.divide_all_columns_by_each_column(pdf)

In [None]:
cols = [i for i in _pdf.columns if i.split('_')[0] in p_severe.columns and i.split('_')[-1] in p_control.columns]
_pdf_sc = _pdf[cols].replace(np.inf, np.nan).dropna(axis=0)
_pdf_sc['base'] = 1.0
print(f'Shape of _pdf_sc: {_pdf_sc.shape}')

cols = [i for i in _pdf.columns if i.split('_')[0] in p_relapse.columns and i.split('_')[-1] in p_control.columns]
_pdf_rc = _pdf[cols].replace(np.inf, np.nan).dropna(axis=0)
_pdf_rc['base'] = 1.0
print(f'Shape of _pdf_rc: {_pdf_rc.shape}')

cols = [i for i in _pdf.columns if i.split('_')[0] in p_severe.columns and i.split('_')[-1] in p_moderate.columns]
_pdf_smod = _pdf[cols].replace(np.inf, np.nan).dropna(axis=0)
_pdf_smod['base'] = 1.0
print(f'Shape of _pdf_smod: {_pdf_smod.shape}')

cols = [i for i in _pdf.columns if i.split('_')[0] in p_severe.columns and i.split('_')[-1] in p_mild.columns]
_pdf_smild = _pdf[cols].replace(np.inf, np.nan).dropna(axis=0)
_pdf_smild['base'] = 1.0
print(f'Shape of _pdf_smild: {_pdf_smild.shape}')

cols = [i for i in _pdf.columns if i.split('_')[0] in p_mild.columns and i.split('_')[-1] in p_control.columns]
_pdf_mc = _pdf[cols].replace(np.inf, np.nan).dropna(axis=0)
_pdf_mc['base'] = 1.0
print(f'Shape of _pdf_mc: {_pdf_mc.shape}')


cols = [i for i in _pdf.columns if i.split('_')[0] in p_moderate.columns and i.split('_')[-1] in p_control.columns]
_pdf_modc = _pdf[cols].replace(np.inf, np.nan).dropna(axis=0)
_pdf_modc['base'] = 1.0
print(f'Shape of _pdf_modc: {_pdf_modc.shape}')

cols = [i for i in _pdf.columns if i.split('_')[0] in p_relapse.columns and i.split('_')[-1] in p_severe.columns]
_pdf_rs = _pdf[cols].replace(np.inf, np.nan).dropna(axis=0)
_pdf_rs['base'] = 1.0
print(f'Shape of _pdf_rs: {_pdf_rs.shape}')

In [None]:
res = pd.DataFrame()
for col in _pdf_rc.columns.difference(['base',]):
    _res = pr.workflows.workflow_Fang2012(cobra_model=model.model, mapped_genes_baseline=_pdf_rc['base'],
                                          mapped_genes_comparison=_pdf_rc[col], gene_fill=1.0)
    res[col] = pd.Series(_res)
res.to_csv('../data/UC_analyses/pr_centrality_proteomics_relapse_control.csv', sep='\t')

In [None]:
res.to_csv('../data/UC_analyses/pr_centrality_proteomics_severe_control.csv', sep='\t')

In [None]:
res1 = pd.read_csv('../data/UC_analyses/pr_centrality_proteomics.csv', sep='\t', index_col=0)
res1 = util.divide_all_columns_by_each_column(res1)

In [None]:
res1[res.columns].to_csv('../data/UC_analyses/pr_centrality_proteomics_severe_control_1.csv', sep='\t')

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
sns.kdeplot(data=np.log10(p_control), ax=ax)

In [None]:
# pd.concat((p_mild.mean(axis=1)/p_control.mean(axis=1),
#            p_moderate.mean(axis=1)/p_control.mean(axis=1),
#            p_severe.mean(axis=1)/p_control.mean(axis=1),
#            p_relapse.mean(axis=1)/p_control.mean(axis=1)), axis=1)

pdf = pd.concat((p_mild.mean(axis=1),
                 p_moderate.mean(axis=1),
                 p_severe.mean(axis=1),
                 p_relapse.mean(axis=1),
                 p_control.mean(axis=1)), axis=1)
pdf.columns = ['mild', 'moderate', 'severe', 'relapse', 'control']
pdf = pdf.div(pdf.control, axis=0)
pdf = pdf.groupby(pdf.index).max().dropna(axis=0)


In [None]:
r1 = pr.workflows.workflow_Fang2012(cobra_model=recon.model, mapped_genes_baseline=pdf.control,
                                      mapped_genes_comparison=p_control, gene_fill=1.0)


In [None]:
res = pd.DataFrame()
for col in pdf.columns[:-1]:
    _res = pr.workflows.workflow_Fang2012(cobra_model=recon.model, mapped_genes_baseline=pdf.control,
                                          mapped_genes_comparison=pdf[col], gene_fill=1.0)
    res = pd.concat((res, _res), axis=1)
res.columns = pdf.columns[:-1]


In [None]:
sns.kdeplot(data=res)

In [None]:
gdata = pd.read_csv(
    'C:/Users/ssh041/Downloads/transfer_250856_files_6667e22b/UC_analyses/geo_rna_seq/GSE109142/full_data_detailed.tsv', sep='\t')


In [None]:
mdf = pd.concat((control[control.columns[7:]].mean(),
                mild[mild.columns[7:]].mean(),
                moderate[control.columns[7:]].mean(),
                severe[control.columns[7:]].mean(),
                relapse[control.columns[7:]].mean()), axis=1)
mdf.columns = ['control', 'mild', 'moderate', 'severe', 'relapse']
mdf = mdf.div(mdf.control, axis=0)
mdf = mdf.groupby(mdf.index).max().dropna(axis=0)
mdf

In [None]:
# mdf[mdf.index.str.contains('|'.join(recon.metabolites.fullName))]
# recon.metabolites[recon.metabolites.fullName.str.contains(mdf.index[0].())]
a = list(set([i.capitalize() for i in recon.metabolites.fullName]) & set(mdf.index))
b = list(set([i.capitalize() for i in recon.metabolites.fullName]) & set([i.capitalize() for i in mdf.index]))
c = list(set([i.lower() for i in recon.metabolites.fullName]) & set([i.lower() for i in mdf.index]))
d = list(set([i.lower() for i in recon.metabolites.fullName]) & set([i for i in mdf.index]))
# recon.metabolites[recon.metabolites.fullName.isin(list(set(a+b+c+d)))]
# list(set(a+b+c+d))
a


In [None]:
i_capitalize = {i: i for i in list(set(mdf.index) & set(recon.metabolites.fullName))}
req_change = list(set(mdf.index) - set(recon.metabolites.fullName))
for i in req_change:
    if '-' in i:
        i_capitalize[i] = '-'.join([i.split('-')
                                       [0], i.split('-')[1].capitalize()])
#     else:
#         if i.capitalize() in recon.metabolites.fullName:
#             i_capitalize[i] = i.capitalize()
# remaining = list(set(mdf.index) - set(i_capitalize.keys()))
# for m in list(set(['L-'+i for i in remaining]) & set(recon.metabolites.fullName)):
#     i_capitalize[m[2:]] = m
len(i_capitalize)


In [None]:
len(set(i_capitalize.values()) & set(recon.metabolites.fullName))

In [None]:
len(set(mdf.index))# & set(i_capitalize.keys()))



In [None]:
metabolomics = pd.read_csv(
    '../../reporter_metabolites/mapped_metabolomics_curated.csv', sep=',')


In [None]:
metabolomics

In [None]:
# recon.reactions[recon.reactions.formula.str.contains('ser_L')]

# recon.reactions[recon.reactions.formula.str.contains('mthf')]
# recon.reactions[recon.reactions.formula.str.contains('amet')]['subsystem'].unique()
# recon.reactions[(recon.reactions.subsystem == 'Glycine, serine, alanine, and threonine metabolism') & (
#     recon.reactions.formula.str.contains('mthf'))]

# recon.reactions[recon.reactions.ecnumber=='2.1.1.13']

# Glycine, serine, alanine, and threonine


In [None]:
def update_index(metabolomics):
    mindex = []
    for m in metabolomics.index:
        if len(m.split('_'))>1:
            mindex.append('_'.join([m.split('_')[0].lower(), m.split('_')[1]]))
        else:
            mindex.append(m.lower())

    metabolomics.index = mindex
    return metabolomics


def update_index1(metabolomics):
    mindex = []
    for m in metabolomics.index:
        if len(m.split('_')) > 1:
            mindex.append('_'.join([m.split('_')[0], m.split('_')[1].upper()]))
        else:
            mindex.append(m.lower())

    metabolomics.index = mindex
    return metabolomics


In [None]:
# df = pd.read_csv('../data/predictions_uc_patients.csv', sep='\t', index_col=0)
pr = pd.read_csv('../data/pr_uc_patients.csv', sep='\t', index_col=0)
pr['vmh'] = [i[:-3].lower() for i in pr.index]
pr['mets'] = [i for i in pr.index]
metabolomics = pd.read_csv('../../reporter_metabolites/mapped_metabolomics_curated.csv', sep=',')


In [None]:
def plot_kde(pr, save=False, filename=None):
    fig, ax = plt.subplots(figsize=(8, 6))
    # sns.kdeplot(data=np.log2(pr['0']), ax=ax)
    sns.histplot(data=np.log2(pr['0']), bins=10, ax=ax)
    ax.set_yscale('log')
    ax.axvline(0.0, linewidth=2.0, color='k', ls='--')
    ax.set_xlabel('concentration change (log2)')
    plt.xlim((-1, 1.))
    plt.tight_layout()
    if save == True:
        return fig.savefig(filename, dpi=300)
    else:
        return plt.show()


plot_kde(pr=pr, save=True, filename='../images/UC/histplot_uc_patients.png')


In [None]:
def map_metabolomics(metabolomics, pr):
    metabolomics.vmh = metabolomics.vmh.str.lower()
    print(f'mapped: {len(set([i.lower() for i in metabolomics.vmh]) & set([i.lower() for i in pr.vmh]))}')
    print(f'missing: {len(set([i.lower() for i in metabolomics.vmh]) - set([i.lower() for i in pr.vmh]))}')
    missing = list(set([i.lower() for i in metabolomics.vmh]) - set([i.lower() for i in pr.vmh]))
    print(f'\t{missing}')
    m1 = pr.merge(metabolomics, left_on='vmh', right_on='vmh').set_index('mets')
    m2 = u.reshape(m1)
    m2 = m2.rename(columns={'0': 'prediction_proteomics', 'fc': 'metabolomics'})
    m2['metabolomics'] = m2['metabolomics'].astype('float')
    m2['prediction_proteomics'] = m2['prediction_proteomics'].astype('float')
    return m2


In [None]:
p_crit = metabolomics.p < .05
fc_crit = (metabolomics.fc > 1.2) | (metabolomics.fc < .8)
metabolomics2 = metabolomics[p_crit & fc_crit]
m2 = map_metabolomics(metabolomics=metabolomics2, pr=pr)

m1 = map_metabolomics(metabolomics=metabolomics, pr=pr)


In [None]:
# len(recon.metabolites[recon.metabolites.abbreviation.isin(df.index)])
df1 = pd.read_csv('../data/predictions_uc_patients.csv', sep='\t', index_col=0)


In [None]:
proteomics = pd.read_csv('../../reporter_metabolites/mapped_proteomics.csv', index_col='number')
proteomics.index = proteomics.index.astype('str')
# gs = pd.read_csv('../data/recon/genes_sensitivity.tsv', sep='\t', index_col=0)


In [None]:
cc = cs.calculate_cc(proteomics)
cc1 = cs.calculate_cc(proteomics, grouping='cytoplasm')
cc2 = cs.calculate_cc(proteomics, grouping='mitochondrial')


In [None]:
df = pd.concat((m2.groupby('metabolites').mean(),
               cc[cc.index.isin(m2.metabolites)]), axis=1)
df = df.rename(columns={0: 'control_coeff'})

df1 = pd.concat((m2.groupby('metabolites').mean(),
                 cc1[cc1.index.isin(m2.metabolites)]), axis=1)
df1 = df1.rename(columns={0: 'control_coeff'})

df2 = pd.concat((m2.groupby('metabolites').mean(),
                 cc2[cc2.index.isin(m2.metabolites)]), axis=1)
df2 = df2.rename(columns={0: 'control_coeff'})


In [None]:
df = cs.merged_data(cc_df=cc, m_df=m2, res_df=)

In [None]:
def plot_parity(data, xcolumn, ycolumn, figtitle=None, save=False,
                filename=None, ub=0.5, cc=False):
    fig, ax = plt.subplots(figsize=(8, 6))
    if cc == True:
        sc = ax.scatter(y=np.log2(data[ycolumn]), x=np.log2(
            data[xcolumn]), c=data.control_coeff, cmap='Reds')
        plt.colorbar(sc, label='confidence score ($\Gamma$)',
                    boundaries=np.linspace(0., ub, 5))
    else:
        sc = ax.scatter(y=np.log2(data[ycolumn]),
                        x=np.log2(data[xcolumn]))
    # ax.set_yscale('log')
    ax.axhline(0.0, ls="--", c="k")
    ax.axvline(0.0, ls="--", c="k")
    ax.set_xlim((-3.0, 3.0))
    ax.set_ylim((-0.3, 0.3))
    data1 = pd.concat((data[ycolumn], data[xcolumn]), axis=1)
    pl.add_percentage(data1, colname=ycolumn, colname2=xcolumn, ax=ax, show_sc_percentages=False,
                      l=(0.05, 0.95), l2=(0.55, 0.95), l3=(0.05, 0.25), l4=(0.55, 0.25),
                      fsize=15)
    # pl.add_metabolite_names1(data1, colname=0, colname2=1, l=(0.05, 0.9), l2=(0.75, 0.95),
    #                          l3=(0.05, 0.2), l4=(0.55, 0.2), n2=1, n4=2, ax=ax)
    plt.xlabel('Measured concentration foldchange (log2)')
    plt.ylabel('Predicted concentration foldchange (log2)')
    plt.title(figtitle)
    plt.tight_layout()
    if save == True:
        return fig.savefig(filename, dpi=300)
    else:
        return plt.show()


In [None]:
plot_parity(data=df.dropna(), xcolumn='metabolomics', ycolumn='prediction_proteomics', ub=0.6,
            save=True, filename='../images/UC/parity_mean_UC_patients_pvalue_1.png')


In [None]:
plot_parity(data=df1.dropna(), xcolumn='metabolomics', ycolumn='prediction_proteomics', ub=0.4,
            save=True, filename='../images/UC/parity_cytoplasm_UC_patients_pvalue.png')


In [None]:
plot_parity(data=df2.dropna(), xcolumn='metabolomics', ycolumn='prediction_proteomics', ub=0.6,
            save=True, filename='../images/UC/parity_mitochondrial_UC_patients_pvalue.png')
