This file plots out the results from Snakefile Run 2
(samples 8; methods: linreg, boosting, elastic, forest;
features: single, forest, string, string_hi, all)

Note the boosting had larger steps 0.1 and fewer n_estimators (200).
There are fewer alpha hyperparameter values to optimize for in ElasticNetCV
The results are output in a dictionary (model, metrics)

In [4]:
data_dir = '/Volumes/Ginkgo_Data/Data/2021-11_CPTACTrainedModels/out3'
out_dir = '01_ExportModelResults_Output/out3'
# out2 was the previous result that had duplicated tumor set 4 and did not ensure log read expression in all tables

In [2]:
import pickle
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from predict_protein import download_cptac, select_features, train_model
from predict_protein import utils

get_dataframe = utils.get_dataframe

def get_median_corr(path):
    return np.median(get_dataframe(path)['corr_test'])

def plot_corr(path):

    res = get_dataframe(path)

    res['id'] = res.index
    df_ = pd.wide_to_long(res, ['corr'], i='id', j='set', sep='_', suffix=r'\w+')

    df_g = df_.groupby('set')['corr'].agg(['mean', 'median'])

    g = sns.displot(
        data=df_, x='corr',
        col='set',
        facet_kws=dict(sharey=False, sharex=False)
    )

    # extract and flatten the axes from the figure
    axes = g.axes.flatten()

    # iterate through each axes
    for ax in axes:

        spec = ax.get_title().split(' = ')[1]
        data = df_g.loc[spec, :]

        ax.text(x=data['median'], y=2, s='median: ' + str(np.round(data['median'], 3)))
        # plot the lines
        ax.axvline(x=data['median'], c='orange', ls='--', lw=2.5)

    g.savefig(os.path.splitext(path)[0] + '.pdf')

    return g


In [3]:
# Note this is not run currently since I decided to plot figures in R

# for tumor in ['tumor2', 'tumor8']:
#     for method in ['forest', 'elastic', 'linreg']:
#         for feature in ['single', 'corum', 'string', 'stringhi', 'all']:
#             fname =  f'{tumor}_{method}_{feature}'
#             plot_corr(os.path.join(data_dir, f'{fname}.p'))

In [5]:
for tumor in ['tumor8',]:
    for method in ['linreg', 'elastic', 'forest']:
        for feature in ['single', 'corum', 'string', 'stringhi', 'all']:
            fname =  f'{tumor}_{method}_{feature}'
            if os.path.exists(os.path.join(data_dir, f'{fname}.p')) and \
                    not os.path.exists(f'{out_dir}/01_ExportModelResults_{tumor}_{method}_{feature}.csv'):
                genewise_result = get_dataframe(os.path.join(data_dir, f'{fname}.p'))
                genewise_result.to_csv(f'{out_dir}/01_ExportModelResults_{tumor}_{method}_{feature}.csv')


In [7]:

for tumor in ['tumor2', 'tumor3', 'tumor4', 'tumor5', 'tumor6', 'tumor7', 'tumor8',]:
    for method in ['forest', 'elastic', 'linreg', 'boosting']:
        for feature in ['single', 'corum', 'string', 'stringhi', 'all']:
            fname =  f'{tumor}_{method}_{feature}'
            if not os.path.exists(os.path.join(data_dir, f'{fname}_metrics.csv')):
                if os.path.exists(os.path.join(data_dir, f'{fname}.p')):
                    df__ = get_dataframe(os.path.join(data_dir, f'{fname}.p'))
                    df2 = df__.describe().copy()
                    df2['tumor'] = tumor
                    df2['method'] = method
                    df2['feature'] = feature
                    df2['measure'] = df2.index
                    df2.to_csv(os.path.join(data_dir, f'{fname}_metrics.csv'))


In [8]:
all_metrics = []

for tumor in [ 'tumor2', 'tumor3', 'tumor4', 'tumor5', 'tumor6', 'tumor7', 'tumor8', ]:
    for method in ['forest', 'elastic', 'linreg', 'boosting']:
        for feature in ['single', 'corum', 'string', 'stringhi', 'all']:
            fname =  f'{tumor}_{method}_{feature}'
            if os.path.exists(os.path.join(data_dir, f'{fname}_metrics.csv')):
                read_df = pd.read_csv(os.path.join(data_dir, f'{fname}_metrics.csv'))
                all_metrics.append(read_df)

metrics_df = pd.concat(all_metrics, ignore_index=True)
metrics_df.to_csv(os.path.join('01_ExportModelResults_metrics_out3.csv'))

In [None]:
metrics_df
medians = metrics_df[metrics_df['measure'] == '50%'].copy()
medians2 = pd.wide_to_long(medians, 'corr', i=['tumor', 'method', 'feature'], j='set', sep='_', suffix='\\w+')

g = sns.relplot(data=medians2,
                x='tumor',
                y='corr',
                col='method',
                col_wrap=3,
                palette='viridis',
                hue='feature',
                style='set',
                kind='line',
                marker='o',
                )
g.set(ylim=(0, 1))
g.savefig('Metrics.png')