In [None]:
import pandas as pd
import numpy as np
import loader as load
import config
# aak_ge = load.loadGEWithClinical()

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# fig, ax = plt.subplots()

# print(data[0].cancer.unique())
# https://colorbrewer2.org/#type=qualitative&scheme=Dark2&n=3
# https://towardsdatascience.com/styling-pandas-plots-and-charts-9b4721e9e597
# Support the retrieve lines: https://stackoverflow.com/questions/4320021/matplotlib-transparent-line-plots
def plot_prediction_metrics_for_data_types(prediction_per_layer, layer_names, sampling, target, selection_type, root_folder="Visual/prediction/all_modalities", metrics=["precision", "recall", "f1-score"], ylimits={}):
    for metric in metrics:
        fig = plt.figure(figsize = (24, 6))
        for i, cancer in enumerate(prediction_per_layer[0].cancer.unique()):
            ax = fig.add_subplot(1,4,(i+1)) 
            ax.set_title(f"{target} pred {cancer} | {sampling} | {selection_type}", fontsize = 10)
            ax.set_ylabel(metric, fontsize = 15)    
            color_per_label = ["b", "g", "orange", "r"]
            for j, pred in enumerate(prediction_per_layer):
                # d["balanced_accuracy"] = d[""]
                d = pred[[metric, f"{metric}-std", "cancer", "p"]]
                d = d[d.cancer == cancer]
                d = d.sort_values(by="p")
                # feature_aHemounts = d.p.unique()
                x_categorical_tick_sequence = range(d.p.nunique())
                # Necessary for easy way to prevent ugly x-axis scaling based on feature value
                d = d.astype({"p": str})
                # Put "all" features at the end
                d["p"] = d["p"].replace(['0'], 'all')
                feature_order = list(d.p.unique())
                feature_order.append(feature_order.pop(0))
                ylim = ylimits.get(metric, (0, 1))

                line = d.set_index("p").loc[feature_order].plot(
                    kind="line", 
                    y=metric, 
                    yerr=f"{metric}-std", 
                    ax=ax,
                    xticks=x_categorical_tick_sequence,
                    label=layer_names[j], 
                    ylim=ylim,
                    # Can use normal parameters of plot library
                    alpha=0.5,
                    color=color_per_label[j])
                    # style=["(1, 0, 0, 0.1)-" for x in range(5)])#, "(1, 0, 0, 0.5)", "(1, 0, 0, 0.5)"])
                    # style="r-")
                
                # print(line)
            # for l in fig.gca().get_lines():
                # l.set_color("black")
                
        filename = f"{root_folder}/{sampling}/{target}/{selection_type}/metrics/{metric}.png"
        load.createDirectory(filename)
        plt.savefig(filename, transparent=False, facecolor="white")

def plot_frac_features_selected(prediction_layer, layer_name, sampling, target, selection_type, show_ge=True, absolute_amount=False):
    fig, ax = plt.subplots(figsize = (12, 6))
    all_features, _ = load.getFeatures()
    tcma_gen_features, aak_ge_features = all_features
    modality_features = aak_ge_features if show_ge else tcma_gen_features

    ax.set_title(f"{target} pred | {layer_name} | {sampling} | {selection_type} | out of {len(modality_features)} total features", fontsize = 10)
    for c_i, cancer in enumerate(prediction_layer.cancer.unique()):
        d = prediction_layer
        
        of_modality = "nr." if absolute_amount else "frac"
        modality = " GE" if show_ge else " GENUS"
        y_label = f"{of_modality}{modality} selected"
        
        ax.set_ylabel(y_label, fontsize = 15)    
        ax.set_xlabel("p", fontsize = 15)    
        
        d = d[d.cancer == cancer]
        d = d.sort_values(by="p")
        d = d[d.p != 0]
        d = d.astype({"p": str})
        ps = d.p.unique()
        
        p_mean_frac_modality_features = []
        p_std_frac_modality_features = []

        for i, p in enumerate(ps):
            d_p = d[d.p == p]
            total_iterations_count = d_p.iteration.nunique()
            fractions_of_modality_features = []
            for iteration in d_p.iteration.unique():
                d_iteration_features = d_p[d_p.iteration==iteration]

                d_iteration_features_modality = d_iteration_features[d_iteration_features.features.isin(modality_features)]
                
                modality_features_normalizer = 1 if absolute_amount else int(p)

                fraction_of_modality_features = len(d_iteration_features_modality.index) / modality_features_normalizer
                # print(p, modality_features_normalizer, len(d_iteration_features_modality.index))
                fractions_of_modality_features.append(fraction_of_modality_features)

            fractions_of_modality_features = np.array(fractions_of_modality_features)
            p_mean_frac_modality_features.append(np.mean(fractions_of_modality_features))
            p_std_frac_modality_features.append(np.std(fractions_of_modality_features))
            
        ax.errorbar(ps, p_mean_frac_modality_features, label=cancer, yerr=p_std_frac_modality_features, alpha=0.5)
        # ax.set_ylim([0,1])with 
        ax.legend()

    of_modality_file_name = "nr" if absolute_amount else "frac"
    modality_file_name = "_ge" if show_ge else "_genus"
    filename = f"Visual/prediction/{sampling}/{target}/{selection_type}/features/{of_modality_file_name}_selected{modality_file_name}.png"
    load.createDirectory(filename)
    plt.savefig(filename, transparent=False, facecolor="white")


files = ['aak_ge', 'tcma_gen', 'tcma_gen_aak_ge']
# data = [pd.read_csv(fr"Data\Descriptor\Prediction_Tables\random_sampling\stage\{x}_chi2_pred.csv", index_col=None) for x in files]
# plot_prediction_metrics_for_data_types(data, files, "random_sampling", "stage", "chi2")
sampling, target, selection_type = "random_sampling", "tumor", "linreg"
# features_data = pd.read_csv(fr"Data\Descriptor\Prediction_Tables\{sampling}\{target}\tcma_gen_aak_ge_{selection_type}_predfeature.csv", index_col=None)
# plot_frac_features_selected(features_data, "tcma_gen_aak_ge", sampling, target, selection_type, show_ge=False, absolute_amount=True)

In [None]:

import itertools
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error

def generate_confusion_matrix(output_per_layer, classes, layer_names, sampling, target, selection_type, normalize=False):
    # if normalize:
    #     cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    #     print("Normalized confusion matrix")
    # else:
    #     print('Confusion matrix, without normalization')

    for cancer in output_per_layer[0].cancer.unique():
        fig = plt.figure(figsize = (30, 18))

        for k, d in enumerate(output_per_layer):
            # if k > 1:
            #     continue
            d = d[d.cancer == cancer]
            d =  d.sort_values(by="p")
            ps = d.p.unique()
            
            for i, p in enumerate(ps):
                d_with_p_features = d[d.p == p]
                predicted = d_with_p_features["predicted"].values
                actual = d_with_p_features["actual"].values
                
                ax = fig.add_subplot(len(output_per_layer),len(ps),(len(ps) * k + i+1)) 
                
                # plt.subplots_adjust(wspace=0)
                
                if normalize:
                    matrix = confusion_matrix(actual, predicted, labels=range(len(classes)), normalize="all") #don't hardcode.
                    matrix *= 100
                    plt.imshow(matrix, interpolation='nearest', cmap=plt.get_cmap('Blues'), vmin=0, vmax=100)
                else:
                    matrix = confusion_matrix(actual, predicted, labels=range(len(classes)))
                    plt.imshow(matrix, interpolation='nearest', cmap=plt.get_cmap('Blues'))

                
                
                # if i == 1:
                plt.colorbar()
                # title = f"{cancer} {layer_names[k]} {target} pred (p:{p}) | {sampling} | {selection_type}"
                title = f"{cancer} {layer_names[k]} {target} pred (p:{p})"
                plt.title(title)

                tick_marks = np.arange(len(classes))
                plt.xticks(tick_marks, classes, rotation=45)
                plt.yticks(tick_marks, classes)

                fmt = '.2f' if normalize else 'd'
                thresh = matrix.max() / 2.

                for i, j in itertools.product(range(matrix.shape[0]), range(matrix.shape[1])):
                    plt.text(j, i, format(matrix[i, j], fmt), horizontalalignment="center",
                            color="white" if matrix[i, j] > thresh else "black")

                plt.tight_layout()
                plt.ylabel('True label')
                plt.xlabel('Predicted label')
        filename = f"Visual/prediction/{sampling}/{target}/{selection_type}/cnf/{cancer}.png"
        load.createDirectory(filename)
        plt.savefig(filename, transparent=False, facecolor="white")

    return matrix

def computeMetric(metric, cancer, p, all_pred_outputs, std=True):
    current_outputs = all_pred_outputs[(all_pred_outputs.cancer == cancer) & (all_pred_outputs.p == p)]
    
    metric_means = []
    for iteration in current_outputs.iteration.unique():
        current_iteration_preds = current_outputs[current_outputs.iteration==iteration]

        if metric in ["precision", "recall", "f1-score"]:
            cur_report = classification_report(current_iteration_preds.actual, current_iteration_preds.predicted, output_dict=True, zero_division=0)
            cur_metric = cur_report["macro avg"][metric]
        elif metric == "rmse":
            cur_metric = mean_squared_error(current_iteration_preds.actual, current_iteration_preds["predicted-raw"], squared=True)
        metric_means.append(cur_metric)

    metric_means = np.array(metric_means)

    print(f"{metric}{cancer}{p} {np.mean(metric_means)}")
    return np.mean(metric_means) if not std else np.std(metric_means)

In [None]:
import os

def createVisualizations(include_parity=False, confusion=False, features=False):
    for target in config.prediction_targets[1:]:
        for sampling in config.sampling:
            for selection_type in config.selection_types[:2]:
                output_root_folder = "Visual/prediction/all_modalities"
                files = ['aak_ge', 'tcma_gen', 'tcma_gen_aak_ge']
                layer_names = ["GE", "GENUS", "GE ∩ GENUS"]

                if include_parity:
                    output_root_folder = "Visual/prediction/base_modalities"
                    files = files.append('tcma_gen_aak_ge(parity)')
                    layer_names = layer_names.append("GE ∩ GENUS (parity)")
                
                prediction_tables_dir = fr"Data/Descriptor/Prediction_Tables/{sampling}/{target}"
                data = []
                output_data = []
                for x in files:
                    data_selection_dir = prediction_tables_dir + fr"/{x}_{selection_type}"
                    if x == 'tcma_gen_aak_ge(parity)':
                        data_selection_dir = prediction_tables_dir + fr"/tcma_gen_aak_ge_{selection_type}(parity)"
                    current_data = pd.read_csv(data_selection_dir + "_pred.csv", index_col=None)
                    current_output_data = pd.read_csv(data_selection_dir + "_predoutput.csv", index_col=None)
                    data.append(current_data)
                    output_data.append(current_output_data)
                

                metrics = ["precision", "recall", "f1-score"]
                if target == "stage":
                    metrics.append("rmse")

                metrics = metrics[-1:]
                max_rmse = 0

                for i, layer in enumerate(data):
                    for metric in metrics:
                        if metric == "rmse":
                            layer[f"{metric}"] = layer.apply(lambda row: computeMetric(metric, row['cancer'], row['p'], output_data[i], std=False), axis=1)
                            max_rmse = max(max_rmse, max(layer[f"{metric}"]))
                        layer[f"{metric}-std"] = layer.apply(lambda row: computeMetric(metric, row['cancer'], row['p'], output_data[i], std=True), axis=1)

                ylimits = {"rmse": (0, max_rmse)}
                plot_prediction_metrics_for_data_types(data, layer_names, sampling, target, selection_type, output_root_folder, metrics=metrics, ylimits=ylimits)

                if confusion:
                    if target == "stage":
                        labels = ["Stage " + str(x) for x in range(5)]
                    elif target == "tumor":
                        labels = ["Normal", "Tumor"]

                    generate_confusion_matrix(output_data, labels, layer_names, sampling, target, selection_type)
                
                if features:
                    features_data = pd.read_csv(fr"{prediction_tables_dir}\tcma_gen_aak_ge_{selection_type}_predfeature.csv", index_col=None)
                    for ge_modality, absolute_amount in [(True, False), (False, True)]:
                        plot_frac_features_selected(features_data, layer_names[-2], sampling, target, selection_type, show_ge=ge_modality, absolute_amount=absolute_amount)
createVisualizations()

In [None]:
files = ['aak_ge', 'tcma_gen', 'tcma_gen_aak_ge']
c_data_pred = [pd.read_csv(fr"Data\Descriptor\Prediction_Tables\random_sampling\stage\{x}_linreg_pred.csv", index_col=None) for x in files]
c_data_features = [pd.read_csv(fr"Data\Descriptor\Prediction_Tables\random_sampling\stage\{x}_linreg_predfeature.csv", index_col=None) for x in files]

target = "stage"
sampling = "random_sampling"
selection_type = "linreg"
for cancer in c_data_features[0].cancer.unique():
        
        fig = plt.figure(figsize = (48, 6))
        for k, d in enumerate(c_data_features):
            ax = fig.add_subplot(1,3,(k+1))
            ax.set_title(f"{target} pred {cancer} | {files[k]} | {sampling} | {selection_type}", fontsize = 10)
            ax.set_ylabel("nr. features selected", fontsize = 15)    
            ax.set_xlabel("frequency selected", fontsize = 15)    
           
            d = d[d.cancer == cancer]
            ps = d.p.unique()
            for i, p in enumerate(ps[-3:-2]):
                feature_occurrences = {}
                d = d[d.p == p]
                total_iterations_count = d.iteration.nunique()
                for iteration in d.iteration.unique():
                    d_iteration_features = d[d.iteration==iteration]
                    features = d_iteration_features.features.values 
                    
                    for f in features:
                        feature_occurrences[f] = feature_occurrences.get(f, 0) + 1
                feature_occurrences_d = pd.DataFrame.from_dict(feature_occurrences, orient="index", columns=["count"])
                # feature_occurrences_d["count"] = feature_occurrences_d["count"].apply( lambda x : x / total_iterations_count)
                # feature_occurrences_d["count"] = feature_occurrences_d["count"].apply( lambda x : x / total_iterations_count)
                # print(feature_occurrences_d, total_iterations_count)
                feature_occurrences_d = feature_occurrences_d.div({"count":total_iterations_count})
                feature_occurrences_d = feature_occurrences_d.sort_values("count", ascending=False)

                bins = pd.cut(feature_occurrences_d["count"], bins=[0, 0.1, .2, .3, .4, .5, .6, .7, .8, .9, 1], include_lowest=True)
                bins_norm = bins.value_counts(sort=False, normalize=True).mul(100)
                bins_norm.plot.bar(
                # bins.value_counts(sort=False).plot.bar(
                    rot=0, 
                    color="b",
                    # figsize=(6,4),
                    ax=ax,
                    ylim=(0, 100))
                    # label=files[k])
                # print(bins)
                # print(f"c:{cancer} l:{filwith es[k]} p:{p} {feature_occurrences_d}")
                # print(f"c:{cancer} l:{files[k]} p:{p} {features}")with 

# print(feature_occurrences_d)

def plot_feature_occurrences_across_iterations(feature_occurrences):
    bins = pd.cut(feature_occurrences["count"], bins=[0, 0.1, .2, .3, .4, .5, .6, .7, .8, .9, 1], include_lowest=True)
    ax = bins.value_counts(sort=False).plot.bar(rot=0, color="b", figsize=(6,4))
    print(bins)
    # out_norm = out.value_counts(sort=False, normalize=True).mul(100)
    # ax = out_norm.plot.bar(rot=0, color="b", figsize=(6,4))
    # ax.set_xticklabels([c[1:-1].replace(","," to") for c in out.cat.categories])
    # plt.ylabel("pct")
    # plt.show()
# plot_feature_occurrences_across_iterations(feature_occurrences_d)

In [None]:
data, files = load.loadAll(includeStage=(target=="stage"), sameSamples=True)
files

In [None]:
tcma_gen = data[0]
aak_ge = data[1]
tcma_gen_features = tcma_gen.columns[:-2]
aak_ge_features = aak_ge.columns[:-2]
aak_ge.iloc[0]

for i in range(2):
    with open(f"Data/Descriptor/Description/{files[i]}_features.txt", "w") as out_file:
        out_file.write(",".join(data[i].columns[:-2]))

# for i in range(2):
#     with open(f"Data/Descriptor/Description/{files[i]}_features.txt", "r") as out_file:
#         raw_features = out_file.read()
#         features = raw_features.split(",")
# features[0][0]

In [None]:
tcma_gen = data[0]
tcma_gen.head(1)
taxa = tcma_gen.iloc[:, :-2]
taxa_means = taxa.mean(axis=0)

# Remove zero values
taxa_means_nonzero = taxa_means.where(lambda x : x != 0).dropna()

print(f"Dropped {len(taxa_means) - len(taxa_means_nonzero)} zero rows")

ax = taxa_means_nonzero.plot(kind="bar", figsize=(50,10))

In [None]:
taxa_fig = ax.get_figure()
taxa_fig.savefig("Visual/TCMA/tcma_genus_abundance.png", transparent=False, facecolor="white")