In [None]:
import pandas as pd
import loader as load
import config
# aak_ge = load.loadGEWithClinical()

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# fig, ax = plt.subplots()

# print(data[0].cancer.unique())
# https://towardsdatascience.com/styling-pandas-plots-and-charts-9b4721e9e597
# Support the retrieve lines: https://stackoverflow.com/questions/4320021/matplotlib-transparent-line-plots
def plot_prediction_metrics_for_data_types(prediction_per_layer, layer_names, sampling, target, selection_type):
    for metric in ["precision", "recall", "f1-score"]:
        fig = plt.figure(figsize = (24, 6))
        for i, cancer in enumerate(prediction_per_layer[0].cancer.unique()):
            ax = fig.add_subplot(1,4,(i+1)) 
            ax.set_title(f"{target} pred {cancer} | {sampling} | {selection_type}", fontsize = 10)
            ax.set_ylabel(metric, fontsize = 15)    
            color_per_label = ["b", "g", "orange"]
            for j, pred in enumerate(prediction_per_layer):
                # d["balanced_accuracy"] = d[""]
                d = pred[[metric, f"{metric}-std", "cancer", "p"]]
                d = d[d.cancer == cancer]
                # feature_aHemounts = d.p.unique()
                x_categorical_tick_sequence = range(d.p.nunique())
                # Necessary for easy way to prevent ugly x-axis scaling based on feature value
                d = d.astype({"p": str})
                # Put "all" features at the end
                d["p"] = d["p"].replace(['0'], 'all')
                feature_order = list(d.p.unique())
                feature_order.append(feature_order.pop(0))
                line = d.set_index("p").loc[feature_order].plot(
                    kind="line", 
                    y=metric, 
                    yerr=f"{metric}-std", 
                    ax=ax,
                    xticks=x_categorical_tick_sequence,
                    label=layer_names[j], 
                    ylim=(0, 1),
                    # Can use normal parameters of plot library
                    alpha=0.5,
                    color=color_per_label[j])
                    # style=["(1, 0, 0, 0.1)-" for x in range(5)])#, "(1, 0, 0, 0.5)", "(1, 0, 0, 0.5)"])
                    # style="r-")
                
                # print(line)
            # for l in fig.gca().get_lines():
                # l.set_color("black")
                
        filename = f"Visual/prediction/{sampling}/{target}/{selection_type}/metrics/{metric}.png"
        load.createDirectory(filename)
        plt.savefig(filename, transparent=False, facecolor="white")

files = ['aak_ge', 'tcma_gen', 'tcma_gen_aak_ge']
# data = [pd.read_csv(fr"Data\Descriptor\Prediction_Tables\random_sampling\stage\{x}_chi2_pred.csv", index_col=None) for x in files]
# plot_prediction_metrics_for_data_types(data, files, "random_sampling", "stage", "chi2")

In [None]:

import itertools
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

def generate_confusion_matrix(output_per_layer, classes, layer_names, sampling, target, selection_type, normalize=False):
    # if normalize:
    #     cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
    #     print("Normalized confusion matrix")
    # else:
    #     print('Confusion matrix, without normalization')

    for cancer in output_per_layer[0].cancer.unique():
        fig = plt.figure(figsize = (30, 18))

        for k, d in enumerate(output_per_layer):
            # if k > 1:
            #     continue
            d = d[d.cancer == cancer]
            ps = d.p.unique()
            
            for i, p in enumerate(ps):
                d_with_p_features = d[d.p == p]
                predicted = d_with_p_features["predicted"].values
                actual = d_with_p_features["actual"].values
                
                ax = fig.add_subplot(len(output_per_layer),len(ps),(len(ps) * k + i+1)) 
                
                # plt.subplots_adjust(wspace=0)
                
                if normalize:
                    matrix = confusion_matrix(actual, predicted, labels=range(len(classes)), normalize="all") #don't hardcode.
                    matrix *= 100
                    plt.imshow(matrix, interpolation='nearest', cmap=plt.get_cmap('Blues'), vmin=0, vmax=100)
                else:
                    matrix = confusion_matrix(actual, predicted, labels=range(len(classes)))
                    plt.imshow(matrix, interpolation='nearest', cmap=plt.get_cmap('Blues'))

                
                
                # if i == 1:
                plt.colorbar()
                title = f"{cancer} {layer_names[k]} {target} pred (p:{p}) | {sampling} | {selection_type}"
                plt.title(title)

                tick_marks = np.arange(len(classes))
                plt.xticks(tick_marks, classes, rotation=45)
                plt.yticks(tick_marks, classes)

                fmt = '.2f' if normalize else 'd'
                thresh = matrix.max() / 2.

                for i, j in itertools.product(range(matrix.shape[0]), range(matrix.shape[1])):
                    plt.text(j, i, format(matrix[i, j], fmt), horizontalalignment="center",
                            color="white" if matrix[i, j] > thresh else "black")

                plt.tight_layout()
                plt.ylabel('True label')
                plt.xlabel('Predicted label')
        filename = f"Visual/prediction/{sampling}/{target}/{selection_type}/cnf/{cancer}.png"
        load.createDirectory(filename)
        plt.savefig(filename, transparent=False, facecolor="white")

    return matrix

def computePredictionDeviation(metric, cancer, p, all_pred_outputs):
    current_outputs = all_pred_outputs[(all_pred_outputs.cancer == cancer) & (all_pred_outputs.p == p)]
    
    metric_means = []
    for iteration in current_outputs.iteration.unique():
        current_iteration_preds = current_outputs[current_outputs.iteration==iteration]
        cur_report = classification_report(current_iteration_preds.actual, current_iteration_preds.predicted, output_dict=True, zero_division=0)
        metric_means.append(cur_report["macro avg"][metric])
    metric_means = np.array(metric_means)
    # print(f"dust{metric}{cancer}{p} {np.mean(metric_means)}")
    return np.std(metric_means)

In [None]:
# todo: ADJUST RANGE OF GROUP
for target in config.prediction_targets[:1]:
    for sampling in config.sampling:
        for selection_type in config.selection_types[:1]:
            files = ['aak_ge', 'tcma_gen', 'tcma_gen_aak_ge']
            data = [pd.read_csv(fr"Data\Descriptor\Prediction_Tables\{sampling}\{target}\{x}_{selection_type}_pred.csv", index_col=None) for x in files]
            output_data = [pd.read_csv(fr"Data\Descriptor\Prediction_Tables\{sampling}\{target}\{x}_{selection_type}_predoutput.csv", index_col=None) for x in files]

            for i, layer in enumerate(data):
                for metric in ["precision", "recall", "f1-score"]:
                    layer[f"{metric}-std"] = layer.apply(lambda row: computePredictionDeviation(metric, row['cancer'], row['p'], output_data[i]), axis=1)
                    # layer[f"{metric}-std"] = 0.2
            # print("New data:", data)
            plot_prediction_metrics_for_data_types(data, files, sampling, target, selection_type)


            if target == "stage":
                labels = ["Stage " + str(x) for x in range(5)]
            elif target == "tumor":
                labels = ["Normal", "Tumor"]
            generate_confusion_matrix(output_data, labels, files, sampling, target, selection_type)

In [None]:
pred1 = [2, 2, 2, 0, 2]
actual1 = [0, 2, 2, 1, 4]

pred2 = [1, 2, 3, 0, 2]
actual2 = [1, 2, 2, 1, 3]






# Put label or it will give a wrong dimension if values missing
cnf_matrix = confusion_matrix(actual1, pred1, labels=range(5), normalize="all")
cnf_matrix2 = confusion_matrix(actual2, pred2, labels=range(5))#, normalize="all")
cnf_matrix

In [None]:
files = ['aak_ge', 'tcma_gen', 'tcma_gen_aak_ge']
c_data = [pd.read_csv(fr"Data\Descriptor\Prediction_Tables\cv\tumor\{x}_chi2_predoutput.csv", index_col=None) for x in files]
           

In [None]:
from sklearn.metrics import accuracy_score, classification_report, average_precision_score, log_loss
# labels = ["Stage " + str(x) for x in range(5)]

# generate_confusion_matrix(c_data, labels, files, "random", "stage", "chi2")
# generate_confusion_matrix(c_data, ["normal", "tumor"], files, "cv", "tumor", "chi2")

files = ['aak_ge', 'tcma_gen', 'tcma_gen_aak_ge']
c_data_output = [pd.read_csv(fr"Data\Descriptor\Prediction_Tables\cv\tumor\{x}_chi2_predoutput.csv", index_col=None) for x in files]
c_data_pred = [pd.read_csv(fr"Data\Descriptor\Prediction_Tables\cv\tumor\{x}_chi2_pred.csv", index_col=None) for x in files]

def computePredictionDeviation(metric, cancer, p, all_pred_outputs):
    current_outputs = all_pred_outputs[(all_pred_outputs.cancer == cancer) & (all_pred_outputs.p == p)]
    
    metric_means = []
    for iteration in current_outputs.iteration.unique():
        current_iteration_preds = current_outputs[current_outputs.iteration==iteration]
        cur_report = classification_report(current_iteration_preds.actual, current_iteration_preds.predicted, output_dict=True, zero_division=0)
        metric_means.append(cur_report["macro avg"][metric])
    metric_means = np.array(metric_means)
    print(f"dust{metric}{cancer}{p} {np.mean(metric_means)}")
    return np.std(metric_means)


for i, layer in enumerate(c_data_pred):
    for metric in ["precision", "recall", "f1-score"]:
        layer[f"{metric}-std"] = layer.apply(lambda row: computePredictionDeviation(metric, row['cancer'], row['p'], c_data_output[i]), axis=1)

c_data_pred

In [None]:
data, files = load.loadAll(includeStage=(target=="stage"), sameSamples=True)
files

In [None]:
tcma_gen = data[0]
tcma_gen.head(1)
taxa = tcma_gen.iloc[:, :-2]
taxa_means = taxa.mean(axis=0)

# Remove zero values
taxa_means_nonzero = taxa_means.where(lambda x : x != 0).dropna()

print(f"Dropped {len(taxa_means) - len(taxa_means_nonzero)} zero rows")

ax = taxa_means_nonzero.plot(kind="bar", figsize=(50,10))

In [None]:
taxa_fig = ax.get_figure()
taxa_fig.savefig("Visual/TCMA/tcma_genus_abundance.png", transparent=False, facecolor="white")