In [None]:
import pandas as pd
import loader as load
import config
# aak_ge = load.loadGEWithClinical()

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# fig, ax = plt.subplots()

# print(data[0].cancer.unique())

def plot_prediction_metrics_for_data_types(prediction_per_layer, layer_names, sampling, target, selection_type):
    for metric in ["precision", "recall", "f1-score"]:
        fig = plt.figure(figsize = (24, 6))
        for i, cancer in enumerate(prediction_per_layer[0].cancer.unique()):
            ax = fig.add_subplot(1,4,(i+1)) 
            ax.set_title(f"{target} pred {cancer} | {sampling} | {selection_type}", fontsize = 10)
            ax.set_ylabel(metric, fontsize = 15)

            for j, d in enumerate(prediction_per_layer):
                # d["balanced_accuracy"] = d[""]
                d = d[[metric, "cancer", "p"]]
                d = d[d.cancer == cancer]
                # feature_amounts = d.p.unique()
                x_categorical_tick_sequence = range(d.p.nunique())
                # Necessary for easy way to prevent ugly x-axis scaling based on feature value
                d = d.astype({"p": str})
                line = d.plot(kind="line", x="p", y=metric, ax=ax, xticks=x_categorical_tick_sequence, label=layer_names[j], ylim=(0, 1))
                
        filename = f"Visual/prediction/{sampling}/{target}/{selection_type}/{metric}.png"
        load.createDirectory(filename)
        plt.savefig(filename, transparent=False, facecolor="white")

In [None]:
for target in config.prediction_targets:
    for sampling in config.sampling:
        for selection_type in config.selection_types:
            files = ['aak_ge', 'tcma_gen', 'tcma_gen_aak_ge']
            data = [pd.read_csv(fr"Data\Descriptor\Prediction_Tables\{sampling}\{target}\{x}_{selection_type}_pred.csv", index_col=None) for x in files]
            plot_prediction_metrics_for_data_types(data, files, sampling, target, selection_type)

In [None]:
pred1 = [2, 2, 2, 0, 2]
actual1 = [0, 2, 2, 1, 4]

pred2 = [1, 2, 3, 0, 2]
actual2 = [1, 2, 2, 1, 3]




from sklearn.metrics import confusion_matrix

# Put label or it will give a wrong dimension if values missing
cnf_matrix = confusion_matrix(actual1, pred1, labels=range(5))#, normalize="all")
cnf_matrix2 = confusion_matrix(actual2, pred2, labels=range(5))#, normalize="all")
cnf_matrix

In [None]:
files = ['aak_ge', 'tcma_gen', 'tcma_gen_aak_ge']
c_data = [pd.read_csv(fr"Data\Descriptor\Prediction_Tables\random_sampling\stage\{x}_chi2_predoutput.csv", index_col=None) for x in files]
           

In [None]:

import itertools
import numpy as np
def generate_confusion_matrix(output_per_layer, classes, layer_names, sampling, target, selection_type, normalize=False):
    if normalize:
        cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    cancer = "COAD"
    for k, d in enumerate(output_per_layer):
        if k > 0:
            continue
        fig = plt.figure(figsize = (24, 6))
        d = d[d.cancer == cancer]
        ps = d.p.unique()
        
        for i, p in enumerate(ps):
            d_with_p_features = d[d.p == p]
            predicted = d_with_p_features["predicted"].values
            actual = d_with_p_features["actual"].values
            matrix = confusion_matrix(predicted, actual, labels=range(5)) # normalize? don't hardcode.

            
            ax = fig.add_subplot(1,len(ps),(i+1)) 
            # plt.subplots_adjust(wspace=0)
            plt.imshow(matrix, interpolation='nearest', cmap=plt.get_cmap('Blues'))
            
            # if i == 1:
            plt.colorbar()
            title = f"{layer_names[k]} {target} pred {cancer} | {sampling} | {selection_type}"
            plt.title(title)

            tick_marks = np.arange(len(classes))
            plt.xticks(tick_marks, classes, rotation=45)
            plt.yticks(tick_marks, classes)

            fmt = '.2f' if normalize else 'd'
            thresh = matrix.max() / 2.

            for i, j in itertools.product(range(matrix.shape[0]), range(matrix.shape[1])):
                plt.text(j, i, format(matrix[i, j], fmt), horizontalalignment="center",
                        color="white" if matrix[i, j] > thresh else "black")

            plt.tight_layout()
            plt.ylabel('True label')
            plt.xlabel('Predicted label')

    return matrix
labels = ["Stage " + str(x) for x in range(5)]
generate_confusion_matrix(c_data, labels, files, "random", "stage", "chi2")

In [None]:
data, files = load.loadAll(includeStage=(target=="stage"), sameSamples=True)
files

In [None]:
tcma_gen = data[0]
tcma_gen.head(1)
taxa = tcma_gen.iloc[:, :-2]
taxa_means = taxa.mean(axis=0)

# Remove zero values
taxa_means_nonzero = taxa_means.where(lambda x : x != 0).dropna()

print(f"Dropped {len(taxa_means) - len(taxa_means_nonzero)} zero rows")

ax = taxa_means_nonzero.plot(kind="bar", figsize=(50,10))

In [None]:
taxa_fig = ax.get_figure()
taxa_fig.savefig("Visual/TCMA/tcma_genus_abundance.png", transparent=False, facecolor="white")