# CNN Performance in Independant Set

This file is part of the Verifying explainability of a deep learning tissue classifier trained on RNA-seq data project.

Verifying explainability of a deep learning tissue classifier trained on RNA-seq data project is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.


Verifying explainability of a deep learning tissue classifier trained on RNA-seq data project is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with the Verifying explainability of a deep learning tissue classifier trained on RNA-seq data project.  If not, see <http://www.gnu.org/licenses/>.


### Objective:
> Load HPA dataset and compare to GTEx dataset using UMAP and CNN performance

### Input files:
1. *gtex_filtered_tmm_intersect_{data_type}.pkl*
2. *ndependent_log2_tmm_update.pkl*
3. *filtered_genes.pkl*

### Output files:
1. *all_genes_umap_hpa_{data_type}.svg*  
 
### Table of contents:
1. [Import Modules](#1.-Import-Modules)  
2. [Set static paths](#2.-Set-static-paths)  
3. [Load files](#3.-Load-files)  
    3.1 [Load GTEx RNAseq](#3.1-Load-GTEx-RNAseq)  
    3.2 [Load HPA RNAseq](#3.1-Load-HPA-RNAseq)  
    3.3 [Load gene list](#3.2-Load-gene-list)  
4. [Process data](#4.-Process-data)  
    4.1 [Reshape dataframe](#4.1-Reshape-dataframe)  
    4.2 [Filter genes](#4.2-Filter-genes)  
    4.3 [Add labels](#4.3-Add-labels)
5. [Save outputs](#5.-Save-outputs) 

## 1. Import Modules

In [None]:
import os
util_path = '../src'
os.chdir(util_path)

In [None]:
import pickle
from modelling.cnn import log_transform
import umap
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure 
import numpy as np

from keras import backend as K
from keras.models import model_from_json
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import f1_score

from modelling.cnn import convert_2d, convert_onehot, keras_cnn, log_transform

In [None]:
%load_ext autoreload
%autoreload 2

## 2. Set static paths

In [None]:
data_type = "imbalanced"
data_dir = "../data/"

In [None]:
input_dir = data_dir + "processed/"
gene_dir = data_dir + "gene_lists/"
fig_dir = "../figures/"
output_dir = data_dir + "proc/"
model_dir = "../models/"

## 3. Load files

#### 3.1 Load GTEx RNAseq

In [None]:
## Test data
with open(input_dir + f"gtex_filtered_tmm_intersect_{data_type}.pkl", "rb") as f:
    gtex = pickle.load(f)

#### 3.2 Load HPA RNAseq

In [None]:
## Test data
with open(input_dir + "independent_log2_tmm_update.pkl", "rb") as f:
    hpa = pickle.load(f)

In [None]:
gtex = log_transform(gtex, label=True)

#### 3.3 Load gene list

In [None]:
## All genes
with open(gene_dir + "filtered_genes.pkl", "rb") as f:
    filtered_genes = pickle.load(f)

In [None]:
# remove duplicated gene
filtered_genes.remove("SLURP2")

## 4. *.

In [None]:
# GTEx
gtex_filt = gtex.loc[:, filtered_genes]
gtex_filt["type"] = gtex.loc[:, "type"]

In [None]:
# HPA
hpa_filt = hpa.loc[:, filtered_genes]
hpa_filt["type"] = hpa.loc[:, "type"]

## 5. Apply UMAP

#### 5.1 Prepare GTEx data for UMAP 

In [None]:
unlabeled_df = gtex_filt.drop(["type"], axis=1)

#### 5.2 Fit UMAP to GTEx

In [None]:
umapfit = umap.UMAP(n_components=2, random_state=42)

In [None]:
%%time
umapfit.fit(unlabeled_df)

#### 5.3 Transform GTEx using UMAP

In [None]:
v = umapfit.transform(unlabeled_df)

#### 5.4 Prepare GTEx UMAP transformed data for plotting

In [None]:
df_subset = pd.DataFrame(gtex_filt["type"], index = gtex_filt.index)
df_subset["umap-2d-one"] = v[:, 0]
df_subset["umap-2d-two"] = v[:, 1]
df_subset["dataset"]="GTEx"

#### 5.5 Prepare HPA data for UMAP 

In [None]:
unlabeled_df_ind = hpa_filt.drop(["type"], axis=1)

#### 5.3 Transform HPA using UMAP

In [None]:
u = umapfit.transform(unlabeled_df_ind)

#### 5.4 Prepare HPA UMAP transformed data for plotting

In [None]:
df_subset_ind = pd.DataFrame(hpa_filt["type"], index = hpa_filt.index)
df_subset_ind["umap-2d-one"] = u[:, 0]
df_subset_ind["umap-2d-two"] = u[:, 1]
df_subset_ind["dataset"]="HPA"

In [None]:
df_subset_all = df_subset.append(df_subset_ind)

#### 5.5 Plot

In [None]:
cdict = {"HPA":"red", "GTEx" : "gray"}

In [None]:
figure(num=None, figsize=(14, 14), dpi=80, facecolor="w", edgecolor="k")
fig = plt.scatter(
    x=df_subset_all["umap-2d-one"],
    y=df_subset_all["umap-2d-two"],
    c=df_subset_all["dataset"].apply(lambda x: cdict[x]),    
    s=2,
)
plt.axis("off")
file_path = fig_dir+f"/all_genes_umap_hpa_{data_type}.svg"
plt.savefig(file_path)


In [None]:
X_test = hpa_filt.drop("type", axis=1)
y_test = hpa_filt["type"]


X_test = log_transform(X_test)
X_test = convert_2d(X_test)

lb = LabelBinarizer()
lb.fit(y_test.values)

## 6. Test performance of 2DCNN on HPA

#### 6.1 Load model

In [None]:
# Load model beatifully
model_json_path = model_dir+f"{data_type}_model_topology.json"
trained_model = model_from_json(
    open(model_json_path, "r").read()
)

# load weights into new model
model_weights_path = model_dir+f"{data_type}_model_weights.hdf5"
trained_model.load_weights(model_weights_path)

#### 6.2 Run predictions

In [None]:
# Run predictions and add everything to a giant DataFrame
y_preds = trained_model.predict_classes(
    X_test
)
num_preds = len(y_preds)

classes = gtex_filt["type"].unique()
num_classes = len(classes)

y_preds_onehot = np.zeros([num_preds, num_classes])
y_preds_onehot[np.arange(num_preds), y_preds] = 1

y_preds_labels = lb.inverse_transform(y_preds_onehot)

print(
    f"macro-average F1 : {f1_score(y_test, y_preds_labels, average='macro')}"
)