# Cluster analysis - 1

### Input files:
1. *shap_genes.pkl*
2. *edger_genes.pkl*
3. *filtered_genes.pkl*
4. *gtex_filtered_tmm_intersect_test.pkl*


### Output files:
1. *supp_fig7a.svg*
2. *supp_fig7b.svg*
3. *supp_fig7c.svg*
4. *supp_fig7d.svg*
5. *fig4d.svg*
6. *shap_kmeans.pkl*
 
### Table of contents:
1. [Import Modules](#1.-Import-Modules)  
2. [Set static paths](#2.-Set-static-paths)  
3. [Load files](#3.-Load-files)  
    3.1 [Load genes](#3.1-Load-genes)  
    3.2 [Load test data](#3.1-Load-test-data)  
4. [Process data](#4.-Process-data)  
    4.1 [Transform data](#4.1-Transform-data)  
    4.2 [Filter genes](#4.2-Filter-genes)  
5. [Plot clusters](#5.-Plot-clusters)  
6. [Measure clustering](#6.-Measure-clustering)  
    6.1 [Calculate kmeans](#6.1-Calculate-kmeans)  
    6.2 [Plot V-measure](#6.2-Plot-V-measure)  
7. [Save out results](#6.-Save-out-results)  

## 1. Import Modules

In [None]:
import os

In [None]:
util_path = '../src'
os.chdir(util_path)

In [None]:
import pandas as pd
import pickle
from tqdm import tqdm
from cluster import get_random_gene_df, get_kmeans_dict, get_p_value
from vis import plot_umap
from modelling.cnn import log_transform
import statistics 
import math
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(rc={'figure.figsize':(11.7,8.27)})

%load_ext autoreload
%autoreload 2

## 2. Set static paths

In [None]:
data_type = "imbalanced"
data_dir = "../data/"

In [None]:
input_dir = data_dir + "processed/"
gene_dir = data_dir + "gene_lists/"
fig_dir = "../figures/"
output_dir = data_dir + "processed/"

## 3. Load files

#### 3.1 Load genes

In [None]:
## SHAP genes
with open(gene_dir + "shap_genes.pkl", "rb") as f:
    shap_genes = pickle.load(f)
    
## EdgeR genes
with open(gene_dir + "edger_genes.pkl", "rb") as f:
    edger_genes = pickle.load(f)
    
## All genes
with open(gene_dir + "filtered_genes.pkl", "rb") as f:
    filtered_genes = pickle.load(f)

#### 3.2 Load test data

In [None]:
## Test data
with open(input_dir + "gtex_filtered_tmm_intersect_test.pkl", "rb") as f:
    test_data = pickle.load(f)

## 4. Process data

#### 4.1 Transform data

In [None]:
test_data = log_transform(test_data, label=True)

#### 4.2 Filter genes

In [None]:
# SHAP-selected genes
shap_df = test_data.loc[:, shap_genes]
shap_df["type"] = test_data.loc[:, "type"]

# # edgeR-selected genes

edger_df = test_data.loc[:, edger_genes]
edger_df["type"] = test_data.loc[:, "type"]

# fullset genes

full_df = test_data.loc[:, filtered_genes]
full_df["type"] = test_data.loc[:, "type"]

In [None]:
random_df = get_random_gene_df(test_data, 2423)

## 5. Plot clusters

In [None]:
random_list = []
# SHAP UMAP
shap_umap_df = plot_umap(
    shap_df,
    "supp_fig7a",
    fig_dir,
    label_col="type",
    seed=42,
)

#  EDGER UMAP
edger_umap_df = plot_umap(
    edger_df,
    "supp_fig7b",
    fig_dir,
    label_col="type",
    seed=42,
)

# FULL UMAP
full_umap_df = plot_umap(
    full_df,
    "supp_fig7c",
    fig_dir,
    label_col="type",
    seed=42,
)

# RANDOM SHAP UMAP
rand_shap_umap_df = plot_umap(
    random_df,
    "supp_fig7d",
    fig_dir,
    label_col="type",
    seed=42,
)
random_list.append(rand_shap_umap_df)

for i in range(9):
    rand_shap_umap_df = plot_umap(
        random_df,
        "supp_fig7d",
        fig_dir,
        label_col="type",
        seed=42,
        save_plot=False
    )
    random_list.append(rand_shap_umap_df)

## 6. Measure clustering

#### 6.1 Calculate kmeans

In [None]:
kmeans_dict = {}
shap = []
edger = []
fullset = []
random_shap_dict = {
    0:[],
    1:[],
    2:[],
    3:[],
    4:[],
    5:[],
    6:[],
    7:[],
    8:[],
    9:[]
}

In [None]:
for x in tqdm(range(10)):
    shap.append(get_kmeans_dict(shap_umap_df, "type"))
    edger.append(get_kmeans_dict(edger_umap_df, "type"))
    fullset.append(get_kmeans_dict(full_umap_df, "type"))
    for i in range(10):
        random_shap_dict[i].append(get_kmeans_dict(random_list[i], "type"))

In [None]:
kmeans_dict["SHAP"] = shap
kmeans_dict["EdgeR"] = edger
kmeans_dict["All"] = fullset
for i in range(10):
    kmeans_dict[f"Random SHAP {i}"] = random_shap_dict[i]

In [None]:
random_shap_results = []
shap_results = pd.DataFrame.from_dict(kmeans_dict["SHAP"])
edger_results = pd.DataFrame.from_dict(kmeans_dict["EdgeR"])
fullset_results = pd.DataFrame.from_dict(kmeans_dict["All"])
for i in range(10):
    random_shap_results.append(pd.DataFrame.from_dict(kmeans_dict[f"Random SHAP {i}"]))

In [None]:
result_list = [shap_results, edger_results, fullset_results]

In [None]:
for i in range(10):
    result_list.append(random_shap_results[i])

In [None]:
colour_map_rand = {"SHAP":"r", "Random (n=2423)":"lightgray",  "All Genes": "b", "EdgeR":"g"}

#### 6.2 Plot V-measure

In [None]:
df_labels = ["SHAP", "All Genes",  "EdgeR", "Random (n=2423)", "Random (n=2423)", "Random (n=2423)", "Random (n=2423)", "Random (n=2423)", "Random (n=2423)", "Random (n=2423)", "Random (n=2423)", "Random (n=2423)", "Random (n=2423)", "Random (n=2423)"]
metric = "V-Measure"
for ix, i in enumerate(result_list):
    sns.kdeplot(result_list[ix][metric], label=df_labels[ix], color=colour_map_rand[df_labels[ix]]).set_title(metric)
plt.legend()
sns.despine();
file_path = fig_dir+"fig4d.svg"
plt.savefig(file_path)

In [None]:
rand_mean = []
for i in range(10):
    rand_mean.append(random_shap_results[i]["V-Measure"].mean())

In [None]:
print("SHAP V-Measure:", round(shap_results["V-Measure"].mean(), 3))
print("EdgeR V-Measure:", round(edger_results["V-Measure"].mean(), 3))
print("All genes V-Measure:", round(fullset_results["V-Measure"].mean(), 3))
print("Random SHAP V-Measures:", round(min(rand_mean), 3), "-", round(max(rand_mean), 3))

## 7. Save out results

In [None]:
shap_results.to_pickle(output_dir+"shap_kmeans.pkl")