# UVEAL MELANOMA TUMOR CELL ANALYSIS

###### Ashley Laughney ... 03-29-2018
###### Ethan Earlie ........... 07-20-2020

## INSTALL DEPENDENCIES

In [None]:
fig_dpi = 300

In [None]:
%load_ext autoreload
%autoreload 2
run_in_background_wait_default=True

import warnings
warnings.filterwarnings("ignore")

import time
import decimal
import json
import os
from os import path
import random
import shlex
import shutil
import glob
from subprocess import call, check_output, Popen, PIPE
import gc
import numpy as np
from numpy.linalg import norm
import seaborn as sns
import pandas as pd
import scanpy as sc
import networkx as nx
import pickle
import joblib
import re
import csv
import h5py
from copy import deepcopy
import itertools
from itertools import combinations
from collections import defaultdict, OrderedDict
from imp import reload
from mpl_toolkits.mplot3d import Axes3D
from colors import rgb, hex
import scipy.cluster.hierarchy as hc
#import wikipedia
import pylab
from scipy.stats import wasserstein_distance

import matplotlib
print(matplotlib.get_backend())
#%matplotlib inline
#print(matplotlib.get_backend())

import matplotlib.pyplot as plt
print(matplotlib.get_backend())
import matplotlib.gridspec as gridspec
print(matplotlib.get_backend())
import matplotlib.colors as mcolors

from functools import partial
from statsmodels.sandbox.stats.multicomp import multipletests
from mpl_toolkits.axes_grid1 import make_axes_locatable

from sklearn import manifold
from sklearn.manifold.t_sne import _joint_probabilities, _joint_probabilities_nn
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.neighbors import NearestNeighbors
from sklearn import cluster
from sklearn.svm import SVC, LinearSVC

from scipy import stats
from scipy.cluster.hierarchy import dendrogram
from scipy.sparse import coo_matrix, csr_matrix, find, csgraph
from scipy.sparse.linalg import eigs
from scipy.stats import t, variation
from scipy.stats.mstats import kruskalwallis, rankdata
from scipy.spatial import distance
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import dendrogram, fcluster, leaves_list, set_link_color_palette
from kneed import DataGenerator, KneeLocator

#import seqc.stats.analysis as tools

import fastcluster
import bhtsne
import phenograph
import palantir

#import magic
#import wishbone
import seqc
import seqc.plot
import seqc.stats
import seqc.stats.resampled_nonparametric as RNP

from matplotlib.colors import LinearSegmentedColormap
print(matplotlib.get_backend())

from pylab import *
from scipy.optimize import curve_fit
from geode import *
#import gseapy as gp
from seqc.stats import gsea as GSEA

from scipy.stats import zscore
import matplotlib.ticker as plticker
import cmocean
from sklearn.linear_model import LinearRegression


from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram
import statsmodels.formula.api as sms
from sklearn.cluster import SpectralClustering
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture

from IPython.display import HTML, display
from IPython.display import IFrame
#import requests, json
clustergrammer_url = 'http://amp.pharm.mssm.edu/clustergrammer/matrix_upload/'

from statistics import mode
from pprint import pprint

#from src.plotCnvArr import plotcnv
#from src.inferCNV import infer_cnv

def gauss(x,mu,sigma,A):
    return A*exp(-(x-mu)**2/2/sigma**2)

def bimodal(x,mu1,sigma1,A1,mu2,sigma2,A2):
    return gauss(x,mu1,sigma1,A1)+gauss(x,mu2,sigma2,A2)

def rgb2hex(r,g,b):
    hex = "#{:02x}{:02x}{:02x}".format(r,g,b)
    return hex

def hex2rgb(hexcode):
    rgb = tuple(map(ord,hexcode[1:].decode('hex')))
    return rgb

# Custom analysis functions, embedded here
#import seqc.stats.analysis as tools

def display_link(url):
    raw_html = '<a href="%s" target="_blank">%s</a>' % (url, url)
    return display(HTML(raw_html))

import matplotlib.patches as patches
print(matplotlib.get_backend())
import matplotlib.colors as colors
print(matplotlib.get_backend())
import math
import scipy.cluster.hierarchy as sch

import re
from kneed import DataGenerator, KneeLocator
from natsort import natsorted, ns

_nsre = re.compile('([0-9]+)')
def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split(_nsre, s)] 

In [None]:
# INSTALL FORCE DIRECTED LAYOUT DEPENDENCIES
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix, find
from fa2 import ForceAtlas2

def determine_kernel(pca_projections, knn=30, n_jobs=-1):
    """Run Diffusion maps using the adaptive anisotropic kernel

    :param pca_projections: PCA projections of the data
    :param n_components: Number of diffusion components
    :return: Diffusion components, corresponding eigen values and the diffusion operator
    """

    # Determine the kernel
    print('Determing nearest neighbor graph...')
    nbrs = NearestNeighbors(n_neighbors=int(knn), metric='euclidean',
                            n_jobs=n_jobs).fit(pca_projections.values)
    kNN = nbrs.kneighbors_graph(pca_projections.values, mode='distance')

    # Adaptive k
    adaptive_k = int(np.floor(knn / 3))
    nbrs = NearestNeighbors(n_neighbors=int(adaptive_k),
                            metric='euclidean', n_jobs=n_jobs).fit(pca_projections.values)
    adaptive_std = nbrs.kneighbors_graph(
        pca_projections.values, mode='distance').max(axis=1)
    adaptive_std = np.ravel(adaptive_std.todense())

    # Kernel
    N = pca_projections.shape[0]
    x, y, dists = find(kNN)

    # Diffusion components
    W = csr_matrix((np.exp(-dists), (x, y)), shape=[N, N])
    kernel = W + W.T

    return kernel


def compute_force_atlas(adjacency, index_cells, verbose=True, iterations=1500):

    init_coords = np.random.random((adjacency.shape[0], 2))

    forceatlas2 = ForceAtlas2(
        # Behavior alternatives
        outboundAttractionDistribution=False,  # Dissuade hubs
        linLogMode=False,  # NOT IMPLEMENTED
        adjustSizes=False,  # Prevent overlap (NOT IMPLEMENTED)
        edgeWeightInfluence=1.0,
        # Performance
        jitterTolerance=1.0,  # Tolerance
        barnesHutOptimize=True,
        barnesHutTheta=1.2,
        multiThreaded=False,  # NOT IMPLEMENTED
        # Tuning
        scalingRatio=2.0,
        strongGravityMode=False,
        gravity=1.0,
        # Log
        verbose=verbose)

    positions = forceatlas2.forceatlas2(
        adjacency, pos=init_coords, iterations=iterations)
    positions = np.array(positions)

    # Convert to dataframe
    positions = pd.DataFrame(positions,
                             index=index_cells, columns=['x', 'y'])
    return positions

In [None]:
# FUNCTIONS FOR PARSING GMT FILES
import sys, logging
def gsea_gmt_parser(gmt, min_size = 3, max_size = 1000, gene_list=None):
    """Parse gene_sets.gmt(gene set database) file or download from enrichr server.  
    
    :param gmt: the gene_sets.gmt file of GSEA input or an enrichr libary name.
                checkout full enrichr library name here: http://amp.pharm.mssm.edu/Enrichr/#stats
                
    :param min_size: Minimum allowed number of genes from gene set also the data set. Default: 3. 
    :param max_size: Maximum allowed number of genes from gene set also the data set. Default: 5000.
    :param gene_list: Used for filtering gene set. Only used this argument for :func:`call` method.
    :return: Return a new filtered gene set database dictionary. 

    **DO NOT** filter gene sets, when use :func:`replot`. Because ``GSEA`` Desktop have already
    do this for you.
            
    """

    if gmt.lower().endswith(".gmt"):
        with open(gmt) as genesets:    
             genesets_dict = { line.strip("\n").split("\t")[0]: line.strip("\n").split("\t")[2:] 
                              for line in genesets.readlines()}    
    else:
        logging.info("Downloading and generating Enrichr library gene sets...") 
        names = get_library_name()
        if gmt in names:
            import requests
            ENRICHR_URL = 'http://amp.pharm.mssm.edu/Enrichr/geneSetLibrary'
            query_string = '?mode=text&libraryName=%s'
            response = requests.get( ENRICHR_URL + query_string % gmt)
        else: 
            raise Exception("gene_set files(.gmt) not found")
        if not response.ok:
            raise Exception('Error fetching enrichment results, check internet connection first.')
                     
        genesets_dict = { line.split("\t")[0]: 
                          [gene.strip("\n").split(",")[0] for gene in line.split("\t")[2:-1]] 
                          for line in response.iter_lines(chunk_size=1024, decode_unicode='utf-8')}    
 
    

    #filtering dict
    if sys.version_info[0] == 3 :
        genesets_filter =  {k: v for k, v in genesets_dict.items() if len(v) >= min_size and len(v) <= max_size}
    elif sys.version_info[0] == 2:
        genesets_filter =  {k: v for k, v in genesets_dict.iteritems() if len(v) >= min_size and len(v) <= max_size}
    else:
        sys.stderr.write("System failure. Please Provide correct input files")
        sys.exit(1)    
    if gene_list is not None:
        subsets = sorted(genesets_filter.keys())             
        for subset in subsets:            
            tag_indicator = in1d(gene_list, genesets_filter.get(subset), assume_unique=True)
            tag_len = sum(tag_indicator)      
            if tag_len <= min_size or tag_len >= max_size:                    
                del genesets_filter[subset]
            else:
                continue
                
    filsets_num = len(genesets_dict) - len(genesets_filter)
    logging.info("{a} gene_sets have been filtered out when max_size={b} and min_size={c}". \
                 format(a=filsets_num,b=max_size,c=min_size))
    
    if filsets_num == len(genesets_dict):
        sys.stderr.write("No gene sets passed throught filtering condition!!!, try new paramters again!\n" +\
                         "Note: Gene names for gseapy is case sensitive." )
        sys.exit(1)
    else:
        return genesets_filter

In [None]:
# FUNCTIONS FOR PARSING DENDROGRAMS
# Next we need to grab the labels of the leaves given the indexes. 
# But before we do that, since it's difficult to keep track of what color e.g. '#c13d3f' is, 
# we make make an IPython notebook compatible HTML representation of the dictionary holding the information. 
# Objects of this class will behave just like dictionaries, except for representing them as a HTML table.
class Clusters(dict):
    def _repr_html_(self):
        html = '<table style="border: 0;">'
        for c in self:
            hx = rgb2hex(colorConverter.to_rgb(c))
            html += '<tr style="border: 0;">' \
            '<td style="background-color: {0}; ' \
                       'border: 0;">' \
            '<code style="background-color: {0};">'.format(hx)
            html += c + '</code></td>'
            html += '<td style="border: 0"><code>' 
            html += repr(self[c]) + '</code>'
            html += '</td></tr>'
        
        html += '</table>'
        
        return html
    
def get_cluster_classes(den, label='ivl'):
    cluster_idxs = defaultdict(list)
    for c, pi in zip(den['color_list'], den['icoord']):
        for leg in pi[1:3]:
            i = (leg - 5.0) / 10.0
            if abs(i - int(i)) < 1e-5:
                cluster_idxs[c].append(int(i))
    
    cluster_classes = Clusters()
    for c, l in cluster_idxs.items():
        i_l = [den[label][i] for i in l]
        cluster_classes[c] = i_l
    
    return cluster_classes

In [None]:
# FUNCTION FOR RUNNING R SCRIPTS
# run bash command in background if stdoutfile does not exist. If it does, output contents of stdoutfile.
# If force=true, overwrite and re-run command.
def run_in_background(command, stdoutfile, stderrfile="", force=False, wait=run_in_background_wait_default):
    if force or not path.exists(stdoutfile):
        command = 'bash -c \'' + command + '\' > ' + stdoutfile
        if stderrfile:
            command = command + ' 2>' + stderrfile
        else:
            command = command + ' 2>&1'
        if not wait:
            command = command + ' &'
        print('calling ' + command + '\n')
        os.system(command)
    else:
        print("Output from stdout file " + stdoutfile)
        !cat {stdoutfile}
        if stderrfile and path.exists(stderrfile):
            print("Output from stderr file " + stderrfile)
            !cat {stderrfile}

In [None]:
# MAKE CUSTOM COLR MAP
def make_cmap(colors, position=None, bit=False):
    '''
    make_cmap takes a list of tuples which contain RGB values. The RGB
    values may either be in 8-bit [0 to 255] (in which bit must be set to
    True when called) or arithmetic [0 to 1] (default). make_cmap returns
    a cmap with equally spaced colors.
    Arrange your tuples so that the first color is the lowest value for the
    colorbar and the last is the highest.
    position contains values from 0 to 1 to dictate the location of each color.
    '''
    import matplotlib as mpl
    import numpy as np
    bit_rgb = np.linspace(0,1,256)
    if position == None:
        position = np.linspace(0,1,len(colors))
    else:
        if len(position) != len(colors):
            sys.exit("position length must be the same as colors")
        elif position[0] != 0 or position[-1] != 1:
            sys.exit("position must start with 0 and end with 1")
    if bit:
        for i in range(len(colors)):
            colors[i] = (bit_rgb[colors[i][0]],
                         bit_rgb[colors[i][1]],
                         bit_rgb[colors[i][2]])
    cdict = {'red':[], 'green':[], 'blue':[]}
    for pos, color in zip(position, colors):
        cdict['red'].append((pos, color[0], color[0]))
        cdict['green'].append((pos, color[1], color[1]))
        cdict['blue'].append((pos, color[2], color[2]))

    cmap = mpl.colors.LinearSegmentedColormap('my_colormap',cdict,256)
    return cmap

In [None]:
# H5 Wrapper
class H5:

    def __init__(self, archive_name: str):
        """Wrapper for the pandas HDFStore class which ensures that all interactions with
        the archive result in a closed, flushed archive.

        In order to ensure data usability, all data must be submitted in DataFrame format.
        This decision was made to encourage users to pair metadata with sequencing data,
        and reduce the incidence of unexpected data permutation.

        :param archive_name: name of the h5 archive to open. If the archive does not exist
          it will be created using a blosc5 filter

        :method ls: list contents of the archive
        :method save: save an object to the h5 archive
        :method load: load an object from the archive
        :method remove: remove a DataFrame from the archive
        :method is_open: returns True if the h5 archive is open, else False
        """
        if os.path.isfile(archive_name):
            self._archive = pd.HDFStore(archive_name, mode='a')
            self._archive.close()
        else:
            self._archive = pd.HDFStore(
                archive_name, mode='a', complib='blosc', complevel=5)
            self._archive.close()

    def __repr__(self):
        self._archive.open()
        try:
            return repr(self._archive)
        finally:
            self._archive.close()

    def save(self, data: pd.DataFrame, location: str) -> None:
        """Save DataFrame data to the h5 archive in location.

        :param data: DataFrame object to store
        :param location: filepath to save the object in the h5 hierarchy
        """
        if not isinstance(data, pd.DataFrame):
            if isinstance(data, np.ndarray):
                res = input('np.ndarray class detected. Save as pd.DataFrame with '
                            'ascending integer indices? [y/n] ')
                if res in ['y', 'yes', 'Y', 'YES', 'True', 'true', '1']:
                    data = pd.DataFrame(data)
                else:
                    print('User elected not to save DataFrame, archive is unmodified.')
                    return
            else:
                raise TypeError('only pd.DataFrame objects can be saved using this '
                                'class. To save np.ndarray objects please see the tables '
                                'package.')
        self._archive.open()
        try:
            self._archive[location] = data
        finally:
            self._archive.close()

    def load(self, location: str) -> None:
        """Load and return the dataframe found at location in the archive.

        :param location: str, location of object to retrieve from h5
        :return: pd.DataFrame, object found at location
        """
        self._archive.open()
        try:
            return self._archive[location]
        finally:
            self._archive.close()

    def ls(self) -> None:
        """list archive contents"""
        try:
            self._archive.open()
            print(self._archive.keys())
            print(self._archive)
        finally:
            self._archive.close()

    def remove(self, location: str) -> None:
        """remove the DataFrame at location from the archive

        Note: removing a dataframe at a branch node will remove all leaves sharing this
        prefix. e.g. in an archive containing:

        /data
        /data/filtered
        /data/metadata
        /new_data/data

        removing /data would remove the first three DataFrame objects from the archive.

        :param location: location of DataFrame to remove
        :return: None
        """

        self._archive.open()
        try:
            if location not in self._archive.keys():
                raise ValueError(
                    '{} not contained in archive, nothing to remove.'.format(location))
            else:
                removed = [k for k in self._archive.keys()
                           if k.startswith(location + '/')]
                if len(removed) != 0:
                    res = input(
                        'Removing branch node {}, which is a prefix for {!a} will remove '
                        'all listed DataFrames. Continue with removal? [y/n] '.format(
                            location, removed))
                    if res not in ['y', 'yes', 'Y', 'YES', 'True', 'true', '1']:
                        print('returned without deletion.')
                        return
                self._archive.remove(location)
        finally:
            self._archive.close()

    @property
    def is_open(self) -> bool:
        return self._archive.is_open

In [None]:
%matplotlib inline
x = np.arange(5)
plt.scatter(x,x)

In [None]:
# DISPLALY ALL CURRENT DEPENEDENCY
!pip freeze

In [None]:
#s=pd.HDFStore("/ifs/e63data/massaguelab/ashley/data/sc_RNAseq/h5/counts/Project_MB_07384/UVMEL_MERGED_noAdditionalLibSizeFilter.h5")
#s.keys()

In [None]:
#s.close()

In [None]:
#tmp = '/ifs/e63data/massaguelab/ashley/data/sc_RNAseq/h5/counts/Project_MB_07384/UVMEL_MERGED_noAdditionalLibSizeFilter.h5'
#!ls -l '/ifs/e63data/massaguelab/ashley/data/sc_RNAseq/h5/counts/Project_MB_07384/UVMEL_MERGED_noAdditionalLibSizeFilter.h5'

In [None]:
#h5_data = seqc.H5('/ifs/e63data/massaguelab/ashley/data/sc_RNAseq/h5/counts/Project_MB_07384/UVMEL_MERGED_noAdditionalLibSizeFilter.h5')
#h5_data.ls()

In [None]:
#help(seqc.H5)

In [None]:
#matplotlib.get_backend()

## LOAD SAMPLE DATA

### SAMPLE INFORMATION

In [None]:
DATA_PATH = os.path.expanduser('/workdir/uvmel_project/data/')
print(DATA_PATH)
FN  = 'UVMEL_MERGED.h5'#_noAdditionalLibSizeFilter
ORGANISM = 'human'

### SET PATH TO SAVE DIRECTORIES

In [None]:
# TAG ANALYSIS OUTPUT FOLDERS WITH CURRENT DATE
tag = '_ARCHETYPES'
now = time.strftime("%x") 
now = str.replace(now,'/','_')

# SPECIFY OUTPUT STEMS FOR FIGURES/PATHWAY ANALYSIS
GSEA_output_stem = DATA_PATH.replace('/data/','/GSEA/') + FN.replace('.h5','{}_'.format(tag)) + now +'/'
print(GSEA_output_stem)
FIG_output_stem = DATA_PATH.replace('/data/','/figures/') + FN.replace('.h5','{}_'.format(tag)) + now +'/'
print(FIG_output_stem)
CNV_output_stem = DATA_PATH.replace('/data/','/CNV/') + FN.replace('.h5','{}_'.format(tag)) + now +'/'
print(CNV_output_stem)
CSV_output_stem = DATA_PATH.replace("/data/",'/CSV/') + FN.replace('.h5','{}_'.format(tag)) + now +'/'
print(CSV_output_stem)

# CREATE GSEA DIRECTORY IF IT DOES NOT EXIST
d = os.path.dirname(GSEA_output_stem)
if not os.path.exists(d):
        os.makedirs(d)

# CREATE FIGURE DIRECTORY IF IT DOES NOT EXIST     
d = os.path.dirname(FIG_output_stem)
if not os.path.exists(d):
        os.makedirs(d)
        
# CREATE CNV DIRECTORY IF IT DOES NOT EXIST     
d = os.path.dirname(CNV_output_stem)
if not os.path.exists(d):
        os.makedirs(d)
        
# CREATE CNV DIRECTORY IF IT DOES NOT EXIST     
d = os.path.dirname(CSV_output_stem)
if not os.path.exists(d):
        os.makedirs(d)

### LOAD PROCESSED DATAFRAME

In [None]:
h5_data = H5(DATA_PATH+FN)
h5_data.ls()

### SPECIFY COLORMAP

In [None]:
# --- DEFINE ALL COLORMAPS UPFRONT FOR UNIFORMITY ---

# CATEGORICAL: SAMPLE
FLATUI_SAMPLE = [ 
                 'FF0000', #'UM01A'
                 'CC0066', #'UM01B'
                 'CC00CC', #'UM02'
                 '0000CD', #'UM03'
                 'FF8000', #'UM04'
                 '660066', #'UM05'
                 '7F00FF', #'UM06'
]

FLATUI_PATIENT = [ 
                 'DC143C', #'UM01'
                 'C71585', #'UM02'
                 '0000CD', #'UM03'
                 'FF1493', #'UM04'
                 'FF4500', #'UM05'
                 '330066', #'UM06'
]

FLATUI_CASTLE = [
                 '0000FF', # 1
                 'FF0000', # 2
]

FLATUI_STAGE = [
                 'C0C0C0', # 1
                 '000000', # 2
]

FLATUI_TYPE = [
                 '000099', # Epithelioid
                 '990099', # Mixed
                 '990000', # Spindle
]

FLATUI_VASCULO = [
                 'DC143C', # Absent
                 'B0C4DE', # Present
]

FLATUI_CLASS = [
                 '0000FF', # 0
                 'FF0000', # 1   
                 'FF8C00', # 2
                 'FFD700', # 3  
                 '808000', # 4
                 '9ACD32', # 5   
                 '006400', # 6
                 '20B2AA', # 7  
                 '2F4F4F', # 8
                 '00BFFF', # 9   
                 '1E90FF', # 10
                 '191970', # 11 
                 '4B0082', # 12
                 '8B008B', # 13  
                 'FF00FF', # 14
                 'FF1493', # 15 
                 '8B4513', # 16
                 'D2691E', # 17 
                 '708090', # 18
                 'B0C4DE', # 19  
                 '696969', # 20
                 'FFB6C1', # 21
                 'FAEBD7', # 22
                 'F5DEB3', # 23
                 'FFFACD', # 24
                 'D2B48C', # 25
                 'FFA07A', # 26                
]

FLATUI_META_CELL_TYPE = [
                 'B0C4DE', # IMMUNE
                 'FFA07A', # PHOTORECEPTOR  
                 '000000', # TUMOR 
]

# CONVERT HEX TO RGB (FLATUI_SAMPLE)
colors = np.zeros((len(FLATUI_META_CELL_TYPE),3))
for ind,hexcolor in enumerate(FLATUI_META_CELL_TYPE):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_META_CELL_TYPE = LinearSegmentedColormap.from_list('FLATUI_META_CELL_TYPE', colors, N=len(colors))

# CONVERT HEX TO RGB (FLATUI_SAMPLE)
colors = np.zeros((len(FLATUI_SAMPLE),3))
for ind,hexcolor in enumerate(FLATUI_SAMPLE):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_SAMPLES = LinearSegmentedColormap.from_list('FLATUI_SAMPLE', colors, N=len(colors))

# CONVERT HEX TO RGB (FLATUI_CASTLE)
colors = np.zeros((len(FLATUI_CASTLE),3))
for ind,hexcolor in enumerate(FLATUI_CASTLE):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_CASTLE = LinearSegmentedColormap.from_list('FLATUI_CASTLE', colors, N=len(colors))

# CONVERT HEX TO RGB (FLATUI_PATIENT)
colors = np.zeros((len(FLATUI_PATIENT),3))
for ind,hexcolor in enumerate(FLATUI_PATIENT):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_PATIENT = LinearSegmentedColormap.from_list('FLATUI_PATIENT', colors, N=len(colors))

# CONVERT HEX TO RGB (FLATUI_STAGE)
colors = np.zeros((len(FLATUI_STAGE),3))
for ind,hexcolor in enumerate(FLATUI_STAGE):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_STAGE = LinearSegmentedColormap.from_list('FLATUI_STAGE', colors, N=len(colors))

# CONVERT HEX TO RGB (FLATUI_SMOKING)
colors = np.zeros((len(FLATUI_TYPE),3))
for ind,hexcolor in enumerate(FLATUI_TYPE):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_TYPE = LinearSegmentedColormap.from_list('FLATUI_TYPE', colors, N=len(colors))

# CONVERT HEX TO RGB (FLATUI_TREATMENT)
colors = np.zeros((len(FLATUI_VASCULO),3))
for ind,hexcolor in enumerate(FLATUI_VASCULO):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_VASCULO = LinearSegmentedColormap.from_list('FLATUI_VASCULO', colors, N=len(colors))

# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_CLASS),3))
for ind,hexcolor in enumerate(FLATUI_CLASS):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_CLASS = LinearSegmentedColormap.from_list('FLATUI_CLASS', colors, N=len(colors))

# CONTINUOUS: DIVERGING
CM_DIVERGING = plt.cm.RdBu_r

# CONTINUOUS: SEQUENTIAL
CM_SEQUENTIAL = cmocean.cm.thermal 

## CANCER CELLS ONLY

###### SPECIFY SUBSET TYPE

In [None]:
subset_type = 'TUMOR'

exec('DF_{} = h5_data.load(\'/DF_{}\')'.format(subset_type,subset_type))
exec('NDF_{} = h5_data.load(\'/NDF_{}\')'.format(subset_type,subset_type))
exec('INDF_{} = h5_data.load(\'/INDF_{}\')'.format(subset_type,subset_type))
exec('METADATA_{} = h5_data.load(\'/METADATA_{}\')'.format(subset_type,subset_type))
exec('DIMENSIONS_{} = h5_data.load(\'/DIMENSIONS_{}\')'.format(subset_type,subset_type))
exec('GENE_RANK_MAST_{} = h5_data.load(\'/GENE_RANK_MAST_{}\')'.format(subset_type,subset_type))
exec('ARCHETYPE_DISTANCES_{} = h5_data.load(\'/ARCHETYPE_DISTANCES_{}\')'.format(subset_type,subset_type))

### REPORT LIBRARY SIZE DISTRIBUTION OF SUBSET

In [None]:
# PRINT LIBRARY SIZE AND CELL COUNT STATISTICS
exec('median_libsize= DF_{}.sum(axis=1).median()'.format(subset_type))
exec('num_cells = len(DF_{}.sum(axis=1))'.format(subset_type))

print('All Samples, Count: {}, Median LibSize: {}'.format(num_cells, median_libsize))

exec('QUERY = DF_{}'.format(subset_type))
for index in np.unique(QUERY.index.get_level_values('Legend')):
    tmp = QUERY.loc[QUERY.index.map(lambda x: x[1] == index)]
    median_libsize= tmp.sum(axis=1).median()
    num_cells = len(tmp.sum(axis=1))
    print('Sample ID: {}, Count: {}, Median LibSize: {}'.format(index, num_cells, median_libsize))

In [None]:
exec('QUERY = DF_{}'.format(subset_type))
exec('META = METADATA_{}'.format(subset_type))

plt.figure(figsize = (6,6))
gs1 = gridspec.GridSpec(2,2)
gs1.update(wspace=0.5, hspace=0.7) # set the spacing between axes. 

# (1) PLOT LOG LIBRARY SIZE
ax = plt.subplot(gs1[0])
bins = np.linspace(np.log10(QUERY.values.sum(axis=1).min()), np.log10(QUERY.values.sum(axis=1).max())*0.95, 100)
for ind, label in enumerate(np.unique(QUERY.index.get_level_values('Legend'))):
    x = np.log10(QUERY.loc[QUERY.index.map(lambda x: x[1] == label)].values.sum(axis=1)) # log cell size
    plt.hist(x, bins, alpha=0.5, label=label,color = np.divide(tuple(hex(FLATUI_SAMPLE[ind]).rgb),255)) 
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('\nLog. Cell Size')
sns.despine()

# (2) PLOT LOG NUM UNIQUE GENES
ax = plt.subplot(gs1[1])
bins = np.linspace(np.log10((np.sum(QUERY > 0,axis=1)).min()), 
                   np.log10((np.sum(QUERY > 0,axis=1)).max())*0.95, 100)
for ind, label in enumerate(np.unique(QUERY.index.get_level_values('Legend'))):
    x = np.log10(np.sum(QUERY.loc[QUERY.index.map(lambda x: x[1] == label)] > 0,axis=1))
    plt.hist(x, bins, alpha=0.5, label=label,color = np.divide(tuple(hex(FLATUI_SAMPLE[ind]).rgb),255)) 
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('\nLog. Number unique genes')
sns.despine()

# (3) PLOT LOG NUMBER OF CELLS CONTRIBUTING TO EACH GENE
ax = plt.subplot(gs1[2])
data = np.log10(np.sum(QUERY.values > 0,axis=0))
bins = np.linspace(0, data.max()*0.95, 20)
for ind, label in enumerate(np.unique(QUERY.index.get_level_values('Legend'))):
    x = np.log10(np.sum(QUERY.loc[QUERY.index.map(lambda x: x[1] == label)] > 0,axis=0))
    x[(np.isinf(x)) | (np.isnan(x))] = 0
    plt.hist(x, bins, alpha=0.5, label=label,color = np.divide(tuple(hex(FLATUI_SAMPLE[ind]).rgb),255)) 
    
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('\nLog. Cells contributing to each gene')
sns.despine()

# (4) PLOT MITOCHONDRIAL FRACTION PER CELL 
ax = plt.subplot(gs1[3]) 
vals = META['mt_fraction']
vals = np.log10(META['mt_fraction'].astype(float))
vals[(np.isinf(vals)) | (np.isnan(vals))] = 0

bins = np.linspace(vals.min(),vals.max()*0.95, 20)
for ind, label in enumerate(np.unique(QUERY.index.get_level_values('Legend'))):
    x = vals.loc[vals.index.map(lambda x: x[1] == label)]
    x[(np.isinf(x)) | (np.isnan(x))] = 0
    plt.hist(x, bins, alpha=0.5, label=label,color = np.divide(tuple(hex(FLATUI_SAMPLE[ind]).rgb),255)) 
    
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('\nLog. Mitochondrial Fraction')
sns.despine()

# Add (abbreviated) legend bottom left
L = plt.legend(loc='upper right',prop={'size':6},bbox_to_anchor=(1.8, 0.95),fancybox=True) 

# SAVE FIGURE
figure_label = '_library_distribution_subset_{}_log10'.format(subset_type)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label + '.png'
plt.savefig(fn, dpi=fig_dpi)
print(fn)

### FILTER LOW ABUNDANCE GENES

In [None]:
exec('QUERY = DF_{}'.format(subset_type))

# FIT BINOMIAL DISTRIBUTION AND FILTER BASED ON MEAN/STD OF SECOND
plt.figure(figsize = (10,3))
gs1 = gridspec.GridSpec(1, 2)
gs1.update(wspace=0.7, hspace=0.7) # set the spacing between axes. 

# (4) PLOT LOG NUMBER OF CELLS CONTRIBUTING TO EACH GENE
num_cells_per_gene = np.log(np.sum(QUERY.values > 0,axis=0))
num_cells_per_gene[(np.isinf(num_cells_per_gene)) | (np.isnan(num_cells_per_gene))] = 0
rmv_genes1 = np.where(num_cells_per_gene<=0)[0] # GENES MUST BE DETECTED IN AT LEAST 10 CELLS

ax = plt.subplot(gs1[0])
bins = np.linspace(num_cells_per_gene.min(), num_cells_per_gene.max()*0.95, 20)
plt.hist(num_cells_per_gene, bins, alpha=0.5, label='keep')

if rmv_genes1.any():
    plt.hist(num_cells_per_gene[rmv_genes1], bins, alpha=1, label='remove')
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('Gene Filter 1: Remove Genes Singletons \n(Log. # Expressing Cells)')
sns.despine()

# (5) PLOT LOG COUNTS PER GENE - REMOVE LOW ABUNDANCE GENES
log_counts_per_gene = np.log(np.log(np.sum(QUERY.values,axis=0)))
log_counts_per_gene[(np.isinf(log_counts_per_gene)) | (np.isnan(log_counts_per_gene))] = 0
data = log_counts_per_gene

ax = plt.subplot(gs1[1])
bins = np.linspace(data.min(), data.max()*0.95, 100)
y,x,_=hist(data,bins,alpha=.3,label='data')

x=(x[1:]+x[:-1])/2 
expected=(0,.2,3500,1.5,.2,500)
params,cov=curve_fit(bimodal,x,y,expected)
sigma=sqrt(diag(cov))
plot(x,bimodal(x,*params),color='red',lw=3,label='model')

mu1 = params[0]
std1 = params[1]
mu2 = params[3]
std2 = params[4]
rmv_genes_neg = np.where(data<mu2-4*std2)[0]
rmv_genes2 = np.sort(list(set(list(rmv_genes_neg))))

if rmv_genes2.any():
    plt.hist(data[rmv_genes2], bins, alpha=1, label='remove')
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('Gene Filter 2: \nLog-Log Counts/Gene')
sns.despine()

# SAVE FIGURE
figure_label = '_filter_genes'
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label + '.png'
plt.savefig(fn, dpi=fig_dpi)
print(fn)

# EVALUATE NUMBER OF CELLS/GENES REMOVED
print('Count Gene Filter 1: {}'.format(len(rmv_genes1)))
print('Count Gene Filter 2: {}'.format(len(rmv_genes2)))

# REMOVE SELECTED OUTLIER GENES(HIGHLIGHTED IN GREEN)
CUT_DF = deepcopy(QUERY)
rmv_genes = np.sort(list(set(list(rmv_genes1) + list(rmv_genes2))))
if rmv_genes.any():
    CUT_DF = CUT_DF.drop(CUT_DF.columns[rmv_genes],axis=1)
    print(CUT_DF.shape)
    
# Remove empty genes if they exist
drop_genes = np.where(CUT_DF.sum(axis=0)==0)[0]
CUT_DF = CUT_DF.drop(CUT_DF.columns[drop_genes],axis=1)
print(CUT_DF.shape)

# GENES EXCLUDING LOW ABUNDANCE GENES
goi = list(CUT_DF.columns)
print(len(goi))

###### LOAD KEY GENESETS

In [None]:
# LOAD GENE LIST FROM EXCEL
path_to_genesets = DATA_PATH + 'uveal_melanoma_MB_v3.csv'
genesets = pd.read_csv(path_to_genesets,header='infer')
genesets = genesets.apply(lambda x: x.astype(str).str.upper())
print(shape(genesets)[1])

### EVALUATE GENE EXPRESSION PER META CLASS

In [None]:
plot_type = 'box'
title = 'BAP1'
meta = 'Assignment'
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
exec('META = METADATA_{}'.format(subset_type))

fig = plt.figure(figsize = (1,5))
ax = plt.gca()

scale_type = 'count'
palette = dict(zip(['Castle 1','Castle 2'],['#0000FF','#FF0000'])) 


genes = [title]
genes = np.unique([x for x in genes if str(x) != 'NAN'])
detected_genes = list(set(genes).intersection(set(QUERY.columns)))
vals = QUERY[detected_genes]
SCORE = np.nansum(vals,axis=1)

# Format data structure for violin plot
violin_data = []
for ind,v in enumerate(SCORE):
    violin_data.append({'gene': title, 'Z-normalized Expression': v,
                        meta:META[meta].values[ind]}) 
violin_data = pd.DataFrame(violin_data)  

# BOXPLOT GENE EXPRESSION
if plot_type == 'box':
    g = sns.boxplot(x="gene", y="Z-normalized Expression", hue=meta,data=violin_data, palette=palette,notch = True, 
                    hue_order = ['Castle 1','Castle 2'],
                    fliersize = 4, showmeans=False,linewidth = 1, ax = ax) #order = labels, 
    g.set_ylabel("{}".format(datatype),fontsize=10)
    g.set_xlabel(" ",fontsize=10,rotation = 90)
    g.tick_params(labelsize=10)
    sns.despine()
    ax.set(ylim=(0, SCORE.max()*0.95))
    g.legend(loc='upper right',prop={'size':6},bbox_to_anchor=(2.0, 0.95),fancybox=True) 


elif plot_type == 'violin':
    # VIOLIN GENE EXPRESSION
    g = sns.violinplot(x="gene", y="Z-normalized Expression", hue=meta,data=violin_data, palette=palette,notch = True,
                    hue_order = ['Castle 1','Castle 2'],fliersize = 4, showmeans=True,linewidth = 1) #order = labels, 

    g.set_ylabel("{}".format(datatype),fontsize=10)
    g.set_xlabel(" ",fontsize=10,rotation = 90)
    ax.set(ylim=(0, SCORE.max()*0.95))
    g.tick_params(labelsize=10)
    sns.despine()

# COMPARE DISTRIBUTIONS
CLASS1 = violin_data.loc[violin_data[meta].isin(['Castle 1'])]['Z-normalized Expression'].values
CLASS2 = violin_data.loc[violin_data[meta].isin(['Castle 2'])]['Z-normalized Expression'].values
print(title + ' CLASS1 vs. CLASS2')
print(stats.mannwhitneyu(CLASS1,CLASS2))

# SAVE FIGURE
figure_label = '_{}_{}PLOT_{}_{}_{}'.format(subset_type, plot_type,datatype,meta,title)
fn = FIG_output_stem + '/BOXPLOTS/' + FN.replace(".h5", "") + figure_label
    
# CREATE GSEA DIRECTORY IF IT DOES NOT EXIST
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
        
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
plt.savefig(fn + '.pdf', bbox_inches='tight',dpi=fig_dpi)
print(fn)

## TUMOR CELL CLUSTERS AND INFERRED CASTLE CLASS

### VISUALIZE PHENOGRAPH CLUSTERS

In [None]:
# PLOT CATEGORICAL CLUSTER ASSIGNMENTS
# Note: Any rows assigned -1 were identified as outliers and should not be considered as a member of any community.
plt.figure(figsize = (8,3))
nrow = 1
ncol = 2
dot_size = 1
gs1 = gridspec.GridSpec(nrow, ncol)
gs1.update(wspace=0.1, hspace=0.1) # set the spacing between axes. 

dimtype = 'ForceDirected' 
meta = 'Phenograph_Class'
exec('DIM = DIMENSIONS_{}'.format(subset_type))
exec('META = METADATA_{}'.format(subset_type))
exec('QUERY = INDF_{}'.format(subset_type))
x = DIM['{}0'.format(dimtype)]
y = DIM['{}1'.format(dimtype)]
axis_min = -ceil(ceil(max(np.abs(x).max(),np.abs(y).max()))/10)*10
axis_max = ceil(ceil(max(np.abs(x).max(),np.abs(y).max()))/10)*10

# Cluster Assignment
ax = plt.subplot(gs1[0])
seqc.plot.scatter.categorical(x, y, c=META[meta].values, 
                              cmap= CM_CLASS,
                              legend_kwargs={'ncol': 1}, s=dot_size, ax=ax,randomize=True);
plt.rc('xtick', labelsize=6)
plt.rc('ytick', labelsize=6)
sns.despine()
plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.title('{}'.format(meta), fontname='Helvetica', size=12, weight='normal')
plt.axis('off')
ax.legend_.remove()

# Castle Class
ax = plt.subplot(gs1[1])
seqc.plot.scatter.categorical(x, y, c=META['Assignment'], cmap=CM_CASTLE,legend_kwargs={'ncol': 1}, s=dot_size, ax=ax);
sns.despine()
plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
plt.title('Sample ID', fontname='Helvetica', size=12, weight='normal')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
ax.legend_.remove()

# SAVE FIGURE
figure_label = '_CLUSTERS_INDF_{}_{}_{}'.format(subset_type,meta,dimtype) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
plt.savefig(fn + '.pdf', bbox_inches='tight',dpi=fig_dpi)
print(fn)

### MEAN EXPRESSION OF KEY GENES PER CLUSTER

In [None]:
meta = 'Phenograph_Class'
cluster_mean = QUERY.groupby(level = [meta],axis=0).mean()

In [None]:
genesets.columns

In [None]:
master_title = ['CIN_signature_Bakhoum','CIN_responsive_NC_NFKB_UP_Bakhoum',
                'CIN_responsive_NC_NFKB_DN_Bakhoum', 'NC_NFKB_REGULATORS_UP_Bakhoum',
                'NC_NFKB_REGULATORS_DN_Bakhoum']
class_genes = genesets.loc[:,master_title]
class_genes = [gene for gene in class_genes.values.flatten() if gene not in ['NAN']]
matster_title = 'NC_NFKB'
print(len(class_genes))

In [None]:
master_title = 'MANUAL'
class_genes = ['JARID2','TING1','TMEM173','BAP1','LPCAT1','HLA-A','HLA-B','HLA-C','HLA-E',
              'PMEL','MLANA','TYR','TYRP1','DCT','RING1']
print(len(class_genes))

In [None]:
meta = 'Phenograph_Class'
FLATUI_PLOT = FLATUI_CLASS

datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
QUERY = QUERY.groupby(level = [meta],axis=0).median()

# GENERATE LUT FOR PHENOGRAPH CLSUTERS BEFORE DOWNSAMPLING MATRIX
# SPECIFY GENESET AND TITLE
ind = QUERY.index
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
# Palatte for Class METADATA
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors = pd.Series(ind).map(lut)

# ONLY DISPLAY METASTASIS ASSOCIATED CLUSTERS
subset_key = meta
ix = [ind for ind, name in enumerate(QUERY.index.names) if name==subset_key][0]

genes = class_genes
genes = [gene for gene in genes if gene in goi]
genes = np.unique(genes)

# CONSTRUCT HEATMAP DATA
heatmap_data = pd.DataFrame(data = zscore(QUERY[genes].values,axis=0),columns = genes)
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
method = 'average' # average, single centroid/euclidean
metric = 'euclidean' # cosine
linkage = hc.linkage(heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)

# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
cl = hc.leaves_list(col_linkage)
mat = heatmap_data.iloc[r1,cl]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(10,5))

# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.18,0.1,0.03,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors[r1]:
    pos = (x, y / len(r1))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(r1), color=c))
    if y >= len(r1)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.2,0.1,0.7,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=plt.cm.RdBu_r,vmin=-3,vmax=3)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 12,fontname='Arial')
axmatrix.grid(False)

# ADD DENDROGRAM
sch.set_link_color_palette(['#808080', '#808080', '#808080', '#808080','#808080','#808080','#808080'])
ax2 = fig.add_axes([0.105,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#808080')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')

# SAVE FIGURE
figure_label = 'heatmaps/user_defined/{}_{}_HEATMAP_{}_{}_colored_{}'.format(datatype,master_title,method,metric,meta).replace('-','_').replace(' ','_')
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

### MEAN EXPRESSION OF KEY GENES PER CELL

In [None]:
QUERY.head()

In [None]:
# VIOLIN PLOTS
subset_type = 'TUMOR'
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
exec('META = METADATA_{}'.format(subset_type))

np.unique(QUERY.index.get_level_values('Archetype'))

In [None]:
# CLASS RELATED GENES
class_genes = genesets.loc[:,['Castle 2', 'Castle 1']]
class_genes = [gene for gene in class_genes.values.flatten() if gene not in ['NAN']]
print(len(class_genes))

# COLUMN INDEX AND COLORS
genes = [gene for gene in class_genes if gene in QUERY.columns]#gene_array['Gene'].values
# CONSTRUCT HEATMAP DATA
heatmap_data = pd.DataFrame(data = zscore(QUERY[genes].values,axis=0),columns = genes, index = QUERY.index)
print(heatmap_data.shape)

In [None]:
# ROW INDEX
meta = 'Archetype'

FLATUI_PLOT = [
                 'E6E6FA', # not assgined
                 'FF66FF', # pink 0 
                 '6600CC', # purple 1
                 '9ACD32', # Green 2
                 '00FFFF', # skye blue 3
                 'FF8800', # orange 4
                 '0000FF', # dark blue 5
                 'FFD700', # yellow 6
                 '20B2AA', # teal 7 
                 '008000', # forest green 8
                 '8B008B', # 9
    ]

ind = META.index.get_level_values(meta)

# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
# Palatte for Class METADATA
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors = pd.Series(ind).map(lut)

# COLUMN INDEX AND COLORS
genes = [gene for gene in class_genes if gene in QUERY.columns]#gene_array['Gene'].values
# CONSTRUCT HEATMAP DATA
heatmap_data = pd.DataFrame(data = zscore(QUERY[genes].values,axis=0),columns = genes, index = QUERY.index)
yticks = heatmap_data.index
xticks = heatmap_data.columns

In [None]:
heatmap_data

In [None]:
# LINKAGE 
method = 'average' # average, single
metric = 'euclidean' # cosine
linkage = hc.linkage(heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)

In [None]:
QUERY

In [None]:
# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
c1 = hc.leaves_list(col_linkage)
mat = heatmap_data.iloc[r1,c1]

row_colors = pd.Series(mat.index.get_level_values(meta),index =mat.index.get_level_values(meta)).map(lut)

In [None]:
# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(4,10))
plt.rcParams["axes.grid"] = False

# ADD ROW COLOR INDEX 1 (PHENOGRAPH CLASS)
ax1 = fig.add_axes([0,0.1,0.05,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors:
    pos = (x, y / len(row_colors))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(row_colors), color=c))
    if y >= len(row_colors)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.05,0.1,0.9,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=plt.cm.RdBu_r,vmin=-1,vmax=1)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 14)

# ADD COLORBAR
axcolor = fig.add_axes([1.1,0.1,0.1,0.1])
cbar = plt.colorbar(im, cax=axcolor)
#cbar.ax.get_yaxis().set_ticks([])

# SAVE FIGURE
figure_label = 'CASTLE_GENES'.format(meta,subset_type,method,metric)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

## ARCHETYPAL ANALYSIS

In [None]:
subset_type

### IDENTIFY OPTIMAL NUMBER OF ARCHETYPES

In [None]:
# Diffusion Eigenvalues
component_prefix = 'IPC'
exec('QUERY = DIMENSIONS_{}'.format(subset_type))
idx = QUERY.columns.to_series().str.contains('^{}((?!TSNE).)*$'.format(component_prefix)) # select from DIMS_NDF
exec('IPC = DIMENSIONS_{}.loc[:, idx.values]'.format(subset_type))
IPC = IPC.iloc[:,:22]
IPC.head()

In [None]:
!pip freeze

In [None]:
# IDENTIFY OPTIMAL NUMBER OF ARCHETYPES BASED ON KNEE POINT OF THE MODEL EXPALINED VARIANCE
from py_pcha.PCHA import PCHA
n = 20
SSE = [0] * n
varexpl =[0]*n
for noc in np.arange(n):
    #SSE[noc] = noc
    XC, S, C, SSE[noc], varexpl[noc] = PCHA(IPC.T.values, noc=noc+1, delta=0.05)
    print('noc: {}, SSE: {}, EXPVAR: {}'.format(noc, SSE[noc], varexpl[noc]))

In [None]:
varexpl

In [None]:
# IDENTIFY POINT OF MAXIMUM CURVATURE IN CUMULATIVE EXPLAINED VARIANCE
y = varexpl
x = np.arange(len(y))+1
kneedle = KneeLocator(x, y)
kneedle.plot_knee_normalized()
knee = ncomponents = kneedle.knee
plt.title('knee: {}'.format(knee))

# SAVE FIGURE
figure_label = '_{}_KNEEPOINT_ARCHETYPE_EXPLAINED_VARIANCE'.format(subset_type)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
plt.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)

### COMPUTE ARCHETYPES USING OPTIMAL NUMBER OF ARCHETYPES

In [None]:
# Compute Archeytpes on Imputed PCS
optimal_noc = 8
from py_pcha.PCHA import PCHA
XC, S, C, SSE, varexpl = PCHA(IPC.T.values, noc=optimal_noc, delta=0.05)
archetypes = pd.DataFrame(data = np.squeeze(np.asarray(XC)).T, columns = IPC.columns)

In [None]:
archetypes

### COMPUTE PAIRWISE DISTANCE BETWEEN ARCHETYPES

In [None]:
distance = np.zeros([archetypes.shape[0],archetypes.shape[0]])
for current_archetype_idx1 in np.arange(archetypes.shape[0]):
    current_archetype1 = archetypes.loc[current_archetype_idx1,:]

    for current_archetype_idx2 in np.arange(archetypes.shape[0]):
        current_archetype2 = archetypes.loc[current_archetype_idx2,:]

        distance[current_archetype_idx1,current_archetype_idx2] = \
        np.sqrt((current_archetype1-current_archetype2)**2).sum()

distance = pd.DataFrame(data = distance, columns = np.arange(archetypes.shape[0]),
                        index = np.arange(archetypes.shape[0]))

In [None]:
# Clustered heatmap of euclidean distance between archetypes
yticks = distance.index
xticks = distance.columns

# LINKAGE BASED ON NES OF SIGNIFICANT DIFFERENTIALLY EXPRESSED PATHWAYS
method = 'average'
metric = 'cosine'
linkage = hc.linkage(distance, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(distance.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)

# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
cl = hc.leaves_list(col_linkage)
mat = distance.iloc[r1,cl]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(5,5))
plt.rcParams["axes.grid"] = False

# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.1,0.1,0.6,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=plt.cm.RdBu_r)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 14)

labels = list(mat.index)
axmatrix.yaxis.set_ticks_position('left')
ytick = plt.yticks(range(len(labels)), labels, rotation = 0, fontsize = 14)

#axmatrix.set_yticklabels([]);

# ADD COLORBAR
axcolor = fig.add_axes([0.75,0.1,0.01,0.1])
cbar = plt.colorbar(im, cax=axcolor)
#cbar.ax.get_yaxis().set_ticks([])

# SAVE FIGURE
figure_label = 'euclidean_distance_btw_archetypes'
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

###### FIND CELL NEAREST TO EACH ARCHETYPE BASED ON EUCLIDEAN DISTANCE

In [None]:
# Compute distance between each cell and archetype
archetype_distances = pd.DataFrame()
for current_archetype_idx in np.arange(archetypes.shape[0]):
    current_archetype = archetypes.loc[current_archetype_idx,:]
    distances = np.sqrt(((current_archetype-IPC)**2).sum(axis = 1)).to_frame(name = 'Distance_Archetype_{}'.format(current_archetype_idx))
    archetype_distances = pd.concat([archetype_distances, distances],axis=1)

### VISUALIZE CELL NEAREST TO EACH ARCHETYPE

In [None]:
np.arange(len(archetype_distances.columns))

In [None]:
list(genesets.columns)

In [None]:
# Visuaulize distance to each archetype
# PLOT CATEGORICAL CLUSTER ASSIGNMENTS
# Note: Any rows assigned -1 were identified as outliers and should not be considered as a member of any community.
plt.figure(figsize = (10,5))
nrow = 1
ncol = 2
dot_size = 1
gs1 = gridspec.GridSpec(nrow, ncol)
gs1.update(wspace=0.01, hspace=0.1) # set the spacing between axes. 

cm = plt.cm.bwr 
dimtype = 'ForceDirected' 
meta = 'Phenograph_Class'
exec('DIM = DIMENSIONS_{}'.format(subset_type))
exec('QUERY= INDF_{}'.format(subset_type))
exec('META = METADATA_{}'.format(subset_type))
x = DIM['{}0'.format(dimtype)]
y = DIM['{}1'.format(dimtype)]
axis_min = -ceil(ceil(max(np.abs(x).max(),np.abs(y).max()))/10)*10
axis_max = ceil(ceil(max(np.abs(x).max(),np.abs(y).max()))/10)*10

# MONOSOMY 3 SIGNATURE
ax = plt.subplot(gs1[0])
gene_sig = 'Monosomy 3 Up'
signature_genes = genesets[gene_sig].values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
phase = zscore(np.nanmedian(QUERY[detected_genes],axis=1))
# Plot scatter map with phase score
mask = np.isnan(phase)
    
# Randomize/Sort Color Values before Plotting
color_values = np.array(phase)
i = np.random.permutation(len(color_values))
xx = np.array(x)[i]
yy = np.array(y)[i]
color_values = color_values[i]

plt.scatter(xx, yy, c = color_values, cmap = CM_DIVERGING, s=3, alpha =0.4)
plt.clim(-1.5,1.5)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
plt.title('Monosomy 3', fontname='Helvetica', size=12, weight='normal')

# BAP1 SIGNATURE
ax = plt.subplot(gs1[1])
gene_sig = 'Castle 2'
signature_genes = genesets[gene_sig].values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
phase = zscore(np.nanmedian(QUERY[detected_genes],axis=1))
# Plot scatter map with phase score
mask = np.isnan(phase)
    
# Randomize/Sort Color Values before Plotting
color_values = np.array(phase)
i = np.random.permutation(len(color_values))
xx = np.array(x)[i]
yy = np.array(y)[i]
color_values = color_values[i]

plt.scatter(xx, yy, c = color_values, cmap = CM_DIVERGING, s=3, alpha =0.4)
plt.clim(-1.5,1.5)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
plt.title('GEP 2', fontname='Helvetica', size=12, weight='normal')

# SAVE FIGURE
figure_label = '_{}_MONOSOMY3_CASTLE2'.format(subset_type)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
plt.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
FLATUI_ARCHETYPES = [
                 'FF66FF', # pink
                 '6600CC', # purple
                 '9ACD32', # Green
                 '00FFFF', # skye blue
                 'FF8800', # orange
                 '0000FF', # dark blue
                 'FFD700', # yellow
                 '20B2AA', # teal
                 #'008000', # forest green
                 #'8B008B'
    ]

# Visuaulize distance to each archetype
# PLOT CATEGORICAL CLUSTER ASSIGNMENTS
# Note: Any rows assigned -1 were identified as outliers and should not be considered as a member of any community.
plt.figure(figsize = (20,10))
nrow = 1
ncol = 2
dot_size = 1
gs1 = gridspec.GridSpec(nrow, ncol)
gs1.update(wspace=0.01, hspace=0.1) # set the spacing between axes. 

cm = plt.cm.bwr 
dimtype = 'ForceDirected' 
meta = 'Phenograph_Class'
exec('DIM = DIMENSIONS_{}'.format(subset_type))
exec('QUERY= INDF_{}'.format(subset_type))
exec('META = METADATA_{}'.format(subset_type))
x = DIM['{}0'.format(dimtype)]
y = DIM['{}1'.format(dimtype)]
axis_min = -ceil(ceil(max(np.abs(x).max(),np.abs(y).max()))/10)*10
axis_max = ceil(ceil(max(np.abs(x).max(),np.abs(y).max()))/10)*10

# MONOSOMY 3 SIGNATURE
ax = plt.subplot(gs1[0])
gene_sig = 'Monosomy 3 Up'
signature_genes = genesets[gene_sig].values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
phase = zscore(np.nanmedian(QUERY[detected_genes],axis=1))
# Plot scatter map with phase score
mask = np.isnan(phase)
    
# Randomize/Sort Color Values before Plotting
color_values = np.array(phase)
i = np.random.permutation(len(color_values))
xx = np.array(x)[i]
yy = np.array(y)[i]
color_values = color_values[i]

plt.scatter(xx, yy, c = color_values, cmap = CM_DIVERGING, s=3, alpha =0.4)
plt.clim(-1.5,1.5)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
plt.title('Monosomy 3', fontname='Helvetica', size=12, weight='normal')

# Distance to archeytpe
#ax = plt.subplot(gs1[1])
#archetype_distances['Archetype_Cluster'] = [0]*archetype_distances.shape[0]
#plt.scatter(x,y,c = '#E6E6FA',s = 3)
for ii in np.arange(len(archetype_distances.columns)):
    c = np.log10(archetype_distances['Distance_Archetype_{}'.format(ii)])
    #cc = c < c.mean()-2*c.std()
    c = c==c.min()
    #numcells = cc.sum()
    #print(numcells)    
    #text(x, y, s, fontsize=12)
    plt.scatter(x[c.values],y[c.values],s= 300, color = '#'+FLATUI_ARCHETYPES[ii])
    offset = 1000
    plt.text(x[c.values]+offset,y[c.values]+offset,s = '{}'.format(ii), fontsize = 20)#numcells/10)
    sns.despine()
    plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
    plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
    plt.title('Archetypes', fontname='Helvetica', size=12, weight='normal')
    plt.xlim(axis_min,axis_max)
    plt.ylim(axis_min,axis_max)
    plt.axis('off')

# SAVE FIGURE
figure_label = '_{}_MONOSOMY3_ARCHETYPES_noText_Top5'.format(subset_type)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
plt.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
FLATUI_ARCHETYPES = [
                 'E6E6FA', # not assgined
                 'FF66FF', # pink
                 '6600CC', # purple
                 '9ACD32', # Green
                 '00FFFF', # skye blue
                 'FF8800', # orange
                 '0000FF', # dark blue
                 'FFD700', # yellow
                 '20B2AA', # teal
                 #'008000', # forest green
                 #'8B008B'
    ]

# CONVERT HEX TO RGB (FLATUI_SAMPLE)
colors = np.zeros((len(FLATUI_ARCHETYPES),3))
for ind,hexcolor in enumerate(FLATUI_ARCHETYPES):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_ARCHETYPES = LinearSegmentedColormap.from_list('FLATUI_ARCHETYPES', colors, N=len(colors))

num_archetypes = len(archetype_distances.columns)
ccc = [0]*archetype_distances.shape[0]
for ii in np.arange(num_archetypes):
    c = np.log10(archetype_distances['Distance_Archetype_{}'.format(ii)])
    cc = c == c.min()
    cc = cc*(ii+1)
    ccc=ccc+cc
    
ax = plt.gca()
seqc.plot.scatter.categorical(x, y, c= ccc, randomize = True,edgecolors='none',
                              cmap=CM_ARCHETYPES,legend_kwargs={'ncol': 1}, s=1, ax=ax);
plt.title('Archetypes', fontname='Helvetica', size=12, weight='normal')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax.legend_.remove()
plt.axis('off')
#ccc[ccc>num_archetypes] = 0 # remove cells assigned to more than one archetype

In [None]:
np.unique(ccc)

###### UPDATE INDEX

In [None]:
METADATA_TUMOR.head()

In [None]:
# Write fuzzy clusters to metadata
METADATA_TUMOR['Archetype_Nearest_Cell'] = ccc

# Update index
new_index = pd.MultiIndex.from_tuples(list(zip(METADATA_TUMOR.index.get_level_values('Sample ID'), 
                                               METADATA_TUMOR.index.get_level_values('Legend'),
                                               METADATA_TUMOR.index.get_level_values('Patient'),
                                               METADATA_TUMOR.index.get_level_values('Cell ID'),
                                               METADATA_TUMOR.index.get_level_values('Phenograph_Class'),
                                               METADATA_TUMOR.index.get_level_values('Archetype'),
                                               METADATA_TUMOR.index.get_level_values('Nearest Archetype'),
                                               METADATA_TUMOR['Archetype_Nearest_Cell'])), 
                                  names=['Sample ID','Legend', 'Patient','Cell ID','Phenograph_Class','Archetype',
                                        'Nearest Archetype','Archetype_Nearest_Cell'])

In [None]:
subset_type = 'TUMOR'
exec('DF_{} = pd.DataFrame(DF_{}.values, columns = DF_{}.columns, index = new_index)'\
     .format(subset_type,subset_type,subset_type))
exec('NDF_{} = pd.DataFrame(NDF_{}.values, columns = NDF_{}.columns, index = new_index)'\
     .format(subset_type,subset_type,subset_type))
exec('INDF_{} = pd.DataFrame(INDF_{}.values, columns = INDF_{}.columns, index = new_index)'\
     .format(subset_type,subset_type,subset_type))
exec('METADATA_{} = pd.DataFrame(METADATA_{}.values, columns = METADATA_{}.columns, index = new_index)'\
     .format(subset_type,subset_type,subset_type))
exec('DIMENSIONS_{} = pd.DataFrame(DIMENSIONS_{}.values, columns = DIMENSIONS_{}.columns, index = new_index)'\
     .format(subset_type,subset_type,subset_type))

### COMPUTE DIFFUSION DISTANCE BETWEEN EACH CELL AND EACH ARCHETYPE (NEAREST CELL)

In [None]:
# PLOT DCS on TSNE, SELECT MOST RELEVANT PCS
dimtype = 'ForceDirected' 

exec('x = DIMENSIONS_{}[\'{}0\']'.format(subset_type,dimtype))
exec('y = DIMENSIONS_{}[\'{}1\']'.format(subset_type,dimtype))
xmax = np.abs(x).max()
ymax = np.abs(y).max()
axis_min = -ceil(ceil(max(xmax,ymax)/10)*10)
axis_max = ceil(ceil(max(xmax,ymax)/10)*10)

cm = plt.cm.bwr 
dot_size = 1

# Diffusion Eigenvalues
component_prefix = 'DC'
exec('QUERY = DIMENSIONS_{}'.format(subset_type))
idx = QUERY.columns.to_series().str.contains('^{}((?!TSNE).)*$'.format(component_prefix)) # select from DIMS_NDF
exec('DIFFUSION_EIGS = DIMENSIONS_{}.loc[:, idx.values]'.format(subset_type))

nrow = 2
ncol = 3
plt.figure(figsize = (10,5))
gs1 = gridspec.GridSpec(nrow, ncol)
gs1.update(wspace=0.5, hspace=0.5)  

# Plot scatter map with phase score
for ind,label in enumerate(DIFFUSION_EIGS.columns[:5]):
    c = DIFFUSION_EIGS[label].values
    # CURRENT AXIS
    ax = plt.subplot(gs1[ind])
    plt.scatter(x, y, c=c,cmap=CM_DIVERGING, s=dot_size, alpha =0.4)             # finite data
    plt.title(label, fontname='Helvetica', size=12, weight='normal')
    plt.axis('off')
    plt.clim(-0.04,0.04)
    plt.xlim(axis_min,axis_max)
    plt.ylim(axis_min,axis_max)
    
# PLOT SAMPLE_ID
ax = plt.subplot(gs1[ind+1])
exec('seqc.plot.scatter.categorical(x, y, c=NDF_{}.index.get_level_values(\'Legend\'), randomize = True, \
     cmap=CM_SAMPLES, s=dot_size, ax=ax)'.format(subset_type))
plt.title('Sample ID', fontname='Helvetica', size=12, weight='normal')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
ax.legend_.remove()

# SAVE FIGURE
figure_label = '_{}_DIFFUSIONEIGS_{}'.format(subset_type,dimtype)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
plt.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)

### SELECT MOST IMPORTANT DIFFUSION EIGS FOR MULTISCALE DISTANCE

In [None]:
DIFFUSION_EIGS.head()

In [None]:
# SELECT EIGENVALUES, EXCLUDING ZEROITH
eigs = DIFFUSION_EIGS
eigs = [eig[0].astype(float) for eig in eigs.values]
eigs = np.array(eigs)
eigengap = np.abs(eigs[:-1]-eigs[1:])

# COMPUTE EIGENGAP
fig = plt.figure(figsize = (3,3))
ax = plt.gca()
ax.set_facecolor('white')
plt.plot(np.arange(len(eigengap))+1,eigengap, marker='.', linestyle='dashed',linewidth=1, markersize=10, color = 'k')
plt.xlim(1,18)
plt.ylim(0,0.025)

ax.set_ylabel("\n\n\n{}".format('Delat(Eigenvalue)'),fontsize=10)
ax.set_xlabel("Diffusion Component",fontsize=10)
ax.tick_params(labelsize=10)

plt.grid(False)
sns.despine()

# SAVE FIGURE
figure_label = '{}_DCeigengap'.format(subset_type)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)

### PAIRWISE DIFFUSION DISTANCE BETWEEN CELLS

In [None]:
L = 5 # number of eigen vectors to include in multi-scale distance

from scipy.spatial.distance import pdist, squareform

vec = DIFFUSION_EIGS.iloc[:,:L]
scale_factor = (eigs/(1-eigs))#**2 # constant
scale_factor = scale_factor[:L]

distance_matrix = vec*scale_factor
pairwise_distances = squareform(pdist(distance_matrix, lambda u,v: np.sqrt(((u-v)**2).sum()))) # MS, remove sqrt?

diffusion_pairwise_distances= pd.DataFrame(index = DIFFUSION_EIGS.index, data=pairwise_distances, 
                                           columns = DIFFUSION_EIGS.index.get_level_values('Cell ID'))

fig = plt.figure(figsize = (3,3))
ax = plt.gca()
plt.imshow(-np.log(pairwise_distances+0.001))
plt.colorbar()

### ASSIGN SOFT CLUSTER TO ARCHETYPE BASED ON DIFFUSION DISTANCE

In [None]:
diffusion_pairwise_distances

In [None]:
# Get ID of cell nearest to each archetype
master_groupby = 'Archetype_Nearest_Cell'
ix = [ind for ind,name in enumerate(diffusion_pairwise_distances.index.names) if name==master_groupby][0]
cellid_archetype = np.zeros(len(np.unique(diffusion_pairwise_distances.index.get_level_values(master_groupby)))-1)
for arch_ind in np.unique(diffusion_pairwise_distances.index.get_level_values(master_groupby))[1:]:
    vals = diffusion_pairwise_distances.loc[diffusion_pairwise_distances.index.map(lambda x: x[ix] in [arch_ind])]
    cellid_archetype[int(arch_ind)-1] = vals.index.get_level_values('Cell ID')[0]
cellid_archetype

In [None]:
# Assign soft cluster arround each archtype based on multi-scale diffusion distance
# Radius around each archetype define as 1/2 multiscale diffusion distance to nearest archetype
master_groupby = 'Archetype_Nearest_Cell'
ix = [ind for ind,name in enumerate(diffusion_pairwise_distances.index.names) if name==master_groupby][0]

assignments = pd.DataFrame(index = diffusion_pairwise_distances.index, 
                           columns = ['Archetype_Soft_Cluster'],
                           data = np.zeros(diffusion_pairwise_distances.shape[0]).astype(int))

for arch_ind in np.unique(diffusion_pairwise_distances.index.get_level_values(master_groupby))[1:]:
    
    vals = diffusion_pairwise_distances.loc[diffusion_pairwise_distances.index.map(lambda x: x[ix] in [arch_ind])]
    
    distance_to_other_archetypes = vals[cellid_archetype.astype(int)].\
                                   drop(vals.index.get_level_values('Cell ID')[0],axis=1)
    
    arch_radius = 0.5*distance_to_other_archetypes.min().min()
    tmp = (vals < arch_radius)*arch_ind
    assignments = pd.DataFrame(index = diffusion_pairwise_distances.index, 
                               columns = ['Archetype_Soft_Cluster'],
                               data = assignments['Archetype_Soft_Cluster'].values + tmp.values[0])

METADATA_TUMOR['Archetype_Soft_Cluster'] = assignments['Archetype_Soft_Cluster']

In [None]:
FLATUI_ARCHETYPES = [
                 'E6E6FA', # not assgined
                 'FF66FF', # pink
                 '6600CC', # purple
                 '9ACD32', # Green
                 '00FFFF', # skye blue
                 'FF8800', # orange
                 '0000FF', # dark blue
                 'FFD700', # yellow
                 '20B2AA', # teal
                 #'008000', # forest green
                 #'8B008B'
    ]

# CONVERT HEX TO RGB (FLATUI_SAMPLE)
colors = np.zeros((len(FLATUI_ARCHETYPES),3))
for ind,hexcolor in enumerate(FLATUI_ARCHETYPES):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_ARCHETYPES = LinearSegmentedColormap.from_list('FLATUI_ARCHETYPES', colors, N=len(colors))

In [None]:
### PLOT CATEGORICAL CLUSTER ASSIGNMENTS
# Note: Any rows assigned -1 were identified as outliers and should not be considered as a member of any community.
plt.figure(figsize = (8,3))
nrow = 1
ncol = 2
dot_size = 1
gs1 = gridspec.GridSpec(nrow, ncol)
gs1.update(wspace=0.1, hspace=0.1) # set the spacing between axes. 

dimtype = 'ForceDirected' 
meta = 'Phenograph_Class'
exec('DIM = DIMENSIONS_{}'.format(subset_type))
exec('META = METADATA_{}'.format(subset_type))
exec('QUERY = INDF_{}'.format(subset_type))
x = DIM['{}0'.format(dimtype)]
y = DIM['{}1'.format(dimtype)]
axis_min = -ceil(ceil(max(np.abs(x).max(),np.abs(y).max()))/10)*10
axis_max = ceil(ceil(max(np.abs(x).max(),np.abs(y).max()))/10)*10

# Cluster Assignment
ax = plt.subplot(gs1[0])
seqc.plot.scatter.categorical(x, y, c=META[meta].values, 
                              cmap= CM_CLASS,
                              legend_kwargs={'ncol': 1}, s=dot_size, ax=ax,randomize=True);
plt.rc('xtick', labelsize=6)
plt.rc('ytick', labelsize=6)
sns.despine()
plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.title('{}'.format(meta), fontname='Helvetica', size=12, weight='normal')
plt.axis('off')
ax.legend_.remove()

# Castle Class
ax = plt.subplot(gs1[1])
seqc.plot.scatter.categorical(x, y, c=META['Archetype_Soft_Cluster'], 
                              cmap=CM_ARCHETYPES,legend_kwargs={'ncol': 1}, s=dot_size, ax=ax);
sns.despine()
plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
plt.title('Sample ID', fontname='Helvetica', size=12, weight='normal')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
#ax.legend_.remove()

# SAVE FIGURE
figure_label = '_CLUSTERS_INDF_{}_{}_{}'.format(subset_type,meta,dimtype) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
plt.savefig(fn + '.pdf', bbox_inches='tight',dpi=fig_dpi)
print(fn)


###### ASSIGN ALL CELLS TO NEAREST ARCHETYPE

In [None]:
diffusion_pairwise_distances.head()

In [None]:
D = pd.DataFrame(index = diffusion_pairwise_distances.index)
for ind in np.unique(diffusion_pairwise_distances.index.get_level_values('Archetype_Nearest_Cell')):
    D[ind]=diffusion_pairwise_distances[\
           diffusion_pairwise_distances.index.get_level_values('Archetype_Nearest_Cell')==ind].mean(axis=0).values

METADATA_TUMOR['Nearest_Archetype'] = D.idxmin(axis=1)

In [None]:
cm_flatui_1 = FLATUI_ARCHETYPES[1:]

# CONVERT HEX TO RGB (FLATUI_SAMPLE)
colors = np.zeros((len(cm_flatui_1),3))
for ind,hexcolor in enumerate(cm_flatui_1):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_ARCHETYPES = LinearSegmentedColormap.from_list('cm_flatui_1', colors, N=len(colors))

# PLOT CATEGORICAL CLUSTER ASSIGNMENTS
# Note: Any rows assigned -1 were identified as outliers and should not be considered as a member of any community.
plt.figure(figsize = (8,3))
nrow = 1
ncol = 2
dot_size = 1
gs1 = gridspec.GridSpec(nrow, ncol)
gs1.update(wspace=0.1, hspace=0.1) # set the spacing between axes. 

dimtype = 'ForceDirected' 
meta = 'Phenograph_Class'
exec('DIM = DIMENSIONS_{}'.format(subset_type))
exec('META = METADATA_{}'.format(subset_type))
exec('QUERY = INDF_{}'.format(subset_type))
x = DIM['{}0'.format(dimtype)]
y = DIM['{}1'.format(dimtype)]
axis_min = -ceil(ceil(max(np.abs(x).max(),np.abs(y).max()))/10)*10
axis_max = ceil(ceil(max(np.abs(x).max(),np.abs(y).max()))/10)*10

# Cluster Assignment
ax = plt.subplot(gs1[0])
seqc.plot.scatter.categorical(x, y, c=META[meta].values, 
                              cmap= CM_CLASS,
                              legend_kwargs={'ncol': 1}, s=dot_size, ax=ax,randomize=True);
plt.rc('xtick', labelsize=6)
plt.rc('ytick', labelsize=6)
sns.despine()
plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.title('{}'.format(meta), fontname='Helvetica', size=12, weight='normal')
plt.axis('off')
ax.legend_.remove()

# Castle Class
ax = plt.subplot(gs1[1])
seqc.plot.scatter.categorical(x, y, c=META['Nearest_Archetype'], 
                              cmap=CM_ARCHETYPES,legend_kwargs={'ncol': 1}, s=dot_size, ax=ax);
sns.despine()
plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
plt.title('Sample ID', fontname='Helvetica', size=12, weight='normal')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
#ax.legend_.remove()

# SAVE FIGURE
figure_label = '_CLUSTERS_INDF_{}_{}_{}'.format(subset_type,meta,dimtype) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
plt.savefig(fn + '.pdf', bbox_inches='tight',dpi=fig_dpi)
print(fn)

###### UPDATE MULTI-INDEX

In [None]:
# Update index
new_index = pd.MultiIndex.from_tuples(list(zip(METADATA_TUMOR.index.get_level_values('Sample ID'), 
                                               METADATA_TUMOR.index.get_level_values('Legend'),
                                               METADATA_TUMOR.index.get_level_values('Patient'),
                                               METADATA_TUMOR.index.get_level_values('Cell ID'),
                                               METADATA_TUMOR.index.get_level_values('Phenograph_Class'),
                                               METADATA_TUMOR.index.get_level_values('Archetype'),
                                               METADATA_TUMOR.index.get_level_values('Nearest Archetype'),
                                               METADATA_TUMOR['Archetype_Nearest_Cell'],
                                               METADATA_TUMOR['Archetype_Soft_Cluster'],
                                               METADATA_TUMOR['Nearest_Archetype']
                                              )), 
                                  names=['Sample ID','Legend', 'Patient','Cell ID','Phenograph_Class','Archetype',
                                        'Nearest Archetype','Archetype_Nearest_Cell','Archetype_Soft_Cluster',
                                        'Nearest_Archetype'])

In [None]:
subset_type = 'TUMOR'
exec('DF_{} = pd.DataFrame(DF_{}.values, columns = DF_{}.columns, index = new_index)'.
     format(subset_type,subset_type,subset_type))
exec('NDF_{} = pd.DataFrame(NDF_{}.values, columns = NDF_{}.columns, index = new_index)'.
     format(subset_type,subset_type,subset_type))
exec('INDF_{} = pd.DataFrame(INDF_{}.values, columns = INDF_{}.columns, index = new_index)'.
     format(subset_type,subset_type,subset_type))
exec('METADATA_{} = pd.DataFrame(METADATA_{}.values, columns = METADATA_{}.columns, index = new_index)'.
     format(subset_type,subset_type,subset_type))
exec('DIMENSIONS_{} = pd.DataFrame(DIMENSIONS_{}.values, columns = DIMENSIONS_{}.columns, index = new_index)'.
     format(subset_type,subset_type,subset_type))

### FRACTION OF EACH ARCHETYPE PER PATIENT

In [None]:
subset_type

In [None]:
FLATUI_ARCHETYPES

In [None]:
groupby_type = 'Nearest_Archetype' # specify grouping
cm_flatui = FLATUI_ARCHETYPES # specify colormap for grouping

datatype = 'METADATA_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
exec('META = METADATA_{}'.format(subset_type))

# UPDATE INDEX TO CONTAIN DE_CLASS
new_index = pd.MultiIndex.from_tuples(list(zip(META.index.get_level_values('Sample ID'), 
                                               META.index.get_level_values('Patient'),
                                               META.index.get_level_values('Cell ID'),
                                               META[groupby_type])), 
                                  names=['Sample ID','Patient', 'Cell ID',groupby_type])
META = pd.DataFrame(data =META.values, index = new_index, columns = META.columns)

exec('tissue_cluster_sizes = \
     META.groupby(level=[\'Patient\', \'{}\'], axis=0).size().unstack().fillna(0)'.format(groupby_type))

# PRINT NUMBER OF CELLS PER META-CELL TYPE
print(tissue_cluster_sizes.sum(axis=0))
print('\n')
print(tissue_cluster_sizes.sum(axis=0).sum())
# PRINT NUMBER OF CELLS PER PATIENT
print(tissue_cluster_sizes.sum(axis=1))

# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(cm_flatui),3))
for ii,hexcolor in enumerate(cm_flatui):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
metacell_colors = [rgb2hex(int(color[0]*255), int(color[1]*255), int(color[2]*255)) for color in colors]

In [None]:
# Create a pie chart
D = tissue_cluster_sizes
plt.figure(figsize = (3,1))
ax = plt.gca()
D.plot.bar(stacked=True, color=metacell_colors, ax = ax, width = 0.85)
ax.legend_.remove()
plt.axis('off')

# SAVE FIGURE
figure_label = '_{}Distribution_PerCellType_{}_PathwayMerged_retainN_noaxis'.\
                format(meta,subset_type).replace('.','_')
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
plt.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
tissue_cluster_sizes

In [None]:
D = tissue_cluster_sizes
plt.figure(figsize = (3,1))
ax = plt.gca()
D.div(D.sum(axis=1),axis=0).plot.bar(stacked=True, color=metacell_colors, ax = ax, width = 0.85)
ax.legend_.remove()
plt.axis('off')

# SAVE FIGURE
figure_label = '_{}Distribution_PerCellType_{}_PathwayMerged_fraction'.format(meta,subset_type).replace('.','_')
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
plt.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
D.div(D.sum(axis=1),axis=0)

### REPRESENTATION OF EACH ARCHETYPE WITHIN EACH PATIENT

In [None]:
patient_fraction = np.log10(D.div(D.sum(axis=1),axis=0))

In [None]:
patient_fraction

In [None]:
exec('tissue_cluster_sizes = INDF_{}.groupby(level=[\'Patient\', \'{}\'], axis=0).size().unstack().fillna(0)'.format(subset_type,meta))

In [None]:
tissue_cluster_sizes

In [None]:
(tissue_cluster_sizes.div(tissue_cluster_sizes.sum(axis=1),axis =0))

In [None]:
sample_size_array

In [None]:
#meta = 'Archetype_Soft_Cluster'
#FLATUI_PLOT = FLATUI_ARCHETYPES

meta = 'Nearest_Archetype'
FLATUI_PLOT = FLATUI_NEAREST_ARCHETYPES

# Phenograph clusters
ind = META[meta].values
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
# Palatte for Class METADATA
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))

exec('tissue_cluster_sizes = INDF_{}.groupby(level=[\'Patient\', \'{}\'], axis=0).size().unstack().fillna(0)'.format(subset_type,meta))
tissue_cluster_fraction = (tissue_cluster_sizes.div(tissue_cluster_sizes.sum(axis=1),axis =0))

exec('sample_size_array = \
     META.groupby(level=[\'Patient\', \'{}\'], axis=0).size().unstack().fillna(0).sum(axis=1).values'.format(groupby_type))

#SCORE = tissue_cluster_fraction[cluster_rank]

violin_data = pd.DataFrame()
for class_name in tissue_cluster_fraction.columns:
    a = pd.DataFrame({meta:[class_name]*tissue_cluster_fraction.shape[0], 
                      'Fraction':tissue_cluster_fraction[class_name].values,
                      'Sample Size':tissue_cluster_sizes[class_name].values,
                      'Patient': tissue_cluster_fraction[class_name].index})
    violin_data = violin_data.append(a)

cluster_rank = violin_data.groupby(meta).median().sort_values('Fraction',ascending = False).index#[0]+order

fig = plt.figure(figsize = (3,5))
ax = plt.gca()

# Plot violin plot
g = sns.boxplot(x=meta, y="Fraction" ,data=violin_data,
                linewidth = 1, ax = ax, order = cluster_rank, notch = False,fliersize =1,
               palette = lut);
g.set_ylabel("{} Fraction per Patient".format(meta,fontsize=10))
g.set_xlabel(" ",fontsize=10,rotation = 90)
g.tick_params(labelsize=10)
sns.despine()

# Add in points to show each observation
sns.swarmplot(x=meta, y="Fraction" ,data=violin_data,
              size=4, color="k", linewidth=0, order = cluster_rank, marker = 'o')

g.set_xlabel(" ",fontsize=10,rotation = 90)
plt.ylim((0,1))

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(True)
ax.spines['left'].set_visible(True)
ax.spines['bottom'].set_linewidth(0.75)
ax.spines['left'].set_linewidth(0.75)


# SAVE FIGURE
figure_label = 'fraction_{}_per_patient_ranked_size'.format(meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
tissue_cluster_fraction

In [None]:
violin_data.to_csv('/workdir/uvmel_project/data/Archetype_per_Patient_Fractions_Ranked_Boxplot_Data.csv')

### MULTI-SCALE DIFFUSION DISTANCE BETWEEN ARCHETYPES

In [None]:
diffusion_pairwise_distances.head()

In [None]:
master_groupby = 'Archetype_Nearest_Cell'
ix = [ind for ind,name in enumerate(diffusion_pairwise_distances.index.names) if name==master_groupby][0]
distance_to_other_archetypes = pd.DataFrame()
for arch_ind in np.unique(diffusion_pairwise_distances.index.get_level_values(master_groupby))[1:]:
    vals = diffusion_pairwise_distances.loc[diffusion_pairwise_distances.index.map(lambda x: x[ix] in [arch_ind])]
    distance_to_other_archetypes = distance_to_other_archetypes.append(vals[cellid_archetype.astype(int)])

distance_to_other_archetypes = pd.DataFrame(index = \
                                            distance_to_other_archetypes.index.get_level_values(master_groupby), 
                                            data = \
                                            distance_to_other_archetypes.values,
                                            columns = \
                                            distance_to_other_archetypes.index.get_level_values(master_groupby))

In [None]:
# Clustered heatmap of euclidean distance between archetypes
yticks = distance_to_other_archetypes.index
xticks = distance_to_other_archetypes.columns

# LINKAGE BASED ON NES OF SIGNIFICANT DIFFERENTIALLY EXPRESSED PATHWAYS
method = 'average'
metric = 'cosine'
linkage = hc.linkage(distance_to_other_archetypes, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(distance_to_other_archetypes.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)

# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
cl = hc.leaves_list(col_linkage)
mat = distance_to_other_archetypes.iloc[r1,cl]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(5,5))
plt.rcParams["axes.grid"] = False

# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.1,0.1,0.6,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=plt.cm.RdBu_r)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 14)

labels = list(mat.index)
axmatrix.yaxis.set_ticks_position('left')
ytick = plt.yticks(range(len(labels)), labels, rotation = 0, fontsize = 14)

#axmatrix.set_yticklabels([]);

# ADD COLORBAR
axcolor = fig.add_axes([0.75,0.1,0.01,0.1])
cbar = plt.colorbar(im, cax=axcolor)
#cbar.ax.get_yaxis().set_ticks([])

# SAVE FIGURE
figure_label = 'diffusion_distance_ongraph_btw_archetypes'
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
distance_to_other_archetypes[4].sort_values().index

In [None]:
distance_to_other_archetypes[5].sort_values().index

In [None]:
distance_to_other_archetypes[3].sort_values().index

### RANKED BOXPLOTS

In [None]:
# RANK BY DIFFUSION DISTANCE
order = distance_to_other_archetypes[4].sort_values().index
order = order.append(pd.Index([0])) 

In [None]:
order

In [None]:
genesets.columns

In [None]:
meta = 'Archetype_Soft_Cluster'

FLATUI_ARCHETYPES = [
                 'E6E6FA', # not assgined
                 'FF66FF', # pink
                 '6600CC', # purple
                 '9ACD32', # Green
                 '00FFFF', # skye blue
                 'FF8800', # orange
                 '0000FF', # dark blue
                 'FFD700', # yellow
                 '20B2AA', # teal
                 #'008000', # forest green
                 #'8B008B'
    ]

FLATUI_PLOT = FLATUI_ARCHETYPES

In [None]:
plot_genes = ['TMEM173', 'BAP1', 'JARID2', 'RING1', 'MYC','PRAME', 'JUN','CDH1', 'VIM', 'TYR', 
              'TYRP1','MLANA', 'PMEL', 'DCT','ASCL1', 'SOX10', 'CHD7', 'FOXD3', 'PAX3', 'NUMB',
              'SNAI1','SNAI2','NEUROG2','HLA-A','HLA-B','HLA-C', 'B2M', 'HLA-G', 'ABCG1','PPARG',
              'DDIT3','NUPR1','RAB3B','IGFBP4','LRRC8C','TCP11L2','MAFK','NRG1','F2R','KRT19','CTGF','ZFC3H1']

In [None]:
# INDIVIDUAL GENE EXPRESSION
meta = 'Archetype_Soft_Cluster'
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
exec('META = METADATA_{}'.format(subset_type))

# Meta-Grouping
exec('ind = METADATA_{}[meta].values'.format(subset_type))
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)

cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))

for title in plot_genes: # PLOT INDIVIDUAL GENES
#for title in genesets:  # PLOT MEDIAN EXPRESSION OF PATHWAY
    
    # PLOT INDIVIDUAL GENES
    genes = [title]
    
    # PLOT MEDIAN EXPRESSION OF SIGNATURE
    #genes = genesets[title].values
    
    genes = np.unique([x for x in genes if str(x) != 'NAN'])
    detected_genes = list(set(genes).intersection(set(QUERY.columns)))
    vals = QUERY[detected_genes]
    SCORE = np.nanmedian(vals,axis=1)
    
    # Format data structure for violin plot
    violin_data = []
    for ind,v in enumerate(SCORE):
        violin_data.append({'gene': title, 'Z-normalized Expression': v,meta:META[meta].values[ind]}) 
    violin_data = pd.DataFrame(violin_data)
    
    # DENSITY PLOT
    fig = plt.figure(figsize = (10,3))
    gs1 = gridspec.GridSpec(2, 2)
    gs1.update(wspace=0.2, hspace=0) # set the spacing between axes. 

    # BGEP RANKED ARPLOTS
    ax_barplot = plt.subplot(gs1[:, 0])
    g = sns.boxplot(x="gene", y="Z-normalized Expression", hue=meta,data=violin_data, palette=palette,
                    notch = True, fliersize = 2, showmeans=False,linewidth = 1, hue_order = order, 
                    showfliers=True, whis = 0.95) 
    g.set_ylabel("{} {}".format(genes[0],datatype),fontsize=10)
    g.set_xlabel(" ",fontsize=10,rotation = 90)
    g.tick_params(labelsize=10)
    ax_barplot.legend_.remove()
    vs = violin_data.iloc[[ind for ind,val in enumerate(violin_data[meta]) if val in order]]['Z-normalized Expression']
    ax_barplot.set(ylim=(vs.min()*0.9, vs.max()*0.95))
    ax_barplot.spines['right'].set_visible(False)
    ax_barplot.spines['top'].set_visible(False)
    ax_barplot.spines['bottom'].set_visible(True)
    ax_barplot.spines['left'].set_visible(True)
    ax_barplot.spines['bottom'].set_linewidth(0.75)
    ax_barplot.spines['left'].set_linewidth(0.75)

    ax1 = plt.subplot(gs1[0, 1])
    ax1.spines['right'].set_visible(False)
    ax1.spines['top'].set_visible(False)
    ax1.spines['bottom'].set_visible(True)
    ax1.spines['left'].set_visible(True)
    ax1.spines['bottom'].set_linewidth(0.75)
    ax1.spines['left'].set_linewidth(0.75)
    bool_array = [val in order for val in violin_data[meta]]
    bins = np.linspace(violin_data['Z-normalized Expression'][bool_array].min(), \
                       violin_data['Z-normalized Expression'][bool_array].max()*0.90, 100)
    for archetype_ind in order:
        vals = violin_data[violin_data[meta]==archetype_ind]['Z-normalized Expression']
        plt.hist(vals,bins,density = False, alpha=0.75,color = palette[archetype_ind])
        plt.tick_params(labelsize=10)
        plt.xticks([])

    archetype_ind = 0
    vals = violin_data[violin_data[meta]==archetype_ind]['Z-normalized Expression']
    ax2 = plt.subplot(gs1[1, 1])
    plt.hist(vals,bins,density = False, alpha=1,color = palette[archetype_ind])
    ax2.set_ylim(ax2.get_ylim()[::-1])
    ax2.spines['right'].set_visible(False)
    ax2.spines['top'].set_visible(True)
    ax2.spines['bottom'].set_visible(False)
    ax2.spines['left'].set_visible(True)
    ax2.spines['top'].set_linewidth(0.75)
    ax2.spines['left'].set_linewidth(0.75)

    plt.xticks(rotation=70)
    plt.tick_params(labelsize=10)
    
    # SAVE FIGURE
    figure_label = '_{}_{}PLOT_{}_{}_{}'.format(subset_type, plot_type,datatype,meta,title)
    fn = FIG_output_stem + 'BOXPLOTS_RANKED_MEDIAN_GEP2_NODENSITY/{}'.format(meta) + FN.replace(".h5", "") + \
    figure_label
    fn = fn.replace(' ','_')
    
    # CREATE GSEA DIRECTORY IF IT DOES NOT EXIST
    d = os.path.dirname(fn)
    if not os.path.exists(d):
        os.makedirs(d)
        
    plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
    #plt.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
    print(fn)
    
    # Close Figure
    plt.close(fig)

### NK CELL HETEROGENEITY

In [None]:
# Phenograph Cluster 19 = NK/NKT
exec('INDF_{} = h5_data.load(\'/INDF_{}\')'.format('ALL','ALL'))
groupby_type = 'Phenograph_Class'
exec('tissue_cluster_sizes = INDF_ALL.groupby(level=[\'Patient\', \'{}\'], axis=0).size().unstack().fillna(0)'.\
     format(groupby_type))
fraction_cell_detected = tissue_cluster_sizes.div(tissue_cluster_sizes.sum(axis=1),axis=0)
NK_CELL_FRACTION = fraction_cell_detected[19]

In [None]:
NK_CELL_FRACTION

In [None]:
mat2 = pd.DataFrame(index = NK_CELL_FRACTION.index, columns = ['NK FRACTION'],data = (NK_CELL_FRACTION.values))

In [None]:
# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(1,5))
plt.rcParams["axes.grid"] = False

# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.1,0.1,0.6,0.6])
im = axmatrix.matshow(mat2, aspect='auto', origin='lower', cmap=plt.cm.RdBu_r, vmin = 0, vmax = 0.2)

labels = list(mat2.index)
axmatrix.yaxis.set_ticks_position('left')
ytick = plt.yticks(range(len(labels)), labels, rotation = 0, fontsize = 14)

#axmatrix.set_yticklabels([]);

# ADD COLORBAR
axcolor = fig.add_axes([0.75,0.1,0.1,0.1])
cbar = plt.colorbar(im, cax=axcolor)
#cbar.ax.get_yaxis().set_ticks([])

# SAVE FIGURE
figure_label = 'NK_NKT_FRACTION'
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
# Fraction of each tumor cel archetype
groupby_type = 'Archetype_Soft_Cluster'
exec('tissue_cluster_sizes = QUERY.groupby(level=[\'Patient\', \'{}\'], axis=0).size().unstack().fillna(0)'.format(groupby_type))
fraction_cell_detected = tissue_cluster_sizes.div(tissue_cluster_sizes.sum(axis=1),axis=0)
fraction_cell_detected

In [None]:
plt.figure(figsize = (5,5))
ax = plt.gca()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(True)
ax.spines['left'].set_visible(True)

x = np.log10(NK_CELL_FRACTION+0.001)[:5]

for arch_ind in order:
    y = np.log10(fraction_cell_detected[arch_ind]+0.001)[:5]
     
    #(r, p) = stats.pearsonr(x, y)
    (slope, intercept, r, p, std_err) = stats.linregress(x,y)
    print('Archetype{} vs NK/NKT Abundance, R: {}, p: {}'.format(arch_ind,r,p))
    
    xi =np.linspace(x.min(),x.max(),20)
    yi = slope*xi+intercept
    
    if p < 0.05:
        plt.scatter(x,y,c = palette[arch_ind])
        plt.plot(xi,yi,c = palette[arch_ind])

###### SAVE TO DATAFRAME

In [None]:
subset_type = 'TUMOR'

# WRITE RANKED GENE LIST TO H5
exec('h5_data.save(DF_{}, \'DF_{}\')'.format(subset_type,subset_type)) # update directory
exec('h5_data.save(NDF_{}, \'NDF_{}\')'.format(subset_type,subset_type)) # update directory
exec('h5_data.save(INDF_{}, \'INDF_{}\')'.format(subset_type,subset_type)) # update directory
exec('h5_data.save(METADATA_{}, \'METADATA_{}\')'.format(subset_type,subset_type)) # update directory
exec('h5_data.save(DIMENSIONS_{}, \'DIMENSIONS_{}\')'.format(subset_type,subset_type)) # update directory
h5_data.ls() # list contents of directory

In [None]:
METADATA_TUMOR.head()

## INTRA-PATIENT HETEROGENEITY

### ARCHETYPE DISTRIBUTION PER PATIENT

In [None]:
subset_type

In [None]:
groupby_type = 'Nearest_Archetype' # specify grouping
cm_flatui = FLATUI_ARCHETYPES[1:] # specify colormap for grouping

datatype = 'METADATA_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
exec('META = METADATA_{}'.format(subset_type))


exec('tissue_cluster_sizes = \
     META.groupby(level=[\'Patient\', \'{}\'], axis=0).size().unstack().fillna(0)'.format(groupby_type))

# PRINT NUMBER OF CELLS PER META-CELL TYPE
print(tissue_cluster_sizes.sum(axis=0))
print('\n')
print(tissue_cluster_sizes.sum(axis=0).sum())
# PRINT NUMBER OF CELLS PER PATIENT
print(tissue_cluster_sizes.sum(axis=1))

# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(cm_flatui),3))
for ii,hexcolor in enumerate(cm_flatui):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
metacell_colors = [rgb2hex(int(color[0]*255), int(color[1]*255), int(color[2]*255)) for color in colors]

In [None]:
tissue_cluster_sizes

In [None]:
tissue_cluster_sizes.div(tissue_cluster_sizes.sum(axis=1),axis=0)

In [None]:
D = tissue_cluster_sizes
plt.figure(figsize = (3,1))
ax = plt.gca()
D.div(D.sum(axis=1),axis=0).plot.bar(stacked=True, color=metacell_colors, ax = ax, width = 0.85)
ax.legend_.remove()
plt.axis('off')

# SAVE FIGURE
figure_label = '_{}Distribution_PerCellType_{}_PathwayMerged_fraction'.format(meta,subset_type).replace('.','_')
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
plt.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
metacell_colors

### RANKED BOXPLOT PER PATIENT

In [None]:
# INDIVIDUAL GENE EXPRESSION
meta = 'Archetype_Soft_Cluster'#'Nearest_Archetype'
FLATUI_PLOT = FLATUI_ARCHETYPES#FLATUI_ARCHETYPES[1:]

#meta = 'Nearest_Archetype'
#FLATUI_PLOT = FLATUI_ARCHETYPES[1:]

patient_ind = 'UM02'
subset_key = 'Patient'
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
exec('META = METADATA_{}'.format(subset_type))

# Meta-Grouping
ind = META[meta].values
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)

cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))

# SUBSET  PATIENT
ix = [ind for ind, name in enumerate(QUERY.index.names) if name==subset_key][0]
QUERY = QUERY.loc[QUERY.index.map(lambda x: x[ix] in [patient_ind])]
META = META.loc[META.index.map(lambda x: x[ix] in [patient_ind])]

ind = META[meta].values

new_order = [o for o in order if o in np.unique(META[meta])]

#for title in plot_genes: # LOT INDIVIDUAL GENES
for title in genesets: # PLOT MEDIAN EXPRESSION OF PATHWAY
    
    # PLOT INDIVIDUAL GENES
    #genes = [title]
    
    # PLOT MEDIAN EXPRESSION OF SIGNATURE
    genes = genesets[title].values
    
    genes = np.unique([x for x in genes if str(x) != 'NAN'])
    detected_genes = list(set(genes).intersection(set(QUERY.columns)))
    vals = QUERY[detected_genes]
    SCORE = np.nanmedian(vals,axis=1)
    
    # Format data structure for violin plot
    violin_data = []
    for ind,v in enumerate(SCORE):
        violin_data.append({'gene': title, 'Z-normalized Expression': v,meta:META[meta].values[ind]}) 
    violin_data = pd.DataFrame(violin_data)
    
    # DENSITY PLOT
    fig = plt.figure(figsize = (10,3))
    gs1 = gridspec.GridSpec(2, 2)
    gs1.update(wspace=0.2, hspace=0) # set the spacing between axes. 

    # BGEP RANKED ARPLOTS
    ax_barplot = plt.subplot(gs1[:, 0])
    g = sns.boxplot(x="gene", y="Z-normalized Expression", hue=meta, data=violin_data, palette=palette,
                    notch = False, fliersize = 2, showmeans=False,linewidth = 1, 
                    hue_order = new_order, 
                    showfliers=True, whis = 0.95) 
    g.set_ylabel("{} {}".format(genes[0],datatype),fontsize=10)
    g.set_xlabel(" ",fontsize=10,rotation = 90)
    g.tick_params(labelsize=10)
    ax_barplot.legend_.remove()
    vs = violin_data.iloc[[ind for ind,val in enumerate(violin_data[meta]) 
                           if val in order]]['Z-normalized Expression']
    ax_barplot.set(ylim=(vs.min()*0.9, vs.max()*0.95))
    ax_barplot.spines['right'].set_visible(False)
    ax_barplot.spines['top'].set_visible(False)
    ax_barplot.spines['bottom'].set_visible(True)
    ax_barplot.spines['left'].set_visible(True)
    ax_barplot.spines['bottom'].set_linewidth(0.75)
    ax_barplot.spines['left'].set_linewidth(0.75)

    ax1 = plt.subplot(gs1[0, 1])
    ax1.spines['right'].set_visible(False)
    ax1.spines['top'].set_visible(False)
    ax1.spines['bottom'].set_visible(True)
    ax1.spines['left'].set_visible(True)
    ax1.spines['bottom'].set_linewidth(0.75)
    ax1.spines['left'].set_linewidth(0.75)
    bool_array = [val in order for val in violin_data[meta]]
    bins = np.linspace(violin_data['Z-normalized Expression'][bool_array].min(), 
                       violin_data['Z-normalized Expression'][bool_array].max()*0.90, 100)#
    for archetype_ind in new_order:
        vals = violin_data[violin_data[meta]==archetype_ind]['Z-normalized Expression']
        plt.hist(vals,bins,density = False, alpha=0.75,color = palette[archetype_ind])
        plt.tick_params(labelsize=10)
        plt.xticks([])
    
    if (violin_data[meta]==0).sum()>0:
        archetype_ind = 0
        vals = violin_data[violin_data[meta]==archetype_ind]['Z-normalized Expression']
        ax2 = plt.subplot(gs1[1, 1])
        plt.hist(vals,bins,density =  False, alpha=1,color = palette[archetype_ind])
        ax2.set_ylim(ax1.get_ylim()[::-1])
        ax2.spines['right'].set_visible(False)
        ax2.spines['top'].set_visible(True)
        ax2.spines['bottom'].set_visible(False)
        ax2.spines['left'].set_visible(True)
        ax2.spines['top'].set_linewidth(0.75)
        ax2.spines['left'].set_linewidth(0.75)

    plt.xticks(rotation=70)
    plt.tick_params(labelsize=10)
    
    # SAVE FIGURE
    figure_label = '_{}_{}PLOT_{}_{}_{}_Patient{}'.format(subset_type, plot_type,datatype,meta,title,patient_ind)
    fn = FIG_output_stem + 'BOXPLOTS_RANKED_MEDIAN_GEP2_NODENSITY_PATIENT{}/{}'.format(patient_ind,meta) + \
         FN.replace(".h5", "") + figure_label
    fn = fn.replace(' ','_')
    
    # CREATE GSEA DIRECTORY IF IT DOES NOT EXIST
    d = os.path.dirname(fn)
    if not os.path.exists(d):
        os.makedirs(d)
        
    plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
    #plt.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
    print(fn)
    
    # Close Figure
    plt.close(fig)

In [None]:
(violin_data[meta]==0).sum()

### RECREATING MIXTURE MODEL

#### MIXTURE MODEL FOR TCGA CLASSIFICATION

In [None]:
TCGA_CLASS_LIST = ['TCGA1', 'TCGA2', 'TCGA3', 'TCGA4']

In [None]:
# Make the TCGA marker_file dataframe containing the signature genes for each TCGA subtype:

TCGA_markerfile_df = pd.DataFrame()

TCGA_markerfile_df['TCGA1'] = [1 for x in range(15)] + [0 for x in range(45)]
TCGA_markerfile_df['TCGA2'] = [0 for x in range(15)] + [1 for x in range(15)] + [0 for x in range(30)]
TCGA_markerfile_df['TCGA3'] = [0 for x in range(30)] + [1 for x in range(15)] + [0 for x in range(15)]
TCGA_markerfile_df['TCGA4'] = [0 for x in range(45)] + [1 for x in range(15)]

TCGA_markerfile_df.index = ['PCDH20', 'CNTN3', 'CCDC68', 'ZNF204P', 'STXBP5L', 'PDE3A', 'ZNF883', 'CHL1', 'SLC25A27', 
                            'ZNF702P', 'ZNF667', 'ZNF391', 'KYNU', 'MPPED2', 'PCDHA13', 'PYY', 'CRYBB2', 'EYA2', 
                            'ERVFRDE1', 'MYO7B', 'LOC643719', 'C20orf26', 'DLL3', 'RHCG', 'COX6A2', 'SOSTDC1', 
                            'TMEM151A', 'FXYD1', 'LOC441869', 'LOC100188947', 'SSX5', 'C1orf116', 'GSG1L', 'VTN', 
                            'COL9A3', 'AMN', 'DYSF', 'NECAB2', 'CDH4', 'ADAM11', 'CAMK1G', 'NXPH4', 'NPAS1', 'SUSD2', 
                            'BAI1', 'CXCL13', 'RIMS2', 'KCNV2', 'GAD1', 'IDO1', 'GBP1', 'UBD', 'JAKMIP1', 
                            'SH2D1A', 'CXCL9', 'COL22A1', 'GBP5', 'TM7SF4', 'LOC96610', 'TIGIT']

In [None]:
# WORKING WITH TUMOR SUBSET:
subset_type = 'TUMOR'

# WORKING WITH IMPUTED NORMALIZED DATA:
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
exec('META = METADATA_{}'.format(subset_type))

# Get imputed normalized gene expression means for each TCGA subtype:

gene1 = 'TCGA1'
signature_genes = TCGA_markerfile_df.loc[TCGA_markerfile_df[gene1] == 1].index.values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
vals1 = (np.nanmean(QUERY[detected_genes],axis=1))

gene2 = 'TCGA2'
signature_genes = TCGA_markerfile_df.loc[TCGA_markerfile_df[gene2] == 1].index.values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
vals2 = (np.nanmean(QUERY[detected_genes],axis=1))

gene3 = 'TCGA3'
signature_genes = TCGA_markerfile_df.loc[TCGA_markerfile_df[gene3] == 1].index.values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
vals3 = (np.nanmean(QUERY[detected_genes],axis=1))

gene4 = 'TCGA4'
signature_genes = TCGA_markerfile_df.loc[TCGA_markerfile_df[gene4] == 1].index.values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
vals4 = (np.nanmean(QUERY[detected_genes],axis=1))

X = pd.DataFrame(data = {gene1: vals1, gene2: vals2, gene3: vals3, gene4: vals4}, index = QUERY.index)

In [None]:
X.head()

###### RUN BGMM MODEL

In [None]:
# Fit BGMM using dataframe X:
bgmm = BayesianGaussianMixture(covariance_type='spherical', n_components=4, random_state=1)
bgmm.fit(X[['TCGA1','TCGA2','TCGA3','TCGA4']])

# Predict TCGA classification for each cell:
clusters = bgmm.predict(X[['TCGA1','TCGA2','TCGA3','TCGA4']])

# Use T-SNE to visualize cell classifications:
X_embedded = manifold.TSNE(n_components=2, random_state=34).fit_transform(X[['TCGA1','TCGA2','TCGA3','TCGA4']])
X['tsne1'] = X_embedded[:,0]
X['tsne2'] = X_embedded[:,1]

X['Assignment'] = clusters

In [None]:
# Make Class Assignments
lut ={}
mean_array = []
for ii in np.unique(X['Assignment']):
    mean_array.append([X[X['Assignment']==ii][x].mean() for x in TCGA_CLASS_LIST])
mean_array = np.array(mean_array)

for i in range(len(np.unique(X['Assignment']))):
    max_ind = np.unravel_index(np.argmax(mean_array, axis=None), mean_array.shape)
    lut[max_ind[0]] = TCGA_CLASS_LIST[max_ind[1]]
    mean_array[:,max_ind[1]] = 0
    mean_array[max_ind[0],:] = 0

# MAP ASSIGNMENTS
X['Assignment'] = X['Assignment'].map(lut)

sns.set(font_scale=2.5)
sns.set_style("white")
g = sns.pairplot(X[['tsne1','tsne2','Assignment']], 
                 hue = 'Assignment',
                 palette={"TCGA1": "#0000FF",
                          "TCGA2": "#A4D3FC",
                          "TCGA3": "#FF7DC2", 
                          "TCGA4": "#FF0000",},
            diag_kind = 'kde', aspect = 1, size = 10);# SAVE FIGURE


# SAVE FIGURE
figure_label = 'MSK_BayesianGMM_TCGA'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 

d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
#g.savefig(fn + '.png', dpi=400, transparent=True)
#g.savefig(fn + '.pdf', dpi=400)
print(fn)

#### INTRATUMOR TCGA HETEROGENEITY

In [None]:
# Normalize mean TCGA signature gene expression for each cell:
X_norm = X[TCGA_CLASS_LIST].copy()
X_norm = X_norm.div(X_norm.sum(axis=1), axis=0)

###### TCGA DISTRIBUTION PER PATIENT

In [None]:
METADATA_TUMOR

In [None]:
METADATA_TUMOR

In [None]:
# GENE-GENE COVARIANCE WITHIN NORMAL SUBSET
n = 500
reps = np.arange(20)

boxplot_patient_list = []
boxplot_fract_list = []
boxplot_class_list = []
master_groupby = 'Patient'

for rep in reps:
    for patient in METADATA_TUMOR.index.get_level_values('Patient').unique():
        SUBSET = pd.DataFrame()
        for class_selection in np.unique(METADATA_TUMOR.index.get_level_values(master_groupby)):
            CSUBSET = METADATA_TUMOR.loc[METADATA_TUMOR.index.map(lambda x: x[ix] in 
                                                                  [class_selection])].sample(n=n,replace=True)
            SUBSET = SUBSET.append(CSUBSET)
        
        tmp = [SUBSET.loc[SUBSET.index.get_level_values('Patient') == patient].\
               TCGA_Assignment.value_counts().reindex(SUBSET.TCGA_Assignment.unique(), fill_value=0)[x] 
               for x in TCGA_CLASS_LIST]
        tmp = list(tmp/sum(tmp))
    
        boxplot_patient_list = boxplot_patient_list + [patient for x in range(len(TCGA_CLASS_LIST))]
        boxplot_fract_list = boxplot_fract_list + tmp
        boxplot_class_list = boxplot_class_list + TCGA_CLASS_LIST

boxplot_data = pd.DataFrame()
boxplot_data['Patient'] = ['MSK_' + x for x in boxplot_patient_list]
boxplot_data['Fraction'] = boxplot_fract_list
boxplot_data['Class'] = boxplot_class_list

boxplot_data.Fraction = [x if x > 0.0001 else 1e-4 for x in boxplot_data.Fraction]

# Set plot
fig = plt.figure(figsize = (7,4))
ax = plt.gca()

# Plot bar plot

sns.barplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class', ci=95, capsize=0.15,
                palette={"TCGA1": "#0000FF",
                         "TCGA2": "#A4D3FC",
                         "TCGA3": "#FF7DC2", 
                         "TCGA4": "#FF0000",},
                linewidth=1, ax = ax, alpha=1,
                errcolor='black',
                errwidth=1)

g = sns.stripplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class',
                palette={"TCGA1": "#0000FF",
                         "TCGA2": "#A4D3FC",
                         "TCGA3": "#FF7DC2", 
                         "TCGA4": "#FF0000",},
                linewidth = 0.5, edgecolor='white',
                ax = ax, alpha=0.7,
                dodge=True)

fig.get_axes()[0].set_yscale('log')

plt.ylim((1e-4,1.15))
ax.set_ylabel("Cell Type Fraction",fontsize=10)
ax.set_xlabel(" ",fontsize=10,rotation = 90)
ax.tick_params(labelsize=10)
ax.set_xticklabels(boxplot_data['Patient'].unique(),rotation=90)
sns.despine()

ax.legend_.remove()
plt.tight_layout()

# SAVE FIGURE
figure_label = 'TCGA_Distribution_by_Patient_LOG'
fn = '/workdir/uvmel_project/figures/Revision_2_Figures/' + figure_label 
plt.savefig(fn + '.png', dpi=400, transparent=True)
plt.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
n = 500
reps = np.arange(20)

master_groupby = 'Patient'
ix = [ind for ind,name in enumerate(X.index.names) if name==master_groupby][0]
fraction_TCGA1 = np.zeros((len(reps),len(np.unique(X.index.get_level_values(master_groupby)))))
fraction_TCGA2 = np.zeros((len(reps),len(np.unique(X.index.get_level_values(master_groupby)))))
fraction_TCGA3 = np.zeros((len(reps),len(np.unique(X.index.get_level_values(master_groupby)))))
fraction_TCGA4 = np.zeros((len(reps),len(np.unique(X.index.get_level_values(master_groupby)))))

for rep in reps:
    # RANDOMLY SAMPLE SAME NUMBER OF CELLS PER PATIENT
    SUBSET = pd.DataFrame()
    for class_selection in np.unique(X.index.get_level_values(master_groupby)):
        CSUBSET = X.loc[X.index.map(lambda x: x[ix] in [class_selection])].sample(n=n,replace=True)
        SUBSET = SUBSET.append(CSUBSET)
    
    bgmm = BayesianGaussianMixture(covariance_type='spherical', n_components=4, random_state=1)
    bgmm.fit(SUBSET[TCGA_CLASS_LIST])
    clusters = bgmm.predict(SUBSET[TCGA_CLASS_LIST])
    SUBSET['Assignment'] = clusters

    # Make Class Assignments
    lut ={}
    mean_array = []
    for ii in np.unique(SUBSET['Assignment']):
        mean_array.append([SUBSET[SUBSET['Assignment']==ii][x].mean() for x in TCGA_CLASS_LIST])
    mean_array = np.array(mean_array)
    
    for i in range(len(np.unique(SUBSET['Assignment']))):
        max_ind = np.unravel_index(np.argmax(mean_array, axis=None), mean_array.shape)
        lut[max_ind[0]] = TCGA_CLASS_LIST[max_ind[1]]
        mean_array[:,max_ind[1]] = 0
        mean_array[max_ind[0],:] = 0
    
    # MAP ASSIGNMENTS
    SUBSET['Assignment'] = SUBSET['Assignment'].map(lut)

    # Update multi-index
    new_index = pd.MultiIndex.from_tuples(list(zip(SUBSET.index.get_level_values('Sample ID'), 
                                                   SUBSET.index.get_level_values('Legend'),
                                                   SUBSET.index.get_level_values('Patient'),
                                                   SUBSET.index.get_level_values('Cell ID'),
                                                   SUBSET.index.get_level_values('Phenograph_Class'),
                                                   SUBSET['Assignment'])), 
                                          names=['Sample ID','Legend', 'Patient','Cell ID',
                                                 'Phenograph_Class','Assignment'])
    SUBSET = pd.DataFrame(data = SUBSET.values, columns = SUBSET.columns, index = new_index)

    # CASTLE ASSIGNMENT PER PATIENT
    meta = 'Assignment'
    exec('tmp = SUBSET.groupby(level=[\'Patient\', \'{}\'], axis=0).size().unstack().fillna(0)'.format(meta))

    # SAVE PER DOWNSAMPLE
    cell_type_fraction = tmp.div(tmp.sum(axis=1),axis=0)
    fraction_TCGA1[rep,:] = cell_type_fraction['TCGA1'].values
    fraction_TCGA2[rep,:] = cell_type_fraction['TCGA2'].values
    fraction_TCGA3[rep,:] = cell_type_fraction['TCGA3'].values
    fraction_TCGA4[rep,:] = cell_type_fraction['TCGA4'].values
    
    
# CONVER TO PANDAS DATAFRAME
boxplot_data_fraction_TCGA1 = pd.DataFrame(data = fraction_TCGA1, 
                                             columns = ['{}_{}'.format(master_groupby,ind) 
                                                        for ind in 
                                                        np.unique(X.index.get_level_values(master_groupby))])
boxplot_data_fraction_TCGA1 = boxplot_data_fraction_TCGA1.stack().rename_axis(('Rep', 'Patient')).\
                                                                      reset_index(name='Fraction')

boxplot_data_fraction_TCGA2 = pd.DataFrame(data = fraction_TCGA2, 
                                             columns = ['{}_{}'.format(master_groupby,ind) 
                                                        for ind in 
                                                        np.unique(X.index.get_level_values(master_groupby))])
boxplot_data_fraction_TCGA2 = boxplot_data_fraction_TCGA2.stack().rename_axis(('Rep', 'Patient')).\
                                                                      reset_index(name='Fraction')

boxplot_data_fraction_TCGA3 = pd.DataFrame(data = fraction_TCGA3, 
                                             columns = ['{}_{}'.format(master_groupby,ind) 
                                                        for ind in 
                                                        np.unique(X.index.get_level_values(master_groupby))])
boxplot_data_fraction_TCGA3 = boxplot_data_fraction_TCGA3.stack().rename_axis(('Rep', 'Patient')).\
                                                                      reset_index(name='Fraction')

boxplot_data_fraction_TCGA4 = pd.DataFrame(data = fraction_TCGA4, 
                                             columns = ['{}_{}'.format(master_groupby,ind) 
                                                        for ind in 
                                                        np.unique(X.index.get_level_values(master_groupby))])
boxplot_data_fraction_TCGA4 = boxplot_data_fraction_TCGA4.stack().rename_axis(('Rep', 'Patient')).\
                                                                      reset_index(name='Fraction')

boxplot_data_fraction_TCGA1['Class'] = ['TCGA1']*len(boxplot_data_fraction_TCGA1)
boxplot_data_fraction_TCGA2['Class'] = ['TCGA2']*len(boxplot_data_fraction_TCGA2)
boxplot_data_fraction_TCGA3['Class'] = ['TCGA3']*len(boxplot_data_fraction_TCGA3)
boxplot_data_fraction_TCGA4['Class'] = ['TCGA4']*len(boxplot_data_fraction_TCGA4)

boxplot_data = boxplot_data_fraction_TCGA1.append([boxplot_data_fraction_TCGA2,
                                                   boxplot_data_fraction_TCGA3,
                                                   boxplot_data_fraction_TCGA4])
boxplot_data['Patient'] = [x.replace('Patient','MSK') for x in boxplot_data.Patient]

#boxplot_data[boxplot_data.Fraction < 0.0001] = 0
boxplot_data.Fraction = [x if x > 0.001 else 1e-4 for x in boxplot_data.Fraction]


# VIOLIN PLOT
fig = plt.figure(figsize = (10,4))
ax = plt.gca()

# Plot violin plot
sns.barplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class', ci=95, capsize=.1,
                palette={"TCGA1": "#0000FF",
                         "TCGA2": "#A4D3FC",
                         "TCGA3": "#FF7DC2", 
                         "TCGA4": "#FF0000",},
                linewidth=1, ax = ax, 
                alpha=1)

g = sns.stripplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class',
                palette={"TCGA1": "#0000FF",
                         "TCGA2": "#A4D3FC",
                         "TCGA3": "#FF7DC2", 
                         "TCGA4": "#FF0000",},
                linewidth = 0.5, edgecolor='white',
                ax = ax, alpha=0.7,
                dodge=True)

fig.get_axes()[0].set_yscale('log')

plt.ylim((1e-4,1.15))
g.set_ylabel("Cell Type Fraction",fontsize=10)
g.set_xlabel(" ",fontsize=10,rotation = 90)
g.tick_params(labelsize=10)
g.set_xticklabels(ax.get_xticklabels(),rotation=90)
sns.despine()

ax.legend_.remove()
#plt.tight_layout()

# SAVE FIGURE
figure_label = 'TCGA_Distribution_by_Patient_LOG'
fn = '/workdir/uvmel_project/figures/Revision_2_Figures/' + figure_label 
plt.savefig(fn + '.png', dpi=400, transparent=True)
plt.savefig(fn + '.pdf', dpi=400)
print(fn)

#### MIXTURE MODEL FOR GEP CLASSIFICATION

In [None]:
# Create GEP marker_file dataframe containing gene signatures for each GEP class:

GEP_markerfile_df = pd.DataFrame() 
GEP_markerfile_df['GEP1'] = [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0] 
GEP_markerfile_df['GEP2'] = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1] 
GEP_markerfile_df.index = ['ID2', 'FXR1', 'EIF1B', 'LTA4H', 'MTUS1', 'SATB1', 'LMCD1', 'ROBO1', 
                           'CDH1', 'ECM1', 'HTR2B', 'RAB31']

pd.DataFrame(columns=['GEP1','GEP2']).to_csv(DATA_PATH + 'cellAssign/GEP_markerfile.csv', index=False) 
GEP_markerfile_df.to_csv(DATA_PATH + 'cellAssign/GEP_markerfile.csv', header=None, mode='a')
GEP_markerfile_df

In [None]:
subset_type = 'TUMOR'
# LOAD GENE LIST FROM EXCEL
path_to_genesets = DATA_PATH+'uveal_melanoma_MB_v3.csv'
genesets = pd.read_csv(path_to_genesets,header='infer')
genesets = genesets.apply(lambda x: x.astype(str).str.upper())
print(shape(genesets)[1])

# Using imputed. normalized data:
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
exec('META = METADATA_{}'.format(subset_type))

In [None]:
# Get mean gene expression for signature genes for each GEP class:
gene1 = 'Castle 1'
signature_genes = genesets[gene1].values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
vals1 = (np.nanmean(QUERY[detected_genes],axis=1))

gene2 = 'Castle 2'
signature_genes = genesets[gene2].values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
vals2 = (np.nanmean(QUERY[detected_genes],axis=1))

X = pd.DataFrame(data = {gene1: vals1, gene2: vals2}, index = QUERY.index)

# Train BGMM model:
bgmm = BayesianGaussianMixture(covariance_type='diag', n_components=2, random_state=1)
bgmm.fit(X[['Castle 1','Castle 2']])

# Use trained BGMM to make predictions:
clusters = bgmm.predict(X[['Castle 1','Castle 2']])
X['Assignment'] = clusters

# Make Class Assignments
lut ={}
for ii in np.unique(X['Assignment']):
    if X[X['Assignment']==ii]['Castle 1'].mean()>X[X['Assignment']==ii]['Castle 2'].mean():
        lut[ii] = 'Castle 1'
    else:
        lut[ii] = 'Castle 2'
    
X['Assignment'] = X['Assignment'].map(lut)


# Plot results:
sns.set(font_scale=2.5)
sns.set_style("white")
g = sns.pairplot(X[['Castle 1','Castle 2','Assignment']], 
                 hue = 'Assignment',
                 palette={"Castle 1": "#0000FF", "Castle 2": "#FF0000", 'Mixed': '#000000'},
                 diag_kind = 'kde', aspect = 1, size = 10,
                 diag_kws={"shade": False}) # SAVE FIGURE

g.set(xlabel='', ylabel='')

# SAVE FIGURE
#figure_label = 'MSK_BayesianGMM'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 

d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
#g.savefig(fn + '.png', dpi=400, transparent=True)
#g.savefig(fn + '.pdf', dpi=400)
print(fn)

###### COLOR GEP PLOT BY TCGA CLASS

In [None]:
X['TCGA_Class'] = METADATA_TUMOR.TCGA_Assignment

sns.set(font_scale=2.5)
sns.set_style("white")
g = sns.pairplot(X[['Castle 1','Castle 2','Assignment', 'TCGA_Class']], 
                 hue = 'TCGA_Class',
                 palette={"TCGA1": "#0000FF",
                          "TCGA2": "#A4D3FC",
                          "TCGA3": "#FF7DC2", 
                          "TCGA4": "#FF0000",},
            diag_kind = 'kde', aspect = 1, size = 10);# SAVE FIGURE


# SAVE FIGURE
figure_label = 'MSK_Mixture_Model_FULL_TCGA_colored'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 

d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
g.savefig(fn + '.png', dpi=400, transparent=True)
g.savefig(fn + '.pdf', dpi=400)
print(fn)

###### MOVE ON WITH INTRATUMOR HETEROGENEITY FOR GEP

In [None]:
# Second GEP assignment scatter plot:

plt.figure(figsize = (5,5))
sns.despine()
sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})
ax = plt.gca()

ax = plt.scatter(X[X['Assignment']=='Castle 1']['Castle 2'], X[X['Assignment']=='Castle 1']['Castle 1'],
                 s = 1, c ='blue',alpha = 0.5)
ax = sns.kdeplot(X[X['Assignment']=='Castle 1']['Castle 2'], X[X['Assignment']=='Castle 1']['Castle 1'],
                 cmap="Blues", shade=True, shade_lowest=False,n_levels = 20, alpha = 0.5)

ax = plt.scatter(X[X['Assignment']=='Castle 2']['Castle 2'], X[X['Assignment']=='Castle 2']['Castle 1'],
                 s = 1, c ='red',alpha = 0.5)
ax = sns.kdeplot(X[X['Assignment']=='Castle 2']['Castle 2'], X[X['Assignment']=='Castle 2']['Castle 1'],
                 cmap="Reds", shade=True, shade_lowest=False, n_levels = 20, alpha = 0.5)


# SAVE FIGURE
# SAVE FIGURE
figure_label = 'MSK_Mixture_Model'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 

d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
plt.savefig(fn + '.png', dpi=400, transparent=True)
plt.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
# Update multi-index
new_index = pd.MultiIndex.from_tuples(list(zip(X.index.get_level_values('Sample ID'), 
                                               X.index.get_level_values('Legend'),
                                               X.index.get_level_values('Patient'),
                                               X.index.get_level_values('Cell ID'),
                                               X.index.get_level_values('Phenograph_Class'),
                                               X['Assignment'])), 
                                  names=['Sample ID','Legend', 'Patient','Cell ID','Phenograph_Class','Assignment'])
X = pd.DataFrame(data = X.values, columns = X.columns, index = new_index)

# CLASS ASSIGNMENT BY PATIENT 
meta = 'Assignment'
exec('tmp = X.groupby(level=[\'Patient\', \'{}\'], axis=0).size().unstack().fillna(0)'.format(meta))
colors = ["#0000FF","#FF0000"]

plt.figure(figsize = (5,3))
ax = plt.gca()
tmp.div(tmp.sum(axis=1),axis=0).plot.barh(stacked=True, color=colors, ax = ax, width = 0.95)
ax.legend_.remove()
ax.set_frame_on(False)
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plt.xlabel('Cell Fraction', fontsize=12)
plt.ylabel('Patient', fontsize=12)

# SAVE FIGURE
#figure_label = 'GEP_Distribution_by_Patient'
#fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label
#plt.savefig(fn + '.png', dpi=400, transparent=True)
#plt.savefig(fn + '.pdf', dpi=400)
#print(fn)

In [None]:
# Down sampled assignment distribution (not used in paper anymore) 

# GENE-GENE COVARIANCE WITHIN NORMAL SUBSET
n = 500
reps = np.arange(20)

master_groupby = 'Patient'
ix = [ind for ind,name in enumerate(X.index.names) if name==master_groupby][0]
fraction_castle1 = np.zeros((len(reps),len(np.unique(X.index.get_level_values(master_groupby)))))
fraction_castle2 = np.zeros((len(reps),len(np.unique(X.index.get_level_values(master_groupby)))))

for rep in reps:
    # RANDOMLY SAMPLE SAME NUMBER OF CELLS PER PATIENT
    SUBSET = pd.DataFrame()
    for class_selection in np.unique(X.index.get_level_values(master_groupby)):
        CSUBSET = X.loc[X.index.map(lambda x: x[ix] in [class_selection])].sample(n=n,replace=True)
        SUBSET = SUBSET.append(CSUBSET)
    
    bgmm = BayesianGaussianMixture(covariance_type='diag', n_components=2, random_state=1)
    bgmm.fit(SUBSET[['Castle 1','Castle 2']])
    clusters = bgmm.predict(SUBSET[['Castle 1','Castle 2']])
    SUBSET['Assignment'] = clusters

    # Make Class Assignments
    lut ={}
    for ii in np.unique(SUBSET['Assignment']):
        if SUBSET[SUBSET['Assignment']==ii]['Castle 1'].mean()>SUBSET[SUBSET['Assignment']==ii]['Castle 2'].mean():
            lut[ii] = 'Castle 1'
        else:
            lut[ii] = 'Castle 2'
    
    SUBSET['Assignment'] = SUBSET['Assignment'].map(lut)

    # Update multi-index
    new_index = pd.MultiIndex.from_tuples(list(zip(SUBSET.index.get_level_values('Sample ID'), 
                                                   SUBSET.index.get_level_values('Legend'),
                                                   SUBSET.index.get_level_values('Patient'),
                                                   SUBSET.index.get_level_values('Cell ID'),
                                                   SUBSET.index.get_level_values('Phenograph_Class'),
                                                   SUBSET['Assignment'])), 
                                          names=['Sample ID','Legend', 'Patient','Cell ID',
                                                 'Phenograph_Class','Assignment'])
    SUBSET = pd.DataFrame(data = SUBSET.values, columns = SUBSET.columns, index = new_index)

    # CASTLE ASSIGNMENT PER PATIENT
    meta = 'Assignment'
    exec('tmp = SUBSET.groupby(level=[\'Patient\', \'{}\'], axis=0).size().unstack().fillna(0)'.format(meta))

    # SAVE PER DOWNSAMPLE
    cell_type_fraction = tmp.div(tmp.sum(axis=1),axis=0)
    fraction_castle1[rep,:] = cell_type_fraction['Castle 1'].values
    fraction_castle2[rep,:] = cell_type_fraction['Castle 2'].values
    
# CONVER TO PANDAS DATAFRAME
boxplot_data_fraction_castle1 = pd.DataFrame(data = fraction_castle1, 
                                             columns = ['{}_{}'.format(master_groupby,ind) 
                                                        for ind in 
                                                        np.unique(X.index.get_level_values(master_groupby))])
boxplot_data_fraction_castle1 = boxplot_data_fraction_castle1.stack().rename_axis(('Rep', 'Patient')).\
                                                                      reset_index(name='Fraction')

boxplot_data_fraction_castle2 = pd.DataFrame(data = fraction_castle2, 
                                             columns = ['{}_{}'.format(master_groupby,ind) 
                                                        for ind in 
                                                        np.unique(X.index.get_level_values(master_groupby))])
boxplot_data_fraction_castle2 = boxplot_data_fraction_castle2.stack().rename_axis(('Rep', 'Patient')).\
                                                                      reset_index(name='Fraction')

boxplot_data_fraction_castle1['Class'] = ['Castle 1']*len(boxplot_data_fraction_castle1)
boxplot_data_fraction_castle2['Class'] = ['Castle 2']*len(boxplot_data_fraction_castle2)
boxplot_data = boxplot_data_fraction_castle1.append(boxplot_data_fraction_castle2)
boxplot_data['Patient'] = [x.replace('Patient','MSK') for x in boxplot_data.Patient]

#boxplot_data[boxplot_data.Fraction < 0.0001] = 0
boxplot_data.Fraction = [x if x > 0.001 else 1e-4 for x in boxplot_data.Fraction]

# VIOLIN PLOT
fig = plt.figure(figsize = (10,4))
ax = plt.gca()

# Plot violin plot
sns.barplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class', ci=95, capsize=.2,
                palette={"Castle 1": "#0000FF", "Castle 2": "#FF0000", 'Mixed': '#000000'},
                linewidth=1, ax = ax, alpha=1)

g = sns.stripplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class',
                palette={"Castle 1": "#0000FF", "Castle 2": "#FF0000", 'Mixed': '#000000'},
                linewidth = 0.5, edgecolor='white',
                ax = ax, alpha=0.7,
                dodge=True)

fig.get_axes()[0].set_yscale('log')

plt.ylim((1e-4,1.15))
g.set_ylabel("Cell Type Fraction",fontsize=10)
g.set_xlabel(" ",fontsize=10,rotation = 90)
g.tick_params(labelsize=10)
g.set_xticklabels(ax.get_xticklabels(),rotation=90)
sns.despine()

ax.legend_.remove()
#plt.tight_layout()

# SAVE FIGURE
figure_label = 'GEP_Distribution_by_Patient_LOG'
fn = '/workdir/uvmel_project/figures/Revision_2_Figures/' + figure_label 
plt.savefig(fn + '.png', dpi=400, transparent=True)
plt.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
boxplot_data

In [None]:
SUBSET

In [None]:
# GENE-GENE COVARIANCE WITHIN NORMAL SUBSET
n = 500
reps = np.arange(20)

boxplot_patient_list = []
boxplot_fract_list = []
boxplot_class_list = []
GEP_CLASS_LIST = ['Castle 1', 'Castle 2']
master_groupby = 'Patient'

for rep in reps:
    for patient in METADATA_TUMOR.index.get_level_values('Patient').unique():
        SUBSET = pd.DataFrame()
        for class_selection in np.unique(METADATA_TUMOR.index.get_level_values(master_groupby)):
            CSUBSET = METADATA_TUMOR.loc[METADATA_TUMOR.index.map(lambda x: x[ix] in 
                                                                  [class_selection])].sample(n=n,replace=True)
            SUBSET = SUBSET.append(CSUBSET)
        
        tmp = [SUBSET.loc[SUBSET.index.get_level_values('Patient') == patient].\
               Assignment.value_counts().reindex(SUBSET.Assignment.unique(), fill_value=0)[x] 
               for x in GEP_CLASS_LIST]
        tmp = list(tmp/sum(tmp))
    
        boxplot_patient_list = boxplot_patient_list + [patient for x in range(len(GEP_CLASS_LIST))]
        boxplot_fract_list = boxplot_fract_list + tmp
        boxplot_class_list = boxplot_class_list + GEP_CLASS_LIST

boxplot_data = pd.DataFrame()
boxplot_data['Patient'] = ['MSK_' + x for x in boxplot_patient_list]
boxplot_data['Fraction'] = boxplot_fract_list
boxplot_data['Class'] = boxplot_class_list

boxplot_data.Fraction = [x if x > 0.0001 else 1e-4 for x in boxplot_data.Fraction]

# Set plot
fig = plt.figure(figsize = (7,4))
ax = plt.gca()

# Plot bar plot

sns.barplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class', ci=95, capsize=0.15,
                palette={"Castle 1": "#0000FF", "Castle 2": "#FF0000", 'Mixed': '#000000'},
                linewidth=1, ax = ax, alpha=1,
                errcolor='black',
                errwidth=1)

g = sns.stripplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class',
                palette={"Castle 1": "#0000FF", "Castle 2": "#FF0000", 'Mixed': '#000000'},
                linewidth = 0.5, edgecolor='white',
                ax = ax, alpha=0.7,
                dodge=True)

fig.get_axes()[0].set_yscale('log')

plt.ylim((1e-4,1.15))
ax.set_ylabel("Cell Type Fraction",fontsize=10)
ax.set_xlabel(" ",fontsize=10,rotation = 90)
ax.tick_params(labelsize=10)
ax.set_xticklabels(boxplot_data['Patient'].unique(),rotation=90)
sns.despine()

ax.legend_.remove()
plt.tight_layout()

# SAVE FIGURE
figure_label = 'GEP_Distribution_by_Patient_LOG'
fn = '/workdir/uvmel_project/figures/Revision_2_Figures/' + figure_label 
plt.savefig(fn + '.png', dpi=400, transparent=True)
plt.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
boxplot_patient_list = []
boxplot_fract_list = []
boxplot_class_list = []
GEP_CLASS_LIST = ['Castle 1', 'Castle 2']

for patient in METADATA_TUMOR.index.get_level_values('Patient').unique():
    tmp = [METADATA_TUMOR.loc[METADATA_TUMOR.index.get_level_values('Patient') == patient].\
           Assignment.value_counts().reindex(METADATA_TUMOR.Assignment.unique(), fill_value=0)[x] 
           for x in GEP_CLASS_LIST]
    tmp = list(tmp/sum(tmp))
    
    boxplot_patient_list = boxplot_patient_list + [patient for x in range(len(GEP_CLASS_LIST))]
    boxplot_fract_list = boxplot_fract_list + tmp
    boxplot_class_list = boxplot_class_list + GEP_CLASS_LIST

boxplot_data = pd.DataFrame()
boxplot_data['Patient'] = ['MSK_' + x for x in boxplot_patient_list]
boxplot_data['Fraction'] = boxplot_fract_list
boxplot_data['Class'] = boxplot_class_list

# Set plot
fig = plt.figure(figsize = (7,4))
ax = plt.gca()

# Plot bar plot

# Castle 1:
ax.bar(x = boxplot_data.loc[boxplot_data.Class == 'Castle 1']['Patient'], 
        height = boxplot_data.loc[boxplot_data.Class == 'Castle 1']['Fraction'], capsize=.2,
        color="#0505C5",
        linewidth = 1,
        width=0.5)

# Castle 2:
ax.bar(x = boxplot_data.loc[boxplot_data.Class == 'Castle 2']['Patient'], 
        height = boxplot_data.loc[boxplot_data.Class == 'Castle 2']['Fraction'], capsize=.2,
        color="#D60808",
        linewidth = 1,
        bottom=boxplot_data.loc[boxplot_data.Class == 'Castle 1']['Fraction'],
        width=0.5)

fig.get_axes()[0].set_yscale('log')

plt.ylim((0,1))
ax.set_ylabel("Cell Type Fraction",fontsize=10)
ax.set_xlabel(" ",fontsize=10,rotation = 90)
ax.tick_params(labelsize=10)
ax.set_xticklabels(boxplot_data['Patient'],rotation=90)
sns.despine()

#ax.legend_.remove()

# SAVE FIGURE
figure_label = 'GEP_Distribution_by_Patient_LOG_NO_ds'
fn = '/workdir/uvmel_project/figures/Revision_2_Figures/' + figure_label 
plt.savefig(fn + '.png', dpi=400, transparent=True)
plt.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
# Get GEP distributions per patient:

boxplot_patient_list = []
boxplot_fract_list = []
boxplot_class_list = []
GEP_CLASS_LIST = ['Castle 1', 'Castle 2']

for patient in METADATA_TUMOR.index.get_level_values('Patient').unique():
    tmp = [METADATA_TUMOR.loc[METADATA_TUMOR.index.get_level_values('Patient') == patient].\
           Assignment.value_counts().reindex(METADATA_TUMOR.Assignment.unique(), fill_value=0)[x] 
           for x in GEP_CLASS_LIST]
    tmp = list(tmp/sum(tmp))
    
    boxplot_patient_list = boxplot_patient_list + [patient for x in range(len(GEP_CLASS_LIST))]
    boxplot_fract_list = boxplot_fract_list + tmp
    boxplot_class_list = boxplot_class_list + GEP_CLASS_LIST

boxplot_data = pd.DataFrame()
boxplot_data['Patient'] = ['MSK_' + x for x in boxplot_patient_list]
boxplot_data['Fraction'] = boxplot_fract_list
boxplot_data['Class'] = boxplot_class_list

# VIOLIN PLOT
fig = plt.figure(figsize = (8,6))
ax = plt.gca()

# Plot violin plot
g = sns.barplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class', ci=95, capsize=.2,
                palette={"Castle 1": "#0000FF", "Castle 2": "#FF0000", 'Mixed': '#000000'},
                linewidth = 1, ax = ax);
fig.get_axes()[0].set_yscale('log')

plt.ylim((0,1))
g.set_ylabel("scRNA-seq \n Tumor Fraction", fontsize=18)
g.set_xlabel(" ", fontsize=10, rotation = 90)
g.tick_params(labelsize=18)
g.set_xticklabels(ax.get_xticklabels(),rotation=90)
#sns.set_style("noticks")
sns.despine()

ax.legend_.remove()
plt.tight_layout()
g.axis(True)

# SAVE FIGURE
figure_label = 'GEP_Distribution_by_Patient_LOG_NO_ds'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 
plt.savefig(fn + '.png', dpi=400, transparent=True)
plt.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
boxplot_data.head()

###### TCGA DIAMOND PLOTS

In [None]:
# CONVERT HEX TO RGB (FLATUI_CASTLE)
ind_GEP = ['Castle 1', 'Castle 2']
colors = np.zeros((len(FLATUI_CASTLE),3))
for ii,hexcolor in enumerate(FLATUI_CASTLE):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind_GEP)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut_CASTLE = dict(zip(np.unique(ind_GEP), colors[cix,:]))

In [None]:
GEP_color_set = [pd.Series(METADATA_TUMOR.loc[METADATA_TUMOR.index.map(lambda x: x[2] == y)].\
                           Assignment.values.tolist()).map(lut_CASTLE)
                 for y in METADATA_TUMOR.index.get_level_values('Patient').unique()]

In [None]:
diamond_list = [plt.Rectangle((0,-1), np.sqrt(2), np.sqrt(2), 45, color='black', fill=False, linewidth=0.5) 
                for x in range(6)]

fig1, axes = plt.subplots(2,3)

TCGA_probs_x_MSK = [X_norm.loc[X_norm.index.map(lambda x: x[2] == y)].TCGA2.values - 
                    X_norm.loc[X_norm.index.map(lambda x: x[2] == y)].TCGA4.values
                    for y in X_norm.index.unique(level='Patient')]

TCGA_probs_y_MSK = [X_norm.loc[X_norm.index.map(lambda x: x[2] == y)].TCGA1.values - 
                    X_norm.loc[X_norm.index.map(lambda x: x[2] == y)].TCGA3.values
                    for y in X_norm.index.unique(level='Patient')]

TCGA_probs_per_patient_MSK = [[(TCGA_probs_x_MSK[x][y], TCGA_probs_y_MSK[x][y]) 
                               for y in range(len(TCGA_probs_x_MSK[x]))]
                               for x in range(len(TCGA_probs_x_MSK))]

for i, ax in enumerate(axes.flatten()):
    ax.scatter(*zip(*TCGA_probs_per_patient_MSK[i]), s=1, color=GEP_color_set[i],
               alpha=0.25)
    ax.set_frame_on(False)
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    ax.add_patch(diamond_list[i])

# SAVE FIGURE
figure_label = 'TCGA_PROBS_DiamondPlot'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig1.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig1.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
diamond_list = [plt.Rectangle((0,-1), np.sqrt(2), np.sqrt(2), 45, color='black', fill=False, linewidth=0.5)
                for x in range(6)]

fig1, axes = plt.subplots(2,3)

TCGA_probs_x_MSK = [X_norm.loc[X_norm.index.map(lambda x: x[2] == y)].TCGA2.values - 
                    X_norm.loc[X_norm.index.map(lambda x: x[2] == y)].TCGA4.values
                    for y in X_norm.index.unique(level='Patient')]

TCGA_probs_y_MSK = [X_norm.loc[X_norm.index.map(lambda x: x[2] == y)].TCGA1.values - 
                    X_norm.loc[X_norm.index.map(lambda x: x[2] == y)].TCGA3.values
                    for y in X_norm.index.unique(level='Patient')]

TCGA_probs_per_patient_MSK = [[(TCGA_probs_x_MSK[x][y], TCGA_probs_y_MSK[x][y]) 
                               for y in range(len(TCGA_probs_x_MSK[x]))]
                               for x in range(len(TCGA_probs_x_MSK))]

for i, ax in enumerate(axes.flatten()):
    ax.scatter(*zip(*TCGA_probs_per_patient_MSK[i]), s=0.2, color='black',
               alpha=0.3)
    ax.set_frame_on(False)
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    ax.add_patch(diamond_list[i])

# SAVE FIGURE
figure_label = 'TCGA_PROBS_DiamondPlot_monochrome'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig1.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig1.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
# CONVERT HEX TO RGB (FLATUI_CASTLE)
FLATUI_TCGA = ['0000FF', 'A4D3FC', 'FF7DC2', 'FF0000']

ind_TCGA = TCGA_CLASS_LIST
colors = np.zeros((len(FLATUI_TCGA),3))
for ii,hexcolor in enumerate(FLATUI_TCGA):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind_TCGA)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut_TCGA = dict(zip(np.unique(ind_TCGA), colors[cix,:]))
CM_TCGA = LinearSegmentedColormap.from_list('FLATUI_TCGA', colors, N=len(colors))

In [None]:
TCGA_color_set = [pd.Series(METADATA_TUMOR.loc[METADATA_TUMOR.index.map(lambda x: x[2] == y)].\
                           TCGA_Assignment.values.tolist()).map(lut_TCGA)
                 for y in METADATA_TUMOR.index.get_level_values('Patient').unique()]

In [None]:
diamond_list = [plt.Rectangle((0,-1), np.sqrt(2), np.sqrt(2), 45, color='black', fill=False, linewidth=0.5)
                for x in range(6)]

fig1, axes = plt.subplots(2,3)

for i, ax in enumerate(axes.flatten()):
    ax.scatter(*zip(*TCGA_probs_per_patient_MSK[i]), s=1, color=TCGA_color_set[i],
               alpha=0.25)
    ax.set_frame_on(False)
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    ax.add_patch(diamond_list[i])

# SAVE FIGURE
figure_label = 'TCGA_PROBS_DiamondPlot_tcga_colored'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig1.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig1.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

## GEP CLINICAL PROGNOSTIFICATION OF INDIVIDUAL TUMOR CELLS

### GEP SIGNATURE CLUSTERED HEATMAP

In [None]:
class_genes = GEP_markerfile_df.index.values.tolist()

# Working with imputed normalized tumor cell data:
subset_type = 'TUMOR'
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))

# COLUMN INDEX AND COLORS
genes = [gene for gene in class_genes if gene in QUERY.columns]#gene_array['Gene'].values
# CONSTRUCT HEATMAP DATA
heatmap_data = pd.DataFrame(data = zscore(QUERY[genes].values,axis=0),columns = genes, index = QUERY.index)
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
method = 'average' # average, single
metric = 'euclidean' # cosine
linkage = hc.linkage(heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)

In [None]:
# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
c1 = hc.leaves_list(col_linkage)
mat = heatmap_data.iloc[r1,c1]

In [None]:
GEP_assignments_sorted = [METADATA_TUMOR.loc[METADATA_TUMOR.index.get_level_values('Cell ID') == x].\
                          Assignment.\
                          values[0] for x in mat.index.get_level_values('Cell ID').values]

row_colors = pd.Series(GEP_assignments_sorted, index=GEP_assignments_sorted).map(lut_CASTLE)

In [None]:
# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(4,10))
plt.rcParams["axes.grid"] = False

# ADD ROW COLOR INDEX 1 (PHENOGRAPH CLASS)
ax1 = fig.add_axes([0,0.1,0.05,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors:
    pos = (x, y / len(row_colors))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(row_colors), color=c))
    if y >= len(row_colors)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.05,0.1,0.9,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=plt.cm.RdBu_r,vmin=-1,vmax=1)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_frame_on(False)
axmatrix.set_yticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 14)
plt.tick_params(size=0)

# ADD COLORBAR
axcolor = fig.add_axes([1.1,0.1,0.1,0.1])
cbar = plt.colorbar(im, cax=axcolor)
#cbar.ax.get_yaxis().set_ticks([])

# SAVE FIGURE
figure_label = 'GEP_Heatmap_BayesianGMM'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

### PAIRWISE CORRELATION BETWEEN MONOSOMY 3 SIGNATURE AND GEP2

In [None]:
# Using imputed normalized data for tumor cells:
subset_type = 'TUMOR'
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
exec('META = METADATA_{}'.format(subset_type))

meta = 'Assignment'

gene1 = 'Castle 2'
signature_genes = genesets[gene1].values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
G1 = (np.nansum(QUERY[detected_genes],axis=1))#*complexity

gene2 = 'Monosomy 3 Up'
signature_genes = genesets[gene2].values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
G2 = (np.nansum(QUERY[detected_genes],axis=1))#*complexity

# ROW INDEX
meta = 'Assignment'
groupby_type = meta # 'Class', 'Meta-Source', 'Legend'
FLATUI = ['0000FF','FF0000']#FLATUI_CLASS, 'FLATUI_SOURCE, FLATUI_SAMPLES

violin_data = pd.DataFrame({
                            meta:list(META[meta]),
                            gene1: G1,
                            gene2: G2,
                           })

ind = violin_data[groupby_type]

# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI),3))
for ii,hexcolor in enumerate(FLATUI):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
# Palatte for Class METADATA
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
dot_colors = pd.Series(ind).map(lut)
metacell_colors = [rgb2hex(int(color[0]*255), int(color[1]*255), int(color[2]*255)) for color in dot_colors]

In [None]:
# DISTRIBUTION OF SOX2/SOX9
x = gene1
y = gene2

plt.figure(figsize = (20,20))
sns.despine()
sns.set(style='ticks')

# JOINTPLOT
g = sns.jointplot(x, y, data=violin_data.fillna(0), kind="reg", stat_func=None, ratio=2, color='k', size=10)
g.ax_joint.cla() # or g.ax_joint.collections[0].set_visible(False), as per mwaskom's comment

# REPLOT SCATTER WITH LINEAGE COLORED
plt.sca(g.ax_joint)
plt.scatter(violin_data[x], violin_data[y], c=dot_colors)

# ADD REGRESSION
xx = violin_data[x]
yy = violin_data[y]
f = lambda x, *p: polyval(p, x)
p, cov = curve_fit(f, xx, yy, [1, 1])

# simulated draws from the probability density function of the regression
xi = linspace(np.min(xx), np.max(xx), 100)
ps = np.random.multivariate_normal(p, cov, 10000)
ysample = np.asarray([f(xi, *pi) for pi in ps])
lower = percentile(ysample, 10, axis=0)
upper = percentile(ysample, 90, axis=0)

# regression estimate line
y_fit = poly1d(p)(xi)

# PLOT REGRESSION LINE
plt.fill_between(xi, lower, upper, facecolor='k', alpha=0.5)
plt.plot(xi, y_fit, 'k-')
plt.tick_params(size=0)
plt.yticks(ticks=[0, 5, 10, 15, 20, 25, 30, 35], 
           labels=['0', '5', '10', '15', '20', '25', '30', '35'], size=20)
plt.xticks(ticks=[0, 5, 10, 15, 20], labels=['0', '5', '10', '15', '20'], size=20)
#plt.xlim((0,5))
#plt.ylim((0,5))

# ADD AXIS LABELS
g.set_axis_labels('GEP2 Signature', 'Monosomy 3 Signature', fontsize=28, fontname = 'Arial')

# COMPUTE STATISTICS
(r, p) = stats.pearsonr(violin_data[x].fillna(0), violin_data[y].fillna(0))
print('{} vs {}, R: {}, p: {}'.format(x,y,r,p))

# SAVE FIGURE
#figure_label = 'M3_GEP2_Correlation_BGMM_UPDATED'.format(meta,subset_type,method,metric)
#fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 
#    
#d = os.path.dirname(fn)
#if not os.path.exists(d):
#    os.makedirs(d)
#    
#plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
#plt.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
stats.pearsonr(violin_data[x].fillna(0), violin_data[y].fillna(0))

In [None]:
violin_data.head()

In [None]:
stats.pearsonr(violin_data[:1088][x].fillna(0), violin_data[:1088][y].fillna(0))

In [None]:
np.nextafter(0, 1)

In [None]:
sys.float_info

### EVALUATE GENE EXPRESSION PER GEP CLASS

In [None]:
plot_type = 'box'
title = 'BAP1'
meta = 'Assignment'
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
exec('META = METADATA_{}'.format(subset_type))

fig = plt.figure(figsize = (1,3))
ax = plt.gca()

scale_type = 'count'
palette = dict(zip(['Castle 1','Castle 2'],['#0000FF','#FF0000'])) 


genes = [title]
genes = np.unique([x for x in genes if str(x) != 'NAN'])
detected_genes = list(set(genes).intersection(set(QUERY.columns)))
vals = QUERY[detected_genes]
SCORE = np.nansum(vals,axis=1)

# Format data structure for violin plot
violin_data = []
for ind,v in enumerate(SCORE):
    violin_data.append({'gene': title, 'Z-normalized Expression': v,
                        meta:META[meta].values[ind]}) 
violin_data = pd.DataFrame(violin_data)  

# BOXPLOT GENE EXPRESSION
if plot_type == 'box':
    g = sns.boxplot(x="gene", y="Z-normalized Expression", 
                    hue=meta,data=violin_data, palette=palette,notch = True, 
                    hue_order = ['Castle 1','Castle 2'], 
                    fliersize = 4, showmeans=False,linewidth = 1, ax = ax) #order = labels, 
    g.set_ylabel('BAP1 Imputed Expression'.format(datatype),fontsize=14)
    g.set_xlabel(" ",fontsize=10,rotation = 90)
    g.tick_params(labelsize=10)
    sns.despine()
    ax.set(ylim=(0, SCORE.max()*0.95))
    g.legend(loc='upper right',prop={'size':6},bbox_to_anchor=(2.0, 0.95),fancybox=True) 


elif plot_type == 'violin':
    # VIOLIN GENE EXPRESSION
    g = sns.violinplot(x="gene", y="Z-normalized Expression", 
                       hue=meta,data=violin_data, palette=palette,notch = True, 
                    hue_order = ['Castle 1','Castle 2'],fliersize = 4, showmeans=True,linewidth = 1) #order = labels, 

    g.set_ylabel('BAP1 Imputed Expression'.format(datatype),fontsize=14)
    g.set_xlabel(" ",fontsize=10,rotation = 90)
    ax.set(ylim=(0, SCORE.max()*0.95))
    g.tick_params(labelsize=10)
    sns.despine()
    
plt.tick_params(size=0)
plt.yticks(ticks=[0, 0.5, 1, 1.5, 2.0], 
           labels=['0.0', '0.5', '1.0', '1.5', '2.0'], size=11)

# COMPARE DISTRIBUTIONS
CLASS1 = violin_data.loc[violin_data[meta].isin(['Castle 1'])]['Z-normalized Expression'].values
CLASS2 = violin_data.loc[violin_data[meta].isin(['Castle 2'])]['Z-normalized Expression'].values
print(title + ' CLASS1 vs. CLASS2')
print(stats.mannwhitneyu(CLASS1,CLASS2))

# SAVE FIGURE
#figure_label = 'BAP1_Expression_by_GEP_BGMM_UPDATED'.format(meta,subset_type,method,metric)
#fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 
#    
#d = os.path.dirname(fn)
#if not os.path.exists(d):
#    os.makedirs(d)
#    
#plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
#plt.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
#print(fn)

In [None]:
METADATA_TUMOR.Assignment.value_counts()

In [None]:
violin_data.Assignment.value_counts()

### PHENOTYPIC VOLUME BY PATIENT

###### The commented out code produces the phenotypic volume results, which were then pickled using joblib and imported to save time

In [None]:
# Import phenotypic volume results per patient:

PHEN_VOL_UM01 = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UM01')
PHEN_VOL_UM02 = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UM02')
PHEN_VOL_UM03 = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UM03')
PHEN_VOL_UM04 = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UM04')
PHEN_VOL_UM05 = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UM05')
PHEN_VOL_UM06 = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UM06')

PHEN_VOL_LIST = [PHEN_VOL_UM01, PHEN_VOL_UM02, PHEN_VOL_UM03, PHEN_VOL_UM04, PHEN_VOL_UM05, PHEN_VOL_UM06]

phen_vol_data = pd.DataFrame()

phen_vol_data['Patient'] = ['MSK-UM01']*len(PHEN_VOL_LIST[0])+['MSK-UM02']*len(PHEN_VOL_LIST[1])+\
                           ['MSK-UM03']*len(PHEN_VOL_LIST[2])+['MSK-UM04']*len(PHEN_VOL_LIST[3])+\
                           ['MSK-UM05']*len(PHEN_VOL_LIST[4])+['MSK-UM06']*len(PHEN_VOL_LIST[5])

phen_vol_data['Log Phenotypic Volume'] = [x for l in PHEN_VOL_LIST for x in l]

phen_vol_data['Status'] = ['Alive' if x in ['MSK-UM01', 'MSK-UM05', 'MSK-UM03', 'MSK-UM04']
                            else 'Deceased' for x in phen_vol_data.Patient.values]

In [None]:
phen_vol_data.head()

In [None]:
# Visualize using violin plot:

fig, ax = plt.subplots(figsize=(7,7))

ax = sns.violinplot(x='Patient', y='Log Phenotypic Volume', data=phen_vol_data,
                    order=phen_vol_data.groupby('Patient').mean().\
                          sort_values(by='Log Phenotypic Volume').index.values,
                    hue='Status',
                    palette={'Alive': 'white', 'Deceased': 'gray'},
                    dodge=False,
                    scale='count')

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.xticks(rotation=90)

# SAVE FIGURE
figure_label = 'Phenotypic_Volume_Violin_MSK_count_scale'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

### PHENOTYPIC VOLUME INTRAPATIENT

###### The commented out code produces the phenotypic volume results, which were then pickled using joblib and imported to save time

In [None]:
# IMport phenotypic volume results for GEP1 vs. GEP2 for UM03:

C1_UM03 = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UM03_GEP1')
C2_UM03 = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UM03_GEP2')

In [None]:
# PLot results:

sns.set_style("white")
fig = plt.figure(figsize = (5,5))
ax = plt.gca()

sns.kdeplot(C1_UM03, shade=True, color="#0000FF")
sns.kdeplot(C2_UM03, shade=True, color="#FF0000")

ax.set_ylabel("Frequency",fontsize=10)
ax.set_xlabel("Log Phenotypic Volume",fontsize=10)
ax.tick_params(labelsize=10)
#g.set_xticklabels(ax.get_xticklabels(),rotation=90)
sns.despine()
plt.ylim((0,4))

# SAVE FIGURE
figure_label = 'Log_Phenotypic_Volume_UM03'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)


## PRC1 TARGET GENES HEATMAPS

### GEP2 RANKED HEATMAP FOR PRC1 TARGET GENES

In [None]:
# RANK BY GEP 2
meta = 'Nearest_Archetype'#'Archetype_Soft_Cluster'
subset_type = 'TUMOR'
exec('QUERY = INDF_{}'.format(subset_type))
exec('META = METADATA_{}'.format(subset_type))
rank_by = 'Castle 2'#'Class 2 Onken v2'
genes = np.unique(genesets[rank_by].values.ravel().tolist())
detected_genes = [gene for gene in genes if gene in list(QUERY.columns)]

QUERY = QUERY.loc[:, (QUERY != 0).any(axis=0)]

vals = QUERY[detected_genes]
SCORE = np.nanmedian(vals,axis=1)

In [None]:
PRC1_target_genes = pd.read_csv(DATA_PATH+'PRC1_targets_Genesets_and_Chipseq.csv')
plot_genes = PRC1_target_genes['PRC1_targets_Genesets_and_Chipseq'].dropna().values.tolist()
plot_genes = [x for x in plot_genes if x in QUERY.columns.values.tolist()]

# LOAD GENESETS
path_to_genesets = DATA_PATH+'uveal_melanoma_MB_v3.csv'
genesets = pd.read_csv(path_to_genesets,header='infer')

subset_type = 'TUMOR'
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))

# HEATMAP OF INDIVIDUAL CELLS RANKED BY LUNGE EP
rank_by = 'Castle 2'#
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in list(set(QUERY.columns))]
vals = QUERY[genes]
QUERY['GEP2_RANK'] =np.nanmean(vals,axis=1)

# CONSTRUCT HEATMAP DATA
heatmap_data = pd.DataFrame( data = zscore(QUERY.sort_values(by=['GEP2_RANK'])[plot_genes],axis=0), 
                   columns = plot_genes, index = QUERY.sort_values(by=['GEP2_RANK']).index)
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
method = 'average' # average, single centroid/euclidean
metric = 'cosine' # cosine
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
cl = hc.leaves_list(col_linkage)
mat = heatmap_data.iloc[:,cl]

window = 20
mat = mat.rolling(window, win_type='triang',center = True).sum()
half_window = int(window/2)
for ind in np.arange(half_window):
    mat.iloc[ind] = mat.iloc[half_window]    
for ind in np.arange(half_window):
    mat.iloc[-ind] = mat.iloc[-half_window]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(4,10))

# ADD MATRIX WITH LINEAGE NAMES
axmatrix = fig.add_axes([0.12,0.1,0.4,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=CM_DIVERGING,vmin=-10,vmax=10)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
axmatrix.set_xticklabels([]);
axmatrix.grid(False)
plt.tick_params(axis='both', size=0)

# SAVE FIGURE
figure_label = 'PRC1_Targets_Heatmap_UPDATED_average_cosine'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/GEP2_Ranked_Heatmaps/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
H2A_target_genes = pd.read_csv(DATA_PATH+'H2AK119Ub_in_921_3_23.txt', header=None)
H2A_target_genes.columns = ['Genes']
plot_genes = H2A_target_genes['Genes'].dropna().values.tolist()
plot_genes = [x for x in plot_genes if x in QUERY.columns.values.tolist()]

In [None]:
len(plot_genes) / len(H2A_target_genes)

In [None]:
H2A_target_genes = pd.read_csv(DATA_PATH+'H2AK119Ub_in_921_3_23.txt', header=None)
H2A_target_genes.columns = ['Genes']
plot_genes = H2A_target_genes['Genes'].dropna().values.tolist()
plot_genes = [x for x in plot_genes if x in QUERY.columns.values.tolist()]

# LOAD GENESETS
path_to_genesets = DATA_PATH+'uveal_melanoma_MB_v3.csv'
genesets = pd.read_csv(path_to_genesets,header='infer')

subset_type = 'TUMOR'
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))

# HEATMAP OF INDIVIDUAL CELLS RANKED BY LUNGE EP
rank_by = 'Castle 2'#
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in list(set(QUERY.columns))]
vals = QUERY[genes]
QUERY['GEP2_RANK'] =np.nanmean(vals,axis=1)

# CONSTRUCT HEATMAP DATA
heatmap_data = pd.DataFrame(data = zscore(QUERY.sort_values(by=['GEP2_RANK'])[plot_genes],axis=0), 
                   columns = plot_genes, index = QUERY.sort_values(by=['GEP2_RANK']).index)
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
method = 'average' # average, single centroid/euclidean
metric = 'cosine' # cosine
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
cl = hc.leaves_list(col_linkage)
mat = heatmap_data.iloc[:,cl]

window = 20
mat = mat.rolling(window, win_type='triang',center = True).sum()
half_window = int(window/2)
for ind in np.arange(half_window):
    mat.iloc[ind] = mat.iloc[half_window]    
for ind in np.arange(half_window):
    mat.iloc[-ind] = mat.iloc[-half_window]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(4,10))

# ADD MATRIX WITH LINEAGE NAMES
axmatrix = fig.add_axes([0.12,0.1,0.4,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=CM_DIVERGING,vmin=-10,vmax=10)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
axmatrix.set_xticklabels([]);
axmatrix.grid(False)
plt.tick_params(axis='both', size=0)

# SAVE FIGURE
figure_label = 'H2AK119Ub_Heatmap_UPDATED_average_cosine'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/Revision_2_Figures/GEP2_Ranked_Heatmaps/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
H2A_target_genes = pd.read_csv(DATA_PATH+'H2AK119Ub_in_921_broad_3_23.txt', header=None)
H2A_target_genes.columns = ['Genes']
plot_genes = H2A_target_genes['Genes'].dropna().values.tolist()
plot_genes = [x for x in plot_genes if x in QUERY.columns.values.tolist()]

# LOAD GENESETS
path_to_genesets = DATA_PATH+'uveal_melanoma_MB_v3.csv'
genesets = pd.read_csv(path_to_genesets,header='infer')

subset_type = 'TUMOR'
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))

# HEATMAP OF INDIVIDUAL CELLS RANKED BY LUNGE EP
rank_by = 'Castle 2'#
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in list(set(QUERY.columns))]
vals = QUERY[genes]
QUERY['GEP2_RANK'] =np.nanmean(vals,axis=1)

# CONSTRUCT HEATMAP DATA
heatmap_data = pd.DataFrame(data = zscore(QUERY.sort_values(by=['GEP2_RANK'])[plot_genes],axis=0), 
                   columns = plot_genes, index = QUERY.sort_values(by=['GEP2_RANK']).index)
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
method = 'average' # average, single centroid/euclidean
metric = 'cosine' # cosine
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
cl = hc.leaves_list(col_linkage)
mat = heatmap_data.iloc[:,cl]

window = 20
mat = mat.rolling(window, win_type='triang',center = True).sum()
half_window = int(window/2)
for ind in np.arange(half_window):
    mat.iloc[ind] = mat.iloc[half_window]    
for ind in np.arange(half_window):
    mat.iloc[-ind] = mat.iloc[-half_window]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(4,10))

# ADD MATRIX WITH LINEAGE NAMES
axmatrix = fig.add_axes([0.12,0.1,0.4,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=CM_DIVERGING,vmin=-10,vmax=10)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
axmatrix.set_xticklabels([]);
axmatrix.grid(False)
plt.tick_params(axis='both', size=0)

# SAVE FIGURE
figure_label = 'H2AK119Ub_Broad_Heatmap_UPDATED_average_cosine'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/Revision_2_Figures/GEP2_Ranked_Heatmaps/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
len(plot_genes) / len(H2A_target_genes)

###### GEP2 RANKED HEATMAP FOR NEW GENESET

In [None]:
# RANK BY GEP 2
meta = 'Nearest_Archetype'#'Archetype_Soft_Cluster'
subset_type = 'TUMOR'
exec('QUERY = INDF_{}'.format(subset_type))
exec('META = METADATA_{}'.format(subset_type))
rank_by = 'Castle 2'#'Class 2 Onken v2'
genes = np.unique(genesets[rank_by].values.ravel().tolist())
detected_genes = [gene for gene in genes if gene in list(QUERY.columns)]

vals = QUERY[detected_genes]
SCORE = np.nanmedian(vals,axis=1)

In [None]:
target_genes = pd.read_csv(DATA_PATH+'829_PRE_92_1_geneset.txt', 
                                sep='\t', header=None, names=['genes'])
plot_genes = target_genes['genes'].dropna().values.tolist()
plot_genes = [x for x in plot_genes if x in QUERY.columns.values.tolist()]

# LOAD GENESETS
path_to_genesets = DATA_PATH+'uveal_melanoma_MB_v3.csv'
genesets = pd.read_csv(path_to_genesets,header='infer')

subset_type = 'TUMOR'
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))

# HEATMAP OF INDIVIDUAL CELLS RANKED BY LUNGE EP
rank_by = 'Castle 2'#
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in list(set(QUERY.columns))]
vals = QUERY[genes]
QUERY['GEP2_RANK'] =np.nanmean(vals,axis=1)

# CONSTRUCT HEATMAP DATA
heatmap_data = pd.DataFrame( data = zscore(QUERY.sort_values(by=['GEP2_RANK'])[plot_genes],axis=0), 
                   columns = plot_genes, index = QUERY.sort_values(by=['GEP2_RANK']).index)
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
method = 'average' # average, single centroid/euclidean
metric = 'euclidean' # cosine
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
cl = hc.leaves_list(col_linkage)
mat = heatmap_data.iloc[:,cl]

window = 20
mat = mat.rolling(window, win_type='triang',center = True).sum()
half_window = int(window/2)
for ind in np.arange(half_window):
    mat.iloc[ind] = mat.iloc[half_window]    
for ind in np.arange(half_window):
    mat.iloc[-ind] = mat.iloc[-half_window]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(4,10))

# ADD MATRIX WITH LINEAGE NAMES
axmatrix = fig.add_axes([0.12,0.1,0.4,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=CM_DIVERGING,vmin=-10,vmax=10)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
axmatrix.set_xticklabels([]);
axmatrix.grid(False)
plt.tick_params(axis='both', size=0)

# SAVE FIGURE
figure_label = 'average_euclidean_heatmap_NEW_GENESET'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/GEP2_Ranked_Heatmaps/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

###### HEAT MAP FOR NC NFkB GENESETS

In [None]:
# RANK BY GEP 2
subset_type = 'TUMOR'
exec('QUERY = INDF_{}'.format(subset_type))
exec('META = METADATA_{}'.format(subset_type))

In [None]:
target_genes = pd.read_csv(DATA_PATH+'Genesets_NC_NFkB.txt', 
                                sep='\t', header=0)
plot_genes = target_genes['CIN-responsive-NC-NFkB'].dropna().values.tolist()
plot_genes = [x for x in plot_genes if x in QUERY.columns.values.tolist()]

# LOAD GENESETS
path_to_genesets = DATA_PATH+'uveal_melanoma_MB_v3.csv'
genesets = pd.read_csv(path_to_genesets,header='infer')

subset_type = 'TUMOR'
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))

# HEATMAP OF INDIVIDUAL CELLS RANKED BY LUNGE EP
rank_by = 'Castle 2'#
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in list(set(QUERY.columns))]
vals = QUERY[genes]
QUERY['GEP2_RANK'] =np.nanmean(vals,axis=1)

# CONSTRUCT HEATMAP DATA
heatmap_data = pd.DataFrame( data = zscore(QUERY.sort_values(by=['GEP2_RANK'])[plot_genes],axis=0), 
                   columns = plot_genes, index = QUERY.sort_values(by=['GEP2_RANK']).index)
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
method = 'average' # average, single centroid/euclidean
metric = 'euclidean' # cosine
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
cl = hc.leaves_list(col_linkage)
mat = heatmap_data.iloc[:,cl]

window = 20
mat = mat.rolling(window, win_type='triang',center = True).sum()
half_window = int(window/2)
for ind in np.arange(half_window):
    mat.iloc[ind] = mat.iloc[half_window]    
for ind in np.arange(half_window):
    mat.iloc[-ind] = mat.iloc[-half_window]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(4,10))

# ADD MATRIX WITH LINEAGE NAMES
axmatrix = fig.add_axes([0.12,0.1,0.4,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=CM_DIVERGING,vmin=-10,vmax=10)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
#axmatrix.set_xticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 6,fontname='Arial')
axmatrix.grid(False)
plt.tick_params(axis='y', size=0)

# SAVE FIGURE
figure_label = 'average_euclidean_heatmap_NC_NFkB'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/GEP2_Ranked_Heatmaps/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

###### HEATMAP FOR NEW GENESETS 4/15 and 5/5

In [None]:
# RANK BY GEP 2
subset_type = 'TUMOR'
exec('QUERY = INDF_{}'.format(subset_type))
exec('META = METADATA_{}'.format(subset_type))

In [None]:
plot_genes

In [None]:
# LOAD GENESETS
target_genes = pd.read_csv(DATA_PATH+'Genesets_4_15.csv', 
                                sep=',', header=0)

# Rank QUERY by GEP2
rank_by = 'Castle 2'#
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in list(set(QUERY.columns))]
vals = QUERY[genes]
QUERY['GEP2_RANK'] =np.nanmean(vals,axis=1)
QUERY = QUERY.sort_values(by=['GEP2_RANK'])

# Iterate through each geneset:
for col in target_genes.columns:
    print(col)
    plot_genes = target_genes[col].dropna().values.tolist()
    plot_genes = [x for x in plot_genes if x in QUERY.columns.values.tolist()]
    
    if len(plot_genes) > 0:
    
        # CONSTRUCT HEATMAP DATA
        heatmap_data = pd.DataFrame( data = zscore(QUERY[plot_genes],axis=0), 
                           columns = plot_genes, index = QUERY.index)
        heatmap_data = heatmap_data.dropna(axis=1, how='any') 
        yticks = heatmap_data.index
        xticks = heatmap_data.columns
        
        # LINKAGE 
        for m in [('average', 'euclidean'), ('average', 'cosine'), ('centroid', 'euclidean')]:
            method = m[0] # average, single centroid/euclidean
            metric = m[1] # cosine
            linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
            col_linkage = deepcopy(linkage)
            cl = hc.leaves_list(col_linkage)
            mat = heatmap_data.iloc[:,cl]
            
            window = 20
            mat = mat.rolling(window, win_type='triang',center = True).sum()
            half_window = int(window/2)
            for ind in np.arange(half_window):
                mat.iloc[ind] = mat.iloc[half_window]    
            for ind in np.arange(half_window):
                mat.iloc[-ind] = mat.iloc[-half_window]
            
            # VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
            fig = plt.figure(figsize=(4,10))
            
            # ADD MATRIX WITH LINEAGE NAMES
            axmatrix = fig.add_axes([0.12,0.1,0.4,0.6])
            im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=CM_DIVERGING,vmin=-10,vmax=10)
            labels = list(mat.columns)
            axmatrix.xaxis.set_ticks_position('bottom')
            axmatrix.set_yticklabels([]);
            axmatrix.set_xticklabels([]);
            #xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 6,fontname='Arial')
            axmatrix.grid(False)
            plt.tick_params(axis='both', size=0)
            
            # SAVE FIGURE
            figure_label = col+'_'+method+'_'+metric
            fn = '/workdir/uvmel_project/figures/Revision_2_Figures/Genesets_4_15_Heatmaps/' + figure_label 
                
            d = os.path.dirname(fn)
            if not os.path.exists(d):
                os.makedirs(d)
                
            fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
            #fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
            print(fn)

In [None]:
# RANK BY GEP 2
subset_type = 'TUMOR'
exec('QUERY = INDF_{}'.format(subset_type))
exec('META = METADATA_{}'.format(subset_type))

In [None]:
# LOAD GENESETS
target_genes = pd.read_csv(DATA_PATH+'Genesets_5_5.csv', 
                                sep=',', header=0)

# Rank QUERY by GEP2
rank_by = 'Castle 2'#
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in list(set(QUERY.columns))]
vals = QUERY[genes]
QUERY['GEP2_RANK'] =np.nanmean(vals,axis=1)
QUERY = QUERY.sort_values(by=['GEP2_RANK'])

In [None]:
print('MSK Data')
for col in target_genes.columns:
    print(col,':')
    plot_genes = target_genes[col].dropna().values.tolist()
    print('Number of genes in set:',len(plot_genes))
    plot_genes = [x for x in plot_genes if x in QUERY.columns.values.tolist()]
    print('Number of genes in set in data:',len(plot_genes))
    print('')

In [None]:
# LOAD GENESETS
target_genes = pd.read_csv(DATA_PATH+'Genesets_5_5.csv', 
                                sep=',', header=0)

# Rank QUERY by GEP2
rank_by = 'Castle 2'#
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in list(set(QUERY.columns))]
vals = QUERY[genes]
QUERY['GEP2_RANK'] =np.nanmean(vals,axis=1)
QUERY = QUERY.sort_values(by=['GEP2_RANK'])

# Iterate through each geneset:
for col in target_genes.columns:
    print(col)
    plot_genes = target_genes[col].dropna().values.tolist()
    plot_genes = [x for x in plot_genes if x in QUERY.columns.values.tolist()]
    
    if len(plot_genes) > 0:
    
        # CONSTRUCT HEATMAP DATA
        heatmap_data = pd.DataFrame( data = zscore(QUERY[plot_genes],axis=0), 
                           columns = plot_genes, index = QUERY.index)
        heatmap_data = heatmap_data.dropna(axis=1, how='any') 
        yticks = heatmap_data.index
        xticks = heatmap_data.columns
        
        # LINKAGE 
        for m in [('average', 'euclidean'), ('average', 'cosine'), ('centroid', 'euclidean')]:
            method = m[0] # average, single centroid/euclidean
            metric = m[1] # cosine
            linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
            col_linkage = deepcopy(linkage)
            cl = hc.leaves_list(col_linkage)
            mat = heatmap_data.iloc[:,cl]
            
            window = 20
            mat = mat.rolling(window, win_type='triang',center = True).sum()
            half_window = int(window/2)
            for ind in np.arange(half_window):
                mat.iloc[ind] = mat.iloc[half_window]    
            for ind in np.arange(half_window):
                mat.iloc[-ind] = mat.iloc[-half_window]
            
            # VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
            fig = plt.figure(figsize=(4,10))
            
            # ADD MATRIX WITH LINEAGE NAMES
            axmatrix = fig.add_axes([0.12,0.1,0.4,0.6])
            im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=CM_DIVERGING,vmin=-10,vmax=10)
            labels = list(mat.columns)
            axmatrix.xaxis.set_ticks_position('bottom')
            axmatrix.set_yticklabels([]);
            axmatrix.set_xticklabels([]);
            #xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 6,fontname='Arial')
            axmatrix.grid(False)
            plt.tick_params(axis='both', size=0)
            
            # SAVE FIGURE
            figure_label = col+'_'+method+'_'+metric
            fn = '/workdir/uvmel_project/figures/Revision_2_Figures/Genesets_5_5_Heatmaps/' + figure_label 
                
            d = os.path.dirname(fn)
            if not os.path.exists(d):
                os.makedirs(d)
                
            fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
            #fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
            print(fn)

###### GEP2 RANK CURVE

In [None]:
# LOAD GENESETS
plot_genes = ['IRF3', 'RELA', 'RELB', 'MB21D1', 'IKBKB', 'NFKB1', 'TMEM173', 'STAT2', 'CXCL10', 'NFKB2',
              'CCL5', 'TBK1', 'JAK1', 'JAK2', 'IRF9', 'TYK2', 'IRF7', 'IRF1', 'STAT1'
             ]

path_to_genesets = DATA_PATH+'uveal_melanoma_MB_v3.csv'
genesets = pd.read_csv(path_to_genesets,header='infer')

subset_type = 'TUMOR'
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))

# HEATMAP OF INDIVIDUAL CELLS RANKED BY LUNGE EP
rank_by = 'Castle 2'#
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in list(set(QUERY.columns))]
vals = QUERY[genes]
QUERY['GEP2_RANK'] =np.nanmean(vals,axis=1)
heatmap_data = pd.DataFrame( data = QUERY.sort_values(by=['GEP2_RANK'])[plot_genes +['GEP2_RANK']], 
                   columns = plot_genes +['GEP2_RANK'], index = QUERY.sort_values(by=['GEP2_RANK']).index)

y = heatmap_data['GEP2_RANK'].values
x = np.arange(len(y))

fig = plt.figure(figsize=(10,4))
ax = plt.gca()

ax.fill_between(x, 0, y,facecolor='#E0E0E0', interpolate=True)
plt.xlim((0,x.max()))
plt.ylim((0,y.max()))
sns.despine()


# SAVE FIGURE
figure_label = 'PRC1_Targets_UPDATED_GEP2_RANK_curve'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/GEP2_Ranked_Heatmaps/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

### PRC1 TARGET GENE DEPREPRESSION IN HIGH RISK UM

#### ORIGINAL PRC1 TARGET GENES

In [None]:
# LOAD GENESETS
path_to_genesets = DATA_PATH + 'mRNA_BAP1_DELETED_INTACT.csv'
znormalized_data = pd.read_csv(path_to_genesets,header='infer')

In [None]:
# LOAD GENESETS
path_to_genesets = DATA_PATH + 'mRNA_cell_lines.csv'
znormalized_data_cells = pd.read_csv(path_to_genesets,header='infer')
znormalized_data_cells.set_index('Gene Symbol', inplace=True)
znormalized_data_cells  = znormalized_data_cells.T

znormalized_data_cells.fillna(0, inplace = True)
znormalized_data_cells = znormalized_data_cells.loc[(znormalized_data_cells.sum(axis=1) != 0), 
                                                    (znormalized_data_cells.sum(axis=0) != 0)]
znormalized_data_cells.head()

In [None]:
znormalized_data.set_index('BAP1: Putative copy-number alterations from GISTIC', inplace=True)
znormalized_data.head()

In [None]:
znormalized_data.fillna(0, inplace = True)
znormalized_data = znormalized_data.loc[(znormalized_data.sum(axis=1) != 0), (znormalized_data.sum(axis=0) != 0)]

In [None]:
znormalized_data_sorted = znormalized_data.sort_index(axis = 0)
znormalized_data_sorted.head()

In [None]:
znormalized_data.shape

In [None]:
heatmap_data = znormalized_data

# CONSTRUCT HEATMAP DATA
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
method = 'average' # average, single centroid/euclidean
metric = 'correlation' # cosine
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
cl = hc.leaves_list(col_linkage)
linkage = hc.linkage(heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
rl = hc.leaves_list(row_linkage)
mat = heatmap_data.iloc[rl,cl]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(20,3))

# ADD MATRIX WITH LINEAGE NAMES
axmatrix = fig.add_axes([0.12,0.1,0.4,0.8])
colors = [(1,1,1), np.divide(tuple(hex('FF0000').rgb),255)]
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=CM_DIVERGING,vmin=-2,vmax=2)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
#axmatrix.set_xticklabels([]);
#xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 6,fontname='Arial')
xtick = plt.xticks([])
axmatrix.grid(False)
axmatrix.spines['top'].set_visible(False)
axmatrix.spines['right'].set_visible(False)
axmatrix.spines['bottom'].set_visible(False)
axmatrix.spines['left'].set_visible(False)
axmatrix.tick_params(axis=u'both', which=u'both',length=0)

# ROW1 COLORS
# COLORMAP LUT
lut = {
                 'Shallow Deletion':'red', # pink
                 'Diploid': 'blue'
    }

ind1 = mat.index
row_colors1 = pd.Series(ind1).map(lut)

# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.08,0.1,0.02,0.8]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors1:
    pos = (x, y / len(row_colors1))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(row_colors1), color=c))
    if y >= len(row_colors1)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')


# ADD COLORBAR
axcolor = fig.add_axes([0.525,0.1,0.007,0.6])
cbar = plt.colorbar(im, cax=axcolor)

# SAVE FIGURE
figure_label = '/PRC1_Targets_Derepressed_highriskUM_ORIGINAL'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/PRC1_Target_Genes/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
QUERY = znormalized_data_cells
heatmap_data = pd.DataFrame( data = zscore(QUERY,axis=0), columns = QUERY.columns, index = QUERY.index)

# CONSTRUCT HEATMAP DATA
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
method = 'average' # average, single centroid/euclidean
metric = 'correlation' # cosine
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
cl = hc.leaves_list(col_linkage)
linkage = hc.linkage(heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
rl = hc.leaves_list(row_linkage)
mat = heatmap_data.iloc[rl,cl]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(20,2))

# ADD MATRIX WITH LINEAGE NAMES
axmatrix = fig.add_axes([0.12,0.1,0.4,0.6])
colors = [(1,1,1), np.divide(tuple(hex('FF0000').rgb),255)]
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=CM_DIVERGING,vmin=-2,vmax=2)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
#axmatrix.set_xticklabels([]);
#xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 6,fontname='Arial')
xtick = plt.xticks([])
axmatrix.grid(False)
axmatrix.spines['top'].set_visible(False)
axmatrix.spines['right'].set_visible(False)
axmatrix.spines['bottom'].set_visible(False)
axmatrix.spines['left'].set_visible(False)
axmatrix.tick_params(axis=u'both', which=u'both',length=0)

# ROW1 COLORS
# COLORMAP LUT
lut = {
                 'DM1_2592':'blue', # pink
                 'DM2_2592': 'blue',
                 'S_2538_22':'red', # pink
                 'S_2538_23': 'red'
    }

ind1 = mat.index
row_colors1 = pd.Series(ind1).map(lut)

# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.08,0.1,0.035,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors1:
    pos = (x, y / len(row_colors1))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(row_colors1), color=c))
    if y >= len(row_colors1)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

# ADD DENDROGRAM
ax2 = fig.add_axes([0.00,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',color_threshold = 0, above_threshold_color='#808080')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')

# ADD COLORBAR
axcolor = fig.add_axes([0.525,0.1,0.007,0.6])
cbar = plt.colorbar(im, cax=axcolor)

# SAVE FIGURE
figure_label = '/PRC1_Targets_unsupervised_clustering_ORIGINAL'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/PRC1_Target_Genes/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

#### UPDATED PRC1 TARGET GENES

In [None]:
# LOAD GENESETS
path_to_genesets = DATA_PATH + 'TCGA_new PRC1 targets.csv'
znormalized_data = pd.read_csv(path_to_genesets,header='infer')

In [None]:
# LOAD GENESETS
path_to_genesets = DATA_PATH + '921vsMP38.csv'
znormalized_data_cells = pd.read_csv(path_to_genesets,header='infer')
znormalized_data_cells.set_index('gene_name', inplace=True)
znormalized_data_cells  = znormalized_data_cells.T

znormalized_data_cells.fillna(0, inplace = True)
znormalized_data_cells = znormalized_data_cells.loc[(znormalized_data_cells.sum(axis=1) != 0), 
                                                    (znormalized_data_cells.sum(axis=0) != 0)]
znormalized_data_cells.head()

In [None]:
znormalized_data.set_index('Chromosome 3', inplace=True)
znormalized_data.drop(columns=['Sample name'], inplace=True)
znormalized_data.head()

In [None]:
znormalized_data.fillna(0, inplace = True)
znormalized_data = znormalized_data.loc[(znormalized_data.sum(axis=1) != 0), (znormalized_data.sum(axis=0) != 0)]

In [None]:
znormalized_data_sorted = znormalized_data.sort_index(axis = 0)
znormalized_data_sorted.head()

In [None]:
znormalized_data.shape

In [None]:
heatmap_data = znormalized_data#_sorted

# CONSTRUCT HEATMAP DATA
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
method = 'average' # average, single centroid/euclidean
metric = 'correlation' # cosine
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
cl = hc.leaves_list(col_linkage)
linkage = hc.linkage(heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
rl = hc.leaves_list(row_linkage)
mat = heatmap_data.iloc[rl,cl]

genes_to_show = ['ZEB2', 'SLC1A4', 'CSPG4', 'FADS3', 'VLDLR', 'ASPH', 'LHFPL2', 'TRIO', 'CHST3', 'C2CD2', 'NT5E']
xlabel_list = [x if x in genes_to_show else '' for x in mat.columns.values]
xtick_list = [x for x in range(len(xlabel_list)) if xlabel_list[x] !='']
xlabel_list = [x for x in xlabel_list if x !='']

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(20,3))

# ADD MATRIX WITH LINEAGE NAMES
axmatrix = fig.add_axes([0.12,0.1,0.4,0.7])
colors = [(1,1,1), np.divide(tuple(hex('FF0000').rgb),255)]
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=CM_DIVERGING,vmin=-2,vmax=2)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
#axmatrix.set_xticklabels([]);
xtick = plt.xticks(xtick_list, xlabel_list, rotation = 90, fontsize = 6,fontname='Arial')
#xtick = plt.xticks([])
axmatrix.grid(False)
axmatrix.spines['top'].set_visible(False)
axmatrix.spines['right'].set_visible(False)
axmatrix.spines['bottom'].set_visible(False)
axmatrix.spines['left'].set_visible(False)
axmatrix.tick_params(axis=u'both', which=u'both',length=3)

# ROW1 COLORS
# COLORMAP LUT
lut = {
                 'Monosomy 3':'red', # pink
                 'Diploid': 'blue'
    }

ind1 = mat.index
row_colors1 = pd.Series(ind1).map(lut)

# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.1,0.1,0.02,0.7]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors1:
    pos = (x, y / len(row_colors1))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(row_colors1), color=c))
    if y >= len(row_colors1)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

# ADD COLORBAR
axcolor = fig.add_axes([0.525,0.1,0.0095,0.25])
cbar = plt.colorbar(im, cax=axcolor)

# SAVE FIGURE
figure_label = 'PRC1_Targets_Derepressed_highriskUM_UPDATED'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/PRC1_Target_Genes/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
QUERY = znormalized_data_cells
heatmap_data = pd.DataFrame( data = zscore(QUERY,axis=0), columns = QUERY.columns, index = QUERY.index)

# CONSTRUCT HEATMAP DATA
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
method = 'average' # average, single centroid/euclidean
metric = 'correlation' # cosine
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
cl = hc.leaves_list(col_linkage)
#linkage = hc.linkage(heatmap_data, method=method, metric = metric)
#row_linkage = deepcopy(linkage)
#rl = hc.leaves_list(row_linkage)
mat = heatmap_data.iloc[:,cl]

genes_to_show = ['ZEB2', 'SLC1A4', 'CSPG4', 'FADS3', 'VLDLR', 'ASPH', 'LHFPL2', 'TRIO', 'CHST3', 'C2CD2', 'NT5E']
xlabel_list = [x if x in genes_to_show else '' for x in mat.columns.values]
xtick_list = [x for x in range(len(xlabel_list)) if xlabel_list[x] !='']
xlabel_list = [x for x in xlabel_list if x !='']

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(20,2))

# ADD MATRIX WITH LINEAGE NAMES
axmatrix = fig.add_axes([0.12,0.1,0.4,0.6])
colors = [(1,1,1), np.divide(tuple(hex('FF0000').rgb),255)]
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=CM_DIVERGING,vmin=-2,vmax=2)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
#axmatrix.set_xticklabels([]);
xtick = plt.xticks(xtick_list, xlabel_list, rotation = 90, fontsize = 6,fontname='Arial')
#xtick = plt.xticks([])
axmatrix.grid(False)
axmatrix.spines['top'].set_visible(False)
axmatrix.spines['right'].set_visible(False)
axmatrix.spines['bottom'].set_visible(False)
axmatrix.spines['left'].set_visible(False)
axmatrix.tick_params(axis=u'x', which=u'both',length=3)
axmatrix.tick_params(axis=u'y', which=u'both',length=0)

# ROW1 COLORS
# COLORMAP LUT
lut = {
                 '92.1':'blue', # pink
                 '92.1.1': 'blue',
                 'MP38':'red', # pink
                 'MP38.1': 'red'
    }

ind1 = mat.index
row_colors1 = pd.Series(ind1).map(lut)

# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.09,0.1,0.03,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors1:
    pos = (x, y / len(row_colors1))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(row_colors1), color=c))
    if y >= len(row_colors1)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

# ADD COLORBAR
axcolor = fig.add_axes([0.525,0.1,0.007,0.6])
cbar = plt.colorbar(im, cax=axcolor)

# SAVE FIGURE
figure_label = 'PRC1_Targets_unsupervised_clustering_UPDATED'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/PRC1_Target_Genes/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
heatmap_data

## STING PATHWAY GENES HEATMAP

In [None]:
plot_genes = ['MB21D1','TMEM173','IRF3','RELA','RELB','IKBKB','NFKB1',
              'STAT2','CXCL10','NFKB2','CCL5',
              'TBK1','JAK1','JAK2','IRF9',
              'TYK2','IRF7','IRF1','STAT1',
              'PPARG','DDIT3','NUPR1','RAB3B','IGFBP4','LRRC8C',
              'TCP11L2','MAFK','NRG1','F2R','KRT19','CTGF','ZFC3H1']

print(len(plot_genes))

# LOAD GENESETS
path_to_genesets = DATA_PATH+'uveal_melanoma_MB_v3.csv'
genesets = pd.read_csv(path_to_genesets,header='infer')

subset_type = 'TUMOR'
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))

# HEATMAP OF INDIVIDUAL CELLS RANKED BY LUNGE EP
rank_by = 'Castle 2'#
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in list(set(QUERY.columns))]
vals = QUERY[genes]
QUERY['GEP2_RANK'] =np.nanmean(vals,axis=1)

# CONSTRUCT HEATMAP DATA
heatmap_data = pd.DataFrame( data = zscore(QUERY.sort_values(by=['GEP2_RANK'])[plot_genes],axis=0), 
                   columns = plot_genes, index = QUERY.sort_values(by=['GEP2_RANK']).index)
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
method = 'average' # average, single centroid/euclidean
metric = 'cosine' # cosine
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
cl = hc.leaves_list(col_linkage)
mat = heatmap_data.iloc[:,cl]
#mat = heatmap_data.iloc[:,:]

window = 20
mat = mat.rolling(window, win_type='triang',center = True).sum()
half_window = int(window/2)
for ind in np.arange(half_window):
    mat.iloc[ind] = mat.iloc[half_window]    
for ind in np.arange(half_window):
    mat.iloc[-ind] = mat.iloc[-half_window]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(5,10))

# ADD MATRIX WITH LINEAGE NAMES
axmatrix = fig.add_axes([0.12,0.1,0.5,0.7])
colors = [(1,1,1), np.divide(tuple(hex('FF0000').rgb),255)]
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=CM_DIVERGING,vmin=-10,vmax=10)
labels = list(mat.columns)
#labels[3] = 'CGAS'
#labels[6] = 'STING'
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
#axmatrix.set_xticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 5,fontname='Arial')
axmatrix.grid(False)
plt.tick_params(size=0)

# SAVE FIGURE
figure_label = 'STING_PATHWAY_Heatmap_EXTENDED_CLUSTERED_average_cosine'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/GEP2_Ranked_Heatmaps/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

## HARBOUR DATASET

In [None]:
DATA_PATH_HARBOUR = DATA_PATH + '/uveal_melanoma_harbour2020/'

ADATA_HARBOUR = sc.read(DATA_PATH_HARBOUR+'merged_pipeline_out/filter2.h5ad')

In [None]:
ADATA_HARBOUR.X

In [None]:
# Primary and metastatic sample sizes before filtering:
ADATA_HARBOUR.obs.sampleType.value_counts()

In [None]:
# Sample sizes by patient before filtering:
ADATA_HARBOUR.obs['sample'].value_counts()

###### FILTER TO JUST TUMOR CELLS

In [None]:
DF_HARBOUR_CELLTYPES = pd.read_csv(DATA_PATH_HARBOUR+'scUM_NComms_Fig1B_metadata_MD.csv', 
                                   names=['Barcode', 'Patient', 'cellType'], header=0)
DF_HARBOUR_CELLTYPES.set_index('Barcode', inplace=True)

tumor_barcodes = DF_HARBOUR_CELLTYPES.loc[DF_HARBOUR_CELLTYPES.cellType.map(lambda x: 'Tumor' in x)].index.values

adata_harbour_barcode_list = [ADATA_HARBOUR.obs.index[x].split('-')[0] +'-'+ 
                              ADATA_HARBOUR.obs['sample'][x].split('_')[1] 
                              for x in range(len(ADATA_HARBOUR.obs))]

tumor_cell_indices = [ADATA_HARBOUR.obs.index.values[x] for x in range(len(adata_harbour_barcode_list)) 
                      if adata_harbour_barcode_list[x] in tumor_barcodes]

In [None]:
DF_HARBOUR_CELLTYPES.cellType.value_counts()

In [None]:
ADATA_HARBOUR = ADATA_HARBOUR[tumor_cell_indices,:]

In [None]:
# Get cellTypes list for tumor cells:
adata_barcodes_cleaned = [ADATA_HARBOUR.obs.index[x].split('-')[0] +'-'+ 
                          ADATA_HARBOUR.obs['sample'][x].split('_')[1] 
                          for x in range(len(ADATA_HARBOUR.obs))]

cellTypes_list = [DF_HARBOUR_CELLTYPES.loc[DF_HARBOUR_CELLTYPES.index == x].cellType.values.tolist()[0]
                  for x in adata_barcodes_cleaned]

In [None]:
# Primary and metastatic sample sizes after filtering:
ADATA_HARBOUR.obs.sampleType.value_counts()

In [None]:
# Show metastasis sample counts:
ADATA_HARBOUR.obs.loc[ADATA_HARBOUR.obs.sampleType.map(lambda x: x == 'metastatic')]['sample'].value_counts()

In [None]:
# Sample sizes by patient after filtering:
ADATA_HARBOUR.obs['sample'].value_counts()

###### CREATE PANDAS DF TO WORK WITH

In [None]:
harbour_matrix = ADATA_HARBOUR.X.toarray()
harbour_gene_names = ADATA_HARBOUR.var.index.values.tolist()
DF_HARBOUR = pd.DataFrame(harbour_matrix, columns=harbour_gene_names)
    
DF_HARBOUR['Sample ID'] = ADATA_HARBOUR.obs['sample_num'].values.tolist()
DF_HARBOUR['Legend'] = ADATA_HARBOUR.obs['sampleType'].values.tolist()
DF_HARBOUR['Patient'] = [x.split('_')[1] for x in ADATA_HARBOUR.obs['sample'].values.tolist()]
DF_HARBOUR['Cell ID'] = DF_HARBOUR.index.values.tolist()
DF_HARBOUR['cellType'] = cellTypes_list
DF_HARBOUR.set_index(['Sample ID', 'Legend', 'Patient', 'Cell ID', 'cellType'], inplace=True)

In [None]:
DF_HARBOUR.index.get_level_values('cellType').value_counts()

In [None]:
DF_HARBOUR.head()

###### FILTER OUT GENES THAT AREN'T EXPRESSED IN ANY CELLS

In [None]:
DF_HARBOUR = DF_HARBOUR.loc[:, (DF_HARBOUR != 0).any(axis=0)]

###### GET DF OF JUST PRIMARY TUMOR CELLS:

In [None]:
DF_HARBOUR_PRIMARY = DF_HARBOUR.loc[DF_HARBOUR.index.map(lambda x: x[1] == 'primary')]
DF_HARBOUR_METASTASIS = DF_HARBOUR.loc[DF_HARBOUR.index.map(lambda x: x[1] == 'metastatic')]

###### WRITE DATAFRAMES TO h5 FOR CELLASSIGN

In [None]:
h5_data_harbour = seqc.H5(DATA_PATH_HARBOUR+'UVMEL_HARBOUR.h5')

#### NORMALIZE HARBOUR DATA

In [None]:
NDF_HARBOUR = DF_HARBOUR.copy()
molecule_sums = NDF_HARBOUR.sum(axis=1)

NDF_HARBOUR = NDF_HARBOUR.div(molecule_sums, axis=0).mul(np.median(molecule_sums), axis=0)

In [None]:
NDF_HARBOUR.to_hdf(DATA_PATH_HARBOUR+'UVMEL_HARBOUR.h5', key='NDF_TUMOR')

In [None]:
NDF_HARBOUR_PRIMARY = DF_HARBOUR_PRIMARY.copy()
molecule_sums_PRIMARY = NDF_HARBOUR_PRIMARY.sum(axis=1)

NDF_HARBOUR_PRIMARY = NDF_HARBOUR_PRIMARY.div(molecule_sums_PRIMARY, axis=0).mul(np.median(molecule_sums_PRIMARY), 
                                                                                 axis=0)

In [None]:
NDF_HARBOUR_PRIMARY.to_hdf(DATA_PATH_HARBOUR+'UVMEL_HARBOUR.h5', key='NDF_TUMOR_PRIMARY')

#### PERFORM IMPUTATION ON NDF_HARBOUR

###### FILTER LOW ABUNDANCE GENES

In [None]:
# FIT BINOMIAL DISTRIBUTION AND FILTER BASED ON MEAN/STD OF SECOND
plt.figure(figsize = (10,3))
gs1 = gridspec.GridSpec(1, 2)
gs1.update(wspace=0.7, hspace=0.7) # set the spacing between axes. 

# (4) PLOT LOG NUMBER OF CELLS CONTRIBUTING TO EACH GENE
num_cells_per_gene = np.log(np.sum(DF_HARBOUR.values > 0,axis=0))
num_cells_per_gene[(np.isinf(num_cells_per_gene)) | (np.isnan(num_cells_per_gene))] = 0
rmv_genes1 = np.where(num_cells_per_gene<=0)[0] # GENES MUST BE DETECTED IN AT LEAST 10 CELLS

ax = plt.subplot(gs1[0])
bins = np.linspace(num_cells_per_gene.min(), num_cells_per_gene.max()*0.95, 20)
plt.hist(num_cells_per_gene, bins, alpha=0.5, label='keep')

if rmv_genes1.any():
    plt.hist(num_cells_per_gene[rmv_genes1], bins, alpha=1, label='remove')
    
#ax.set_axis_facecolor('white')
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('Gene Filter 1: Remove Genes Singletons \n(Log. # Expressing Cells)')
plt.grid(True)
sns.despine()

# (5) PLOT LOG COUNTS PER GENE - REMOVE LOW ABUNDANCE GENES
log_counts_per_gene = np.log(np.log(np.sum(DF_HARBOUR.values,axis=0)))
log_counts_per_gene[(np.isinf(log_counts_per_gene)) | (np.isnan(log_counts_per_gene))] = 0
data = log_counts_per_gene

ax = plt.subplot(gs1[1])
bins = np.linspace(data.min(), data.max()*0.95, 100)
y,x,_=hist(data,bins,alpha=.3,label='data')

x=(x[1:]+x[:-1])/2 
expected=(0,.2,3500,1.5,.2,500)
params,cov=curve_fit(bimodal,x,y,expected)
sigma=sqrt(diag(cov))
plot(x,bimodal(x,*params),color='red',lw=3,label='model')

mu1 = params[0]
std1 = params[1]
mu2 = params[3]
std2 = params[4]
rmv_genes_neg = np.where(data<mu2-4*std2)[0]
rmv_genes2 = np.sort(list(set(list(rmv_genes_neg))))

if rmv_genes2.any():
    plt.hist(data[rmv_genes2], bins, alpha=1, label='remove')
    
#ax.set_axis_facecolor('white')
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('Gene Filter 2: \nLog-Log Counts/Gene')
plt.grid(True)
sns.despine()

# Add (abbreviated) legend bottom left
L = plt.legend(loc='upper right',prop={'size':12},bbox_to_anchor=(1.6, 1.05),fancybox=True) 

# SAVE FIGURE
figure_label = '_filter_genes'
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label + '.png'
plt.savefig(fn, dpi=fig_dpi)
print(fn)

# EVALUATE NUMBER OF CELLS/GENES REMOVED
print('Count Gene Filter 1: {}'.format(len(rmv_genes1)))
print('Count Gene Filter 2: {}'.format(len(rmv_genes2)))

# REMOVE SELECTED OUTLIER GENES(HIGHLIGHTED IN GREEN)
CUT_DF = deepcopy(DF_HARBOUR)
rmv_genes = np.sort(list(set(list(rmv_genes1) + list(rmv_genes2))))
if rmv_genes.any():
    CUT_DF = CUT_DF.drop(CUT_DF.columns[rmv_genes],axis=1)
    print(CUT_DF.shape)
    
# Remove empty genes if they exist
drop_genes = np.where(CUT_DF.sum(axis=0)==0)[0]
CUT_DF = CUT_DF.drop(CUT_DF.columns[drop_genes],axis=1)
print(CUT_DF.shape)

# GENES EXCLUDING LOW ABUNDANCE GENES
goi = list(CUT_DF.columns)
print(len(goi))

###### RANDOMIZED PCA

In [None]:
data = NDF_HARBOUR[goi]
pca = PCA(n_components=100, whiten = False, svd_solver='randomized')
PCA_FIT = pca.fit(data)# cells/observations x genes/features

explained_variance = PCA_FIT.explained_variance_ratio_ # explained variance
cumsum_explained_variance = np.cumsum(explained_variance) # cumulative sum of explained variance
PCA_DATA = PCA_FIT.transform(data) # transformed data

# Store output of pca in dictionary for WISHBONE
eigenvalues = pd.Series(pca.explained_variance_,index = np.arange(pca.n_components))
loadings = pd.DataFrame(PCA_FIT.components_.T, index = data.columns, columns = np.arange(pca.n_components))
pca_dict = {'eigenvalues':eigenvalues,'loadings':loadings}

# Correlation of each PC component with library size

x = molecule_sums.values.tolist()

R = np.zeros(PCA_DATA.shape[1])
for ind in np.arange(PCA_DATA.shape[1]):
    y = PCA_DATA[:,ind]
    R[ind] = np.corrcoef(x,y)[1,0] # correlation coefficient

nrow = 2
ncol = 2
plt.figure(figsize=(7,7))
gs1 = gridspec.GridSpec(nrow,ncol)
gs1.update(wspace = 0.5, hspace = 0.5)

# Variance of each gene
gene_variance = np.var(data, axis=0)
variance_sort_idx =gene_variance.argsort()[::-1]
ranked_gene_variance = gene_variance[variance_sort_idx]

# Plot explained variance
ax = plt.subplot(gs1[0])
plt.plot(explained_variance)
plt.xticks(rotation=70)
plt.ylabel('Ratio Explained Variance')
plt.xlabel('Principle Component')
plt.grid(True)
sns.despine()

# Plot cumulative sum of explained variance
ax = plt.subplot(gs1[1])
plt.plot(cumsum_explained_variance)
plt.xticks(rotation=70)
plt.ylabel('Cumsum Explained Variance')
plt.xlabel('Principle Component')
plt.grid(True)
sns.despine()

# Plot correlatin of library size with each PC
ax = plt.subplot(gs1[2])
plt.plot(np.abs(R))
plt.xticks(rotation=70)
plt.ylabel('Correlation with Library Size (abs)')
plt.xlabel('Principle Component')
plt.grid(True)
sns.despine()

# Plot ranked variance of each gene
ax = plt.subplot(gs1[3])
plt.plot(np.log(ranked_gene_variance.values))
plt.xticks(rotation=70)
plt.ylabel('Log. Variance')
plt.xlabel('Ranked Genes')
plt.grid(True)
sns.despine()

In [None]:
# IDENTIFY POINT OF MAXIMUM CURVATURE IN CUMULATIVE EXPLAINED VARIANCE
y = cumsum_explained_variance
x = np.arange(len(y))
kneedle = KneeLocator(x, y)
kneedle.plot_knee_normalized()
knee = kneedle.knee
plt.title('knee: {}'.format(knee))

In [None]:
cumsum_explained_variance[31]

In [None]:
PCA_DATA.shape

In [None]:
# SELECT DATA ASSOCIATED WITH RELEVANT PCS (REMOVE PC0 if highly correlated with cell size)
ncomponents = knee*2
COMPONENTS = np.arange(ncomponents)
print(COMPONENTS)

# LOADING PER COMPONENT
loadings = PCA_FIT.components_.T[:,COMPONENTS]

labels = []
for i in COMPONENTS:
    labels.append('PC{}'.format(i))

data = NDF_HARBOUR[goi]
data -= np.min(np.ravel(data))
data /= np.max(np.ravel(data))
data = pd.DataFrame(np.dot(data, loadings).astype(np.float64),index=data.index, columns = labels)


# WRITE TO H5
DIMENSIONS_HARBOUR_TUMOR = data.copy()
exec('h5_data_harbour.save(DIMENSIONS_HARBOUR_{}, \'DIMENSIONS_HARBOUR_{}\')'
     .format(subset_type,subset_type)) # update directory
h5_data_harbour.ls() # list contents of directory

###### REGRESS LIBRARY SIZE OUT OF PCS

In [None]:
exec('QUERY = DIMENSIONS_HARBOUR_{}'.format(subset_type))
idx = QUERY.columns.to_series().str.contains('^PC((?!TSNE).)*$') # select from DIMS_NDF
PCA_DATA = QUERY.loc[:, idx.values]
counts = DF_HARBOUR.copy()

num_comps = PCA_DATA.shape[1]

pca_regressed_out_lib_size = deepcopy(PCA_DATA)
for c in range(num_comps):
    lm = LinearRegression(normalize=False)
    X = counts.sum(axis=1).values.reshape(counts.shape[0], 1)
    Y = PCA_DATA.iloc[:, c]
    lm.fit(X, Y)
    pca_regressed_out_lib_size.iloc[:,c] = Y-lm.predict(X)
    
# Add SUFIX FOR REGRESSED LIB SIZE (RLS)
pca_regressed_out_lib_size = pca_regressed_out_lib_size.add_prefix('RLS_')

for label in pca_regressed_out_lib_size.columns:
    exec('DIMENSIONS_HARBOUR_{}[label]= pca_regressed_out_lib_size[label]'.format(subset_type))
    
# WRITE TO H5
exec('h5_data_harbour.save(DIMENSIONS_HARBOUR_{}, \'DIMENSIONS_HARBOUR_{}\')'.
     format(subset_type,subset_type)) # update directory
h5_data_harbour.ls() # list contents of directory

###### CALCULATE DIFFUSIONS COEFFICIENTS

In [None]:
knn = 27 # truncate the neighbors to the k closest points

component_prefix = 'RLS_PC'
exec('QUERY = DIMENSIONS_HARBOUR_{}'.format(subset_type))
idx = QUERY.columns.to_series().str.contains('^{}((?!TSNE).)*$'.format(component_prefix)) # select from DIMS_NDF
PCA_DATA = QUERY.loc[:, idx.values]

DIFFUSION_EIGS = palantir.utils.run_diffusion_maps(PCA_DATA.astype(np.float64), knn=knn)

In [None]:
DF_HARBOUR_PRIMARY.index.get_level_values('Patient').value_counts()

In [None]:
DF_TUMOR.index.get_level_values('Patient').value_counts()

###### IMPUTE MISSING VALUES

In [None]:
imputed_data_matrix = palantir.utils.run_magic_imputation(NDF_HARBOUR, DIFFUSION_EIGS, n_steps=3)

In [None]:
INDF_HARBOUR = imputed_data_matrix.copy()
exec('h5_data_harbour.save(INDF_HARBOUR,\'INDF_HARBOUR\')')

#### PERFORM IMPUTATION ON NDF_HARBOUR_PRIMARY

###### FILTER LOW ABUNDANCE GENES

In [None]:
# FIT BINOMIAL DISTRIBUTION AND FILTER BASED ON MEAN/STD OF SECOND
plt.figure(figsize = (10,3))
gs1 = gridspec.GridSpec(1, 2)
gs1.update(wspace=0.7, hspace=0.7) # set the spacing between axes. 

# (4) PLOT LOG NUMBER OF CELLS CONTRIBUTING TO EACH GENE
num_cells_per_gene = np.log(np.sum(DF_HARBOUR_PRIMARY.values > 0,axis=0))
num_cells_per_gene[(np.isinf(num_cells_per_gene)) | (np.isnan(num_cells_per_gene))] = 0
rmv_genes1 = np.where(num_cells_per_gene<=0)[0] # GENES MUST BE DETECTED IN AT LEAST 10 CELLS

ax = plt.subplot(gs1[0])
bins = np.linspace(num_cells_per_gene.min(), num_cells_per_gene.max()*0.95, 20)
plt.hist(num_cells_per_gene, bins, alpha=0.5, label='keep')

if rmv_genes1.any():
    plt.hist(num_cells_per_gene[rmv_genes1], bins, alpha=1, label='remove')
    
#ax.set_axis_facecolor('white')
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('Gene Filter 1: Remove Genes Singletons \n(Log. # Expressing Cells)')
plt.grid(True)
sns.despine()

# (5) PLOT LOG COUNTS PER GENE - REMOVE LOW ABUNDANCE GENES
log_counts_per_gene = np.log(np.log(np.sum(DF_HARBOUR_PRIMARY.values,axis=0)))
log_counts_per_gene[(np.isinf(log_counts_per_gene)) | (np.isnan(log_counts_per_gene))] = 0
data = log_counts_per_gene

ax = plt.subplot(gs1[1])
bins = np.linspace(data.min(), data.max()*0.95, 100)
y,x,_=hist(data,bins,alpha=.3,label='data')

x=(x[1:]+x[:-1])/2 
expected=(0,.2,3500,1.5,.2,500)
params,cov=curve_fit(bimodal,x,y,expected)
sigma=sqrt(diag(cov))
plot(x,bimodal(x,*params),color='red',lw=3,label='model')

mu1 = params[0]
std1 = params[1]
mu2 = params[3]
std2 = params[4]
rmv_genes_neg = np.where(data<mu2-4*std2)[0]
rmv_genes2 = np.sort(list(set(list(rmv_genes_neg))))

if rmv_genes2.any():
    plt.hist(data[rmv_genes2], bins, alpha=1, label='remove')
    
#ax.set_axis_facecolor('white')
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('Gene Filter 2: \nLog-Log Counts/Gene')
plt.grid(True)
sns.despine()

# Add (abbreviated) legend bottom left
L = plt.legend(loc='upper right',prop={'size':12},bbox_to_anchor=(1.6, 1.05),fancybox=True) 

# SAVE FIGURE
figure_label = '_filter_genes'
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label + '.png'
plt.savefig(fn, dpi=fig_dpi)
print(fn)

# EVALUATE NUMBER OF CELLS/GENES REMOVED
print('Count Gene Filter 1: {}'.format(len(rmv_genes1)))
print('Count Gene Filter 2: {}'.format(len(rmv_genes2)))

# REMOVE SELECTED OUTLIER GENES(HIGHLIGHTED IN GREEN)
CUT_DF = deepcopy(DF_HARBOUR_PRIMARY)
rmv_genes = np.sort(list(set(list(rmv_genes1) + list(rmv_genes2))))
if rmv_genes.any():
    CUT_DF = CUT_DF.drop(CUT_DF.columns[rmv_genes],axis=1)
    print(CUT_DF.shape)
    
# Remove empty genes if they exist
drop_genes = np.where(CUT_DF.sum(axis=0)==0)[0]
CUT_DF = CUT_DF.drop(CUT_DF.columns[drop_genes],axis=1)
print(CUT_DF.shape)

# GENES EXCLUDING LOW ABUNDANCE GENES
goi = list(CUT_DF.columns)
print(len(goi))

###### RANDOMIZED PCA

In [None]:
data = NDF_HARBOUR_PRIMARY[goi]
pca = PCA(n_components=100, whiten = False, svd_solver='randomized')
PCA_FIT = pca.fit(data)# cells/observations x genes/features

explained_variance = PCA_FIT.explained_variance_ratio_ # explained variance
cumsum_explained_variance = np.cumsum(explained_variance) # cumulative sum of explained variance
PCA_DATA = PCA_FIT.transform(data) # transformed data

# Store output of pca in dictionary for WISHBONE
eigenvalues = pd.Series(pca.explained_variance_,index = np.arange(pca.n_components))
loadings = pd.DataFrame(PCA_FIT.components_.T, index = data.columns, columns = np.arange(pca.n_components))
pca_dict = {'eigenvalues':eigenvalues,'loadings':loadings}

# Correlation of each PC component with library size

x = molecule_sums_PRIMARY.values.tolist()

R = np.zeros(PCA_DATA.shape[1])
for ind in np.arange(PCA_DATA.shape[1]):
    y = PCA_DATA[:,ind]
    R[ind] = np.corrcoef(x,y)[1,0] # correlation coefficient

nrow = 2
ncol = 2
plt.figure(figsize=(7,7))
gs1 = gridspec.GridSpec(nrow,ncol)
gs1.update(wspace = 0.5, hspace = 0.5)

# Variance of each gene
gene_variance = np.var(data, axis=0)
variance_sort_idx =gene_variance.argsort()[::-1]
ranked_gene_variance = gene_variance[variance_sort_idx]

# Plot explained variance
ax = plt.subplot(gs1[0])
plt.plot(explained_variance)
plt.xticks(rotation=70)
plt.ylabel('Ratio Explained Variance')
plt.xlabel('Principle Component')
plt.grid(True)
sns.despine()

# Plot cumulative sum of explained variance
ax = plt.subplot(gs1[1])
plt.plot(cumsum_explained_variance)
plt.xticks(rotation=70)
plt.ylabel('Cumsum Explained Variance')
plt.xlabel('Principle Component')
plt.grid(True)
sns.despine()

# Plot correlatin of library size with each PC
ax = plt.subplot(gs1[2])
plt.plot(np.abs(R))
plt.xticks(rotation=70)
plt.ylabel('Correlation with Library Size (abs)')
plt.xlabel('Principle Component')
plt.grid(True)
sns.despine()

# Plot ranked variance of each gene
ax = plt.subplot(gs1[3])
plt.plot(np.log(ranked_gene_variance.values))
plt.xticks(rotation=70)
plt.ylabel('Log. Variance')
plt.xlabel('Ranked Genes')
plt.grid(True)
sns.despine()

In [None]:
# IDENTIFY POINT OF MAXIMUM CURVATURE IN CUMULATIVE EXPLAINED VARIANCE
y = cumsum_explained_variance
x = np.arange(len(y))
kneedle = KneeLocator(x, y)
kneedle.plot_knee_normalized()
knee = kneedle.knee
plt.title('knee: {}'.format(knee))

In [None]:
# SELECT DATA ASSOCIATED WITH RELEVANT PCS (REMOVE PC0 if highly correlated with cell size)
ncomponents = knee*2
COMPONENTS = np.arange(ncomponents)
print(COMPONENTS)

# LOADING PER COMPONENT
loadings = PCA_FIT.components_.T[:,COMPONENTS]

labels = []
for i in COMPONENTS:
    labels.append('PC{}'.format(i))

data = NDF_HARBOUR_PRIMARY[goi]
data -= np.min(np.ravel(data))
data /= np.max(np.ravel(data))
data = pd.DataFrame(np.dot(data, loadings).astype(np.float64),index=data.index, columns = labels)


# WRITE TO H5
DIMENSIONS_HARBOUR_TUMOR_PRIMARY = data.copy()
exec('h5_data_harbour.save(DIMENSIONS_HARBOUR_TUMOR_PRIMARY, \'DIMENSIONS_HARBOUR_TUMOR_PRIMARY\')')# update directory
h5_data_harbour.ls() # list contents of directory

###### REGRESS LIBRARY SIZE OUT OF PCS

In [None]:
exec('QUERY = DIMENSIONS_HARBOUR_TUMOR_PRIMARY')
idx = QUERY.columns.to_series().str.contains('^PC((?!TSNE).)*$') # select from DIMS_NDF
PCA_DATA = QUERY.loc[:, idx.values]
counts = DF_HARBOUR_PRIMARY.copy()

num_comps = PCA_DATA.shape[1]

pca_regressed_out_lib_size = deepcopy(PCA_DATA)
for c in range(num_comps):
    lm = LinearRegression(normalize=False)
    X = counts.sum(axis=1).values.reshape(counts.shape[0], 1)
    Y = PCA_DATA.iloc[:, c]
    lm.fit(X, Y)
    pca_regressed_out_lib_size.iloc[:,c] = Y-lm.predict(X)
    
# Add SUFIX FOR REGRESSED LIB SIZE (RLS)
pca_regressed_out_lib_size = pca_regressed_out_lib_size.add_prefix('RLS_')

for label in pca_regressed_out_lib_size.columns:
    exec('DIMENSIONS_HARBOUR_TUMOR_PRIMARY[label]= pca_regressed_out_lib_size[label]')
    
# WRITE TO H5
exec('h5_data_harbour.save(DIMENSIONS_HARBOUR_TUMOR_PRIMARY, \'DIMENSIONS_HARBOUR_TUMOR_PRIMARY\')')# update directory
h5_data_harbour.ls() # list contents of directory

###### CALCULATE DIFFUSION COEFFICIENTS

In [None]:
knn = 27 # truncate the neighbors to the k closest points

component_prefix = 'RLS_PC'
exec('QUERY = DIMENSIONS_HARBOUR_TUMOR_PRIMARY')
idx = QUERY.columns.to_series().str.contains('^{}((?!TSNE).)*$'.format(component_prefix)) # select from DIMS_NDF
PCA_DATA = QUERY.loc[:, idx.values]

DIFFUSION_EIGS = palantir.utils.run_diffusion_maps(PCA_DATA.astype(np.float64), knn=knn)

###### IMPUTE MISSING VALUES 

In [None]:
imputed_data_matrix = palantir.utils.run_magic_imputation(NDF_HARBOUR_PRIMARY, DIFFUSION_EIGS, n_steps=3)
INDF_HARBOUR_PRIMARY = imputed_data_matrix.copy()
exec('h5_data_harbour.save(INDF_HARBOUR_PRIMARY,\'INDF_HARBOUR_PRIMARY\')')

### MIXTURE MODEL - HARBOUR DATASET

#### MIXTURE MODEL FOR TCGA CLASSIFICATION

In [None]:
# Using imputed normalized Harbour data:
datatype = 'INDF_HARBOUR'
exec('QUERY = {}'.format(datatype))

# Using tumor cell subset:
subset_type = 'TUMOR'

gene1 = 'TCGA1'
signature_genes = TCGA_markerfile_df.loc[TCGA_markerfile_df[gene1] == 1].index.values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
vals1 = (np.nanmean(QUERY[detected_genes],axis=1))

gene2 = 'TCGA2'
signature_genes = TCGA_markerfile_df.loc[TCGA_markerfile_df[gene2] == 1].index.values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
vals2 = (np.nanmean(QUERY[detected_genes],axis=1))

gene3 = 'TCGA3'
signature_genes = TCGA_markerfile_df.loc[TCGA_markerfile_df[gene3] == 1].index.values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
vals3 = (np.nanmean(QUERY[detected_genes],axis=1))

gene4 = 'TCGA4'
signature_genes = TCGA_markerfile_df.loc[TCGA_markerfile_df[gene4] == 1].index.values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
vals4 = (np.nanmean(QUERY[detected_genes],axis=1))

X = pd.DataFrame(data = {gene1: vals1, gene2: vals2, gene3: vals3, gene4: vals4}, index = QUERY.index)

In [None]:
# Find optimal random seed for knn initialization step of the BGMM:
rs_opt = 0
while rs in range(1,50) and rs_opt == 0:
    X_test = X.copy()
    bgmm = BayesianGaussianMixture(covariance_type='tied', n_components=4, random_state=rs)
    bgmm.fit(X_test[['TCGA1','TCGA2','TCGA3','TCGA4']])
    clusters = bgmm.predict(X_test[['TCGA1','TCGA2','TCGA3','TCGA4']])
    
    X_test['Assignment'] = clusters
    
    # Make Class Assignments
    lut ={}
    mean_array = []
    for ii in np.unique(X_test['Assignment']):
        mean_array.append([X_test[X_test['Assignment']==ii][x].mean() for x in TCGA_CLASS_LIST])
    mean_array = np.array(mean_array)

    for i in range(len(np.unique(X_test['Assignment']))):
        max_ind = np.unravel_index(np.argmax(mean_array, axis=None), mean_array.shape)
        lut[max_ind[0]] = TCGA_CLASS_LIST[max_ind[1]]
        mean_array[:,max_ind[1]] = 0
        mean_array[max_ind[0],:] = 0

    # MAP ASSIGNMENTS
    X_test['Assignment'] = X_test['Assignment'].map(lut)
    
    if (X_test.Assignment.value_counts().keys()[0] in ['TCGA3', 'TCGA4']):
        rs_opt = rs
        X_test = []

In [None]:
# Train BGMM:
# NOTE: Occasionally the bgmm results can vary here, changing the random_state value will help (recommend using: 33)

bgmm = BayesianGaussianMixture(covariance_type='tied', n_components=4, random_state=rs_opt)
bgmm.fit(X[['TCGA1','TCGA2','TCGA3','TCGA4']])
clusters = bgmm.predict(X[['TCGA1','TCGA2','TCGA3','TCGA4']])

# Create T-SNE:
X_embedded = manifold.TSNE(n_components=2, random_state=2).fit_transform(X[['TCGA1','TCGA2','TCGA3','TCGA4']])
X['tsne1'] = X_embedded[:,0]
X['tsne2'] = X_embedded[:,1]

X['Assignment'] = clusters

In [None]:
# Make Class Assignments
lut ={}
mean_array = []
for ii in np.unique(X['Assignment']):
    mean_array.append([X[X['Assignment']==ii][x].mean() for x in TCGA_CLASS_LIST])
mean_array = np.array(mean_array)

for i in range(len(np.unique(X['Assignment']))):
    max_ind = np.unravel_index(np.argmax(mean_array, axis=None), mean_array.shape)
    lut[max_ind[0]] = TCGA_CLASS_LIST[max_ind[1]]
    mean_array[:,max_ind[1]] = 0
    mean_array[max_ind[0],:] = 0

# MAP ASSIGNMENTS
X['Assignment'] = X['Assignment'].map(lut)

sns.set(font_scale=2.5)
sns.set_style("white")
g = sns.pairplot(X[['tsne1','tsne2','Assignment']], 
                 hue = 'Assignment',
                 palette={"TCGA1": "#0000FF",
                          "TCGA2": "#A4D3FC",
                          "TCGA3": "#FF7DC2", 
                          "TCGA4": "#FF0000",},
            diag_kind = 'kde', aspect = 1, size = 10);


# SAVE FIGURE
figure_label = 'HARBOUR_BayesianGMM_TCGA'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 

d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
g.savefig(fn + '.png', dpi=400, transparent=True)
g.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
# UPDATE DIRECTORY
X['TCGA_Assignment'] = X.Assignment.values
METADATA_HARBOUR = X[['TCGA_Assignment']].copy()
exec('h5_data_harbour.save(METADATA_HARBOUR, \'METADATA_HARBOUR\')')

#### INTRATUMOR TCGA HETEROGENEITY

###### TCGA DISTRIBUTION PER PATIENT

In [None]:
boxplot_patient_list = []
boxplot_fract_list = []
boxplot_class_list = []

for patient in METADATA_HARBOUR.index.get_level_values('Patient').unique():
    tmp = [METADATA_HARBOUR.loc[METADATA_HARBOUR.index.get_level_values('Patient') == patient].\
           TCGA_Assignment.value_counts().reindex(METADATA_HARBOUR.TCGA_Assignment.unique(), fill_value=0)[x] 
           for x in TCGA_CLASS_LIST]
    tmp = list(tmp/sum(tmp))
    
    boxplot_patient_list = boxplot_patient_list + [patient for x in range(len(TCGA_CLASS_LIST))]
    boxplot_fract_list = boxplot_fract_list + tmp
    boxplot_class_list = boxplot_class_list + TCGA_CLASS_LIST

boxplot_data = pd.DataFrame()
boxplot_data['Patient'] = boxplot_patient_list
boxplot_data['Fraction'] = boxplot_fract_list
boxplot_data['Class'] = boxplot_class_list

# VIOLIN PLOT
fig = plt.figure(figsize = (10,4))
ax = plt.gca()

# Plot violin plot
g = sns.barplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class', ci=95, capsize=.2,
                palette={"TCGA1": "#0000FF",
                         "TCGA2": "#A4D3FC",
                         "TCGA3": "#FF7DC2", 
                         "TCGA4": "#FF0000",},
                linewidth = 1, ax = ax);
fig.get_axes()[0].set_yscale('log')

plt.ylim((0,1))
g.set_ylabel("Cell Type Fraction",fontsize=10)
g.set_xlabel(" ",fontsize=10,rotation = 90)
g.tick_params(labelsize=10)
g.set_xticklabels(ax.get_xticklabels(),rotation=90)
sns.despine()

ax.legend_.remove()

# SAVE FIGURE
figure_label = 'TCGA_Distribution_by_Patient_LOG_HARBOUR'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 
plt.savefig(fn + '.png', dpi=400, transparent=True)
plt.savefig(fn + '.pdf', dpi=400)
print(fn)

#### MIXTURE MODEL FOR GEP CLASSIFICATION

In [None]:
subset_type = 'TUMOR'
# LOAD GENE LIST FROM EXCEL
path_to_genesets = DATA_PATH+'uveal_melanoma_MB_v3.csv'
genesets = pd.read_csv(path_to_genesets,header='infer')
genesets = genesets.apply(lambda x: x.astype(str).str.upper())
print(shape(genesets)[1])

In [None]:
# VIOLIN PLOTS
datatype = 'INDF_HARBOUR'
exec('QUERY = {}'.format(datatype))

# VIOLIN PLOTS
gene1 = 'Castle 1'
signature_genes = genesets[gene1].values
signature_genes = [x for x in signature_genes if str(x).upper() != 'NAN']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
vals1 = (np.nanmean(QUERY[detected_genes],axis=1))

gene2 = 'Castle 2'
signature_genes = genesets[gene2].values
signature_genes = [x for x in signature_genes if str(x).upper() != 'NAN']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
vals2 = (np.nanmean(QUERY[detected_genes],axis=1))

X = pd.DataFrame(data = {gene1: vals1, gene2: vals2}, index = QUERY.index)

bgmm = BayesianGaussianMixture(covariance_type='diag', n_components=2, random_state=1)
bgmm.fit(X[['Castle 1','Castle 2']])
clusters = bgmm.predict(X[['Castle 1','Castle 2']])

In [None]:
# VIOLIN PLOTS
datatype = 'INDF_HARBOUR'
exec('QUERY = {}'.format(datatype))

# VIOLIN PLOTS
gene1 = 'Castle 1'
signature_genes = genesets[gene1].values
signature_genes = [x for x in signature_genes if str(x).upper() != 'NAN']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
vals1 = (np.nanmean(QUERY[detected_genes],axis=1))

gene2 = 'Castle 2'
signature_genes = genesets[gene2].values
signature_genes = [x for x in signature_genes if str(x).upper() != 'NAN']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
vals2 = (np.nanmean(QUERY[detected_genes],axis=1))

X = pd.DataFrame(data = {gene1: vals1, gene2: vals2}, index = QUERY.index)

bgmm = BayesianGaussianMixture(covariance_type='diag', n_components=2)
bgmm.fit(X[['Castle 1','Castle 2']])
clusters = bgmm.predict(X[['Castle 1','Castle 2']])
X['Assignment'] = clusters

# Make Class Assignments
lut ={}
for ii in np.unique(X['Assignment']):
    if X[X['Assignment']==ii]['Castle 1'].mean()>X[X['Assignment']==ii]['Castle 2'].mean():
        lut[ii] = 'Castle 1'
    else:
        lut[ii] = 'Castle 2'
    
X['Assignment'] = X['Assignment'].map(lut)

sns.set(font_scale=2.5)
sns.set_style("white")
g = sns.pairplot(X[['Castle 1','Castle 2','Assignment']], 
                 hue = 'Assignment',
                 palette={"Castle 1": "#0000FF", "Castle 2": "#FF0000", 'Mixed': '#000000'},
                 diag_kind = 'kde', aspect = 1, size = 10); # SAVE FIGURE


# SAVE FIGURE
figure_label = 'HARBOUR_BayesianGMM'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 

d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
g.savefig(fn + '.png', dpi=400, transparent=True)
g.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
# UPDATE DIRECTORY
METADATA_HARBOUR['Assignment'] = X.Assignment
METADATA_HARBOUR.to_hdf(DATA_PATH_HARBOUR+'UVMEL_HARBOUR.h5', key='METADATA_HARBOUR')

###### COLOR GEP PLOT BY TCGA CLASS

In [None]:
X['TCGA_Class'] = METADATA_HARBOUR.TCGA_Assignment

sns.set(font_scale=2.5)
sns.set_style("white")
g = sns.pairplot(X[['Castle 1', 'Castle 2', 'Assignment', 'TCGA_Class']], 
                 hue = 'TCGA_Class',
                 palette={"TCGA1": "#0000FF",
                          "TCGA2": "#A4D3FC",
                          "TCGA3": "#FF7DC2", 
                          "TCGA4": "#FF0000",},
                 diag_kind = 'kde', aspect = 1, size = 10);# SAVE FIGURE


# SAVE FIGURE
figure_label = 'HARBOUR_FULL_TCGA_colored'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 

d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
g.savefig(fn + '.png', dpi=400, transparent=True)
g.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
# Make Class Assignments
X['cellType'] = X.index.get_level_values('cellType').values

sns.set(font_scale=2.5)
sns.set_style("white")
g = sns.pairplot(X[['Castle 1', 'Castle 2', 'Assignment', 'cellType']], 
                 hue = 'cellType',
                 palette={"Class 1A Primary Tumor Cells": "#2F75DF",
                          "Class 1B PRAME+ Met Tumor Cells": "#21C108",
                          "Class 2 PRAME- Primary Tumor Cells": "#B696B9", 
                          "Class 2 PRAME+ Primary Tumor Cells": "#D17070",},
                 diag_kind = 'kde', aspect = 1, size = 10); # SAVE FIGURE


# SAVE FIGURE
figure_label = 'HABROUR_FULL_given_annotations'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 

d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
g.savefig(fn + '.png', dpi=400, transparent=True)
g.savefig(fn + '.pdf', dpi=400)
print(fn)

###### CONTINUE WITH INTRATUMOR HETEROGENEITY FOR GEP

In [None]:
plt.figure(figsize = (5,5))
sns.despine()
sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})
ax = plt.gca()

ax = plt.scatter(X[X['Assignment']=='Castle 1']['Castle 2'], X[X['Assignment']=='Castle 1']['Castle 1'],
                 s = 1, c ='blue',alpha = 0.5)
ax = sns.kdeplot(X[X['Assignment']=='Castle 1']['Castle 2'], X[X['Assignment']=='Castle 1']['Castle 1'],
                 cmap="Blues", shade=True, shade_lowest=False,n_levels = 20, alpha = 0.5)

ax = plt.scatter(X[X['Assignment']=='Castle 2']['Castle 2'], X[X['Assignment']=='Castle 2']['Castle 1'],
                 s = 1, c ='red',alpha = 0.5)
ax = sns.kdeplot(X[X['Assignment']=='Castle 2']['Castle 2'], X[X['Assignment']=='Castle 2']['Castle 1'],
                 cmap="Reds", shade=True, shade_lowest=False, n_levels = 20, alpha = 0.5)


# SAVE FIGURE
# SAVE FIGURE
figure_label = 'Mixture_Model_HARBOUR'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 

d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
plt.savefig(fn + '.png', dpi=400, transparent=True)
plt.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
# Update multi-index
new_index = pd.MultiIndex.from_tuples(list(zip(X.index.get_level_values('Sample ID'), 
                                               X.index.get_level_values('Legend'),
                                               X.index.get_level_values('Patient'),
                                               X.index.get_level_values('Cell ID'),
                                               X['Assignment'])), 
                                  names=['Sample ID','Legend', 'Patient','Cell ID','Assignment'])
X = pd.DataFrame(data = X.values, columns = X.columns, index = new_index)

# CLASS ASSIGNMENT BY PATIENT 
meta = 'Assignment'
exec('tmp = X.groupby(level=[\'Patient\', \'{}\'], axis=0).size().unstack().fillna(0)'.format(meta))
colors = ["#0000FF","#FF0000"]

plt.figure(figsize = (5,3))
ax = plt.gca()
tmp.div(tmp.sum(axis=1),axis=0).plot.barh(stacked=True, color=colors, ax = ax, width = 0.95)
ax.legend_.remove()
ax.set_frame_on(False)
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plt.xlabel('Cell Fraction', fontsize=12)
plt.ylabel('Patient', fontsize=12)

# SAVE FIGURE
figure_label = 'GEP_Distribution_by_Patient_HARBOUR'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label
plt.savefig(fn + '.png', dpi=400, transparent=True)
plt.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
# GENE-GENE COVARIANCE WITHIN NORMAL SUBSET
n = 150
reps = np.arange(100)

master_groupby = 'Patient'
ix = [ind for ind,name in enumerate(X.index.names) if name==master_groupby][0]
fraction_castle1 = np.zeros((len(reps),len(np.unique(X.index.get_level_values(master_groupby)))))
fraction_castle2 = np.zeros((len(reps),len(np.unique(X.index.get_level_values(master_groupby)))))

for rep in reps:
    # RANDOMLY SAMPLE SAME NUMBER OF CELLS PER PATIENT
    SUBSET = pd.DataFrame()
    for class_selection in np.unique(X.index.get_level_values(master_groupby)):
        CSUBSET = X.loc[X.index.map(lambda x: x[ix] in [class_selection])].sample(n=n,replace=True)
        SUBSET = SUBSET.append(CSUBSET)
    
    bgmm = BayesianGaussianMixture(covariance_type='diag', n_components=2, random_state=1)
    bgmm.fit(SUBSET[['Castle 1','Castle 2']])
    clusters = bgmm.predict(SUBSET[['Castle 1','Castle 2']])
    SUBSET['Assignment'] = clusters

    # Make Class Assignments
    lut ={}
    for ii in np.unique(SUBSET['Assignment']):
        if SUBSET[SUBSET['Assignment']==ii]['Castle 1'].mean()>SUBSET[SUBSET['Assignment']==ii]['Castle 2'].mean():
            lut[ii] = 'Castle 1'
        else:
            lut[ii] = 'Castle 2'
    
    SUBSET['Assignment'] = SUBSET['Assignment'].map(lut)

    # Update multi-index
    new_index = pd.MultiIndex.from_tuples(list(zip(SUBSET.index.get_level_values('Sample ID'), 
                                                   SUBSET.index.get_level_values('Legend'),
                                                   SUBSET.index.get_level_values('Patient'),
                                                   SUBSET.index.get_level_values('Cell ID'),
                                                   SUBSET['Assignment'])), 
                                          names=['Sample ID','Legend', 'Patient','Cell ID',
                                                 'Assignment'])
    SUBSET = pd.DataFrame(data = SUBSET.values, columns = SUBSET.columns, index = new_index)

    # CASTLE ASSIGNMENT PER PATIENT
    meta = 'Assignment'
    exec('tmp = SUBSET.groupby(level=[\'Patient\', \'{}\'], axis=0).size().unstack().fillna(0)'.format(meta))

    # SAVE PER DOWNSAMPLE
    cell_type_fraction = tmp.div(tmp.sum(axis=1),axis=0)
    fraction_castle1[rep,:] = cell_type_fraction['Castle 1'].values
    fraction_castle2[rep,:] = cell_type_fraction['Castle 2'].values
    
# CONVER TO PANDAS DATAFRAME
boxplot_data_fraction_castle1 = pd.DataFrame(data = fraction_castle1, 
                                             columns = ['{}_{}'.format(master_groupby,ind) 
                                                        for ind in 
                                                        np.unique(X.index.get_level_values(master_groupby))])
boxplot_data_fraction_castle1 = boxplot_data_fraction_castle1.stack().rename_axis(('Rep', 'Patient')).\
                                                                      reset_index(name='Fraction')

boxplot_data_fraction_castle2 = pd.DataFrame(data = fraction_castle2, 
                                             columns = ['{}_{}'.format(master_groupby,ind) 
                                                        for ind in 
                                                        np.unique(X.index.get_level_values(master_groupby))])
boxplot_data_fraction_castle2 = boxplot_data_fraction_castle2.stack().rename_axis(('Rep', 'Patient')).\
                                                                      reset_index(name='Fraction')

boxplot_data_fraction_castle1['Class'] = ['Castle 1']*len(boxplot_data_fraction_castle1)
boxplot_data_fraction_castle2['Class'] = ['Castle 2']*len(boxplot_data_fraction_castle2)
boxplot_data = boxplot_data_fraction_castle1.append(boxplot_data_fraction_castle2)
boxplot_data['Patient'] = [x.replace('Patient_','') for x in boxplot_data.Patient]

# VIOLIN PLOT
fig = plt.figure(figsize = (10,4))
ax = plt.gca()

# Plot violin plot
g = sns.barplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class', ci=95, capsize=.2,
                palette={"Castle 1": "#0000FF", "Castle 2": "#FF0000", 'Mixed': '#000000'},
                linewidth = 1, ax = ax);
fig.get_axes()[0].set_yscale('log')

plt.ylim((0,1))
g.set_ylabel("Cell Type Fraction",fontsize=10)
g.set_xlabel(" ",fontsize=10,rotation = 90)
g.tick_params(labelsize=10)
g.set_xticklabels(ax.get_xticklabels(),rotation=90)
sns.despine()

ax.legend_.remove()
plt.tight_layout()

# SAVE FIGURE
figure_label = 'GEP_Distribution_by_Patient_LOG_HARBOUR'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 
plt.savefig(fn + '.png', dpi=400, transparent=True)
plt.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
boxplot_patient_list = []
boxplot_fract_list = []
boxplot_class_list = []

for patient in sort(METADATA_HARBOUR.index.get_level_values('Patient').unique().tolist()):
    tmp = [METADATA_HARBOUR.loc[METADATA_HARBOUR.index.get_level_values('Patient') == patient].\
           Assignment.value_counts().reindex(METADATA_HARBOUR.Assignment.unique(), fill_value=0)[x] 
           for x in GEP_CLASS_LIST]
    tmp = list(tmp/sum(tmp))
    
    boxplot_patient_list = boxplot_patient_list + [patient for x in range(len(GEP_CLASS_LIST))]
    boxplot_fract_list = boxplot_fract_list + tmp
    boxplot_class_list = boxplot_class_list + GEP_CLASS_LIST

boxplot_data = pd.DataFrame()
boxplot_data['Patient'] = boxplot_patient_list
boxplot_data['Fraction'] = boxplot_fract_list
boxplot_data['Class'] = boxplot_class_list

# VIOLIN PLOT
fig = plt.figure(figsize = (10,4))
ax = plt.gca()

# Plot violin plot
g = sns.barplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class', ci=95, capsize=.2,
                palette={"Castle 1": "#0000FF", "Castle 2": "#FF0000", 'Mixed': '#000000'},
                linewidth = 1, ax = ax);
fig.get_axes()[0].set_yscale('log')

plt.ylim((0,1))
g.set_ylabel("Cell Type Fraction",fontsize=10)
g.set_xlabel(" ",fontsize=10,rotation = 90)
g.tick_params(labelsize=10)
g.set_xticklabels(ax.get_xticklabels(),rotation=90)
sns.despine()

ax.legend_.remove()
plt.tight_layout()

# SAVE FIGURE
figure_label = 'GEP_Distribution_by_Patient_LOG_HARBOUR_NO_ds'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 
plt.savefig(fn + '.png', dpi=400, transparent=True)
plt.savefig(fn + '.pdf', dpi=400)
print(fn)

### MIXTURE MODEL - HARBOUR DATA PRIMARY ONLY

In [None]:
METADATA_TUMOR.drop(columns='Patient').groupby('Patient').Assignment.value_counts(normalize=True)

#### MIXTURE MODEL FOR TCGA CLASSIFICATION

In [None]:
exec('INDF_HARBOUR_PRIMARY = h5_data_harbour.load(\'/INDF_HARBOUR_PRIMARY\')')

In [None]:
# VIOLIN PLOTS
datatype = 'INDF_HARBOUR_PRIMARY'
exec('QUERY = {}'.format(datatype))

# VIOLIN PLOTS
subset_type = 'TUMOR'

gene1 = 'TCGA1'
signature_genes = TCGA_markerfile_df.loc[TCGA_markerfile_df[gene1] == 1].index.values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
vals1 = (np.nanmean(QUERY[detected_genes],axis=1))

gene2 = 'TCGA2'
signature_genes = TCGA_markerfile_df.loc[TCGA_markerfile_df[gene2] == 1].index.values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
vals2 = (np.nanmean(QUERY[detected_genes],axis=1))

gene3 = 'TCGA3'
signature_genes = TCGA_markerfile_df.loc[TCGA_markerfile_df[gene3] == 1].index.values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
vals3 = (np.nanmean(QUERY[detected_genes],axis=1))

gene4 = 'TCGA4'
signature_genes = TCGA_markerfile_df.loc[TCGA_markerfile_df[gene4] == 1].index.values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
vals4 = (np.nanmean(QUERY[detected_genes],axis=1))

X = pd.DataFrame(data = {gene1: vals1, gene2: vals2, gene3: vals3, gene4: vals4}, index = QUERY.index)

In [None]:
# Train BGMM Model:
bgmm = BayesianGaussianMixture(covariance_type='spherical', n_components=4, random_state=37)
bgmm.fit(X[['TCGA1','TCGA2','TCGA3','TCGA4']])
clusters = bgmm.predict(X[['TCGA1','TCGA2','TCGA3','TCGA4']])

# Create T-SNE:
X_embedded = manifold.TSNE(n_components=2, random_state=2).fit_transform(X[['TCGA1','TCGA2','TCGA3','TCGA4']])
X['tsne1'] = X_embedded[:,0]
X['tsne2'] = X_embedded[:,1]

X['Assignment'] = clusters

In [None]:
# Make Class Assignments
lut ={}
mean_array = []
for ii in np.unique(X['Assignment']):
    mean_array.append([X[X['Assignment']==ii][x].mean() for x in TCGA_CLASS_LIST])
mean_array = np.array(mean_array)

for i in range(len(np.unique(X['Assignment']))):
    max_ind = np.unravel_index(np.argmax(mean_array, axis=None), mean_array.shape)
    lut[max_ind[0]] = TCGA_CLASS_LIST[max_ind[1]]
    mean_array[:,max_ind[1]] = 0
    mean_array[max_ind[0],:] = 0

# MAP ASSIGNMENTS
X['Assignment'] = X['Assignment'].map(lut)

sns.set(font_scale=2.5)
sns.set_style("white")
g = sns.pairplot(X[['tsne1','tsne2','Assignment']], 
                 hue = 'Assignment',
                 palette={"TCGA1": "#0000FF",
                          "TCGA2": "#A4D3FC",
                          "TCGA3": "#FF7DC2", 
                          "TCGA4": "#FF0000",},
            diag_kind = 'kde', aspect = 1, size = 10);# SAVE FIGURE


# SAVE FIGURE
figure_label = 'HARBOUR_PRIMARY_BayesianGMM_TCGA'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 

d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
g.savefig(fn + '.png', dpi=400, transparent=True)
g.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
X_test = X

In [None]:
# UPDATE DIRECTORY
X['TCGA_Assignment'] = X.Assignment
#METADATA_HARBOUR_PRIMARY = X[['TCGA_Assignment']].copy()
#exec('h5_data_harbour.save(METADATA_HARBOUR_PRIMARY, \'METADATA_HARBOUR_PRIMARY\')')

In [None]:
X_norm = X[TCGA_CLASS_LIST].copy()
X_norm = X_norm.div(X_norm.sum(axis=1), axis=0)

#### MIXTURE MODEL FOR GEP CLASSIFICATION

In [None]:
subset_type = 'TUMOR'
# LOAD GENE LIST FROM EXCEL
path_to_genesets = DATA_PATH+'uveal_melanoma_MB_v3.csv'
genesets = pd.read_csv(path_to_genesets,header='infer')
genesets = genesets.apply(lambda x: x.astype(str).str.upper())
print(shape(genesets)[1])

In [None]:
# VIOLIN PLOTS
datatype = 'INDF_HARBOUR_PRIMARY'
exec('QUERY = {}'.format(datatype))

# VIOLIN PLOTS
gene1 = 'Castle 1'
signature_genes = genesets[gene1].values
signature_genes = [x for x in signature_genes if str(x).upper() != 'NAN']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
vals1 = (np.nanmean(QUERY[detected_genes],axis=1))

gene2 = 'Castle 2'
signature_genes = genesets[gene2].values
signature_genes = [x for x in signature_genes if str(x).upper() != 'NAN']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
vals2 = (np.nanmean(QUERY[detected_genes],axis=1))

X = pd.DataFrame(data = {gene1: vals1, gene2: vals2}, index = QUERY.index)

In [None]:
# VIOLIN PLOTS
datatype = 'INDF_HARBOUR_PRIMARY'
exec('QUERY = {}'.format(datatype))

# VIOLIN PLOTS
gene1 = 'Castle 1'
signature_genes = genesets[gene1].values
signature_genes = [x for x in signature_genes if str(x).upper() != 'NAN']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
vals1 = (np.nanmean(QUERY[detected_genes],axis=1))

gene2 = 'Castle 2'
signature_genes = genesets[gene2].values
signature_genes = [x for x in signature_genes if str(x).upper() != 'NAN']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
vals2 = (np.nanmean(QUERY[detected_genes],axis=1))

X = pd.DataFrame(data = {gene1: vals1, gene2: vals2}, index = QUERY.index)

bgmm = BayesianGaussianMixture(covariance_type='diag', n_components=2, random_state=1)
bgmm.fit(X[['Castle 1','Castle 2']])
clusters = bgmm.predict(X[['Castle 1','Castle 2']])
X['Assignment'] = clusters

# Make Class Assignments
lut ={}
for ii in np.unique(X['Assignment']):
    if X[X['Assignment']==ii]['Castle 1'].mean()>X[X['Assignment']==ii]['Castle 2'].mean():
        lut[ii] = 'Castle 1'
    else:
        lut[ii] = 'Castle 2'
    
X['Assignment'] = X['Assignment'].map(lut)

sns.set(font_scale=2.5)
sns.set_style("white")
g = sns.pairplot(X[['Castle 1','Castle 2','Assignment']], 
                 hue = 'Assignment',
                 palette={"Castle 1": "#0000FF", "Castle 2": "#FF0000", 'Mixed': '#000000'},
                 diag_kind = 'kde', aspect = 1, size = 10); # SAVE FIGURE


# SAVE FIGURE
figure_label = 'HARBOUR_PRIMARY_BayesianGMM'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 

d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
#g.savefig(fn + '.png', dpi=400, transparent=True)
#g.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
# UPDATE DIRECTORY
METADATA_HARBOUR_PRIMARY['Assignment'] = X.Assignment
#exec('h5_data_harbour.save(METADATA_HARBOUR_PRIMARY, \'METADATA_HARBOUR_PRIMARY\')')

###### COLOR GEP PLOT BY TCGA CLASS

In [None]:
X['TCGA_Class'] = METADATA_HARBOUR_PRIMARY.TCGA_Assignment

sns.set(font_scale=2.5)
sns.set_style("white")
g = sns.pairplot(X[['Castle 1','Castle 2','Assignment', 'TCGA_Class']], 
                 hue = 'TCGA_Class',
                 palette={"TCGA1": "#0000FF",
                          "TCGA2": "#A4D3FC",
                          "TCGA3": "#FF7DC2", 
                          "TCGA4": "#FF0000",},
            diag_kind = 'kde', aspect = 1, size = 10); # SAVE FIGURE


# SAVE FIGURE
figure_label = 'HARBOUR_PRIMARY_FULL_TCGA_colored'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 

d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
g.savefig(fn + '.png', dpi=400, transparent=True)
g.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
X['TCGA_Class'] = METADATA_HARBOUR_PRIMARY.TCGA_Assignment

sns.set(font_scale=2.5)
sns.set_style("white")
g = sns.pairplot(X[['Castle 1','Castle 2','TCGA_Class']], 
                 hue = 'TCGA_Class',
                 palette={"TCGA1": "#0000FF",
                          "TCGA2": "#A4D3FC",
                          "TCGA3": "#FF7DC2", 
                          "TCGA4": "#FF0000",},
            diag_kind = 'kde', aspect = 1, size = 10); # SAVE FIGURE


# SAVE FIGURE
figure_label = 'HARBOUR_PRIMARY_FULL_TCGA_colored'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 

d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
#g.savefig(fn + '.png', dpi=400, transparent=True)
#g.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
n = 2000
reps = np.arange(10)

master_groupby = 'Patient'
ix = [ind for ind,name in enumerate(X.index.names) if name==master_groupby][0]
fraction_TCGA1 = np.zeros((len(reps),len(np.unique(X.index.get_level_values(master_groupby)))))
fraction_TCGA2 = np.zeros((len(reps),len(np.unique(X.index.get_level_values(master_groupby)))))
fraction_TCGA3 = np.zeros((len(reps),len(np.unique(X.index.get_level_values(master_groupby)))))
fraction_TCGA4 = np.zeros((len(reps),len(np.unique(X.index.get_level_values(master_groupby)))))

for rep in reps:
    # RANDOMLY SAMPLE SAME NUMBER OF CELLS PER PATIENT
    SUBSET = pd.DataFrame()
    for class_selection in np.unique(X.index.get_level_values(master_groupby)):
        CSUBSET = X.loc[X.index.map(lambda x: x[ix] in [class_selection])].sample(n=n,replace=True)
        SUBSET = SUBSET.append(CSUBSET)
    
    bgmm = BayesianGaussianMixture(covariance_type='spherical', n_components=4, random_state=37)
    bgmm.fit(SUBSET[TCGA_CLASS_LIST])
    clusters = bgmm.predict(SUBSET[TCGA_CLASS_LIST])
    SUBSET['Assignment'] = clusters

    # Make Class Assignments
    lut ={}
    mean_array = []
    for ii in np.unique(SUBSET['Assignment']):
        mean_array.append([SUBSET[SUBSET['Assignment']==ii][x].mean() for x in TCGA_CLASS_LIST])
    mean_array = np.array(mean_array)
    
    for i in range(len(np.unique(SUBSET['Assignment']))):
        max_ind = np.unravel_index(np.argmax(mean_array, axis=None), mean_array.shape)
        lut[max_ind[0]] = TCGA_CLASS_LIST[max_ind[1]]
        mean_array[:,max_ind[1]] = 0
        mean_array[max_ind[0],:] = 0
    
    # MAP ASSIGNMENTS
    SUBSET['Assignment'] = SUBSET['Assignment'].map(lut)

    # Update multi-index
    new_index = pd.MultiIndex.from_tuples(list(zip(SUBSET.index.get_level_values('Sample ID'), 
                                                   SUBSET.index.get_level_values('Legend'),
                                                   SUBSET.index.get_level_values('Patient'),
                                                   SUBSET.index.get_level_values('Cell ID'),
                                                   SUBSET['Assignment'])), 
                                          names=['Sample ID','Legend', 'Patient','Cell ID',
                                                 'Assignment'])
    SUBSET = pd.DataFrame(data = SUBSET.values, columns = SUBSET.columns, index = new_index)

    # CASTLE ASSIGNMENT PER PATIENT
    meta = 'Assignment'
    exec('tmp = SUBSET.groupby(level=[\'Patient\', \'{}\'], axis=0).size().unstack().fillna(0)'.format(meta))

    # SAVE PER DOWNSAMPLE
    cell_type_fraction = tmp.div(tmp.sum(axis=1),axis=0)
    fraction_TCGA1[rep,:] = cell_type_fraction['TCGA1'].values
    fraction_TCGA2[rep,:] = cell_type_fraction['TCGA2'].values
    fraction_TCGA3[rep,:] = cell_type_fraction['TCGA3'].values
    fraction_TCGA4[rep,:] = cell_type_fraction['TCGA4'].values
    
    
# CONVER TO PANDAS DATAFRAME
boxplot_data_fraction_TCGA1 = pd.DataFrame(data = fraction_TCGA1, 
                                             columns = ['{}_{}'.format(master_groupby,ind) 
                                                        for ind in 
                                                        np.unique(X.index.get_level_values(master_groupby))])
boxplot_data_fraction_TCGA1 = boxplot_data_fraction_TCGA1.stack().rename_axis(('Rep', 'Patient')).\
                                                                      reset_index(name='Fraction')

boxplot_data_fraction_TCGA2 = pd.DataFrame(data = fraction_TCGA2, 
                                             columns = ['{}_{}'.format(master_groupby,ind) 
                                                        for ind in 
                                                        np.unique(X.index.get_level_values(master_groupby))])
boxplot_data_fraction_TCGA2 = boxplot_data_fraction_TCGA2.stack().rename_axis(('Rep', 'Patient')).\
                                                                      reset_index(name='Fraction')

boxplot_data_fraction_TCGA3 = pd.DataFrame(data = fraction_TCGA3, 
                                             columns = ['{}_{}'.format(master_groupby,ind) 
                                                        for ind in 
                                                        np.unique(X.index.get_level_values(master_groupby))])
boxplot_data_fraction_TCGA3 = boxplot_data_fraction_TCGA3.stack().rename_axis(('Rep', 'Patient')).\
                                                                      reset_index(name='Fraction')

boxplot_data_fraction_TCGA4 = pd.DataFrame(data = fraction_TCGA4, 
                                             columns = ['{}_{}'.format(master_groupby,ind) 
                                                        for ind in 
                                                        np.unique(X.index.get_level_values(master_groupby))])
boxplot_data_fraction_TCGA4 = boxplot_data_fraction_TCGA4.stack().rename_axis(('Rep', 'Patient')).\
                                                                      reset_index(name='Fraction')

boxplot_data_fraction_TCGA1['Class'] = ['TCGA1']*len(boxplot_data_fraction_TCGA1)
boxplot_data_fraction_TCGA2['Class'] = ['TCGA2']*len(boxplot_data_fraction_TCGA2)
boxplot_data_fraction_TCGA3['Class'] = ['TCGA3']*len(boxplot_data_fraction_TCGA3)
boxplot_data_fraction_TCGA4['Class'] = ['TCGA4']*len(boxplot_data_fraction_TCGA4)

boxplot_data = boxplot_data_fraction_TCGA1.append([boxplot_data_fraction_TCGA2,
                                                   boxplot_data_fraction_TCGA3,
                                                   boxplot_data_fraction_TCGA4])
boxplot_data['Patient'] = [x.replace('Patient','MSK') for x in boxplot_data.Patient]

#boxplot_data[boxplot_data.Fraction < 0.0001] = 0
boxplot_data.Fraction = [x if x > 0.001 else 1e-4 for x in boxplot_data.Fraction]


# VIOLIN PLOT
fig = plt.figure(figsize = (10,4))
ax = plt.gca()

# Plot violin plot
sns.barplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class', ci=95, capsize=.1,
                palette={"TCGA1": "#0000FF",
                         "TCGA2": "#A4D3FC",
                         "TCGA3": "#FF7DC2", 
                         "TCGA4": "#FF0000",},
                linewidth=1, ax = ax, 
                alpha=1)

g = sns.stripplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class',
                palette={"TCGA1": "#0000FF",
                         "TCGA2": "#A4D3FC",
                         "TCGA3": "#FF7DC2", 
                         "TCGA4": "#FF0000",},
                linewidth = 0.5, edgecolor='white',
                ax = ax, alpha=0.7,
                dodge=True)

fig.get_axes()[0].set_yscale('log')

plt.ylim((1e-4,1.15))
g.set_ylabel("Cell Type Fraction",fontsize=10)
g.set_xlabel(" ",fontsize=10,rotation = 90)
g.tick_params(labelsize=10)
g.set_xticklabels(ax.get_xticklabels(),rotation=90)
sns.despine()

ax.legend_.remove()
#plt.tight_layout()

# SAVE FIGURE
figure_label = 'TCGA_Distribution_by_Patient_LOG_HARBOUR_PRIMARY'
fn = '/workdir/uvmel_project/figures/Revision_2_Figures/' + figure_label 
plt.savefig(fn + '.png', dpi=400, transparent=True)
plt.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
# GENE-GENE COVARIANCE WITHIN NORMAL SUBSET
n = 1000
reps = np.arange(10)

boxplot_patient_list = []
boxplot_fract_list = []
boxplot_class_list = []
master_groupby = 'Patient'

for rep in reps:
    for patient in np.unique(METADATA_HARBOUR_PRIMARY.index.get_level_values('Patient')):
        SUBSET = pd.DataFrame()
        for class_selection in np.unique(METADATA_HARBOUR_PRIMARY.index.get_level_values(master_groupby)):
            CSUBSET = METADATA_HARBOUR_PRIMARY.loc[METADATA_HARBOUR_PRIMARY.index.map(lambda x: x[ix] in 
                                                                  [class_selection])].sample(n=n,replace=True)
            SUBSET = SUBSET.append(CSUBSET)
        
        tmp = [SUBSET.loc[SUBSET.index.get_level_values('Patient') == patient].\
               Assignment.value_counts().reindex(SUBSET.TCGA_Assignment.unique(), fill_value=0)[x] 
               for x in TCGA_CLASS_LIST]
        tmp = list(tmp/sum(tmp))
    
        boxplot_patient_list = boxplot_patient_list + [patient for x in range(len(TCGA_CLASS_LIST))]
        boxplot_fract_list = boxplot_fract_list + tmp
        boxplot_class_list = boxplot_class_list + TCGA_CLASS_LIST

boxplot_data = pd.DataFrame()
boxplot_data['Patient'] = [x for x in boxplot_patient_list]
boxplot_data['Fraction'] = boxplot_fract_list
boxplot_data['Class'] = boxplot_class_list

boxplot_data.Fraction = [x if x > 0.0001 else 1e-4 for x in boxplot_data.Fraction]

# Set plot
fig = plt.figure(figsize = (7,4))
ax = plt.gca()

# Plot bar plot
sns.barplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class', ci=95, capsize=.15,
                palette={"TCGA1": "#0000FF",
                         "TCGA2": "#A4D3FC",
                         "TCGA3": "#FF7DC2", 
                         "TCGA4": "#FF0000",},
                linewidth=1, ax = ax, 
                alpha=1,
                errcolor='black',
                errwidth=1)

g = sns.stripplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class',
                palette={"TCGA1": "#0000FF",
                         "TCGA2": "#A4D3FC",
                         "TCGA3": "#FF7DC2", 
                         "TCGA4": "#FF0000",},
                linewidth = 0.5, edgecolor='white',
                ax = ax, alpha=0.7,
                dodge=True)

fig.get_axes()[0].set_yscale('log')

plt.ylim((1e-4,1.15))
ax.set_ylabel("Cell Type Fraction",fontsize=10)
ax.set_xlabel(" ",fontsize=10,rotation = 90)
ax.tick_params(labelsize=10)
ax.set_xticklabels(boxplot_data['Patient'].unique(),rotation=90)
sns.despine()

ax.legend_.remove()
plt.tight_layout()

# SAVE FIGURE
figure_label = 'TCGA_Distribution_by_Patient_LOG_HARBOUR_PRIMARY'
fn = '/workdir/uvmel_project/figures/Revision_2_Figures/' + figure_label 
plt.savefig(fn + '.png', dpi=400, transparent=True)
plt.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
X.index.get_level_values('Patient').value_counts()

###### COLOR BY CELLTYPE

In [None]:
# Make Class Assignments
X['cellType'] = X.index.get_level_values('cellType').values

sns.set(font_scale=2.5)
sns.set_style("white")
g = sns.pairplot(X[['Castle 1', 'Castle 2', 'Assignment', 'cellType']], 
                 hue = 'cellType',
                 palette={"Class 1A Primary Tumor Cells": "#2F75DF",
                          "Class 1B PRAME+ Met Tumor Cells": "#21C108",
                          "Class 2 PRAME- Primary Tumor Cells": "#B696B9", 
                          "Class 2 PRAME+ Primary Tumor Cells": "#D17070",},
                 diag_kind = 'kde', aspect = 1, size = 10); # SAVE FIGURE


# SAVE FIGURE
figure_label = 'HABROUR_PRIMARY_FULL_given_annotations'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 

d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
g.savefig(fn + '.png', dpi=400, transparent=True)
g.savefig(fn + '.pdf', dpi=400)
print(fn)

###### CONTINUE WITH INTRATUMOR HETEROGENEITY WITH FOR GEP

In [None]:
plt.figure(figsize = (5,5))
sns.despine()
sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})
ax = plt.gca()

ax = plt.scatter(X[X['Assignment']=='Castle 1']['Castle 2'], X[X['Assignment']=='Castle 1']['Castle 1'],
                 s = 1, c ='blue',alpha = 0.5)
ax = sns.kdeplot(X[X['Assignment']=='Castle 1']['Castle 2'], X[X['Assignment']=='Castle 1']['Castle 1'],
                 cmap="Blues", shade=True, shade_lowest=False,n_levels = 20, alpha = 0.5)

ax = plt.scatter(X[X['Assignment']=='Castle 2']['Castle 2'], X[X['Assignment']=='Castle 2']['Castle 1'],
                 s = 1, c ='red',alpha = 0.5)
ax = sns.kdeplot(X[X['Assignment']=='Castle 2']['Castle 2'], X[X['Assignment']=='Castle 2']['Castle 1'],
                 cmap="Reds", shade=True, shade_lowest=False, n_levels = 20, alpha = 0.5)


# SAVE FIGURE
# SAVE FIGURE
figure_label = 'HARBOUR_PRIMARY_Bayesian_GMM'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 

d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
plt.savefig(fn + '.png', dpi=400, transparent=True)
plt.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
# Update multi-index
new_index = pd.MultiIndex.from_tuples(list(zip(X.index.get_level_values('Sample ID'), 
                                               X.index.get_level_values('Legend'),
                                               X.index.get_level_values('Patient'),
                                               X.index.get_level_values('Cell ID'),
                                               X['Assignment'])), 
                                  names=['Sample ID','Legend', 'Patient','Cell ID','Assignment'])
X = pd.DataFrame(data = X.values, columns = X.columns, index = new_index)

# CLASS ASSIGNMENT BY PATIENT 
meta = 'Assignment'
exec('tmp = X.groupby(level=[\'Patient\', \'{}\'], axis=0).size().unstack().fillna(0)'.format(meta))
colors = ["#0000FF","#FF0000"]

plt.figure(figsize = (5,3))
ax = plt.gca()
tmp.div(tmp.sum(axis=1),axis=0).plot.barh(stacked=True, color=colors, ax = ax, width = 0.95)
ax.legend_.remove()
ax.set_frame_on(False)
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plt.xlabel('Cell Fraction', fontsize=12)
plt.ylabel('Patient', fontsize=12)

# SAVE FIGURE
figure_label = 'GEP_Distribution_by_Patient_HARBOUR_PRIMARY'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label
plt.savefig(fn + '.png', dpi=400, transparent=True)
plt.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
exec('METADATA_HARBOUR = h5_data_harbour.load(\'/METADATA_HARBOUR\')')

In [None]:
exec('METADATA_HARBOUR_PRIMARY = h5_data_harbour.load(\'/METADATA_HARBOUR_PRIMARY\')')

In [None]:
# Set Harbour primary GEP assignments to results from combined model:
METADATA_HARBOUR_PRIMARY['Assignment'] = [METADATA_HARBOUR.loc[
                                          METADATA_HARBOUR.index.get_level_values('Legend') == 'primary'].loc[
                                          METADATA_HARBOUR.loc[
                                          METADATA_HARBOUR.index.get_level_values('Legend') == 'primary'].\
                                          index.get_level_values('Cell ID') == x].Assignment.values[0] 
                                          for x in METADATA_HARBOUR_PRIMARY.index.get_level_values('Cell ID').values]

# Set Harbour primary TCGA assignments to results from combined model:
METADATA_HARBOUR_PRIMARY['TCGA_Assignment'] = [METADATA_HARBOUR.loc[
                                          METADATA_HARBOUR.index.get_level_values('Legend') == 'primary'].loc[
                                          METADATA_HARBOUR.loc[
                                          METADATA_HARBOUR.index.get_level_values('Legend') == 'primary'].\
                                          index.get_level_values('Cell ID') == x].TCGA_Assignment.values[0] 
                                          for x in METADATA_HARBOUR_PRIMARY.index.get_level_values('Cell ID').values]

In [None]:
# GENE-GENE COVARIANCE WITHIN NORMAL SUBSET
n = 1000
reps = np.arange(10)

boxplot_patient_list = []
boxplot_fract_list = []
boxplot_class_list = []
GEP_CLASS_LIST = ['Castle 1', 'Castle 2']
master_groupby = 'Patient'

for rep in reps:
    for patient in np.unique(METADATA_HARBOUR_PRIMARY.index.get_level_values('Patient')):
        SUBSET = pd.DataFrame()
        for class_selection in np.unique(METADATA_HARBOUR_PRIMARY.index.get_level_values(master_groupby)):
            CSUBSET = METADATA_HARBOUR_PRIMARY.loc[METADATA_HARBOUR_PRIMARY.index.map(lambda x: x[ix] in 
                                                                  [class_selection])].sample(n=n,replace=True)
            SUBSET = SUBSET.append(CSUBSET)
        
        tmp = [SUBSET.loc[SUBSET.index.get_level_values('Patient') == patient].\
               Assignment.value_counts().reindex(SUBSET.Assignment.unique(), fill_value=0)[x] 
               for x in GEP_CLASS_LIST]
        tmp = list(tmp/sum(tmp))
    
        boxplot_patient_list = boxplot_patient_list + [patient for x in range(len(GEP_CLASS_LIST))]
        boxplot_fract_list = boxplot_fract_list + tmp
        boxplot_class_list = boxplot_class_list + GEP_CLASS_LIST

boxplot_data = pd.DataFrame()
boxplot_data['Patient'] = [x for x in boxplot_patient_list]
boxplot_data['Fraction'] = boxplot_fract_list
boxplot_data['Class'] = boxplot_class_list

boxplot_data.Fraction = [x if x > 0.0001 else 1e-4 for x in boxplot_data.Fraction]

# Set plot
fig = plt.figure(figsize = (7,4))
ax = plt.gca()

# Plot bar plot
sns.barplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class', ci=95, capsize=0.15,
                palette={"Castle 1": "#0000FF", "Castle 2": "#FF0000", 'Mixed': '#000000'},
                linewidth=1, ax = ax, alpha=1,
                errcolor='black',
                errwidth=1)

g = sns.stripplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class',
                palette={"Castle 1": "#0000FF", "Castle 2": "#FF0000", 'Mixed': '#000000'},
                linewidth = 0.5, edgecolor='white',
                ax = ax, alpha=0.7,
                dodge=True)

fig.get_axes()[0].set_yscale('log')

plt.ylim((1e-4,1.15))
ax.set_ylabel("Cell Type Fraction",fontsize=10)
ax.set_xlabel(" ",fontsize=10,rotation = 90)
ax.tick_params(labelsize=10)
ax.set_xticklabels(boxplot_data['Patient'].unique(),rotation=90)
sns.despine()

ax.legend_.remove()
plt.tight_layout()

# SAVE FIGURE
figure_label = 'GEP_Distribution_by_Patient_LOG_HARBOUR_PRIMARY'
fn = '/workdir/uvmel_project/figures/Revision_2_Figures/' + figure_label 
plt.savefig(fn + '.png', dpi=400, transparent=True)
plt.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
# MSK Data
n = 500
reps = np.arange(20)

boxplot_patient_list = []
boxplot_fract_list = []
boxplot_class_list = []
GEP_CLASS_LIST = ['Castle 1', 'Castle 2']
master_groupby = 'Patient'

for rep in reps:
    for patient in sort(METADATA_TUMOR.index.get_level_values('Patient').unique().tolist()):
        SUBSET = pd.DataFrame()
        for class_selection in np.unique(METADATA_TUMOR.index.get_level_values(master_groupby)):
            CSUBSET = METADATA_TUMOR.loc[METADATA_TUMOR.index.map(lambda x: x[ix] in 
                                                                  [class_selection])].sample(n=n,replace=True)
            SUBSET = SUBSET.append(CSUBSET)
        
        tmp = [SUBSET.loc[SUBSET.index.get_level_values('Patient') == patient].\
               Assignment.value_counts().reindex(SUBSET.Assignment.unique(), fill_value=0)[x] 
               for x in GEP_CLASS_LIST]
        tmp = list(tmp/sum(tmp))
    
        boxplot_patient_list = boxplot_patient_list + [patient for x in range(len(GEP_CLASS_LIST))]
        boxplot_fract_list = boxplot_fract_list + tmp
        boxplot_class_list = boxplot_class_list + GEP_CLASS_LIST

# Harbour Data
for rep in reps:
    for patient in sort(METADATA_HARBOUR_PRIMARY.index.get_level_values('Patient').unique().tolist()):
        SUBSET = pd.DataFrame()
        for class_selection in np.unique(METADATA_HARBOUR_PRIMARY.index.get_level_values(master_groupby)):
            CSUBSET = METADATA_HARBOUR_PRIMARY.loc[METADATA_HARBOUR_PRIMARY.index.map(lambda x: x[ix] in 
                                                                  [class_selection])].sample(n=n,replace=True)
            SUBSET = SUBSET.append(CSUBSET)
        
        tmp = [SUBSET.loc[SUBSET.index.get_level_values('Patient') == patient].\
               Assignment.value_counts().reindex(SUBSET.Assignment.unique(), fill_value=0)[x] 
               for x in GEP_CLASS_LIST]
        tmp = list(tmp/sum(tmp))
    
        boxplot_patient_list = boxplot_patient_list + [patient for x in range(len(GEP_CLASS_LIST))]
        boxplot_fract_list = boxplot_fract_list + tmp
        boxplot_class_list = boxplot_class_list + GEP_CLASS_LIST

        
boxplot_data = pd.DataFrame()
boxplot_data['Patient'] = ['MSK_'+x if 'MM' not in x else x for x in boxplot_patient_list]
boxplot_data['Fraction'] = boxplot_fract_list
boxplot_data['Class'] = boxplot_class_list

boxplot_data.Fraction = [x if x > 0.0001 else 1e-4 for x in boxplot_data.Fraction]


# PLOT
fig = plt.figure(figsize = (16,6))
ax = plt.gca()

# Plot bar and strip plot
sns.barplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class', ci=95, capsize=0.15,
                palette={"Castle 1": "#0000FF", "Castle 2": "#FF0000", 'Mixed': '#000000'},
                linewidth=1, ax = ax, alpha=1,
                errcolor='black',
                errwidth=1)

g = sns.stripplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class',
                palette={"Castle 1": "#0000FF", "Castle 2": "#FF0000", 'Mixed': '#000000'},
                linewidth = 1, edgecolor='white',
                s=10,
                ax = ax, alpha=0.7,
                dodge=True)

fig.get_axes()[0].set_yscale('log')

plt.ylim((1e-4,1.15))
ax.set_ylabel("Cell Type Fraction",fontsize=10)
ax.set_xlabel(" ",fontsize=10,rotation = 90)
ax.tick_params(labelsize=10)
ax.set_xticklabels(boxplot_data['Patient'].unique(),rotation=90)
sns.despine()

ax.legend_.remove()
plt.tight_layout()
plt.minorticks_off()

# SAVE FIGURE
figure_label = 'GEP_Distribution_by_Patient_LOG_COMBINED'
fn = '/workdir/uvmel_project/figures/Revision_2_Figures/' + figure_label 
plt.savefig(fn + '.png', dpi=400, transparent=True)
plt.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
METADATA_TUMOR

In [None]:
# MSK Data
n = 500
reps = np.arange(20)
ix=2

boxplot_patient_list = []
boxplot_fract_list = []
boxplot_class_list = []
master_groupby = 'Patient'

for rep in reps:
    for patient in sort(METADATA_TUMOR.index.get_level_values('Patient').unique().tolist()):
        SUBSET = pd.DataFrame()
        for class_selection in np.unique(METADATA_TUMOR.index.get_level_values(master_groupby)):
            CSUBSET = METADATA_TUMOR.loc[METADATA_TUMOR.index.map(lambda x: x[ix] in 
                                                                  [class_selection])].sample(n=n,replace=True)
            SUBSET = SUBSET.append(CSUBSET)
        
        tmp = [SUBSET.loc[SUBSET.index.get_level_values('Patient') == patient].\
               TCGA_Assignment.value_counts().reindex(SUBSET.TCGA_Assignment.unique(), fill_value=0)[x] 
               for x in TCGA_CLASS_LIST]
        tmp = list(tmp/sum(tmp))
    
        boxplot_patient_list = boxplot_patient_list + [patient for x in range(len(TCGA_CLASS_LIST))]
        boxplot_fract_list = boxplot_fract_list + tmp
        boxplot_class_list = boxplot_class_list + TCGA_CLASS_LIST

# Harbour Data
for rep in reps:
    for patient in sort(METADATA_HARBOUR_PRIMARY.index.get_level_values('Patient').unique().tolist()):
        SUBSET = pd.DataFrame()
        for class_selection in np.unique(METADATA_HARBOUR_PRIMARY.index.get_level_values(master_groupby)):
            CSUBSET = METADATA_HARBOUR_PRIMARY.loc[METADATA_HARBOUR_PRIMARY.index.map(lambda x: x[ix] in 
                                                                  [class_selection])].sample(n=n,replace=True)
            SUBSET = SUBSET.append(CSUBSET)
        
        tmp = [SUBSET.loc[SUBSET.index.get_level_values('Patient') == patient].\
               TCGA_Assignment.value_counts().reindex(SUBSET.TCGA_Assignment.unique(), fill_value=0)[x] 
               for x in TCGA_CLASS_LIST]
        tmp = list(tmp/sum(tmp))
    
        boxplot_patient_list = boxplot_patient_list + [patient for x in range(len(TCGA_CLASS_LIST))]
        boxplot_fract_list = boxplot_fract_list + tmp
        boxplot_class_list = boxplot_class_list + TCGA_CLASS_LIST

        
boxplot_data = pd.DataFrame()
boxplot_data['Patient'] = ['MSK_'+x if 'MM' not in x else x for x in boxplot_patient_list]
boxplot_data['Fraction'] = boxplot_fract_list
boxplot_data['Class'] = boxplot_class_list

boxplot_data.Fraction = [x if x > 0.0001 else 1e-4 for x in boxplot_data.Fraction]


# PLOT
fig = plt.figure(figsize = (16,6))
ax = plt.gca()

# Plot bar and strip plot
sns.barplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class', ci=95, capsize=0.15,
                palette={"TCGA1": "#0000FF",
                         "TCGA2": "#A4D3FC",
                         "TCGA3": "#FF7DC2", 
                         "TCGA4": "#FF0000"},
                linewidth=1, ax = ax, alpha=1,
                errcolor='black',
                errwidth=1)

g = sns.stripplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class',
                palette={"TCGA1": "#0000FF",
                         "TCGA2": "#A4D3FC",
                         "TCGA3": "#FF7DC2", 
                         "TCGA4": "#FF0000"},
                linewidth = 1, edgecolor='white',
                s=10,
                ax = ax, alpha=0.7,
                dodge=True)

fig.get_axes()[0].set_yscale('log')

plt.ylim((1e-4,1.15))
ax.set_ylabel("Cell Type Fraction",fontsize=10)
ax.set_xlabel(" ",fontsize=10,rotation = 90)
ax.tick_params(labelsize=10)
ax.set_xticklabels(boxplot_data['Patient'].unique(),rotation=90)
sns.despine()

ax.legend_.remove()
plt.tight_layout()
plt.minorticks_off()

# SAVE FIGURE
figure_label = 'TCGA_Distribution_by_Patient_LOG_COMBINED'
fn = '/workdir/uvmel_project/figures/Revision_2_Figures/' + figure_label 
#plt.savefig(fn + '.png', dpi=400, transparent=True)
#plt.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
boxplot_data.Patient.value_counts()

In [None]:
boxplot_patient_list = []
boxplot_fract_list = []
boxplot_class_list = []

for patient in sort(METADATA_HARBOUR_PRIMARY.index.get_level_values('Patient').unique().tolist()):
    tmp = [METADATA_HARBOUR_PRIMARY.loc[METADATA_HARBOUR_PRIMARY.index.get_level_values('Patient') == patient].\
           Assignment.value_counts().reindex(METADATA_HARBOUR_PRIMARY.Assignment.unique(), fill_value=0)[x] 
           for x in GEP_CLASS_LIST]
    tmp = list(tmp/sum(tmp))
    
    boxplot_patient_list = boxplot_patient_list + [patient for x in range(len(GEP_CLASS_LIST))]
    boxplot_fract_list = boxplot_fract_list + tmp
    boxplot_class_list = boxplot_class_list + GEP_CLASS_LIST

boxplot_data = pd.DataFrame()
boxplot_data['Patient'] = boxplot_patient_list
boxplot_data['Fraction'] = boxplot_fract_list
boxplot_data['Class'] = boxplot_class_list

# VIOLIN PLOT
fig = plt.figure(figsize = (12,5))
ax = plt.gca()

# Plot violin plot
g = sns.barplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class', ci=95, capsize=.2,
                palette={"Castle 1": "#0000FF", "Castle 2": "#FF0000", 'Mixed': '#000000'},
                linewidth = 1, ax = ax);
fig.get_axes()[0].set_yscale('log')

plt.ylim((0,1))
g.set_ylabel("",fontsize=10)
g.set_xlabel(" ",fontsize=10,rotation = 90)
g.tick_params(labelsize=18)
g.set_xticklabels(ax.get_xticklabels(),rotation=90)
sns.despine()
sns.set_style('white')
plt.tick_params(size=0)

ax.legend_.remove()
plt.tight_layout()
g.axis(True)

# SAVE FIGURE
figure_label = 'GEP_Distribution_COMBINED'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 
#plt.savefig(fn + '.png', dpi=400, transparent=True)
#plt.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
boxplot_patient_list = []
boxplot_fract_list = []
boxplot_class_list = []

for patient in METADATA_TUMOR.index.get_level_values('Patient').unique():
    tmp = [METADATA_TUMOR.loc[METADATA_TUMOR.index.get_level_values('Patient') == patient].\
           TCGA_Assignment.value_counts().reindex(METADATA_TUMOR.TCGA_Assignment.unique(), fill_value=0)[x] 
           for x in TCGA_CLASS_LIST]
    tmp = list(tmp/sum(tmp))
    
    boxplot_patient_list = boxplot_patient_list + ['MSK-' + patient for x in range(len(TCGA_CLASS_LIST))]
    boxplot_fract_list = boxplot_fract_list + tmp
    boxplot_class_list = boxplot_class_list + TCGA_CLASS_LIST

for patient in sort(METADATA_HARBOUR_PRIMARY.index.get_level_values('Patient').unique().tolist()):
    tmp = [METADATA_HARBOUR_PRIMARY.loc[METADATA_HARBOUR_PRIMARY.index.get_level_values('Patient') == patient].\
           TCGA_Assignment.value_counts().reindex(METADATA_HARBOUR_PRIMARY.TCGA_Assignment.unique(), fill_value=0)[x] 
           for x in TCGA_CLASS_LIST]
    tmp = list(tmp/sum(tmp))
    
    boxplot_patient_list = boxplot_patient_list + [patient for x in range(len(TCGA_CLASS_LIST))]
    boxplot_fract_list = boxplot_fract_list + tmp
    boxplot_class_list = boxplot_class_list + TCGA_CLASS_LIST

boxplot_data = pd.DataFrame()
boxplot_data['Patient'] = boxplot_patient_list
boxplot_data['Fraction'] = boxplot_fract_list
boxplot_data['Class'] = boxplot_class_list

# VIOLIN PLOT
fig = plt.figure(figsize = (12,5))
ax = plt.gca()

# Plot violin plot
g = sns.barplot(data=boxplot_data,x = 'Patient', 
                y = 'Fraction', hue = 'Class', ci=95, capsize=.2,
                palette={"TCGA1": "#0000FF",
                         "TCGA2": "#A4D3FC",
                         "TCGA3": "#FF7DC2", 
                         "TCGA4": "#FF0000"},
                linewidth = 1, ax = ax);
fig.get_axes()[0].set_yscale('log')

plt.ylim((0,1))
g.set_ylabel("",fontsize=10)
g.set_xlabel(" ",fontsize=10,rotation = 90)
g.tick_params(labelsize=18)
g.set_xticklabels(ax.get_xticklabels(),rotation=90)
sns.despine()
sns.set_style('ticks')
plt.tick_params(size=0)

ax.legend_.remove()
plt.tight_layout()
g.axis(True)

# SAVE FIGURE
figure_label = 'TCGA_Distribution_COMBINED_BGMM_full_trained'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 
plt.savefig(fn + '.png', dpi=400, transparent=True)
plt.savefig(fn + '.pdf', dpi=400)
print(fn)

In [None]:
TCGA_probs_x_HARBOUR = [X_norm.loc[X_norm.index.map(lambda x: x[2] == y)].TCGA2.values - 
                        X_norm.loc[X_norm.index.map(lambda x: x[2] == y)].TCGA4.values
                        for y in X_norm.groupby(level='Patient').count().index]

TCGA_probs_y_HARBOUR = [X_norm.loc[X_norm.index.map(lambda x: x[2] == y)].TCGA1.values - 
                        X_norm.loc[X_norm.index.map(lambda x: x[2] == y)].TCGA3.values
                        for y in X_norm.groupby(level='Patient').count().index]

TCGA_probs_per_patient_HARBOUR = [[(TCGA_probs_x_HARBOUR[x][y], TCGA_probs_y_HARBOUR[x][y]) 
                                   for y in range(len(TCGA_probs_x_HARBOUR[x]))]
                                   for x in range(len(TCGA_probs_x_HARBOUR))]

In [None]:
diamond_list = [plt.Rectangle((0,-1), np.sqrt(2), np.sqrt(2), 45, color='black', fill=False, linewidth=0.5) 
                for x in range(11)]

fig1, axes = plt.subplots(2,3)

for i, ax in enumerate(axes.flatten()):
    if i in range(len(TCGA_probs_per_patient_HARBOUR)):
        ax.scatter(*zip(*TCGA_probs_per_patient_HARBOUR[i]), s=0.2, color='black',
                   alpha=0.3)
        ax.set_frame_on(False)
        ax.axes.get_xaxis().set_visible(False)
        ax.axes.get_yaxis().set_visible(False)
        ax.add_patch(diamond_list[i])
        ax.set_title(X_norm.groupby(level='Patient').count().index[i], fontdict={'fontsize':6})
    else:
        ax.set_frame_on(False)
        ax.axes.get_xaxis().set_visible(False)
        ax.axes.get_yaxis().set_visible(False)

# SAVE FIGURE
figure_label = 'TCGA_PROBS_DiamondPlot_HARBOUR_PRIMARY_1'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig1.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig1.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
diamond_list = [plt.Rectangle((0,-1), np.sqrt(2), np.sqrt(2), 45, color='black', fill=False, linewidth=0.5) 
                for x in range(11)]

fig1, axes = plt.subplots(2,3)

for i, ax in enumerate(axes.flatten()):
    i = i + 6
    if i in range(len(TCGA_probs_per_patient_HARBOUR)):
        ax.scatter(*zip(*TCGA_probs_per_patient_HARBOUR[i]), s=0.2, color='black',
                   alpha=0.3)
        ax.set_frame_on(False)
        ax.axes.get_xaxis().set_visible(False)
        ax.axes.get_yaxis().set_visible(False)
        ax.add_patch(diamond_list[i])
        ax.set_title(X_norm.groupby(level='Patient').count().index[i], fontdict={'fontsize':6})
    else:
        ax.set_frame_on(False)
        ax.axes.get_xaxis().set_visible(False)
        ax.axes.get_yaxis().set_visible(False)

# SAVE FIGURE
figure_label = 'TCGA_PROBS_DiamondPlot_HARBOUR_PRIMARY_2'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig1.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig1.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

### PRC1 TARGET GENES HEATMAPS - HARBOUR DATASET

#### GEP2 RANKED HEATMAP FOR PRC1 TARGET GENES

In [None]:
# RANK BY GEP 2
subset_type = 'TUMOR'
QUERY = INDF_HARBOUR

rank_by = 'Castle 2'#'Class 2 Onken v2'
genes = np.unique(genesets[rank_by].values.ravel().tolist())
detected_genes = [gene for gene in genes if gene in list(QUERY.columns)]

vals = QUERY[detected_genes]
SCORE = np.nanmedian(vals,axis=1)

In [None]:
PRC1_target_genes = pd.read_csv(DATA_PATH+'PRC1_targets_Genesets_and_Chipseq.csv')
plot_genes = PRC1_target_genes['PRC1_targets_Genesets_and_Chipseq'].dropna().values.tolist()
plot_genes = [x for x in plot_genes if x in QUERY.columns.values.tolist()]

# LOAD GENESETS
path_to_genesets = DATA_PATH+'uveal_melanoma_MB_v3.csv'
genesets = pd.read_csv(path_to_genesets,header='infer')

# HEATMAP OF INDIVIDUAL CELLS RANKED BY LUNGE EP
rank_by = 'Castle 2'#
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in list(set(QUERY.columns))]
vals = QUERY[genes]
QUERY['GEP2_RANK'] =np.nanmean(vals,axis=1)

# CONSTRUCT HEATMAP DATA
heatmap_data = pd.DataFrame( data = zscore(QUERY.sort_values(by=['GEP2_RANK'])[plot_genes],axis=0), 
                   columns = plot_genes, index = QUERY.sort_values(by=['GEP2_RANK']).index)
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
method = 'centroid' # average, single centroid/euclidean
metric = 'euclidean' # cosine
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
cl = hc.leaves_list(col_linkage)
mat = heatmap_data.iloc[:,cl]

window = 20
mat = mat.rolling(window, win_type='triang',center = True).sum()
half_window = int(window/2)
for ind in np.arange(half_window):
    mat.iloc[ind] = mat.iloc[half_window]    
for ind in np.arange(half_window):
    mat.iloc[-ind] = mat.iloc[-half_window]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(4,10))

# ADD MATRIX WITH LINEAGE NAMES
axmatrix = fig.add_axes([0.12,0.1,0.4,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=CM_DIVERGING,vmin=-10,vmax=10)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
axmatrix.set_xticklabels([]);
axmatrix.grid(False)
axmatrix.spines['top'].set_visible(False)
axmatrix.spines['right'].set_visible(False)
axmatrix.spines['bottom'].set_visible(False)
axmatrix.spines['left'].set_visible(False)
plt.tick_params(axis='both', size=0)


# ADD ROW 1 COLOR BY SAMPLE TYPE
sample_status = [QUERY.loc[QUERY.index.get_level_values('Cell ID') == x].index.get_level_values('Legend').\
                          values[0] for x in mat.index.get_level_values('Cell ID').values]

lut_SAMPLE = dict({'primary': np.array([1, 1, 1]), 'metastatic': np.array([0, 0, 0])})

row_colors1 = pd.Series(sample_status, index=sample_status).map(lut_SAMPLE)
ax1 = fig.add_axes([0.07,0.1,0.045,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors1:
    pos = (x, y / len(row_colors1))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(row_colors1), color=c))
    if y >= len(row_colors1)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')


# SAVE FIGURE
figure_label = 'PRC1_Targets_Heatmap_UPDATED_HARBOUR'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/GEP2_Ranked_Heatmaps/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
# LOAD GENESETS
plot_genes = ['IRF3', 'RELA', 'RELB', 'MB21D1', 'IKBKB', 'NFKB1', 'TMEM173', 'STAT2', 'CXCL10', 'NFKB2',
              'CCL5', 'TBK1', 'JAK1', 'JAK2', 'IRF9', 'TYK2', 'IRF7', 'IRF1', 'STAT1'
             ]

path_to_genesets = DATA_PATH+'uveal_melanoma_MB_v3.csv'
genesets = pd.read_csv(path_to_genesets,header='infer')

datatype = 'INDF_HARBOUR'
exec('QUERY = {}'.format(datatype))

# HEATMAP OF INDIVIDUAL CELLS RANKED BY LUNGE EP
rank_by = 'Castle 2'#
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in list(set(QUERY.columns))]
vals = QUERY[genes]
QUERY['GEP2_RANK'] =np.nanmean(vals,axis=1)
heatmap_data = pd.DataFrame( data = QUERY.sort_values(by=['GEP2_RANK'])[plot_genes +['GEP2_RANK']], 
                   columns = plot_genes +['GEP2_RANK'], index = QUERY.sort_values(by=['GEP2_RANK']).index)

y = heatmap_data['GEP2_RANK'].values
x = np.arange(len(y))

fig = plt.figure(figsize=(10,4))
ax = plt.gca()

ax.fill_between(x, 0, y,facecolor='#E0E0E0', interpolate=True)
plt.xlim((0,x.max()))
plt.ylim((0,y.max()))
sns.despine()


# SAVE FIGURE
figure_label = 'PRC1_Targets_UPDATED_GEP2_RANK__HARBOUR_curve'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/GEP2_Ranked_Heatmaps/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
DF_HARBOUR.index.get_level_values('cellType').value_counts()

In [None]:
DF_HARBOUR_PRIMARY.index.get_level_values('cellType').value_counts()

In [None]:
DF_HARBOUR_METASTASIS.index.get_level_values('cellType').value_counts()

#### STING PATHWAY GENES HEATMAPS - HARBOUR DATASET

In [None]:
plot_genes = ['IRF3', 'RELA', 'RELB', 'MB21D1', 'IKBKB', 'NFKB1', 'TMEM173', 'STAT2', 'CXCL10', 'NFKB2',
              'CCL5', 'TBK1', 'JAK1', 'JAK2', 'IRF9', 'TYK2', 'IRF7', 'IRF1', 'STAT1'
             ]

print(len(plot_genes))

# LOAD GENESETS
path_to_genesets = DATA_PATH+'uveal_melanoma_MB_v3.csv'
genesets = pd.read_csv(path_to_genesets,header='infer')

subset_type = 'TUMOR'
QUERY = INDF_HARBOUR

# HEATMAP OF INDIVIDUAL CELLS RANKED BY LUNGE EP
rank_by = 'Castle 2'#
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in list(set(QUERY.columns))]
vals = QUERY[genes]
QUERY['GEP2_RANK'] =np.nanmean(vals,axis=1)

# CONSTRUCT HEATMAP DATA
heatmap_data = pd.DataFrame( data = zscore(QUERY.sort_values(by=['GEP2_RANK'])[plot_genes],axis=0), 
                   columns = plot_genes, index = QUERY.sort_values(by=['GEP2_RANK']).index)
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
#method = 'average' # average, single centroid/euclidean
#metric = 'euclidean' # cosine
#linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
#col_linkage = deepcopy(linkage)
#cl = hc.leaves_list(col_linkage)
#mat = heatmap_data.iloc[:,cl]
mat = heatmap_data.copy()

window = 20
mat = mat.rolling(window, win_type='triang',center = True).sum()
half_window = int(window/2)
for ind in np.arange(half_window):
    mat.iloc[ind] = mat.iloc[half_window]    
for ind in np.arange(half_window):
    mat.iloc[-ind] = mat.iloc[-half_window]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(5,10))

# ADD MATRIX WITH LINEAGE NAMES
axmatrix = fig.add_axes([0.12,0.1,0.4,0.6])
colors = [(1,1,1), np.divide(tuple(hex('FF0000').rgb),255)]
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=CM_DIVERGING,vmin=-10,vmax=10)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
#axmatrix.set_xticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 6,fontname='Arial')
ytick = plt.yticks([])
axmatrix.grid(False)
axmatrix.spines['top'].set_visible(False)
axmatrix.spines['right'].set_visible(False)
axmatrix.spines['bottom'].set_visible(False)
axmatrix.spines['left'].set_visible(False)


# ADD ROW 1 COLOR BY SAMPLE TYPE
sample_status = [QUERY.loc[QUERY.index.get_level_values('Cell ID') == x].index.get_level_values('Legend').\
                          values[0] for x in mat.index.get_level_values('Cell ID').values]

lut_SAMPLE = dict({'primary': np.array([1, 1, 1]), 'metastatic': np.array([0, 0, 0])})

row_colors1 = pd.Series(sample_status, index=sample_status).map(lut_SAMPLE)
ax1 = fig.add_axes([0.08,0.1,0.035,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors1:
    pos = (x, y / len(row_colors1))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(row_colors1), color=c))
    if y >= len(row_colors1)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

# SAVE FIGURE
figure_label = 'STING_PATHWAY_Heatmap_HARBOUR'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/GEP2_Ranked_Heatmaps/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

### PRC1 TARGET GENES HEATMAPS - HARBOUR PRIMARY SAMPLES

#### GEP2 RANKED HEATMAP FOR PRC1 TARGET GENES

In [None]:
# RANK BY GEP 2
subset_type = 'TUMOR'
QUERY = INDF_HARBOUR_PRIMARY

rank_by = 'Castle 2'#'Class 2 Onken v2'
genes = np.unique(genesets[rank_by].values.ravel().tolist())
detected_genes = [gene for gene in genes if gene in list(QUERY.columns)]

vals = QUERY[detected_genes]
SCORE = np.nanmedian(vals,axis=1)

In [None]:
PRC1_target_genes = pd.read_csv(DATA_PATH+'PRC1_targets_Genesets_and_Chipseq.csv')
plot_genes = PRC1_target_genes['PRC1_targets_Genesets_and_Chipseq'].dropna().values.tolist()
plot_genes = [x for x in plot_genes if x in QUERY.columns.values.tolist()]

# LOAD GENESETS
path_to_genesets = DATA_PATH+'uveal_melanoma_MB_v3.csv'
genesets = pd.read_csv(path_to_genesets,header='infer')

# HEATMAP OF INDIVIDUAL CELLS RANKED BY LUNGE EP
rank_by = 'Castle 2'#
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in list(set(QUERY.columns))]
vals = QUERY[genes]
QUERY['GEP2_RANK'] =np.nanmean(vals,axis=1)

# CONSTRUCT HEATMAP DATA
heatmap_data = pd.DataFrame( data = zscore(QUERY.sort_values(by=['GEP2_RANK'])[plot_genes],axis=0), 
                   columns = plot_genes, index = QUERY.sort_values(by=['GEP2_RANK']).index)
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
method = 'centroid' # average, single centroid/euclidean
metric = 'euclidean' # cosine
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
cl = hc.leaves_list(col_linkage)
mat = heatmap_data.iloc[:,cl]

window = 20
mat = mat.rolling(window, win_type='triang',center = True).sum()
half_window = int(window/2)
for ind in np.arange(half_window):
    mat.iloc[ind] = mat.iloc[half_window]    
for ind in np.arange(half_window):
    mat.iloc[-ind] = mat.iloc[-half_window]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(4,10))

# ADD MATRIX WITH LINEAGE NAMES
axmatrix = fig.add_axes([0.12,0.1,0.4,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=CM_DIVERGING,vmin=-10,vmax=10)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
axmatrix.set_xticklabels([]);
axmatrix.grid(False)
plt.tick_params(axis='both', size=0)

# SAVE FIGURE
figure_label = 'PRC1_Targets_Heatmap_UPDATED_HARBOUR_PRIMARY'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/GEP2_Ranked_Heatmaps/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

###### GEP2 RANKED HEATMAP NEW GENESET

In [None]:
# RANK BY GEP 2
subset_type = 'HARBOUR_PRIMARY'
exec('QUERY = INDF_{}'.format(subset_type))
rank_by = 'Castle 2'#'Class 2 Onken v2'
genes = np.unique(genesets[rank_by].values.ravel().tolist())
detected_genes = [gene for gene in genes if gene in list(QUERY.columns)]

vals = QUERY[detected_genes]
SCORE = np.nanmedian(vals,axis=1)

target_genes = pd.read_csv(DATA_PATH+'829_PRE_92_1_geneset.txt', 
                                sep='\t', header=None, names=['genes'])
plot_genes = target_genes['genes'].dropna().values.tolist()
plot_genes = [x for x in plot_genes if x in QUERY.columns.values.tolist()]

# LOAD GENESETS
path_to_genesets = DATA_PATH+'uveal_melanoma_MB_v3.csv'
genesets = pd.read_csv(path_to_genesets,header='infer')

# HEATMAP OF INDIVIDUAL CELLS RANKED BY LUNGE EP
rank_by = 'Castle 2'#
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in list(set(QUERY.columns))]
vals = QUERY[genes]
QUERY['GEP2_RANK'] =np.nanmean(vals,axis=1)

# CONSTRUCT HEATMAP DATA
heatmap_data = pd.DataFrame( data = zscore(QUERY.sort_values(by=['GEP2_RANK'])[plot_genes],axis=0), 
                   columns = plot_genes, index = QUERY.sort_values(by=['GEP2_RANK']).index)
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
method = 'average' # average, single centroid/euclidean
metric = 'euclidean' # cosine
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
cl = hc.leaves_list(col_linkage)
mat = heatmap_data.iloc[:,cl]

window = 20
mat = mat.rolling(window, win_type='triang',center = True).sum()
half_window = int(window/2)
for ind in np.arange(half_window):
    mat.iloc[ind] = mat.iloc[half_window]    
for ind in np.arange(half_window):
    mat.iloc[-ind] = mat.iloc[-half_window]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(4,10))

# ADD MATRIX WITH LINEAGE NAMES
axmatrix = fig.add_axes([0.12,0.1,0.4,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=CM_DIVERGING,vmin=-10,vmax=10)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
axmatrix.set_xticklabels([]);
axmatrix.grid(False)
plt.tick_params(axis='both', size=0)

# SAVE FIGURE
figure_label = 'average_euclidean_heatmap_NEW_GENESET_HARBOUR_PRIMARY'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/GEP2_Ranked_Heatmaps/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

###### GEP2 RANKED HEATMAP 5/5 GENESETS

In [None]:
# RANK BY GEP 2
subset_type = 'TUMOR'
QUERY = INDF_HARBOUR_PRIMARY

rank_by = 'Castle 2'#'Class 2 Onken v2'
genes = np.unique(genesets[rank_by].values.ravel().tolist())
detected_genes = [gene for gene in genes if gene in list(QUERY.columns)]

vals = QUERY[detected_genes]
QUERY['GEP2_RANK'] =np.nanmean(vals,axis=1)
QUERY = QUERY.sort_values(by=['GEP2_RANK'])

In [None]:
for col in target_genes:
    print(col,':')
    print(round(len([x for x in target_genes[col] if x in QUERY.columns])/len(target_genes[col].dropna())*100,3), '%')
    print('')

In [None]:
print('Harbour Primary Data')
for col in target_genes.columns:
    print(col,':')
    plot_genes = target_genes[col].dropna().values.tolist()
    print('Number of genes in set:',len(plot_genes))
    plot_genes = [x for x in plot_genes if x in QUERY.columns.values.tolist()]
    print('Number of genes in set in data:',len(plot_genes))
    print('')

In [None]:
# LOAD GENESETS
target_genes = pd.read_csv(DATA_PATH+'Genesets_5_5.csv', 
                                sep=',', header=0)

# Iterate through each geneset:
for col in target_genes.columns:
    print(col)
    plot_genes = target_genes[col].dropna().values.tolist()
    plot_genes = [x for x in plot_genes if x in QUERY.columns.values.tolist()]
    
    if len(plot_genes) > 0:
    
        # CONSTRUCT HEATMAP DATA
        heatmap_data = pd.DataFrame( data = zscore(QUERY[plot_genes],axis=0), 
                           columns = plot_genes, index = QUERY.index)
        heatmap_data = heatmap_data.dropna(axis=1, how='any') 
        yticks = heatmap_data.index
        xticks = heatmap_data.columns
        
        # LINKAGE 
        for m in [('average', 'euclidean'), ('average', 'cosine'), ('centroid', 'euclidean')]:
            method = m[0] # average, single centroid/euclidean
            metric = m[1] # cosine
            linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
            col_linkage = deepcopy(linkage)
            cl = hc.leaves_list(col_linkage)
            mat = heatmap_data.iloc[:,cl]
            
            window = 20
            mat = mat.rolling(window, win_type='triang',center = True).sum()
            half_window = int(window/2)
            for ind in np.arange(half_window):
                mat.iloc[ind] = mat.iloc[half_window]    
            for ind in np.arange(half_window):
                mat.iloc[-ind] = mat.iloc[-half_window]
            
            # VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
            fig = plt.figure(figsize=(4,10))
            
            # ADD MATRIX WITH LINEAGE NAMES
            axmatrix = fig.add_axes([0.12,0.1,0.4,0.6])
            im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=CM_DIVERGING,vmin=-10,vmax=10)
            labels = list(mat.columns)
            axmatrix.xaxis.set_ticks_position('bottom')
            axmatrix.set_yticklabels([]);
            axmatrix.set_xticklabels([]);
            #xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 6,fontname='Arial')
            axmatrix.grid(False)
            plt.tick_params(axis='both', size=0)
            
            # SAVE FIGURE
            figure_label = col+'_'+method+'_'+metric+'_HARBOUR_PRIMARY'
            fn = '/workdir/uvmel_project/figures/Revision_2_Figures/Genesets_5_5_Heatmaps/' + figure_label 
                
            d = os.path.dirname(fn)
            if not os.path.exists(d):
                os.makedirs(d)
                
            fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
            #fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
            print(fn)

###### GEP2 RANKED CURVE

In [None]:
INDF_HARBOUR_PRIMARY.index.get_level_values('cellType').value_counts()

In [None]:
# LOAD GENESETS
plot_genes = ['IRF3', 'RELA', 'RELB', 'MB21D1', 'IKBKB', 'NFKB1', 'TMEM173', 'STAT2', 'CXCL10', 'NFKB2',
              'CCL5', 'TBK1', 'JAK1', 'JAK2', 'IRF9', 'TYK2', 'IRF7', 'IRF1', 'STAT1'
             ]

path_to_genesets = DATA_PATH+'uveal_melanoma_MB_v3.csv'
genesets = pd.read_csv(path_to_genesets,header='infer')

datatype = 'INDF_HARBOUR_PRIMARY'
exec('QUERY = {}'.format(datatype))

# HEATMAP OF INDIVIDUAL CELLS RANKED BY LUNGE EP
rank_by = 'Castle 2'#
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in list(set(QUERY.columns))]
vals = QUERY[genes]
QUERY['GEP2_RANK'] =np.nanmean(vals,axis=1)
heatmap_data = pd.DataFrame( data = QUERY.sort_values(by=['GEP2_RANK'])[plot_genes +['GEP2_RANK']], 
                   columns = plot_genes +['GEP2_RANK'], index = QUERY.sort_values(by=['GEP2_RANK']).index)

y = heatmap_data['GEP2_RANK'].values
x = np.arange(len(y))

fig = plt.figure(figsize=(10,4))
ax = plt.gca()

ax.fill_between(x, 0, y,facecolor='#E0E0E0', interpolate=True)
plt.xlim((0,x.max()))
plt.ylim((0,y.max()))
sns.despine()


# SAVE FIGURE
figure_label = 'PRC1_Targets_UPDATED_GEP2_RANK__HARBOUR_PRIMARY_curve'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/GEP2_Ranked_Heatmaps/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

#### STING PATHWAY GENES HEATMAPS

In [None]:
plot_genes = ['MB21D1','TMEM173','IRF3','RELA','RELB','IKBKB','NFKB1',
              'STAT2','CXCL10','NFKB2','CCL5',
              'TBK1','JAK1','JAK2','IRF9',
              'TYK2','IRF7','IRF1','STAT1',
              'PPARG','DDIT3','NUPR1','RAB3B','IGFBP4','LRRC8C',
              'TCP11L2','MAFK','NRG1','F2R','KRT19','CTGF','ZFC3H1']

print(len(plot_genes))

# LOAD GENESETS
path_to_genesets = DATA_PATH+'uveal_melanoma_MB_v3.csv'
genesets = pd.read_csv(path_to_genesets,header='infer')

subset_type = 'TUMOR'
QUERY = INDF_HARBOUR_PRIMARY

# HEATMAP OF INDIVIDUAL CELLS RANKED BY LUNGE EP
rank_by = 'Castle 2'#
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in list(set(QUERY.columns))]
vals = QUERY[genes]
QUERY['GEP2_RANK'] =np.nanmean(vals,axis=1)

# CONSTRUCT HEATMAP DATA
heatmap_data = pd.DataFrame( data = zscore(QUERY.sort_values(by=['GEP2_RANK'])[plot_genes],axis=0), 
                   columns = plot_genes, index = QUERY.sort_values(by=['GEP2_RANK']).index)
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
method = 'centroid' # average, single centroid/euclidean
metric = 'euclidean' # cosine
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)
cl = hc.leaves_list(col_linkage)
mat = heatmap_data.iloc[:,cl]
#mat = heatmap_data.copy()

window = 20
mat = mat.rolling(window, win_type='triang',center = True).sum()
half_window = int(window/2)
for ind in np.arange(half_window):
    mat.iloc[ind] = mat.iloc[half_window]    
for ind in np.arange(half_window):
    mat.iloc[-ind] = mat.iloc[-half_window]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(5,10))

# ADD MATRIX WITH LINEAGE NAMES
axmatrix = fig.add_axes([0.12,0.1,0.5,0.7])
colors = [(1,1,1), np.divide(tuple(hex('FF0000').rgb),255)]
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=CM_DIVERGING,vmin=-10,vmax=10)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
#axmatrix.set_xticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 5,fontname='Arial')
ytick = plt.yticks([])
axmatrix.grid(False)
plt.tick_params(size=0)

# SAVE FIGURE
figure_label = '/STING_PATHWAY_Heatmap_HARBOUR_PRIMARY_EXTENDED_CLUSTERED_centroid_euclidean'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/GEP2_Ranked_Heatmaps/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

### GEP PROGNOSTIFICATION OF INDIVIDUAL TUMOR CELLS - HARBOUR

#### GEP SIGNATURE CLUSTERED HEATMAP

In [None]:
class_genes = GEP_markerfile_df.index.values.tolist()

QUERY = INDF_HARBOUR

# COLUMN INDEX AND COLORS
genes = [gene for gene in class_genes if gene in QUERY.columns]#gene_array['Gene'].values
# CONSTRUCT HEATMAP DATA
heatmap_data = pd.DataFrame(data = zscore(QUERY[genes].values,axis=0),columns = genes, index = QUERY.index)
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
method = 'average' # average, single
metric = 'euclidean' # cosine
linkage = hc.linkage(heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)

In [None]:
# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
mat = heatmap_data.iloc[r1,c1]

In [None]:
GEP_assignments_sorted = [METADATA_HARBOUR.loc
                          [METADATA_HARBOUR.index.get_level_values('Cell ID') == x].\
                          Assignment.\
                          values[0] for x in mat.index.get_level_values('Cell ID').values]

row_colors = pd.Series(GEP_assignments_sorted, index=GEP_assignments_sorted).map(lut_CASTLE)

In [None]:
# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(4,10))
plt.rcParams["axes.grid"] = False

# ADD ROW COLOR INDEX 1 (GEP CLASS)
ax1 = fig.add_axes([0,0.1,0.05,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors:
    pos = (x, y / len(row_colors))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(row_colors), color=c))
    if y >= len(row_colors)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

# ADD ROW 2 COLOR BY SAMPLE TYPE
sample_status = [QUERY.loc[QUERY.index.get_level_values('Cell ID') == x].index.get_level_values('Legend').\
                          values[0] for x in mat.index.get_level_values('Cell ID').values]

lut_SAMPLE = dict({'primary': np.array([1, 1, 1]), 'metastatic': np.array([0, 0, 0])})

row_colors2 = pd.Series(sample_status, index=sample_status).map(lut_SAMPLE)
ax2 = fig.add_axes([0.95,0.1,0.05,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors2:
    pos = (x, y / len(row_colors2))
    ax2.add_patch(patches.Rectangle(pos, 1, 1 / len(row_colors2), color=c))
    if y >= len(row_colors2)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.05,0.1,0.9,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=plt.cm.RdBu_r,vmin=-1,vmax=1)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 14)

# ADD COLORBAR
axcolor = fig.add_axes([1.1,0.1,0.1,0.1])
cbar = plt.colorbar(im, cax=axcolor)
#cbar.ax.get_yaxis().set_ticks([])

# SAVE FIGURE
figure_label = 'GEP_HARBOUR_BayesianGMM'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

#### PAIRWISE CORRELATION BETWEEN MONOSOMY 3 SIGNATURE AND GEP 2

In [None]:
# VIOLIN PLOTS
subset_type = 'HARBOUR'
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
exec('META = METADATA_{}'.format(subset_type))

meta = 'Assignment'

gene1 = 'Castle 2'
signature_genes = genesets[gene1].values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
G1 = (np.nansum(QUERY[detected_genes],axis=1))#*complexity

gene2 = 'Monosomy 3 Up'
signature_genes = genesets[gene2].values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
G2 = (np.nansum(QUERY[detected_genes],axis=1))#*complexity

# ROW INDEX
meta = 'Assignment'
groupby_type = meta # 'Class', 'Meta-Source', 'Legend'
FLATUI = ['0000FF','FF0000']#FLATUI_CLASS, 'FLATUI_SOURCE, FLATUI_SAMPLES

violin_data = pd.DataFrame({
                            meta:list(META[meta]),
                            gene1: G1,
                            gene2: G2,
                           })

ind = violin_data[groupby_type]

# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI),3))
for ii,hexcolor in enumerate(FLATUI):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
# Palatte for Class METADATA
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
dot_colors = pd.Series(ind).map(lut)
metacell_colors = [rgb2hex(int(color[0]*255), int(color[1]*255), int(color[2]*255)) for color in dot_colors]

In [None]:
# DISTRIBUTION OF SOX2/SOX9
x = gene1
y = gene2

plt.figure(figsize = (20,20))
sns.despine()
sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})

# JOINTPLOT
g = sns.jointplot(x,y, data=violin_data.fillna(0), kind="reg", stat_func=None,ratio=2, color='k', size=10)
g.ax_joint.cla() # or g.ax_joint.collections[0].set_visible(False), as per mwaskom's comment

# REPLOT SCATTER WITH LINEAGE COLORED
plt.sca(g.ax_joint)
plt.scatter(violin_data[x], violin_data[y], c=dot_colors)

# ADD REGRESSION
xx = violin_data[x]
yy = violin_data[y]
f = lambda x, *p: polyval(p, x)
p, cov = curve_fit(f, xx, yy, [1, 1])

# simulated draws from the probability density function of the regression
xi = linspace(np.min(xx), np.max(xx), 100)
ps = np.random.multivariate_normal(p, cov, 10000)
ysample = np.asarray([f(xi, *pi) for pi in ps])
lower = percentile(ysample, 10, axis=0)
upper = percentile(ysample, 90, axis=0)

# regression estimate line
y_fit = poly1d(p)(xi)

# PLOT REGRESSION LINE
plt.fill_between(xi, lower, upper, facecolor='k', alpha=0.5)
plt.plot(xi, y_fit, 'k-')
#plt.xlim((0,5))
#plt.ylim((0,5))

# ADD AXIS LABELS
g.set_axis_labels(x, y, fontsize=10,fontname = 'Arial')

# COMPUTE STATISTICS
(r, p) = stats.pearsonr(violin_data[x].fillna(0), violin_data[y].fillna(0))
print('{} vs {}, R: {}, p: {}'.format(x,y,r,p))

# SAVE FIGURE
figure_label = 'M3_GEP2_Correlation_BGMM_HARBOUR'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
plt.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

#### EVALUATE GENE EXPRESSION PER GEP CLASS

In [None]:
plot_type = 'box'
title = 'BAP1'
meta = 'Assignment'
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
exec('META = METADATA_{}'.format(subset_type))

fig = plt.figure(figsize = (1,5))
ax = plt.gca()

scale_type = 'count'
palette = dict(zip(['Castle 1','Castle 2'],['#0000FF','#FF0000'])) 


genes = [title]
genes = np.unique([x for x in genes if str(x) != 'NAN'])
detected_genes = list(set(genes).intersection(set(QUERY.columns)))
vals = QUERY[detected_genes]
SCORE = np.nansum(vals,axis=1)

# Format data structure for violin plot
violin_data = []
for ind,v in enumerate(SCORE):
    violin_data.append({'gene': title, 'Z-normalized Expression': v,
                        meta:META[meta].values[ind]}) 
violin_data = pd.DataFrame(violin_data)  

# BOXPLOT GENE EXPRESSION
if plot_type == 'box':
    g = sns.boxplot(x="gene", y="Z-normalized Expression", 
                    hue=meta,data=violin_data, palette=palette,notch = True, 
                    hue_order = ['Castle 1','Castle 2'], 
                    fliersize = 4, showmeans=False,linewidth = 1, ax = ax) #order = labels, 
    g.set_ylabel("{}".format(datatype),fontsize=10)
    g.set_xlabel(" ",fontsize=10,rotation = 90)
    g.tick_params(labelsize=10)
    sns.despine()
    ax.set(ylim=(0, SCORE.max()*0.95))
    g.legend(loc='upper right',prop={'size':6},bbox_to_anchor=(2.0, 0.95),fancybox=True) 


elif plot_type == 'violin':
    # VIOLIN GENE EXPRESSION
    g = sns.violinplot(x="gene", y="Z-normalized Expression", 
                       hue=meta,data=violin_data, palette=palette,notch = True, 
                    hue_order = ['Castle 1','Castle 2'],fliersize = 4, showmeans=True,linewidth = 1) #order = labels, 

    g.set_ylabel("{}".format(datatype),fontsize=10)
    g.set_xlabel(" ",fontsize=10,rotation = 90)
    ax.set(ylim=(0, SCORE.max()*0.95))
    g.tick_params(labelsize=10)
    sns.despine()

# COMPARE DISTRIBUTIONS
CLASS1 = violin_data.loc[violin_data[meta].isin(['Castle 1'])]['Z-normalized Expression'].values
CLASS2 = violin_data.loc[violin_data[meta].isin(['Castle 2'])]['Z-normalized Expression'].values
print(title + ' CLASS1 vs. CLASS2')
print(stats.mannwhitneyu(CLASS1,CLASS2))

# SAVE FIGURE
figure_label = 'BAP1_Expression_by_GEP_BGMM_HARBOUR'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
plt.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

### GEP PROGNOSTIFICATION OF INDIVIDUAL TUMOR CELLS - HARBOUR PRIMARY

#### GEP SIGNATURE CLUSTERED HEATMAP

In [None]:
class_genes = GEP_markerfile_df.index.values.tolist()

QUERY = INDF_HARBOUR_PRIMARY

# COLUMN INDEX AND COLORS
genes = [gene for gene in class_genes if gene in QUERY.columns]#gene_array['Gene'].values
# CONSTRUCT HEATMAP DATA
heatmap_data = pd.DataFrame(data = zscore(QUERY[genes].values,axis=0),columns = genes, index = QUERY.index)
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
method = 'centroid' # average, single
metric = 'euclidean' # cosine
linkage = hc.linkage(heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)

In [None]:
# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
mat = heatmap_data.iloc[r1,c1]

In [None]:
GEP_assignments_sorted = [METADATA_HARBOUR.loc[METADATA_HARBOUR.index.get_level_values('Legend') == 'primary'].loc
                          [METADATA_HARBOUR.loc[METADATA_HARBOUR.index.get_level_values('Legend') == 'primary'].\
                           index.get_level_values('Cell ID') == x].Assignment.\
                          values[0] for x in mat.index.get_level_values('Cell ID').values]

row_colors = pd.Series(GEP_assignments_sorted, index=GEP_assignments_sorted).map(lut_CASTLE)

In [None]:
# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(4,10))
plt.rcParams["axes.grid"] = False

# ADD ROW COLOR INDEX 1 (PHENOGRAPH CLASS)
ax1 = fig.add_axes([0,0.1,0.05,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors:
    pos = (x, y / len(row_colors))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(row_colors), color=c))
    if y >= len(row_colors)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.05,0.1,0.9,0.6], frameon=False)
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=plt.cm.RdBu_r,vmin=-1,vmax=1)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 14)
plt.tick_params(size=0)

# ADD COLORBAR
axcolor = fig.add_axes([1.1,0.1,0.1,0.1])
cbar = plt.colorbar(im, cax=axcolor)
#cbar.ax.get_yaxis().set_ticks([])

# SAVE FIGURE
figure_label = 'GEP_HARBOUR_PRIMARY_BayesianGMM'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

#### PAIRWISE CORRELATION BETWEEN MONOSOMY 3 AND GEP 2

In [None]:
METADATA_HARBOUR_PRIMARY['Assignment'] = [METADATA_HARBOUR.loc[
                                          METADATA_HARBOUR.index.get_level_values('Legend') == 'primary'].loc[
                                          METADATA_HARBOUR.loc[
                                          METADATA_HARBOUR.index.get_level_values('Legend') == 'primary'].\
                                          index.get_level_values('Cell ID') == x].Assignment.values[0] 
                                          for x in METADATA_HARBOUR_PRIMARY.index.get_level_values('Cell ID').values]

In [None]:
# VIOLIN PLOTS
subset_type = 'HARBOUR_PRIMARY'
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
exec('META = METADATA_{}'.format(subset_type))

meta = 'Assignment'

gene1 = 'Castle 2'
signature_genes = genesets[gene1].values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
G1 = (np.nansum(QUERY[detected_genes],axis=1))#*complexity

gene2 = 'Monosomy 3 Up'
signature_genes = genesets[gene2].values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
G2 = (np.nansum(QUERY[detected_genes],axis=1))#*complexity

# ROW INDEX
meta = 'Assignment'
groupby_type = meta # 'Class', 'Meta-Source', 'Legend'
FLATUI = ['0000FF','FF0000']#FLATUI_CLASS, 'FLATUI_SOURCE, FLATUI_SAMPLES

violin_data = pd.DataFrame({
                            meta:list(META[meta]),
                            gene1: G1,
                            gene2: G2,
                           })

ind = violin_data[groupby_type]

# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI),3))
for ii,hexcolor in enumerate(FLATUI):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
# Palatte for Class METADATA
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
dot_colors = pd.Series(ind).map(lut)
metacell_colors = [rgb2hex(int(color[0]*255), int(color[1]*255), int(color[2]*255)) for color in dot_colors]

In [None]:
# DISTRIBUTION OF SOX2/SOX9
x = gene1
y = gene2

plt.figure(figsize = (20,20))
#sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})
sns.set_style('ticks')

# JOINTPLOT
g = sns.jointplot(x,y, data=violin_data.fillna(0), kind="reg", stat_func=None,ratio=2, color='k', size=10)
g.ax_joint.cla() # or g.ax_joint.collections[0].set_visible(False), as per mwaskom's comment

# REPLOT SCATTER WITH LINEAGE COLORED
plt.sca(g.ax_joint)
plt.scatter(violin_data[x], violin_data[y], c=dot_colors)

# ADD REGRESSION
xx = violin_data[x]
yy = violin_data[y]
f = lambda x, *p: polyval(p, x)
p, cov = curve_fit(f, xx, yy, [1, 1])

# simulated draws from the probability density function of the regression
xi = linspace(np.min(xx), np.max(xx), 100)
ps = np.random.multivariate_normal(p, cov, 10000)
ysample = np.asarray([f(xi, *pi) for pi in ps])
lower = percentile(ysample, 10, axis=0)
upper = percentile(ysample, 90, axis=0)

# regression estimate line
y_fit = poly1d(p)(xi)

# PLOT REGRESSION LINE
plt.fill_between(xi, lower, upper, facecolor='k', alpha=0.5)
plt.plot(xi, y_fit, 'k-')
#plt.xlim((0,5))
#plt.ylim((0,5))
plt.tick_params(size=0)
plt.yticks(ticks=[0, 10, 20, 30, 40, 50, 60], 
           labels=['0', '10', '20', '30', '40', '50', '60'], size=20)
plt.xticks(ticks=[0, 2, 4, 6, 8], labels=['0', '2', '4', '6', '8'], size=20)

# ADD AXIS LABELS
g.set_axis_labels(x, y, fontsize=10,fontname = 'Arial')

# COMPUTE STATISTICS
(r, p) = stats.pearsonr(violin_data[x].fillna(0), violin_data[y].fillna(0))
print('{} vs {}, R: {}, p: {}'.format(x,y,r,p))

# SAVE FIGURE
figure_label = 'M3_GEP2_Correlation_BGMM_HARBOUR_PRIMARY'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
plt.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

#### EVALUATE GENE EXPRESSION PER GEP CLASS

In [None]:
plot_type = 'box'
title = 'BAP1'
meta = 'Assignment'
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
exec('META = METADATA_{}'.format(subset_type))

fig = plt.figure(figsize = (1,5))
ax = plt.gca()

scale_type = 'count'
palette = dict(zip(['Castle 1','Castle 2'],['#0000FF','#FF0000'])) 


genes = [title]
genes = np.unique([x for x in genes if str(x) != 'NAN'])
detected_genes = list(set(genes).intersection(set(QUERY.columns)))
vals = QUERY[detected_genes]
SCORE = np.nansum(vals,axis=1)

# Format data structure for violin plot
violin_data = []
for ind,v in enumerate(SCORE):
    violin_data.append({'gene': title, 'Z-normalized Expression': v,
                        meta:META[meta].values[ind]}) 
violin_data = pd.DataFrame(violin_data)  

# BOXPLOT GENE EXPRESSION
if plot_type == 'box':
    g = sns.boxplot(x="gene", y="Z-normalized Expression", 
                    hue=meta,data=violin_data, palette=palette,notch = True, 
                    hue_order = ['Castle 1','Castle 2'], 
                    fliersize = 4, showmeans=False,linewidth = 1, ax = ax) #order = labels, 
    g.set_ylabel("BAP1 Imputed Expression", fontsize=14)
    g.set_xlabel(" ",fontsize=10,rotation = 90)
    g.tick_params(labelsize=11)
    sns.despine()
    ax.set(ylim=(0, SCORE.max()*0.95))
    g.legend(loc='upper right',prop={'size':6},bbox_to_anchor=(2.0, 0.95),fancybox=True) 


elif plot_type == 'violin':
    # VIOLIN GENE EXPRESSION
    g = sns.violinplot(x="gene", y="Z-normalized Expression", 
                       hue=meta,data=violin_data, palette=palette,notch = True, 
                    hue_order = ['Castle 1','Castle 2'],fliersize = 4, showmeans=True,linewidth = 1) #order = labels, 

    g.set_ylabel("BAP1 Imputed Expression", fontsize=14)
    g.set_xlabel(" ",fontsize=10,rotation = 90)
    ax.set(ylim=(0, SCORE.max()*0.95))
    g.tick_params(labelsize=11)
    sns.despine()
    
plt.tick_params(size=0)

# COMPARE DISTRIBUTIONS
CLASS1 = violin_data.loc[violin_data[meta].isin(['Castle 1'])]['Z-normalized Expression'].values
CLASS2 = violin_data.loc[violin_data[meta].isin(['Castle 2'])]['Z-normalized Expression'].values
print(title + ' CLASS1 vs. CLASS2')
print(stats.mannwhitneyu(CLASS1,CLASS2))

# SAVE FIGURE
figure_label = 'BAP1_Expression_by_GEP_BGMM_HARBOUR_PRIMARY'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
plt.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

#### PHENOTYPIC VOLUME BY PATIENT

In [None]:
subset_type = 'HARBOUR_PRIMARY'
meta ='Assignment'

exec('QUERY = h5_data_harbour.load(\'/NDF_TUMOR_PRIMARY\')')
exec('METADATA_HARBOUR = h5_data_harbour.load(\'/METADATA_HARBOUR\')')
META = METADATA_HARBOUR.loc[METADATA_HARBOUR.index.map(lambda x: x[1] == 'primary')]
new_index = pd.MultiIndex.from_tuples(list(zip(META.index.get_level_values('Sample ID'), 
                                               META.index.get_level_values('Legend'),
                                               META.index.get_level_values('Patient'),
                                               META.index.get_level_values('Cell ID'),
                                               META[meta])), 
                                  names=['Sample ID','Legend', 'Patient','Cell ID',meta])
QUERY = pd.DataFrame(data = QUERY.values, index = new_index, columns = QUERY.columns)

In [None]:
# SPECIFY SUBSET
fig_dpi = 400

# FIT BINOMIAL DISTRIBUTION AND FILTER BASED ON MEAN/STD OF SECOND
plt.figure(figsize = (10,3))
gs1 = gridspec.GridSpec(1, 3)
gs1.update(wspace=0.7, hspace=0.7) # set the spacing between axes. 

# (4) PLOT LOG NUMBER OF CELLS CONTRIBUTING TO EACH GENE
num_cells_per_gene = np.log(np.sum(QUERY.values > 0,axis=0))
num_cells_per_gene[(np.isinf(num_cells_per_gene)) | (np.isnan(num_cells_per_gene))] = 0
rmv_genes1 = np.where(num_cells_per_gene<=0)[0] # GENES MUST BE DETECTED IN AT LEAST 10 CELLS

ax = plt.subplot(gs1[0])
bins = np.linspace(num_cells_per_gene.min(), num_cells_per_gene.max()*0.95, 20)
plt.hist(num_cells_per_gene, bins, alpha=0.5, label='keep')

if rmv_genes1.any():
    plt.hist(num_cells_per_gene[rmv_genes1], bins, alpha=1, label='remove')
    
#ax.set_axis_bgcolor('white')
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('Gene Filter 1: Remove Genes Singletons \n(Log. # Expressing Cells)')
plt.grid(True)
sns.despine()

# (2) PLOT LOG COUNTS PER GENE - REMOVE LOW ABUNDANCE GENES
log_counts_per_gene = np.log(np.log(np.sum(QUERY.values,axis=0)))
log_counts_per_gene[(np.isinf(log_counts_per_gene)) | (np.isnan(log_counts_per_gene))] = 0
data = log_counts_per_gene

ax = plt.subplot(gs1[1])
bins = np.linspace(data.min(), data.max()*0.95, 100)
y,x,_=hist(data,bins,alpha=.3,label='data')

x=(x[1:]+x[:-1])/2 
expected=(0,.1,3500,1.75,.2,500)
params,cov=curve_fit(bimodal,x,y,expected)
sigma=sqrt(diag(cov))
plot(x,bimodal(x,*params),color='red',lw=3,label='model')

mu1 = params[0]
std1 = params[1]
mu2 = params[3]
std2 = params[4]
rmv_genes_neg = np.where(data<mu2-3*std2)[0]
rmv_genes2 = np.sort(list(set(list(rmv_genes_neg))))

if rmv_genes2.any():
    plt.hist(data[rmv_genes2], bins, alpha=1, label='remove')
    
#ax.set_axis_bgcolor('white')
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('Gene Filter 2: \nLog-Log Counts/Gene')
plt.grid(True)
sns.despine()

# (3) PLOT STDEV PER GENE
pseudocount = 0.001
std_per_gene = np.log10(QUERY.std(axis=0, skipna=False)+pseudocount)
data = std_per_gene

ax = plt.subplot(gs1[2])
bins = np.linspace(data.min(), data.max()*0.95, 100)
y,x,_=hist(data,bins,alpha=.3,label='data')

x=(x[1:]+x[:-1])/2 
expected=(0,-3,3000,-1,.2,500)
params,cov=curve_fit(bimodal,x,y,expected,maxfev=3000)
sigma=sqrt(diag(cov))
plot(x,bimodal(x,*params),color='red',lw=3,label='model')

mu1 = params[0]
std1 = params[1]
mu2 = params[3]
std2 = params[4]
rmv_genes_neg = np.where(data<mu2-3*abs(std2))[0]
rmv_genes3 = np.sort(list(set(list(rmv_genes_neg))))

if rmv_genes3.any():
    plt.hist(data[rmv_genes3], bins, alpha=1, label='remove')
    
#ax.set_axis_bgcolor('white')
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('Gene Filter 3: \nSDEV/Gene')
plt.grid(True)
sns.despine()

# Add (abbreviated) legend bottom left
L = plt.legend(loc='upper right',prop={'size':12},bbox_to_anchor=(1.6, 1.05),fancybox=True) 

# SAVE FIGURE
figure_label = '_filter_genes'
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label + '.png'
#plt.savefig(fn, dpi=fig_dpi)
print(fn)

# EVALUATE NUMBER OF CELLS/GENES REMOVED
print('Count Gene Filter 1: {}'.format(len(rmv_genes1)))
print('Count Gene Filter 2: {}'.format(len(rmv_genes2)))
print('Count Gene Filter 3: {}'.format(len(rmv_genes3)))

# REMOVE SELECTED OUTLIER GENES(HIGHLIGHTED IN GREEN)
CUT_DF = deepcopy(QUERY)
rmv_genes = np.sort(list(set(list(rmv_genes1) + list(rmv_genes2)+list(rmv_genes3))))
if rmv_genes.any():
    CUT_DF = CUT_DF.drop(CUT_DF.columns[rmv_genes],axis=1)
    print(CUT_DF.shape)
    
# Remove empty genes if they exist
drop_genes = np.where(CUT_DF.sum(axis=0)==0)[0]
CUT_DF = CUT_DF.drop(CUT_DF.columns[drop_genes],axis=1)
print(CUT_DF.shape)

# GENES EXCLUDING LOW ABUNDANCE GENES
variable_goi = list(CUT_DF.columns)
print(len(variable_goi))

In [None]:
C1_harbour = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UMM059')
C2_harbour = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UMM061')
C3_harbour = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UMM062')
C4_harbour = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UMM063')
C5_harbour = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UMM064')
C6_harbour = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UMM065')
C7_harbour = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UMM066')
C8_harbour = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UMM069')

In [None]:
sns.set_style("white")
fig = plt.figure(figsize = (5,5))
ax = plt.gca()

#sns.kdeplot(IM, shade=True, color="#B0C4DE")
sns.kdeplot(C1_harbour, shade=True, color="#"+FLATUI_PATIENT[0])
sns.kdeplot(C2_harbour, shade=True, color="#"+FLATUI_PATIENT[1])
sns.kdeplot(C3_harbour, shade=True, color="#"+FLATUI_PATIENT[2])
sns.kdeplot(C4_harbour, shade=True, color="#"+FLATUI_PATIENT[3])
sns.kdeplot(C5_harbour, shade=True, color="#"+FLATUI_PATIENT[4])
sns.kdeplot(C6_harbour, shade=True, color="#"+FLATUI_PATIENT[5])
sns.kdeplot(C7_harbour, shade=True, color="#"+FLATUI_PATIENT[5])
sns.kdeplot(C8_harbour, shade=True, color="#"+FLATUI_PATIENT[5])

ax.set_ylabel("Frequency",fontsize=10)
ax.set_xlabel("Log Phenotypic Volume",fontsize=10)
ax.tick_params(labelsize=10)
#g.set_xticklabels(ax.get_xticklabels(),rotation=90)
sns.despine()
plt.ylim((0,30))

# SAVE FIGURE
figure_label = 'Log_Phenotypic_Volume_BY_PATIENT_HARBOUR_PRIMARY'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

### PHENOTYPIC VOLUME INTRAPATIENT

In [None]:
# Patient UMM062:
QUERY_FULL = QUERY.copy()
QUERY = QUERY_FULL.loc[QUERY_FULL.index.map(lambda x: x[2] == 'UMM062')]

In [None]:
QUERY.index.get_level_values('Assignment').value_counts()

In [None]:
C1_UMM062 = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UMM062_GEP1')
C2_UMM062 = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UMM062_GEP2')

In [None]:
sns.set_style("white")
fig = plt.figure(figsize = (5,5))
ax = plt.gca()

#sns.kdeplot(IM, shade=True, color="#B0C4DE")
sns.kdeplot(C1_UMM062, shade=True, color="#0000FF")
sns.kdeplot(C2_UMM062, shade=True, color="#FF0000")

ax.set_ylabel("Frequency",fontsize=10)
ax.set_xlabel("Log Phenotypic Volume",fontsize=10)
ax.tick_params(labelsize=10)
#g.set_xticklabels(ax.get_xticklabels(),rotation=90)
sns.despine()
plt.ylim((0,10))

# SAVE FIGURE
figure_label = 'Log_Phenotypic_Volume_UMM062'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)


In [None]:
# Patient UMM065:
QUERY = QUERY_FULL.loc[QUERY_FULL.index.map(lambda x: x[2] == 'UMM065')]

In [None]:
QUERY.index.get_level_values('Assignment').value_counts()

In [None]:
C1_UMM065 = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UMM065_GEP1')
C2_UMM065 = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UMM065_GEP2')

In [None]:
sns.set_style("white")
fig = plt.figure(figsize = (5,5))
ax = plt.gca()

#sns.kdeplot(IM, shade=True, color="#B0C4DE")
sns.kdeplot(C1_UMM065, shade=True, color="#0000FF")
sns.kdeplot(C2_UMM065, shade=True, color="#FF0000")

ax.set_ylabel("Frequency",fontsize=10)
ax.set_xlabel("Log Phenotypic Volume",fontsize=10)
ax.tick_params(labelsize=10)
#g.set_xticklabels(ax.get_xticklabels(),rotation=90)
sns.despine()
plt.ylim((0,10))

# SAVE FIGURE
figure_label = 'Log_Phenotypic_Volume_UMM065'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)


## TESTING STING AND IFN IN CLASS 1 SAMPLES

In [None]:
INDF_HARBOUR_CLASS1 = INDF_HARBOUR.loc[INDF_HARBOUR.index.map(lambda x: x[2] in ['BSSR0022', 'UMM062', 'UMM065'])]

In [None]:
print(INDF_HARBOUR_CLASS1.shape)
INDF_HARBOUR_CLASS1.head()

In [None]:
plot_genes = ['IRF3', 'RELA', 'RELB', 'MB21D1', 'IKBKB', 'NFKB1', 'TMEM173', 'STAT2', 'CXCL10', 'NFKB2',
              'CCL5', 'TBK1', 'JAK1', 'JAK2', 'IRF9', 'TYK2', 'IRF7', 'IRF1', 'STAT1',
              'HLA-A', 'HLA-B', 'HLA-C', 'HLA-E', 'HLA-F', 'HLA-G'
             ]

print(len(plot_genes))

subset_type = 'TUMOR'
QUERY = INDF_HARBOUR_CLASS1

# CONSTRUCT HEATMAP DATA
heatmap_data = pd.DataFrame(data = zscore(QUERY[plot_genes],axis=0), 
                            columns = plot_genes, index = QUERY.index)
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
method = 'centroid' # average, single centroid/euclidean
metric = 'euclidean' # cosine
#linkage = hc.linkage(heatmap_data, method=method, metric = metric)
#row_linkage = deepcopy(linkage)
#rl = hc.leaves_list(row_linkage)
#mat = heatmap_data.iloc[rl,:]
mat = heatmap_data.copy()

window = 20
mat = mat.rolling(window, win_type='triang',center = True).sum()
half_window = int(window/2)
for ind in np.arange(half_window):
    mat.iloc[ind] = mat.iloc[half_window]    
for ind in np.arange(half_window):
    mat.iloc[-ind] = mat.iloc[-half_window]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(5,10))

# ADD MATRIX WITH LINEAGE NAMES
axmatrix = fig.add_axes([0.12,0.1,0.4,0.6])
colors = [(1,1,1), np.divide(tuple(hex('FF0000').rgb),255)]
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=CM_DIVERGING,vmin=-10,vmax=10)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
#axmatrix.set_xticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 6,fontname='Arial')
ytick = plt.yticks([])
axmatrix.grid(False)
axmatrix.spines['top'].set_visible(False)
axmatrix.spines['right'].set_visible(False)
axmatrix.spines['bottom'].set_visible(False)
axmatrix.spines['left'].set_visible(False)

# ADD ROW 1 COLOR BY SAMPLE TYPE
patient_status = [QUERY.loc[QUERY.index.get_level_values('Cell ID') == x].index.get_level_values('Patient').\
                          values[0] for x in mat.index.get_level_values('Cell ID').values]

lut_PATIENT_STATUS_TEMP = dict({'BSSR0022': np.array([51, 0, 255])/255, 
                                'UMM062': np.array([147, 124, 242])/255,
                                'UMM065': np.array([214, 207, 243])/255})

row_colors1 = pd.Series(patient_status, index=patient_status).map(lut_PATIENT_STATUS_TEMP)
ax1 = fig.add_axes([0.08,0.1,0.035,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors1:
    pos = (x, y / len(row_colors1))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(row_colors1), color=c))
    if y >= len(row_colors1)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

# SAVE FIGURE
figure_label = '/STING_IFN_Heatmap_HARBOUR_Class1'.format(meta,subset_type,method,metric)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/TEST_Heatmaps/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

## KARYOTYPIC ANALYSIS PER ARCHETYPE

### TEST FOR CANONICAL VARIANTS

In [None]:
CNV_DF = pd.DataFrame()

for x in range(1,7):
    tmp = pd.read_csv('/workdir/uvmel_project/data/CNV_data/uvmel'+ str(x) +'_cnv_data_075.csv', index_col=0)
    tmp.index.name = 'Cell'
    tmp['Patient'] = ['UM0'+str(x) for y in range(len(tmp))]
    tmp.set_index(['Patient'], append=True, inplace=True)
    tmp = tmp.reorder_levels(['Patient','Cell']).sort_index()

    CNV_DF = pd.concat([CNV_DF, tmp])

In [None]:
CNV_FREQ_DF = pd.DataFrame()
patient_list = []
freq_list = [[] for x in range(3*len(CNV_DF.columns))]
variant_list = []

for x in CNV_DF.index.get_level_values('Patient').unique():
    patient_list = patient_list + [x]
    
    for y in CNV_DF.columns.tolist():
        variant_list = variant_list + [y+'-', y+'+']
        vals_tmp = CNV_DF[CNV_DF.index.get_level_values('Patient') == 
                                                           x][y].value_counts(normalize=True).reindex([-1,0,1], 
                                                           fill_value=0).sort_index().values.tolist()
        
        print(CNV_DF[CNV_DF.index.get_level_values('Patient') == 
                                                           x][y].value_counts(normalize=True).reindex([-1,0,1], 
                                                           fill_value=0).sort_index())
        
        freq_list[variant_list.index(y+'-')].append(vals_tmp[0])
        freq_list[variant_list.index(y+'+')].append(vals_tmp[2])
        
CNV_FREQ_DF['Patient'] = patient_list

for x in variant_list:
    CNV_FREQ_DF[x] = freq_list[variant_list.index(x)]

CNV_FREQ_DF.set_index(['Patient'], inplace=True)

merged_3_loss = []
merged_8_gain = []

for x in CNV_DF.index.get_level_values('Patient').unique():
    
    merged_3_loss.append(sum([1 if ((CNV_DF[CNV_DF.index.get_level_values('Patient') == x]['3p'][y] == -1) |
                             (CNV_DF[CNV_DF.index.get_level_values('Patient') == x]['3q'][y] == -1))
                               else 0 
                               for y in range(len(CNV_DF[CNV_DF.index.get_level_values('Patient') == x]))])/
                        len(CNV_DF[CNV_DF.index.get_level_values('Patient') == x]))
    
    merged_8_gain.append(sum([1 if ((CNV_DF[CNV_DF.index.get_level_values('Patient') == x]['8p'][y] == 1) |
                             (CNV_DF[CNV_DF.index.get_level_values('Patient') == x]['8q'][y] == -1))
                               else 0 
                               for y in range(len(CNV_DF[CNV_DF.index.get_level_values('Patient') == x]))])/
                        len(CNV_DF[CNV_DF.index.get_level_values('Patient') == x]))
    
CNV_FREQ_DF['3-'] = merged_3_loss
CNV_FREQ_DF['8+'] = merged_8_gain

In [None]:
CNV_FREQ_DF[['6q-']]

In [None]:
# CONSTRUCT HEATMAP DATA
canonical_list = ['1p-', '3-', '6p+', '6q-', '8p-', '8+',
                  '11q-', '16q-', '17p-']


mat = CNV_FREQ_DF[canonical_list].T.iloc[::-1]                     # Canonical Only
#mat = CNV_FREQ_DF.T.iloc[::-1]                                      # All
mat = mat[['UM01', 'UM02', 'UM04', 'UM05', 'UM06']]
#mat_full = mat[['UM01', 'UM02', 'UM04', 'UM05', 'UM06']]
#mat = mat[['UM03']]

mat = mat.loc[~(mat==0).all(axis=1)]
#mat_full = mat_full.loc[~(mat_full==0).all(axis=1)]

#mat = mat[mat.index.map(lambda x: x in mat_full.index)]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(5,4))

# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.08,0.1,0.7,0.6])
#cnv_colormap = mcolors.LinearSegmentedColormap('cnv_colormap',['white',
#                                       plt.cm.RdBu_r(0),
#                                      plt.cm.RdBu(0)])

color_cutoff = 50
reds = cm.get_cmap('Reds')
cnv_colormap = reds(np.linspace(0, 1, 100))

#cnv_colormap[color_cutoff:, :] = plt.cm.RdBu(0)
#cnv_colormap = reds(np.linspace(0,1,100))
cnv_colormap = mcolors.ListedColormap(cnv_colormap)

#norm = mcolors.TwoSlopeNorm(vmin=0, vmax=color_cutoff/100)
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=cnv_colormap, vmax=0.5)
labels = list(mat.columns)
labels_y = list(mat.index)
axmatrix.xaxis.set_ticks_position('top')
axmatrix.set_yticklabels(mat.index)
xtick = plt.xticks(range(len(labels)), labels, rotation = 0, fontsize = 8, fontname='Arial')
ytick = plt.yticks(range(len(labels_y)), labels_y, fontsize=6, fontname='Arial')
#axmatrix.grid(False)
#axmatrix.tick_params(axis=u'y', which=u'both',length=0)

# Add colorbar
axcolor = fig.add_axes([0.8,0.1,0.05,0.3])
cbar = plt.colorbar(im, cax=axcolor, ticks=[0,0.5,1])

#plt.axis('off')
fig.axes.facecolor:'white'

# SAVE FIGURE
figure_label = 'CNV_Frequencies_Canonical_All_Others'
fn = '/workdir/uvmel_project/figures/Revision_2_Figures/CNV_FIGURES/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
mat

In [None]:
canonical_list = ['1p-', '3-', '6p+', '6q-', '8p-', '8+',
                  '11q-', '16q-', '17p-']


mat = CNV_FREQ_DF[canonical_list].T.iloc[::-1]                     # Canonical Only
#mat = CNV_FREQ_DF.T.iloc[::-1]                                      # All
#mat = mat[['UM01', 'UM02', 'UM04', 'UM05', 'UM06']]
#mat_full = mat[['UM01', 'UM02', 'UM04', 'UM05', 'UM06']]
#mat = mat[['UM03']]

mat = mat.loc[~(mat==0).all(axis=1)]

In [None]:
mat

In [None]:
mat.index = ['1p-', '3-', '6p-', '6p+', '8+', '11q-', '16q-', '17p-']

In [None]:
mat

In [None]:
plt.scatter(np.arange(0,1,1.0/100), np.arange(0,1,1.0/100),
            cmap=cnv_colormap, c=np.arange(0,1,1.0/100))

### LOAD CNV RESULTS FROM inferCNV

In [None]:
inferCNV_df = pd.read_csv('/workdir/uvmel_project/inferCNV/results_no_cluster_by_grouping_0.1/'+
                          'expr.infercnv.dat',
                           sep='\t')

In [None]:
inferCNV_df

In [None]:
DF_TUMOR.index.get_level_values('Patient').value_counts()

### LOAD CNV RESULTS FROM CaSpER

In [None]:
mat_input = pd.read_csv('/workdir/uvmel_project/data/CNV_data/uvmel2_cnv_data_075.csv', index_col=0)

In [None]:
mat_input.head()

### VISUALIZE RESULTS

In [None]:
# CONSTRUCT HEATMAP DATA
heatmap_data = mat_input

# LINKAGE 
method = 'average' # average, single centroid/euclidean
metric = 'euclidean' # cosine
linkage = hc.linkage(heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)

# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
#mat = heatmap_data                                                # Not Clustered
mat = heatmap_data.iloc[r1,:]                                      # Clustered

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(14,6))


# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.08,0.1,0.7,0.6])
cnv_colormap = mcolors.ListedColormap([plt.cm.RdBu_r(0),
                                       'white',
                                       plt.cm.RdBu(0)])

norm = mcolors.TwoSlopeNorm(vmin=-1, vmax = 1, vcenter=0)
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=cnv_colormap)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 0, fontsize = 8, fontname='Arial')
axmatrix.grid(False)
axmatrix.tick_params(axis=u'y', which=u'both',length=0)

# ADD DENDROGRAM
ax1 = fig.add_axes([0.00,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left', color_threshold = 0, above_threshold_color='#808080',
                    p=6, truncate_mode='level')
ax1.set_xticks([])
ax1.set_yticks([])
plt.axis('off')

fig.axes.facecolor:'white'

# SAVE FIGURE
figure_label = 'UM02_CLUSTERED_075'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/CNV_FIGURES/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

### VISUALIZE ADJACENCY GRAPH

In [None]:
patientNum = 2
CNVadjMatx = pd.read_csv('/workdir/uvmel_project/data/CNV_data/uvmel'+str(patientNum)+'_adjMatx_data_075.csv', 
                         index_col=0)
CNVvarFreq = pd.read_csv('/workdir/uvmel_project/data/CNV_data/uvmel'+str(patientNum)+'_varFreq_data_075.csv', 
                         index_col=0)

In [None]:
max_p = 0.01
min_freq = 0.10

CNVadjMatxFiltered = CNVadjMatx.copy()

CNVadjMatxFiltered[abs(CNVadjMatxFiltered) > max_p] = 0

filterVars = [x for x in CNVvarFreq[CNVvarFreq.varFreq < min_freq].V1.values if x in CNVadjMatxFiltered.columns]
CNVadjMatxFiltered.loc[:,filterVars] = 0
CNVadjMatxFiltered.loc[filterVars,:] = 0

CNVadjMatxFiltered = CNVadjMatxFiltered.loc[:, (CNVadjMatxFiltered != 0).any(axis=0)]
CNVadjMatxFiltered = CNVadjMatxFiltered.loc[(CNVadjMatxFiltered.T != 0).any(axis=0)]

In [None]:
G = nx.from_pandas_adjacency(CNVadjMatxFiltered)

weights = [G[u][v]['weight'] for u,v in G.edges()]
label_dict = dict(zip(list(G.nodes),[x[:2] if len(x) == 5 else x[:3] for x in list(G.nodes)]))

fig = plt.figure(figsize=(8, 6))
plt.tight_layout()

try:
    nx.draw_planar(G, 
            with_labels=True,
            labels=label_dict,
            font_color='white',
            font_size=17,
            font_weight='bold',
            node_color=[plt.cm.RdBu(0) if 'amp' in node else plt.cm.RdBu_r(0) for node in G],
            node_size=[8000.0*(CNVvarFreq.loc[CNVvarFreq.V1 == label].varFreq.values[0])
                       for label in G],
            width=[-0.1*np.log2(abs(x)) for x in weights],
            style=['solid' if x < 0 else 'dashed' for x in weights])
    
    
except:
    nx.draw_shell(G, 
            with_labels=True,
            labels=label_dict,
            font_color='white',
            font_size=17,
            font_weight='bold',
            node_color=[plt.cm.RdBu(0) if 'amp' in node else plt.cm.RdBu_r(0) for node in G],
            node_size=[8000.0*(CNVvarFreq.loc[CNVvarFreq.V1 == label].varFreq.values[0])
                       for label in G],
            width=[-0.1*np.log2(abs(x)) for x in weights],
            style=['solid' if x < 0 else 'dashed' for x in weights])

    
plt.axis('off')
axis = plt.gca()
axis.set_xlim([1.2*x for x in axis.get_xlim()])
axis.set_ylim([1.2*y for y in axis.get_ylim()])
plt.tight_layout()

# SAVE FIGURE
figure_label = 'UM0'+str(patientNum)+'_Network_10_075'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/CNV_FIGURES/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
### 

### KARYOTYPIC VOLUME

In [None]:
patientNum = 6
chr_smoothed_map = pd.DataFrame()
#current_list = [patientNum]
current_list = [1,2,3,4,5,6]

for i in current_list:
    chr_smoothed_map = chr_smoothed_map.append(pd.read_csv('/workdir/uvmel_project/data/CNV_data/uvmel'+
                                           str(i)+'_chrSmoothedExp_data.csv', 
                                           index_col=0))

chr_smoothed_map.index = DF_TUMOR.loc[DF_TUMOR.index.map(lambda x: x[2] in ['UM0'+str(x) for x in current_list
                                                                    ])].index
chr_smoothed_map['Assignment'] = METADATA_TUMOR.loc[METADATA_TUMOR.index.map(lambda x: x[2] in 
                                                                     ['UM0'+str(x) for x in current_list
                                                                     ])].Assignment
chr_smoothed_map.set_index('Assignment', append=True, inplace=True)

In [None]:
chr_smoothed_map.index.get_level_values('Assignment').value_counts()

In [None]:
chr_smoothed_map.head()

In [None]:
plt.hist(chr_smoothed_map[chr_smoothed_map.index.map(lambda x: x[2] == 'UM05')]['3'].values, bins=100)
plt.axvline(-0.25, ymin=0, ymax=100, color='red')

In [None]:
thr = 0.25

CNV_EXP_DF = chr_smoothed_map
CNV_EXP_DF[CNV_EXP_DF < -1*thr] = -1
CNV_EXP_DF[CNV_EXP_DF > thr] = 1
CNV_EXP_DF[abs(CNV_EXP_DF) < thr] = 0

CNV_EXP_FREQ_DF = pd.DataFrame()
patient_list = []
freq_list = [[] for x in range(3*len(CNV_EXP_DF.columns))]
variant_list = []

for x in CNV_EXP_DF.index.get_level_values('Patient').unique():
    patient_list = patient_list + [x]
    
    for y in CNV_EXP_DF.columns.tolist():
        variant_list = variant_list + [y+'-', y+'+']
        vals_tmp = CNV_EXP_DF[CNV_EXP_DF.index.get_level_values('Patient') == 
                                                           x][y].value_counts(normalize=True).reindex([-1,0,1], 
                                                           fill_value=0).sort_index().values.tolist()
        
        print(CNV_EXP_DF[CNV_EXP_DF.index.get_level_values('Patient') == 
                                                           x][y].value_counts(normalize=True).reindex([-1,0,1], 
                                                           fill_value=0).sort_index())
        
        freq_list[variant_list.index(y+'-')].append(vals_tmp[0])
        freq_list[variant_list.index(y+'+')].append(vals_tmp[2])
        
CNV_EXP_FREQ_DF['Patient'] = patient_list

for x in variant_list:
    CNV_EXP_FREQ_DF[x] = freq_list[variant_list.index(x)]

CNV_EXP_FREQ_DF.set_index(['Patient'], inplace=True)

In [None]:
CNV_EXP_FREQ_DF

In [None]:
# CONSTRUCT HEATMAP DATA

canonical_list = ['1-', '3-', '6+', '6-', '8-', '8+',
                  '11-', '16-', '17-']


mat = CNV_EXP_FREQ_DF

mat = mat[canonical_list].T.iloc[::-1]
mat = mat[['UM03']]
#mat = mat[['UM01', 'UM02', 'UM04', 'UM05', 'UM06']]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(1,4))

# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.08,0.1,0.7,0.6])
#cnv_colormap = mcolors.LinearSegmentedColormap('cnv_colormap',['white',
#                                       plt.cm.RdBu_r(0),
#                                      plt.cm.RdBu(0)])

color_cutoff = 50
reds = cm.get_cmap('Reds')
cnv_colormap = reds(np.linspace(0, 1, 100))

#cnv_colormap[color_cutoff:, :] = plt.cm.RdBu(0)
#cnv_colormap = reds(np.linspace(0,1,100))
cnv_colormap = mcolors.ListedColormap(cnv_colormap)

#norm = mcolors.TwoSlopeNorm(vmin=0, vmax=color_cutoff/100)
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=cnv_colormap, vmax=1)
labels = list(mat.columns)
labels_y = list(mat.index)
axmatrix.xaxis.set_ticks_position('top')
axmatrix.set_yticklabels(mat.index)
xtick = plt.xticks(range(len(labels)), labels, rotation = 0, fontsize = 10, fontname='Arial')
ytick = plt.yticks(range(len(labels_y)), labels_y, fontsize=10, fontname='Arial')
#axmatrix.grid(False)
#axmatrix.tick_params(axis=u'y', which=u'both',length=0)

# Add colorbar
axcolor = fig.add_axes([0.8,0.1,0.05,0.3])
cbar = plt.colorbar(im, cax=axcolor, ticks=[0,0.5,1])

#plt.axis('off')
fig.axes.facecolor:'white'

# SAVE FIGURE
figure_label = 'CNV_Heatmap_from_Expression_UM03_unsaturated'
fn = '/workdir/uvmel_project/figures/Revision_2_Figures/CNV_FIGURES/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
mat

In [None]:
# CONSTRUCT HEATMAP DATA
mat = chr_smoothed_map

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(12,8))

# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.08,0.1,0.7,0.6])
#cnv_colormap = mcolors.LinearSegmentedColormap('cnv_colormap',['white',
#                                       plt.cm.RdBu_r(0),
#                                      plt.cm.RdBu(0)])

color_cutoff = 50
reds = cm.get_cmap('Reds')
cnv_colormap = reds(np.linspace(0, 1, 100))

#cnv_colormap[color_cutoff:, :] = plt.cm.RdBu(0)
#cnv_colormap = reds(np.linspace(0,1,100))
cnv_colormap = mcolors.ListedColormap(cnv_colormap)

#norm = mcolors.TwoSlopeNorm(vmin=0, vmax=color_cutoff/100)
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap='RdBu_r', vmin=-1, vmax=1)
labels = list(mat.columns)
labels_y = list(mat.index)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels(mat.index)
xtick = plt.xticks(range(len(labels)), labels, rotation = 0, fontsize = 8, fontname='Arial')
ytick = plt.yticks(range(len(labels_y)), [], fontsize=6, fontname='Arial')
#axmatrix.grid(False)
#axmatrix.tick_params(axis=u'y', which=u'both',length=0)

# Add colorbar
axcolor = fig.add_axes([0.8,0.1,0.05,0.3])
cbar = plt.colorbar(im, cax=axcolor, ticks=[0,0.5,1])

#plt.axis('off')
fig.axes.facecolor:'white'

# SAVE FIGURE
figure_label = 'UM0'+str(patientNum)+'_Heatmap'
fn = '/workdir/uvmel_project/figures/Revision_2_Figures/CNV_FIGURES/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
n = 50
reps = np.arange(100)

In [None]:
mat = chr_smoothed_map.copy()

# CLASS I TUMORS
logvolume_tumor_class1 = np.zeros(len(reps))
class_selection = 'Castle 1'
SUBSET = mat.loc[mat.index.map(lambda x: x[10] == class_selection)]

for rep in reps:
    CSUBSET = SUBSET.sample(n=n,replace=True)
    GENE_COV = CSUBSET.cov(min_periods=None)
    eigenvalues = np.linalg.eigvals(GENE_COV)
    logvolume = np.float(np.sum(0.5*np.log10(eigenvalues[eigenvalues>0]**2))/len(eigenvalues))
    logvolume_tumor_class1[rep] = logvolume
    
# CLASS II TUMORS
logvolume_tumor_class2 = np.zeros(len(reps))
class_selection = 'Castle 2'
SUBSET = mat.loc[mat.index.map(lambda x: x[10] == class_selection)]

for rep in reps:
    CSUBSET = SUBSET.sample(n=n,replace=True)
    GENE_COV = CSUBSET.cov(min_periods=None)
    eigenvalues = np.linalg.eigvals(GENE_COV)
    logvolume = np.float(np.sum(0.5*np.log10(eigenvalues[eigenvalues>0]**2))/len(eigenvalues))
    logvolume_tumor_class2[rep] = logvolume


# WRITE RESULTS TO DATAFRAME
boxplot_data_goi = pd.DataFrame()
boxplot_data_goi['Meta-Source'] = ['Group 1']*len(logvolume_tumor_class1)+['Group 2']*len(logvolume_tumor_class2)
boxplot_data_goi['Log Karyotypic Volume'] = np.concatenate([logvolume_tumor_class1,logvolume_tumor_class2],axis = 0)

# STATISTIC
T1 = boxplot_data_goi.loc[boxplot_data_goi['Meta-Source'].isin(['Group 1'])]['Log Karyotypic Volume'].values
T2 = boxplot_data_goi.loc[boxplot_data_goi['Meta-Source'].isin(['Group 2'])]['Log Karyotypic Volume'].values

In [None]:
len(T1)

In [None]:
# CLASS I vs. CLASS 2
print('Castle 1 vs. Castle 2')
print(stats.mannwhitneyu(T1, T2))

In [None]:
sns.set_style("white")
fig = plt.figure(figsize = (5,5))
ax = plt.gca()

sns.kdeplot(T1, shade=True, color="#0000FF")
sns.kdeplot(T2, shade=True, color="#FF0000")
#sns.kdeplot(T3, shade=True, color="#FF0000")
#sns.kdeplot(T4, shade=True, color="#FF0000")

ax.set_ylabel("Frequency",fontsize=10)
ax.set_xlabel("Log Karyotypic Volume",fontsize=10)
ax.tick_params(labelsize=10)
#ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
sns.despine()
plt.ylim((0,10))
#plt.xlim((-0.95,-0.4))

# SAVE FIGURE
figure_label = 'Log_Karyotypic_Volume_KDE_PATIENT_'+str(patientNum)
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)


In [None]:
sns.set_style("white")
fig = plt.figure(figsize = (5,5))
ax = plt.gca()

sns.pointplot('Meta-Source', 'Log Karyotypic Volume',data=boxplot_data_goi, color='black', 
              dodge=True, join=False,
              scale=0.8, errwidth=1, capsize=0.08, ci='sd')

# SAVE FIGURE
figure_label = 'Karyotypic_Volume_Scatter_UM03'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
fig, ax = plt.subplots(figsize=(7,7))

ax = sns.violinplot(x='Meta-Source', y='Log Karyotypic Volume', data=boxplot_data_goi,
                    hue='Meta-Source',
                    palette={'Group 1': 'white', 'Group 2': 'gray'},
                    dodge=False,
                    scale='area',)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.get_legend().remove()

plt.xticks(rotation=90)

# SAVE FIGURE
figure_label = 'Karyotypic_Volume_Violin_UM03'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

### SHANNON DIVERSITY INDEX

In [None]:
cnv_mat = pd.DataFrame()
current_list = [1,2,3,4,5,6]

for i in current_list:
    cnv_mat = cnv_mat.append(pd.read_csv('/workdir/uvmel_project/data/CNV_data/uvmel'+
                                           str(i)+'_cnv_data_075.csv', 
                                           index_col=0))

cnv_mat.index = DF_TUMOR.loc[DF_TUMOR.index.map(lambda x: x[2] in ['UM0'+str(x) for x in current_list
                                                                    ])].index.get_level_values('Patient')

In [None]:
#unique_kar_df = pd.DataFrame(cnv_mat.groupby(['Patient']+list(cnv_mat.columns)).size(), columns=['Counts'])
kar_df = cnv_mat.copy()

kar_df.reset_index(inplace=True)
kar_df['State'] = kar_df[kar_df.columns[1:-1]].apply(
                                       lambda x: ','.join(x.dropna().astype(str)), axis=1)

#unique_kar_df = unique_kar_df[['Patient', 'State', 'Counts']]

kar_df = kar_df[['Patient', 'State']]
kar_df.set_index('Patient', inplace=True)

In [None]:
kar_df.State[0]

In [None]:
sample_size = 700
sample_rounds= 20

sdi_df = pd.DataFrame()

for i in range(sample_rounds):
    sdi_list = []
    for patient in kar_df.index.unique():
        tmp_df = kar_df.loc[kar_df.index.get_level_values('Patient') == patient]
        tmp_df = tmp_df.sample(sample_size, replace=True)
        p_vect = tmp_df['State'].value_counts(normalize=True)
        sdi_list.append(-1*sum(list(map(lambda x: np.log(x)*x, p_vect))))
    
    sdi_df[str(i)] = sdi_list

sdi_df.index = kar_df.index.unique()
#sdi_df['Ave'] = sdi_df.mean(axis=1).values

In [None]:
sdi_boxplot_data = pd.DataFrame()
sdi_boxplot_data['Patient'] = [item for sublist in [[x]*sample_rounds for x in sdi_df.index] for item in sublist]
sdi_boxplot_data['SDI'] = [item for sublist in [sdi_df.iloc[x].values for x in range(len(sdi_df.index))]
                           for item in sublist]

In [None]:
sns.set_style("white")
fig = plt.figure(figsize = (5,5))
ax = plt.gca()

sns.pointplot('Patient', 'SDI',data=sdi_boxplot_data, color='black', 
              dodge=True, join=False,
              scale=0.7, errwidth=1, capsize=0.1, ci='sd')

# SAVE FIGURE
figure_label = 'Karyotypic_SDI_Scatter'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
kar_df['Cell ID'] = DF_TUMOR.index.get_level_values('Cell ID')
kar_df.set_index('Cell ID', append=True, inplace=True)

In [None]:
kar_unique_df = pd.DataFrame(kar_df.groupby(['Patient','State']).size())
kar_unique_df.rename(columns={0:'Count'}, inplace=True)

In [None]:
kar_ds_df = kar_df[kar_df.reset_index().duplicated(['Patient','State'], keep=False).values == True]

In [None]:
kar_ds_df.head()

In [None]:
sample_size = 300
sample_rounds= 50

sdi_ds_df = pd.DataFrame()

for i in range(sample_rounds):
    sdi_list = []
    for patient in kar_ds_df.index.get_level_values('Patient').unique():
        tmp_df = kar_ds_df.loc[kar_ds_df.index.get_level_values('Patient') == patient]
        tmp_df = tmp_df.sample(sample_size, replace=True)
        p_vect = tmp_df['State'].value_counts(normalize=True)
        sdi_list.append(-1*sum(list(map(lambda x: np.log(x)*x, p_vect))))
    
    sdi_ds_df[str(i)] = sdi_list

sdi_ds_df.index = kar_ds_df.index.get_level_values('Patient').unique()

In [None]:
sdi_boxplot_ds_data = pd.DataFrame()
sdi_boxplot_ds_data['Patient'] = [item for sublist in [[x]*sample_rounds 
                                                       for x in sdi_ds_df.index] for item in sublist]
sdi_boxplot_ds_data['SDI'] = [item for sublist in [sdi_ds_df.iloc[x].values 
                                                   for x in range(len(sdi_ds_df.index))]
                              for item in sublist]

sns.set_style("white")
fig = plt.figure(figsize = (5,5))
ax = plt.gca()

sns.pointplot('Patient', 'SDI',data=sdi_boxplot_ds_data, color='black', 
              dodge=True, join=False,
              scale=0.7, errwidth=1, capsize=0.1, ci='sd')

# SAVE FIGURE
figure_label = 'Karyotypic_SDI_Scatter_DROP_SINGLETONS'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
# SHANNON DIVERSITY WITHIN UM03 FOR GEP ASSIGNMENTS

kar_df_UM03 = kar_df.loc[kar_df.index.get_level_values('Patient') == 'UM03']
kar_df_UM03['Assignment'] = METADATA_TUMOR.loc[METADATA_TUMOR.index.get_level_values('Patient') == 
                                               'UM03'].Assignment.values

kar_df_UM03.set_index('Assignment', append=False, inplace=True)

sample_size = 50
sample_rounds= 100

sdi_df = pd.DataFrame()

for i in range(sample_rounds):
    sdi_list = []
    for gep_class in kar_df_UM03.index.unique():
        tmp_df = kar_df_UM03.loc[kar_df_UM03.index.get_level_values('Assignment') == gep_class]
        tmp_df = tmp_df.sample(sample_size, replace=True)
        p_vect = tmp_df['State'].value_counts(normalize=True)
        sdi_list.append(-1*sum(list(map(lambda x: np.log(x)*x, p_vect))))
    
    sdi_df[str(i)] = sdi_list

sdi_df.index = kar_df_UM03.index.unique()
#sdi_df['Ave'] = sdi_df.mean(axis=1).values

sdi_boxplot_data = pd.DataFrame()
sdi_boxplot_data['GEP'] = [item for sublist in [[x]*sample_rounds for x in sdi_df.index] for item in sublist]
sdi_boxplot_data['SDI'] = [item for sublist in [sdi_df.iloc[x].values for x in range(len(sdi_df.index))]
                           for item in sublist]

sns.set_style("white")
fig = plt.figure(figsize = (5,5))
ax = plt.gca()

sns.kdeplot(sdi_boxplot_data.loc[sdi_boxplot_data.GEP == 'Castle 1']['SDI'].values, shade=True, color="#0000FF")
sns.kdeplot(sdi_boxplot_data.loc[sdi_boxplot_data.GEP == 'Castle 2']['SDI'].values, shade=True, color="#FF0000")

ax.set_ylabel("Frequency",fontsize=10)
ax.set_xlabel("SDI",fontsize=10)
ax.tick_params(labelsize=10)
#ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
sns.despine()
plt.ylim((0,10))
#plt.xlim((-0.95,-0.4))

# SAVE FIGURE
figure_label = 'Karyotypic_SDI_KDE_UM03'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
# CLASS I vs. CLASS 2
print('Castle 1 vs. Castle 2')
print(stats.mannwhitneyu(sdi_boxplot_data.loc[sdi_boxplot_data.GEP == 'Castle 1']['SDI'].values,
                         sdi_boxplot_data.loc[sdi_boxplot_data.GEP == 'Castle 2']['SDI'].values))

In [None]:
fig, ax = plt.subplots(figsize=(2,7))

ax = sns.barplot(x='GEP', y='SDI', data=sdi_boxplot_data,
                 hue='GEP',
                 edgecolor='black',
                 palette={'Castle 1': 'white', 'Castle 2': 'gray'},
                 dodge=False,
                 ci=95,
                 errwidth=1,
                 capsize=0.1,
                 estimator=median)
 
ax.set_yscale('log')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.get_legend().remove()

plt.xticks(rotation=90)

# SAVE FIGURE
figure_label = 'Karyotypic_SDI_Barplot_UM03_median'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
PHEN_VOL_UM01 = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UM01')
PHEN_VOL_UM02 = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UM02')
PHEN_VOL_UM03 = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UM03')
PHEN_VOL_UM04 = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UM04')
PHEN_VOL_UM05 = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UM05')
PHEN_VOL_UM06 = joblib.load('/workdir/uvmel_project/data/MSK_Phenotypic_Volume_UM06')

PHEN_VOL_LIST = [PHEN_VOL_UM01, PHEN_VOL_UM02, PHEN_VOL_UM03, PHEN_VOL_UM04, PHEN_VOL_UM05, PHEN_VOL_UM06]

In [None]:
DF_TUMOR

In [None]:
METADATA_TUMOR

In [None]:
phen_vol_data = pd.DataFrame()

phen_vol_data['Patient'] = ['UM01']*len(PHEN_VOL_LIST[0])+['UM02']*len(PHEN_VOL_LIST[1])+\
                           ['UM03']*len(PHEN_VOL_LIST[2])+['UM04']*len(PHEN_VOL_LIST[3])+\
                           ['UM05']*len(PHEN_VOL_LIST[4])+['UM06']*len(PHEN_VOL_LIST[5])

phen_vol_data['Log Phenotypic Volume'] = [x for l in PHEN_VOL_LIST for x in l]

In [None]:
sns.set_style("white")
fig = plt.figure(figsize = (5,5))
ax = plt.gca()

sns.pointplot('Patient', 'Log Phenotypic Volume',data=phen_vol_data, color='black', 
              dodge=True, join=False,
              scale=0.7, errwidth=1, capsize=0.1, ci='sd')

# SAVE FIGURE
figure_label = 'Phenotypic_Volume_Scatter'
fn = '/workdir/uvmel_project/figures/final_figures_10_27_20/' + figure_label
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

### PHENOTYPIC VOLUME HARBOUR

In [None]:
exec('METADATA_TUMOR = h5_data.load(\'/METADATA_TUMOR\')')

In [None]:
state_map_HMMi3 = {1: 'loss',
                   2: 'neutral',
                   3: 'gain'}

for patient in ['UM01', 'UM02', 'UM03', 'UM04', 'UM05', 'UM06']:

    cnv_mat = pd.DataFrame()
    
    QUERY = pd.read_csv('/workdir/uvmel_project/single_sample_results/results_HMMi3_merged_ref/'+patient+'/'+
                                'HMM_CNV_predictions.HMMi3.rand_trees.hmm_mode-subclusters.'+
                                'Pnorm_0.5.pred_cnv_regions.dat',
                                 sep='\t')
    
    cytoband = pd.read_csv('/workdir/uvmel_project/casper/data/cytoband.txt', sep='\t', header=None)
    cytoband.columns = ['chr', 'start', 'end', 'arm', 'col5']
    
    QUERY_cell_groupings = pd.read_csv('/workdir/uvmel_project/single_sample_results/results_HMMi3_merged_ref/'+
                                             patient+'/'+
                                            '17_HMM_predHMMi3.rand_trees.hmm_mode-subclusters.cell_groupings',
                                             sep='\t')
    
    prop_df = pd.DataFrame(QUERY_cell_groupings.cell_group_name.value_counts(normalize=True)).reset_index()
    prop_df.columns = ['cell_group_name', 'proportion']
    
    QUERY['arm'] = ['p' if 'p' in cytoband[(cytoband.chr == QUERY.chr[x]) & 
                               (cytoband.start <= QUERY.start[x]) &
                               (cytoband.end >= QUERY.start[x])].arm.values[0]
                            else
                            'q'
                            for x in range(len(UM03_HMMi3_df))]
    
    QUERY['CNV'] = [QUERY.chr[x].replace('chr', '')+QUERY.arm[x]+ ' ' + 
                              state_map_HMMi3[QUERY.state[x]] 
                              for x in range(len(QUERY))]
    
    cell_state_map = dict(zip(QUERY.cell_group_name.unique(),
                              [','.join(QUERY[QUERY.cell_group_name==x].CNV.values)
                               for x in QUERY.cell_group_name.unique()]))
    
    cell_groupings_cell_names = [int(x[5:]) for x in QUERY_cell_groupings.cell.values]
    
    cnv_mat['Patient'] = [patient for x in range(len(QUERY_cell_groupings))]
    cnv_mat['State'] = [cell_state_map[x] for x in QUERY_cell_groupings.cell_group_name.values]
    
    QUERY_META = METADATA_TUMOR[METADATA_TUMOR.index.map(lambda x: x[2] == patient)]
    
    cnv_mat.index = [QUERY_META[QUERY_META.index.map(lambda x: x[3] == y)].index.\
                     get_level_values('Patient').values[0]
                     for y in cell_groupings_cell_names]
    
    #unique_kar_df = pd.DataFrame(cnv_mat.groupby(['Patient']+list(cnv_mat.columns)).size(), columns=['Counts'])
    kar_df = cnv_mat.copy()
    
    kar_df.reset_index(inplace=True)
    #kar_df['State'] = kar_df[kar_df.columns[1:-1]].apply(
    #                                       lambda x: ','.join(x.dropna().astype(str)), axis=1)
    
    #unique_kar_df = unique_kar_df[['Patient', 'State', 'Counts']]
    
    kar_df = kar_df[['Patient', 'State']]
    kar_df.set_index('Patient', inplace=True)
    
    
    
    kar_df['Assignment'] = [QUERY_META[QUERY_META.index.map(lambda x: x[3] == y)].Assignment.values[0]
                            for y in cell_groupings_cell_names]
    
    kar_df.set_index('Assignment', append=False, inplace=True)
    
    
    # Compute karyotypic SDI:
    
    sample_size = 50
    sample_rounds= 100
    
    sdi_df = pd.DataFrame()
    
    for i in range(sample_rounds):
        sdi_list = []
        for gep_class in kar_df.index.unique():
            tmp_df = kar_df.loc[kar_df.index.get_level_values('Assignment') == gep_class]
            tmp_df = tmp_df.sample(sample_size, replace=True)
            p_vect = tmp_df['State'].value_counts(normalize=True)
            sdi_list.append(-1*sum(list(map(lambda x: np.log(x)*x, p_vect))))
        
        sdi_df[str(i)] = sdi_list
    
    sdi_df.index = kar_df.index.unique()
    #sdi_df['Ave'] = sdi_df.mean(axis=1).values
    
    sdi_boxplot_data = pd.DataFrame()
    sdi_boxplot_data['GEP'] = [item for sublist in [[x]*sample_rounds for x in sdi_df.index] for item in sublist]
    sdi_boxplot_data['SDI'] = [item for sublist in [sdi_df.iloc[x].values for x in range(len(sdi_df.index))]
                               for item in sublist]
    
    # Save data:
    sdi_boxplot_data.to_csv('/workdir/uvmel_project/data/UM03_Karyotypic_SDI_inferCNV.csv')
    
    
    # Plot KDE:
    sns.set_style("white")
    fig = plt.figure(figsize = (5,5))
    ax = plt.gca()
    
    sns.kdeplot(sdi_boxplot_data.loc[sdi_boxplot_data.GEP == 'Castle 1']['SDI'].values, shade=True, color="#0000FF")
    sns.kdeplot(sdi_boxplot_data.loc[sdi_boxplot_data.GEP == 'Castle 2']['SDI'].values, shade=True, color="#FF0000")
    
    ax.set_ylabel("Frequency",fontsize=10)
    ax.set_xlabel("SDI",fontsize=10)
    ax.tick_params(labelsize=10)
    #ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
    sns.despine()
    plt.ylim((0,10))
    #plt.xlim((-0.95,-0.4))
    
    # SAVE FIGURE
    figure_label = 'Karyotypic_SDI_KDE_UM03'
    fn = '/workdir/uvmel_project/figures/Revision_2_Figures/' + figure_label
        
    d = os.path.dirname(fn)
    if not os.path.exists(d):
        os.makedirs(d)
        
    fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
    fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
    print(fn)
    
    
    # CLASS I vs. CLASS 2
    print('Castle 1 vs. Castle 2')
    
    print(sdi_boxplot_data.loc[sdi_boxplot_data.GEP == 'Castle 1']['SDI'].values)
    print(sdi_boxplot_data.loc[sdi_boxplot_data.GEP == 'Castle 2']['SDI'].values)
    print(stats.mannwhitneyu(sdi_boxplot_data.loc[sdi_boxplot_data.GEP == 'Castle 1']['SDI'].values,
                             sdi_boxplot_data.loc[sdi_boxplot_data.GEP == 'Castle 2']['SDI'].values))
    
    
    # Plot barplot:
    fig, ax = plt.subplots(figsize=(2,7))
    
    ax = sns.barplot(x='GEP', y='SDI', data=sdi_boxplot_data,
                     hue='GEP',
                     edgecolor='black',
                     palette={'Castle 1': 'white', 'Castle 2': 'gray'},
                     dodge=False,
                     ci=95,
                     errwidth=1,
                     capsize=0.1,
                     estimator=median)
     
    ax.set_yscale('log')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_legend().remove()
    
    plt.xticks(rotation=90)
    
    # SAVE FIGURE
    figure_label = 'Karyotypic_SDI_Barplot_UM03_median'
    fn = '/workdir/uvmel_project/figures/Revision_2_Figures/' + figure_label
        
    d = os.path.dirname(fn)
    if not os.path.exists(d):
        os.makedirs(d)
        
    fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
    fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
    print(fn)
    
    print("Script Finished")

In [None]:
state_map_HMMi3 = {1: 'loss',
                   2: 'neutral',
                   3: 'gain'}

cnv_mat = pd.DataFrame()

UM03_HMMi3_df = pd.read_csv('/workdir/uvmel_project/single_sample_results/results_HMMi3_merged_ref/UM03/'+
                            'HMM_CNV_predictions.HMMi3.rand_trees.hmm_mode-subclusters.'+
                            'Pnorm_0.5.pred_cnv_regions.dat',
                             sep='\t')

cytoband = pd.read_csv('/workdir/uvmel_project/casper/data/cytoband.txt', sep='\t', header=None)
cytoband.columns = ['chr', 'start', 'end', 'arm', 'col5']

UM03_HMMi3_cell_groupings = pd.read_csv('/workdir/uvmel_project/single_sample_results/results_HMMi3_merged_ref/'+
                                        'UM03/'+
                                        '17_HMM_predHMMi3.rand_trees.hmm_mode-subclusters.cell_groupings',
                                         sep='\t')

prop_df_3 = pd.DataFrame(UM03_HMMi3_cell_groupings.cell_group_name.value_counts(normalize=True)).reset_index()
prop_df_3.columns = ['cell_group_name', 'proportion']

UM03_HMMi3_df['arm'] = ['p' if 'p' in cytoband[(cytoband.chr == UM03_HMMi3_df.chr[x]) & 
                           (cytoband.start <= UM03_HMMi3_df.start[x]) &
                           (cytoband.end >= UM03_HMMi3_df.start[x])].arm.values[0]
                        else
                        'q'
                        for x in range(len(UM03_HMMi3_df))]

UM03_HMMi3_df['CNV'] = [UM03_HMMi3_df.chr[x].replace('chr', '')+UM03_HMMi3_df.arm[x]+ ' ' + 
                          state_map_HMMi3[UM03_HMMi3_df.state[x]] 
                          for x in range(len(UM03_HMMi3_df))]

cell_state_map = dict(zip(UM03_HMMi3_df.cell_group_name.unique(),
                          [','.join(UM03_HMMi3_df[UM03_HMMi3_df.cell_group_name==x].CNV.values)
                           for x in UM03_HMMi3_df.cell_group_name.unique()]))

cell_groupings_cell_names = [int(x[5:]) for x in UM03_HMMi3_cell_groupings.cell.values]

cnv_mat['Patient'] = ['UM03' for x in range(len(UM03_HMMi3_cell_groupings))]
cnv_mat['State'] = [cell_state_map[x] for x in UM03_HMMi3_cell_groupings.cell_group_name.values]

METADATA_TUMOR = METADATA_TUMOR[METADATA_TUMOR.index.map(lambda x: x[2] == 'UM03')]

cnv_mat.index = [METADATA_TUMOR[METADATA_TUMOR.index.map(lambda x: x[3] == y)].index.get_level_values('Patient').values[0]
                 for y in cell_groupings_cell_names]

#unique_kar_df = pd.DataFrame(cnv_mat.groupby(['Patient']+list(cnv_mat.columns)).size(), columns=['Counts'])
kar_df = cnv_mat.copy()

kar_df.reset_index(inplace=True)
#kar_df['State'] = kar_df[kar_df.columns[1:-1]].apply(
#                                       lambda x: ','.join(x.dropna().astype(str)), axis=1)

#unique_kar_df = unique_kar_df[['Patient', 'State', 'Counts']]

kar_df = kar_df[['Patient', 'State']]
kar_df.set_index('Patient', inplace=True)

kar_df['Assignment'] = [METADATA_TUMOR[METADATA_TUMOR.index.map(lambda x: x[3] == y)].Assignment.values[0]
                        for y in cell_groupings_cell_names]

kar_df.set_index('Assignment', append=False, inplace=True)

In [None]:
kar_df[kar_df.index == 'Castle 2'].State.value_counts()

In [None]:
kar_df[kar_df.index == 'Castle 1'].State.value_counts()

In [None]:
cnv_mat

In [None]:
METADATA_TUMOR.Assignment.value_counts()

In [None]:
METADATA_TUMOR.index.get_level_values('Cell ID')

In [None]:
UM03_expr = pd.read_csv('/workdir/uvmel_project/single_sample_results/results_HMMi6_merged_ref/UM03/'+
                                'expr.infercnv.dat',
                        sep='\t').T

UM03_expr.index = [int(x[5:]) for x in UM03_expr.index]
UM03_expr = UM03_expr.sort_index()
UM03_expr = UM03_expr[UM03_expr.index.map(lambda x: x in METADATA_TUMOR.index.get_level_values('Cell ID').tolist())]
UM03_expr.index = METADATA_TUMOR.index

In [None]:
UM03_expr_prelim = pd.read_csv('/workdir/uvmel_project/single_sample_results/results_HMMi6_merged_ref/UM03/'+
                                'expr.infercnv.preliminary.dat',
                        sep='\t').T

UM03_expr_prelim.index = [int(x[5:]) for x in UM03_expr_prelim.index]
UM03_expr_prelim = UM03_expr_prelim.sort_index()
UM03_expr_prelim = UM03_expr_prelim[UM03_expr_prelim.index.map(lambda x: x in 
                                    METADATA_TUMOR.index.get_level_values('Cell ID').tolist())]
UM03_expr_prelim.index = METADATA_TUMOR.index

In [None]:
QUERY = UM03_expr

# COLUMN INDEX AND COLORS
genes = QUERY.columns.tolist()
# CONSTRUCT HEATMAP DATA
heatmap_data = QUERY
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
method = 'centroid' # average, single
metric = 'euclidean' # cosine
linkage = hc.linkage(heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
#linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
#col_linkage = deepcopy(linkage)

# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
mat = heatmap_data.iloc[r1,:]

GEP_assignments_sorted = [METADATA_TUMOR[METADATA_TUMOR.index.get_level_values('Cell ID') == x].Assignment.values[0] 
                          for x in mat.index.get_level_values('Cell ID').values]

row_colors = pd.Series(GEP_assignments_sorted, index=GEP_assignments_sorted).map(lut_CASTLE)

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(4,10))
plt.rcParams["axes.grid"] = False

# ADD ROW COLOR INDEX 1 (PHENOGRAPH CLASS)
ax1 = fig.add_axes([0,0.1,0.05,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors:
    pos = (x, y / len(row_colors))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(row_colors), color=c))
    if y >= len(row_colors)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.05,0.1,0.9,0.6], frameon=False)
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=plt.cm.RdBu_r,
                      vmin=0.890275611964209,
                      vmax=1.11449552904841)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
axmatrix.set_xticklabels([]);
#xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 14)
plt.tick_params(size=0)

# ADD COLORBAR
axcolor = fig.add_axes([1.1,0.1,0.1,0.1])
cbar = plt.colorbar(im, cax=axcolor)
#cbar.ax.get_yaxis().set_ticks([])

# SAVE FIGURE
figure_label = 'UM03_inferCNV_expr_centroid_euclidean'
fn = '/workdir/uvmel_project/figures/Revision_2_Figures/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

In [None]:
QUERY = UM03_expr_prelim

# COLUMN INDEX AND COLORS
genes = QUERY.columns.tolist()
# CONSTRUCT HEATMAP DATA
heatmap_data = QUERY
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
method = 'centroid' # average, single
metric = 'euclidean' # cosine
linkage = hc.linkage(heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
#linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
#col_linkage = deepcopy(linkage)

# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
mat = heatmap_data.iloc[r1,:]

GEP_assignments_sorted = [METADATA_TUMOR[METADATA_TUMOR.index.get_level_values('Cell ID') == x].Assignment.values[0] 
                          for x in mat.index.get_level_values('Cell ID').values]

row_colors = pd.Series(GEP_assignments_sorted, index=GEP_assignments_sorted).map(lut_CASTLE)

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(4,10))
plt.rcParams["axes.grid"] = False

# ADD ROW COLOR INDEX 1 (PHENOGRAPH CLASS)
ax1 = fig.add_axes([0,0.1,0.05,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors:
    pos = (x, y / len(row_colors))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(row_colors), color=c))
    if y >= len(row_colors)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.05,0.1,0.9,0.6], frameon=False)
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=plt.cm.RdBu_r,
                      vmin=0.885504470951586,
                      vmax=1.11449552904841)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
axmatrix.set_xticklabels([]);
#xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 14)
plt.tick_params(size=0)

# ADD COLORBAR
axcolor = fig.add_axes([1.1,0.1,0.1,0.1])
cbar = plt.colorbar(im, cax=axcolor)
#cbar.ax.get_yaxis().set_ticks([])

# SAVE FIGURE
figure_label = 'UM03_inferCNV_expr_PRELIM_centroid_euclidean'
fn = '/workdir/uvmel_project/figures/Revision_2_Figures/' + figure_label 
    
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=400)
fig.savefig(fn + '.pdf', bbox_inches='tight',dpi=400)
print(fn)

## RANK DEG PER ARCHETYPE

### VISUALIZE ARCHETYPES

In [None]:
FLATUI_ARCHETYPES = [
                 'E6E6FA', # not assgined
                 'FF66FF', # pink
                 'D1C756', # purple
                 '9ACD32', # Green
                 '00FFFF', # skye blue
                 'FF8800', # orange
                 '0000FF', # dark blue
                 'FFD700', # yellow
                 '20B2AA', # teal
    ]

# CONVERT HEX TO RGB (FLATUI_SAMPLE)
colors = np.zeros((len(FLATUI_ARCHETYPES),3))
for ind,hexcolor in enumerate(FLATUI_ARCHETYPES):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_ARCHETYPES = LinearSegmentedColormap.from_list('FLATUI_ARCHETYPES', colors, N=len(colors))

In [None]:
FLATUI_NEAREST_ARCHETYPES = [
                 'FF66FF', # pink
                 '6600CC', # purple
                 '9ACD32', # Green
                 '00FFFF', # skye blue
                 'FF8800', # orange
                 '0000FF', # dark blue
                 'FFD700', # yellow
                 '20B2AA', # teal
    ]

# CONVERT HEX TO RGB (FLATUI_SAMPLE)
colors = np.zeros((len(FLATUI_NEAREST_ARCHETYPES),3))
for ind,hexcolor in enumerate(FLATUI_NEAREST_ARCHETYPES):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_NEAREST_ARCHETYPES = LinearSegmentedColormap.from_list('FLATUI_NEAREST_ARCHETYPES', colors, N=len(colors))

In [None]:
# Visuaulize distance to each archetype
# PLOT CATEGORICAL CLUSTER ASSIGNMENTS
# Note: Any rows assigned -1 were identified as outliers and should not be considered as a member of any community.
plt.figure(figsize = (20,10))
nrow = 1
ncol = 2
dot_size = 2
gs1 = gridspec.GridSpec(nrow, ncol)
gs1.update(wspace=0.05, hspace=0.1) # set the spacing between axes. 

cm = plt.cm.bwr 
dimtype = 'ForceDirected' 
meta = 'Phenograph_Class'
exec('DIM = DIMENSIONS_{}'.format(subset_type))
exec('QUERY= INDF_{}'.format(subset_type))
exec('META = METADATA_{}'.format(subset_type))
x = DIM['{}0'.format(dimtype)]
y = DIM['{}1'.format(dimtype)]
axis_min = -ceil(ceil(max(np.abs(x).max(),np.abs(y).max()))/10)*10
axis_max = ceil(ceil(max(np.abs(x).max(),np.abs(y).max()))/10)*10

# MONOSOMY 3 SIGNATURE
ax = plt.subplot(gs1[0])
gene_sig = 'Monosomy 3 Up'
signature_genes = genesets[gene_sig].values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
phase = zscore(np.nanmedian(QUERY[detected_genes],axis=1))
# Plot scatter map with phase score
mask = np.isnan(phase)
    
# Randomize/Sort Color Values before Plotting
color_values = np.array(phase)
i = np.random.permutation(len(color_values))
xx = np.array(x)[i]
yy = np.array(y)[i]
color_values = color_values[i]

plt.scatter(xx, yy, c = '#E6E6FA', cmap = CM_DIVERGING, s=dot_size, alpha =0.4)
plt.clim(-1.5,1.5)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
plt.title('Monosomy 3', fontname='Helvetica', size=12, weight='normal')

# Cell Nearest to Each Archetype
for ii in np.unique(META['Archetype_Nearest_Cell'])[1:]:
    c=META['Archetype_Nearest_Cell']==ii
    cc=META['Archetype_Soft_Cluster']==ii
    plt.scatter(x[c.values],y[c.values],s= cc.sum()*0.5, color = '#'+FLATUI_ARCHETYPES[ii])
    #offset = 1000
    #plt.text(x[c]+offset,y[c]+offset,s = '{}'.format(ii), fontsize = 20)
    sns.despine()
    plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
    plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
    plt.title('Archetypes', fontname='Helvetica', size=12, weight='normal')
    plt.xlim(axis_min,axis_max)
    plt.ylim(axis_min,axis_max)
    plt.axis('off')

# Soft Cluster Around Each Archetype Defined by Multi-Scale Diffusion Distance
ax = plt.subplot(gs1[1])
seqc.plot.scatter.categorical(x, y, c=META['Archetype_Soft_Cluster'], 
                              cmap=CM_ARCHETYPES,legend_kwargs={'ncol': 1}, s=dot_size, ax=ax);
sns.despine()
plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
plt.title('Sample ID', fontname='Helvetica', size=12, weight='normal')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
#ax.legend_.remove()

# SAVE FIGURE
figure_label = '_NEARESTArchetypes_Grey_INDF_{}_{}_{}'.format(subset_type,meta,dimtype) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
plt.savefig(fn + '.pdf', bbox_inches='tight',dpi=fig_dpi)
print(fn)

In [None]:
# Visuaulize distance to each archetype
# PLOT CATEGORICAL CLUSTER ASSIGNMENTS
# Note: Any rows assigned -1 were identified as outliers and should not be considered as a member of any community.
plt.figure(figsize = (20,10))
nrow = 1
ncol = 2
dot_size = 2
gs1 = gridspec.GridSpec(nrow, ncol)
gs1.update(wspace=0.05, hspace=0.1) # set the spacing between axes. 

cm = plt.cm.bwr 
dimtype = 'ForceDirected' 
meta = 'Phenograph_Class'
exec('DIM = DIMENSIONS_{}'.format(subset_type))
exec('QUERY= INDF_{}'.format(subset_type))
exec('META = METADATA_{}'.format(subset_type))
x = DIM['{}0'.format(dimtype)]
y = DIM['{}1'.format(dimtype)]
axis_min = -ceil(ceil(max(np.abs(x).max(),np.abs(y).max()))/10)*10
axis_max = ceil(ceil(max(np.abs(x).max(),np.abs(y).max()))/10)*10

# MONOSOMY 3 SIGNATURE
ax = plt.subplot(gs1[0])
gene_sig = 'Monosomy 3 Up'
signature_genes = genesets[gene_sig].values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
phase = zscore(np.nanmedian(QUERY[detected_genes],axis=1))
# Plot scatter map with phase score
mask = np.isnan(phase)
    
# Randomize/Sort Color Values before Plotting
color_values = np.array(phase)
i = np.random.permutation(len(color_values))
xx = np.array(x)[i]
yy = np.array(y)[i]
color_values = color_values[i]

plt.scatter(xx, yy, c = '#E6E6FA', cmap = CM_DIVERGING, s=dot_size, alpha =0.4)
plt.clim(-1.5,1.5)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
plt.title('Monosomy 3', fontname='Helvetica', size=12, weight='normal')

# Cell Nearest to Each Archetype
for ii in np.unique(META['Archetype_Nearest_Cell'])[1:]:
    c=META['Archetype_Nearest_Cell']==ii
    cc=META['Archetype_Soft_Cluster']==ii
    plt.scatter(x[c.values],y[c.values],s= cc.sum()*0.5, color = '#'+FLATUI_ARCHETYPES[ii])
    #offset = 1000
    #plt.text(x[c]+offset,y[c]+offset,s = '{}'.format(ii), fontsize = 20)
    sns.despine()
    plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
    plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
    plt.title('Archetypes', fontname='Helvetica', size=12, weight='normal')
    plt.xlim(axis_min,axis_max)
    plt.ylim(axis_min,axis_max)
    plt.axis('off')

# Nearest Archetype Defined by Multi-Scale Diffusion Distance
ax = plt.subplot(gs1[1])
seqc.plot.scatter.categorical(x, y, c=META['Nearest_Archetype'], 
                              cmap=CM_NEAREST_ARCHETYPES,legend_kwargs={'ncol': 1}, s=dot_size, ax=ax);
sns.despine()
plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
plt.title('Sample ID', fontname='Helvetica', size=12, weight='normal')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
#ax.legend_.remove()

# SAVE FIGURE
figure_label = '_NEARESTArchetypes_Grey_INDF_{}_{}_{}'.format(subset_type,meta,dimtype) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
plt.savefig(fn + '.pdf', bbox_inches='tight',dpi=fig_dpi)
print(fn)

In [None]:
# Visuaulize distance to each archetype colored by GEP1 Probability
# PLOT CATEGORICAL CLUSTER ASSIGNMENTS
# Note: Any rows assigned -1 were identified as outliers and should not be considered as a member of any community.
plt.figure(figsize = (20,10))
nrow = 1
ncol = 2
dot_size = 2
gs1 = gridspec.GridSpec(nrow, ncol)
gs1.update(wspace=0.05, hspace=0.1) # set the spacing between axes. 

cm = plt.cm.bwr 
dimtype = 'ForceDirected' 
meta = 'Phenograph_Class'
exec('DIM = DIMENSIONS_{}'.format(subset_type))
exec('QUERY= INDF_{}'.format(subset_type))
exec('META = METADATA_{}'.format(subset_type))
x = DIM['{}0'.format(dimtype)]
y = DIM['{}1'.format(dimtype)]
axis_min = -ceil(ceil(max(np.abs(x).max(),np.abs(y).max()))/10)*10
axis_max = ceil(ceil(max(np.abs(x).max(),np.abs(y).max()))/10)*10

# MONOSOMY 3 SIGNATURE
ax = plt.subplot(gs1[0])
gene_sig = 'Monosomy 3 Up'
signature_genes = genesets[gene_sig].values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
phase = zscore(np.nanmedian(QUERY[detected_genes],axis=1))
# Plot scatter map with phase score
mask = np.isnan(phase)
    
# Randomize/Sort Color Values before Plotting
color_values = np.array(phase)
i = np.random.permutation(len(color_values))
xx = np.array(x)[i]
yy = np.array(y)[i]
color_values = color_values[i]

plt.scatter(xx, yy, c = '#E6E6FA', cmap = CM_DIVERGING, s=dot_size, alpha =0.4)
plt.clim(-1.5,1.5)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
plt.title('Monosomy 3', fontname='Helvetica', size=12, weight='normal')

# Cell Nearest to Each Archetype
for ii in np.unique(META['Archetype_Nearest_Cell'])[1:]:
    c=META['Archetype_Nearest_Cell']==ii
    cc=META['Archetype_Soft_Cluster']==ii
    plt.scatter(x[c.values],y[c.values],s= cc.sum()*0.5, color = '#'+FLATUI_ARCHETYPES[ii])
    #offset = 1000
    #plt.text(x[c]+offset,y[c]+offset,s = '{}'.format(ii), fontsize = 20)
    sns.despine()
    plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
    plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
    plt.title('Archetypes', fontname='Helvetica', size=12, weight='normal')
    plt.xlim(axis_min,axis_max)
    plt.ylim(axis_min,axis_max)
    plt.axis('off')

# Nearest Archetype Defined by Multi-Scale Diffusion Distance
ax = plt.subplot(gs1[1])
seqc.plot.scatter.categorical(x, y, c=METADATA_TUMOR.Assignment, 
                              cmap=CM_CASTLE, legend=False, s=dot_size, ax=ax,);
sns.despine()
plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
plt.title('Sample ID', fontname='Helvetica', size=12, weight='normal')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
#ax.legend_.remove()

# SAVE FIGURE
figure_label = '_NEARESTArchetypes_Grey_INDF_{}_{}_{}'.format(subset_type,meta,dimtype) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
plt.savefig(fn + '.pdf', bbox_inches='tight',dpi=fig_dpi)
print(fn)

In [None]:
# Visuaulize distance to each archetype colored by GEP1 Probability
# PLOT CATEGORICAL CLUSTER ASSIGNMENTS
# Note: Any rows assigned -1 were identified as outliers and should not be considered as a member of any community.
plt.figure(figsize = (20,10))
nrow = 1
ncol = 2
dot_size = 2
gs1 = gridspec.GridSpec(nrow, ncol)
gs1.update(wspace=0.05, hspace=0.1) # set the spacing between axes. 

cm = plt.cm.bwr 
dimtype = 'ForceDirected' 
meta = 'Phenograph_Class'
exec('DIM = DIMENSIONS_{}'.format(subset_type))
exec('QUERY= INDF_{}'.format(subset_type))
exec('META = METADATA_{}'.format(subset_type))
x = DIM['{}0'.format(dimtype)]
y = DIM['{}1'.format(dimtype)]
axis_min = -ceil(ceil(max(np.abs(x).max(),np.abs(y).max()))/10)*10
axis_max = ceil(ceil(max(np.abs(x).max(),np.abs(y).max()))/10)*10

# MONOSOMY 3 SIGNATURE
ax = plt.subplot(gs1[0])
gene_sig = 'Monosomy 3 Up'
signature_genes = genesets[gene_sig].values
signature_genes = [x for x in signature_genes if str(x) != 'nan']
detected_genes = list(set(signature_genes).intersection(set(QUERY.columns)))
phase = zscore(np.nanmedian(QUERY[detected_genes],axis=1))
# Plot scatter map with phase score
mask = np.isnan(phase)
    
# Randomize/Sort Color Values before Plotting
color_values = np.array(phase)
i = np.random.permutation(len(color_values))
xx = np.array(x)[i]
yy = np.array(y)[i]
color_values = color_values[i]

plt.scatter(xx, yy, c = '#E6E6FA', cmap = CM_DIVERGING, s=dot_size, alpha =0.4)
plt.clim(-1.5,1.5)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
plt.title('Monosomy 3', fontname='Helvetica', size=12, weight='normal')

# Cell Nearest to Each Archetype
for ii in np.unique(META['Archetype_Nearest_Cell'])[1:]:
    c=META['Archetype_Nearest_Cell']==ii
    cc=META['Archetype_Soft_Cluster']==ii
    plt.scatter(x[c.values],y[c.values],s= cc.sum()*0.5, color = '#'+FLATUI_ARCHETYPES[ii])
    #offset = 1000
    #plt.text(x[c]+offset,y[c]+offset,s = '{}'.format(ii), fontsize = 20)
    sns.despine()
    plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
    plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
    plt.title('Archetypes', fontname='Helvetica', size=12, weight='normal')
    plt.xlim(axis_min,axis_max)
    plt.ylim(axis_min,axis_max)
    plt.axis('off')

# Nearest Archetype Defined by Multi-Scale Diffusion Distance
ax = plt.subplot(gs1[1])
seqc.plot.scatter.categorical(x, y, c=METADATA_TUMOR.TCGA_Assignment, 
                              cmap=CM_TCGA, legend=False, s=dot_size, ax=ax,);
sns.despine()
plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
plt.title('Sample ID', fontname='Helvetica', size=12, weight='normal')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
#ax.legend_.remove()

# SAVE FIGURE
figure_label = '_NEARESTArchetypes_Grey_INDF_{}_{}_{}'.format(subset_type,meta,dimtype) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
plt.savefig(fn + '.pdf', bbox_inches='tight',dpi=fig_dpi)
print(fn)