In [14]:
import numpy as np
import pandas as pd
import json
from jordan_plus_genes.model.Sample_MIL import InstanceModels, RaggedModels
from jordan_plus_genes.model.KerasLayers import Losses, Metrics
import tensorflow as tf
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from jordan.model import DatasetsUtils
import pickle
import logomaker
import matplotlib.pyplot as plt
from scipy.stats import fisher_exact

#physical_devices = tf.config.experimental.list_physical_devices('GPU')
#tf.config.experimental.set_memory_growth(physical_devices[-4], True)
#tf.config.experimental.set_visible_devices(physical_devices[-4], 'GPU')

# Load the raw data files
D, tcga_maf, samples = pickle.load(open('/home/janaya2/Desktop/ATGC_paper/figures/tumor_classification/data/data.pkl', 'rb'))

cancerhotspots_df = pd.read_csv("/home/sahn33/Documents/cancerhotspots.v2.maf",sep="\t", low_memory=True) #usecols=["Chromosome","Start_Position", "End_Position", "Reference_Allele","Tumor_Seq_Allele2"],

with open("publication_hotspots.vcf", "r") as f:
    lines = f.readlines()
    chrom_index = [i for i, line in enumerate(lines) if line.strip().startswith("#CHROM")]
    data = lines[chrom_index[0]:]
header = data[0].strip().split("\t")
informations = [d.strip().split("\t") for d in data[1:]]
publication_hotspots_df = pd.DataFrame(informations, columns=header)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [15]:
# Create a dataframe that identifies the instances that are hotspots.
cancerhotspots_df.drop_duplicates(subset=["Chromosome","Start_Position", "End_Position", "Reference_Allele","Tumor_Seq_Allele2"],keep=False,inplace=True)

cancerhotspots_df = cancerhotspots_df[["Chromosome","Start_Position", "End_Position", "Reference_Allele","Tumor_Seq_Allele2"]] #
cancerhotspots_df["id"] = cancerhotspots_df.index

hotspot_df_ = pd.merge(cancerhotspots_df, tcga_maf, how='right', on=["Chromosome","Start_Position", "End_Position", "Reference_Allele","Tumor_Seq_Allele2"], suffixes=('_duplicate',''))

publication_hotspots_df.drop_duplicates(subset=["#CHROM","POS","REF","ALT"],keep=False,inplace=True)

hotspot_df_copy = hotspot_df_
hotspot_df_copy = hotspot_df_copy[["Chromosome","Start_Position","Ref","Alt","id"]] #
hotspot_df_copy['Alt'] = hotspot_df_copy['Alt'].apply(lambda x: x[:1])
hotspot_df_copy['Ref'] = hotspot_df_copy['Ref'].apply(lambda x: x[:1])
hotspot_df_copy['Start_Position'] = hotspot_df_copy['Start_Position'].apply(lambda x: str(x))
publication_hotspots_df["my_index"] = publication_hotspots_df.index
hotspot_df_merge_cols = ["Chromosome","Start_Position","Ref","Alt"]
publication_hotspots_df_merge_cols = ["#CHROM","POS","REF","ALT"]

hotspot_df = pd.merge(publication_hotspots_df, hotspot_df_copy, how='right', right_on = hotspot_df_merge_cols, left_on = publication_hotspots_df_merge_cols, suffixes=('_duplicate',''))


In [17]:
# Create a column in the dataframe that flags for InDel instances

tcga_maf['indel'] = np.where((tcga_maf['Variant_Classification'] == 'Frame_Shift_Del') |
                             (tcga_maf['Variant_Classification'] == 'Frame_Shift_Ins') |
                             (tcga_maf['Variant_Classification'] == 'In_Frame_Del') |
                             (tcga_maf['Variant_Classification'] == 'In_Frame_Ins'),
                             True, False)

In [18]:
# Re-assign labels from TCGA to custom NCIT labels and encode the data

tcga_maf['Hugo_Symbol'] = tcga_maf['Hugo_Symbol'].astype('category')
samples['type'] = samples['type'].apply(lambda x: 'COAD' if x == 'READ' else x)

###
# filtering the NCI-T labels (https://livejohnshopkins-my.sharepoint.com/:x:/r/personal/abaras1_jh_edu/_layouts/15/doc2.aspx?sourcedoc=%7B5f92f0fc-ec6c-40d5-ab17-0d3345f9f2c2%7D&action=edit&activeCell=%27Sheet1%27!B21&wdinitialsession=e072a38f-57c8-4c1f-885b-efaefcc81d35&wdrldsc=2&wdrldc=1&wdrldr=AccessTokenExpiredWarning%2CRefreshingExpiredAccessT)
ncit_labels_kept = ['Muscle-Invasive Bladder Carcinoma','Infiltrating Ductal Breast Carcinoma',
                    'Invasive Lobular Breast Carcinoma','Cervical Squamous Cell Carcinoma',
                    'Colorectal Adenocarcinoma','Glioblastoma','Head and Neck Squamous Cell Carcinoma',
                    'Clear Cell Renal Cell Carcinoma','Papillary Renal Cell Carcinoma','Astrocytoma',
                    'Oligoastrocytoma','Oligodendroglioma','Hepatocellular Carcinoma','Lung Adenocarcinoma',
                    'Lung Squamous Cell Carcinoma','Ovarian Serous Adenocarcinoma','Adenocarcinoma, Pancreas',
                    'Paraganglioma','Pheochromocytoma','Prostate Acinar Adenocarcinoma','Colorectal Adenocarcinoma',
                    'Desmoid-Type Fibromatosis','Leiomyosarcoma','Liposarcoma','Malignant Peripheral Nerve Sheath Tumor',
                    'Myxofibrosarcoma','Synovial Sarcoma','Undifferentiated Pleomorphic Sarcoma',
                    'Cutaneous Melanoma','Gastric Adenocarcinoma','Testicular Non-Seminomatous Germ Cell Tumor',
                    'Testicular Seminoma','Thyroid Gland Follicular Carcinoma','Thyroid Gland Papillary Carcinoma',
                    'Endometrial Endometrioid Adenocarcinoma','Endometrial Serous Adenocarcinoma']
idx_filter = samples['NCI-T Label'].isin(ncit_labels_kept)
ncit_samples = samples.loc[idx_filter]
PCPG_ncit = ['Paraganglioma','Pheochromocytoma']
SARC_ncit = ['Desmoid-Type Fibromatosis','Leiomyosarcoma','Liposarcoma','Malignant Peripheral Nerve Sheath Tumor',
             'Myxofibrosarcoma','Synovial Sarcoma','Undifferentiated Pleomorphic Sarcoma']
TGCT_ncit = ['Testicular Non-Seminomatous Germ Cell Tumor','Testicular Seminoma']
ncit_samples.loc[ncit_samples['NCI-T Label'].isin(PCPG_ncit), 'NCI-T Label'] = 'PCPG'
ncit_samples.loc[ncit_samples['NCI-T Label'].isin(SARC_ncit), 'NCI-T Label'] = 'SARC'
ncit_samples.loc[ncit_samples['NCI-T Label'].isin(TGCT_ncit), 'NCI-T Label'] = 'TGCT'

#samples = ncit_samples
A = ncit_samples['NCI-T Label'].astype('category')
###

D['genes'] = np.concatenate(tcga_maf[['Hugo_Symbol']].apply(lambda x: x.cat.codes).values + 1)
input_dim = max(D['genes'])
dropout = .5

indexes = [np.where(D['sample_idx'] == idx) for idx in np.arange(samples.shape[0])[idx_filter]]
#indexes = [np.where(D['sample_idx'] == idx) for idx in np.arange(samples.shape[0])]  #<-- uncomment for non NCIT labels
genes = np.array([D['genes'][i] for i in indexes], dtype='object')
hotspots = np.array([~pd.isna(hotspot_df["my_index"]).values[i] for i in indexes], dtype='object')
indels = np.array([tcga_maf['indel'].values[i] for i in indexes], dtype='object')

index_loader = DatasetsUtils.Map.FromNumpytoIndices([j for i in indexes for j in i], dropout=dropout)
genes_loader = DatasetsUtils.Map.FromNumpyandIndices(genes, tf.int16)
genes_loader_eval = DatasetsUtils.Map.FromNumpy(genes, tf.int16, dropout=0)


#A = samples['type'].astype('category') #<-- uncomment for non NCIT labels
classes = A.cat.categories.values
##integer values for random forest
classes_onehot = np.eye(len(classes))[A.cat.codes]
y_label = classes_onehot

y_strat = np.argmax(y_label, axis=-1)
class_counts = dict(zip(*np.unique(y_strat, return_counts=True)))
y_weights = np.array([1 / class_counts[_] for _ in y_strat])
y_weights /= np.sum(y_weights)

y_label_loader = DatasetsUtils.Map.FromNumpy(y_label, tf.float32)
y_weights_loader = DatasetsUtils.Map.FromNumpy(y_weights, tf.float32)

predictions = []
test_idx = []
weights = []

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [19]:
# Load the models' results.

test_idx, weights = pickle.load(open('/home/mlee276/Desktop/TCGA-ML-main/results/mil_gene_weights.pkl', 'rb'))

gene_encoder = InstanceModels.GeneEmbed(shape=(), input_dim=input_dim, dim=256)
mil = RaggedModels.MIL(instance_encoders=[gene_encoder.model], sample_encoders=[], output_dims=[y_label.shape[-1]], output_types=['other'], mil_hidden=[256], attention_layers=[64, 16], dropout=.5, instance_dropout=.5, regularization=0, input_dropout=dropout)
 
attention_weights = []
embedding_weights = []
idx_test_all = []
for idx_test, fold_weights in zip(test_idx, weights):
    mil.model.set_weights(fold_weights)
    ds_test = tf.data.Dataset.from_tensor_slices(((
                                                       genes_loader_eval(idx_test),
                                                   ),
    ))
    ds_test = ds_test.batch(len(idx_test), drop_remainder=False)
    attention_weights.extend(mil.attention_model.predict(ds_test).to_list())
    idx_test_all.extend(idx_test)
    embedding_weights.append(mil.model.get_weights())

 ### can only call this once or else the order gets mixed. 
hotspots_reordered = hotspots[idx_test_all] 
indels_reordered = indels[idx_test_all]
genes_reordered = genes[idx_test_all]   

In [20]:
# Get MIL predictions
test_idx, mil_predictions = pickle.load(open('/home/mlee276/Desktop/TCGA-ML-main/results/mil_gene_predictions.pkl', 'rb'))
mil_predictions_labels = np.argmax(mil_predictions, axis=-1)
# Get true labels
y_strat = np.argmax(y_label, axis=-1)
correct = (y_strat[np.concatenate(test_idx)])

### can only call this once or else the order gets mixed. 
#hotspots_reordered = hotspots[idx_test_all] 
#indels_reordered = indels[idx_test_all]
#genes_reordered = genes[idx_test_all]

# Returns the attention weights, hotspot attention weights, and non-hotspot attention weights associated with a specifc cancer type and context sequence.
def get_position_weight_hotspots(class_num, sequence):
    # Get indexes of correct predictions
    correct_predictions_idx = []
    for i in range(len(correct)):
        if correct[i] == mil_predictions_labels[i] and correct[i] == class_num:
            correct_predictions_idx.append(i)
            
    all_sample_attention_weights = []
    hotspot_attention_weights = []
    non_hotspot_attention_weights = []
    print(len(correct_predictions_idx))
    for idx in correct_predictions_idx:
        sample_attention_weights = np.concatenate(attention_weights[idx])
        
        hotspot_attention_weights.append(np.expand_dims(sample_attention_weights, axis=0)[:,hotspots_reordered[idx]][0])
        non_hotspot_attention_weights.append(np.expand_dims(sample_attention_weights, axis=0)[:,~hotspots_reordered[idx]][0])
        
        all_sample_attention_weights.append(sample_attention_weights)
    
    return all_sample_attention_weights, hotspot_attention_weights, non_hotspot_attention_weights

# Returns the attention weights, indel attention weights, and non-indel attention weights associated with a specifc cancer type and context sequence.
def get_position_weight_indels(class_num, sequence):
    # Get indexes of correct predictions
    correct_predictions_idx = []
    for i in range(len(correct)):
        if correct[i] == mil_predictions_labels[i] and correct[i] == class_num:
            correct_predictions_idx.append(i)
            
    all_sample_attention_weights = []
    indel_attention_weights = []
    non_indel_attention_weights = []
    for idx in correct_predictions_idx:
        sample_attention_weights = np.concatenate(attention_weights[idx])
        
        indel_attention_weights.append(np.expand_dims(sample_attention_weights, axis=0)[:,indels_reordered[idx]][0])
        non_indel_attention_weights.append(np.expand_dims(sample_attention_weights, axis=0)[:,~indels_reordered[idx]][0])
        
        all_sample_attention_weights.append(sample_attention_weights)
    
    return all_sample_attention_weights, indel_attention_weights, non_indel_attention_weights

'''
attention_weights: [8920 samples, ragged length genes, 1 float value attention weight]
'''
# Returns the attention weight and associated instance gene of a specific cancer class where the attention weight is between the percent_lower and percent_upper bounds.
# This function can run on sample or instance level data, indicated by the use_sample_level boolean parameter.
def get_weights_and_genes(class_num, percent_lower, percent_upper, use_sample_level):
    # Get indexes of correct predictions of the specific cancer type
    correct_predictions_idx = []
    for i in range(len(correct)):
        if correct[i] == mil_predictions_labels[i] and correct[i] == class_num:
            correct_predictions_idx.append(i)
    
    upper_thresh = None
    lower_thresh = None
    class_attention_weights = []
    for idx in correct_predictions_idx:
        class_attention_weights.append(attention_weights[idx])
    all_weights = np.concatenate(class_attention_weights, axis=0).T[0]
    all_weights_sorted = np.sort(np.array(all_weights))
    # Calculate threshold for attention weights (in respects to the cancer type level)
    if not use_sample_level:
        upper_thresh = all_weights_sorted[round(len(all_weights)*percent_upper)-1]
        lower_thresh = all_weights_sorted[round(len(all_weights)*percent_lower)]
    
    # For all the correctly predicted samples of this cancer, get the genes with attention weights above the threshold.
    filtered_sample_attention_weight_genes = []
    for idx in correct_predictions_idx:
        # for the correctly predicted sample, get all its attention weights.
        sample_attention_weights = np.concatenate(attention_weights[idx])
        # Calculate threshold for attention weights (in respects to the sample level)
        if use_sample_level:
            sample_attention_weights_sorted = np.sort(sample_attention_weights)
            upper_thresh = sample_attention_weights_sorted[round(len(sample_attention_weights)*percent_upper)-1]
            lower_thresh = sample_attention_weights_sorted[round(len(sample_attention_weights)*percent_lower)]
        # get the genes associated with each sample which have above threshold attention weight.
        attention_filter = []
        for i in range(len(sample_attention_weights)):
            if sample_attention_weights[i] >= lower_thresh and sample_attention_weights[i] <= upper_thresh:
                attention_filter.append(True)
            else:
                attention_filter.append(False)
        #getting the targeted genes.
        filtered_sample_attention_weight_genes.append(genes_reordered[idx][attention_filter])

    # filtered_sample_attention_weight_genes: [correctly predicted samples, ragged genes above attention weight threshold]
    return all_weights, filtered_sample_attention_weight_genes

# Returns the genes and associated frequency of the genes of the gene of a specific cancer class that the model used. The genes are ordered by frequency. 
# The genes are pre-selected from the entire set of samples for the specific cancer class by the get_weights_and_genes function. 
# This function can run on sample or instance level data, indicated by the use_sample_level boolean parameter.
# This function can be used to return only the top 10 most frequent genes according to the frequency_filter boolean parameter. 
def get_top_gene_labels(class_num, percent_lower, percent_upper, frequency_filter, use_sample_level):
    top_gene_labels = []
    # Get indexes of top genes in this cancer class by attention weight
    _,filtered_sample_attention_weight_genes = get_weights_and_genes(class_num, percent_lower, percent_upper, use_sample_level)
    # Get count of found genes
    temp = []
    for l in filtered_sample_attention_weight_genes:
        temp.extend(l)
    filtered_genes = np.array(temp)
    
    unique_genes, counts = np.unique(filtered_genes, return_counts = True)
    counts_sorted_idx = np.argsort(counts)
    sorted_filtered_genes = unique_genes[counts_sorted_idx]
    top_genes_counted = sorted_filtered_genes
    top_genes_counts = counts[counts_sorted_idx]
    if frequency_filter:
        top_genes_counted = sorted_filtered_genes[-10:]
        top_genes_counts = counts[counts_sorted_idx][-10:]
    #shift codes by one to get the correct gene label from the code:
    top_genes_counted = [n-1 for n in top_genes_counted]
    
    # make "selected" gene idx to label dictionary
    top_gene_labels = tcga_maf['Hugo_Symbol'].cat.categories[top_genes_counted]
    #cat codes are index categories
    
    # get cancer type label
    cancer_label = classes[class_num]
    
    return top_gene_labels, cancer_label, top_genes_counts


In [21]:
'''
we want to compare the count of genes in the top 10% of attention weights and the lower 
90% of attention weights using a Fisher Exact test to see if there is a relationship.
Note: no top 10 frequency of genes filter.
'''
# Returns the Exact Fisher Score of genes from the get_top_gene_labels function. 
# The two catagories being compared in the Fisher test are created using the min_perc_1, max_perc_1, min_perc_2, max_perc_2 parameters. The two catagories separate the genes into two sets of genes based on their attention weight. 
def fisher(cancer_type, min_perc_1, max_perc_1, min_perc_2, max_perc_2):
    gene_labels_10, cancer_label_10, gene_counts_10 = get_top_gene_labels(cancer_type, min_perc_1, max_perc_1, frequency_filter=False, use_sample_level=False)
    gene_labels_90, cancer_label_90, gene_counts_90 = get_top_gene_labels(cancer_type, min_perc_2, max_perc_2, frequency_filter=False, use_sample_level=False)

    gene_labels_counts_10 = dict(zip(gene_labels_10, gene_counts_10))
    gene_labels_counts_90 = dict(zip(gene_labels_90, gene_counts_90))

    gene_counts_10_sum = np.sum(np.array(gene_counts_10))
    gene_counts_90_sum = np.sum(np.array(gene_counts_90))

    genes_unioned = np.union1d(np.array(gene_labels_10), np.array(gene_labels_90))
    gene_fisher = {}
    gene_m = {}
    for gene in genes_unioned:
        m = [[0,0],[0,0]]
        if gene in gene_labels_counts_10:
            m[0][0] = gene_labels_counts_10[gene]
            m[0][1] = gene_counts_10_sum - gene_labels_counts_10[gene]
        else:
            m[0][0] = 0
            m[0][1] = gene_counts_10_sum
        if gene in gene_labels_counts_90:
            m[1][0] = gene_labels_counts_90[gene]
            m[1][1] = gene_counts_90_sum - gene_labels_counts_90[gene]
        else:
            m[1][0] = 0
            m[1][1] = gene_counts_90_sum
        oddsr, p = fisher_exact(m)
        gene_fisher[gene] = round(p, 5)
        gene_m[gene] = m

    gene_10_fisher = {}
    for gene in gene_labels_10:
        if gene in gene_fisher:
            gene_10_fisher[gene] = gene_fisher[gene]

    gene_10_fisher = dict(sorted(gene_10_fisher.items(), key=lambda item: item[1]))

    return gene_fisher, gene_10_fisher, gene_m
    
'''gene_fisher, gene_10_fisher, gene_m = fisher(17, 0.9, 1.0, 0.0, 0.9)
plt.hist(gene_10_fisher.values(),bins=20)
plt.xlabel("fisher score")
plt.ylabel("freq")

print(gene_10_fisher)'''


{'PIK3R1': 0.0, 'ARID1A': 0.0, 'IDH2': 0.0, 'PIK3CA': 0.0, 'NOTCH1': 0.0, 'FUBP1': 0.0, 'IDH1': 0.0, 'CIC': 0.0, 'ATRX': 0.0001, 'NRAS': 0.00099, 'TP53': 0.00125, 'PBRM1': 0.00997, 'CTNNB1': 0.00997, 'ZBTB20': 0.08924, 'ACTC1': 0.1, 'KY': 0.1, 'VN1R4': 0.1, 'LCP1': 0.1, 'MUC4': 0.1, 'OR10S1': 0.1, 'KEL': 0.1, 'OTOL1': 0.1, 'PTEN': 0.1, 'RASGRP3': 0.1, 'RNF43': 0.1, 'SIK2': 0.1, 'SLC6A20': 0.1, 'TCF7L2': 0.1, 'UBR5': 0.1, 'PCDH11Y': 0.1, 'ACTRT1': 0.1, 'GHR': 0.1, 'FOXA1': 0.1, 'FBXO42': 0.1, 'EP300': 0.1, 'EFEMP1': 0.1, 'DNAH6': 0.1, 'ATP1B1': 0.1, 'CAST': 0.1, 'CNOT4': 0.1, 'CDK8': 0.1, 'MTOR': 0.19003, 'MYO5A': 0.19003, 'RAD21': 0.19003, 'IKZF3': 0.19003, 'LAMP2': 0.19003, 'ARID1B': 0.34403, 'BCOR': 0.34403, 'NIPBL': 1.0}


In [12]:
# Produces a plot of the genes of a cancer type that are in the top x percent of attention weights and have a gene count greater than y.
print("The top 10 of the top 10% genes based on attention weight for each cancer type:")
table_data = []
for i in range(len(classes)):
    top_gene_labels, cancer_label, gene_counts = get_top_gene_labels(i, 0.9, 1.0, True, False)
    gene_fisher, gene_10_fisher, gene_m = fisher(i, 0.9, 1.0, 0.0, 0.9)
    
    top_gene_label =  [str(list(top_gene_labels)[j]) + 
                        " (" +  str(list(gene_counts)[j]) + ")\n" +
                        str('\n'.join(pd.DataFrame(np.array(gene_m[top_gene_labels[j]])).to_string(index = False).split('\n')[1:])) +
                       "\np:" + str(gene_fisher[top_gene_labels[j]])
                        for j in range(len(list(top_gene_labels)))]
    row = [cancer_label]
    row.extend(top_gene_label)
    while len(row) < 11:
        row.append("")
    table_data.append(np.array(row))
    
%matplotlib
plt.figure(figsize=(16,8))
table = plt.table(cellText=table_data,loc="center") #, colWidths=widths)
table.auto_set_font_size(False)
table.set_fontsize(5)
table.scale(1,2)
plt.axis("off")
plt.show()


The top 10 of the top 10% genes based on attention weight for each cancer type:
Using matplotlib backend: Qt5Agg


In [13]:
# Produces a Matrix of histogram of the Fisher score distribution of the top 10% attention weighted genes for each cancer type
%matplotlib
fig,axs = plt.subplots(5,6)
for i in range(len(classes)):
    gene_fisher, gene_10_fisher, gene_m = fisher(i, 0.9, 1.0, 0.0, 0.9)
    plt.hist(gene_10_fisher.values(),bins=20)

    bin_nums = 20
    #plotting
    axs[i//6,i%6].hist(gene_10_fisher.values(), bins=bin_nums)
    axs[i//6,i%6].set_title(classes[i],fontsize=8)
    fig.suptitle("Fisher score distribution of the top 10% attention weighted genes for each cancer type")

Using matplotlib backend: Qt5Agg


In [None]:
# Produces a plot of the genes of a cancer type that are in the LOWER x percent of attention weights and have a gene count greater than y.
print("The top 10 of the botttom 90% genes based on attention weight for each cancer type:")
table_data = []
for i in range(len(classes)):
    top_gene_labels, cancer_label, gene_counts = get_top_gene_labels(i, 0.0, 0.9, True, False)
    top_gene_labels =  [str(list(top_gene_labels)[i]) + " \n(" +  str(list(gene_counts)[i]) + ")" for i in range(len(list(top_gene_labels)))]
    row = [cancer_label]
    row.extend(top_gene_labels)
    while len(row) < 11:
        row.append("")
    table_data.append(np.array(row))
    #print(row)
    
%matplotlib
plt.figure(figsize=(16,8))
widths = [1/(len(table_data[0])+5) for i in range(len(table_data[0]))]
widths[0] = 5/(len(table_data[0])+5)
table = plt.table(cellText=table_data,loc="center", colWidths=widths)
table.auto_set_font_size(False)
table.set_fontsize(8)
table.scale(1,2)

plt.axis("off")
plt.show()

In [None]:
# Produces a Matrix of histogram of the frequency of genes in the top 10% attention weighted genes for each cancer type
%matplotlib
fig,axs = plt.subplots(5,6)
for i in range(27):
    w, f = get_weights_and_genes(i, 0.9, 1.0, False)
    
    #for plotting:
    filtered_instance_attention_genes = np.concatenate(np.expand_dims(f,axis=0)[0])

    bin_nums = 100
    #plotting
    axs[i//6,i%6].hist(filtered_instance_attention_genes, bins=bin_nums)
    axs[i//6,i%6].set_title(classes[i],fontsize=8)
    fig.suptitle("Number of genes with attention weights in top 10%")


In [None]:
# Produces a matrix of histograms representing the frequency of attention weights for hotspot instances and non-hotspot instances for all cancer types.
%matplotlib
fig,ax = plt.subplots(5,6)
for i in range(27):
    t_, h_, nh_ = get_position_weight_hotspots(i, five_p) 
    c1 = 0;c2 = 0;c3 = 0
    for n in t_:
        c1 += len(n)
    for n in h_:
        c2 += len(n)
    for n in nh_:
        c3 += len(n)

    perc_hotspot = str(np.round_(100*c2/c1, 3))

    bin_nums = 200
    hotspot_att_weights = []
    for weights in h_:
        hotspot_att_weights.extend(weights)
    non_hotspot_att_weights = []
    for weights in nh_:
        non_hotspot_att_weights.extend(weights)

    #fig, ax = plt.subplots(figsize=(12, 6))
    ax[i//6,i%6].hist(non_hotspot_att_weights, bins=bin_nums, label="non hotspots attention weights")
    ax[i//6,i%6].hist(hotspot_att_weights, bins=bin_nums, label="hotspots attention weights")
    ax[i//6,i%6].set_xlabel('attention weights')
    ax[i//6,i%6].set_ylabel('Count')
    ax[i//6,i%6].set_title(classes[i] + "; " + perc_hotspot + "%", fontsize=8)
#fig.legend()
fig.suptitle("Gene HotSpots Attention Weight Distribution (orange is hotspots)")
    #plt.show()

In [None]:
# Produces a matrix of histograms representing the frequency of attention weights for indel instances and non-indel instances for all cancer types.
%matplotlib
fig,ax = plt.subplots(5,6)
for i in range(27):
    t_, h_, nh_ = get_position_weight_hotspots(i,genes)
    c1 = 0;c2 = 0;c3 = 0
    for n in t_:
        c1 += len(n)
    for n in h_:
        c2 += len(n)
    for n in nh_:
        c3 += len(n)

    perc_hotspot = str(np.round_(100*c2/c1, 3))

    bin_nums = 200
    hotspot_att_weights = []
    for weights in h_:
        hotspot_att_weights.extend(weights)
    non_hotspot_att_weights = []
    for weights in nh_:
        non_hotspot_att_weights.extend(weights)

    #fig, ax = plt.subplots(figsize=(12, 6))
    ax[i//6,i%6].hist(non_hotspot_att_weights, bins=bin_nums, label="non indels attention weights")
    ax[i//6,i%6].hist(hotspot_att_weights, bins=bin_nums, label="indels attention weights")
    ax[i//6,i%6].set_xlabel('attention weights')
    ax[i//6,i%6].set_ylabel('Count')
    ax[i//6,i%6].set_title(classes[i] + "; " + perc_hotspot + "%", fontsize=8)
#fig.legend()
fig.suptitle("Gene InDel Attention Weight Distribution (orange is hotspots)")
    #plt.show()