# Extract Gene Sets
This section will contain functionality to extract gene sets from symbolic linked disease sets. The goal of this is that once gene sets are extracted, monte-carlo's can be performed on a locally run DGIdb to generate histogram's of returned drugs. This will identified gene sets (hopefully) as stastically significant and relevant to diseases.

In [114]:
import pandas as pd
import re
import requests
import json
import random
import plotly.express as px

In [117]:
# Define disease sets
disease_sets = ['brainCancer','cardiacDisorders','cnsCancer','hereditaryCancer','metabolicDiseases','renalDiseases','skinDiseases','t-All','wilmsTumor','MDS','epilepsy','pediatricCancer','pediatricDisease']


In [118]:
# Open yaml files to get excel names  (missing epilepsy & mds & pediatricCancer & pediatricDisease)

excel_file_list = list()
for i in disease_sets:
    with open('../symlinks/panels/'+ i + '.yaml') as f:
        individual_gene_panels = f.read().splitlines()
    excel_file_list.append(individual_gene_panels)



In [119]:
# Functions to get genes
def get_genes(df):
    
    gene_list = list()
    try:
        for i in df.geneSymbol:
            gene_list.append(i)
    except:
        for i in df.Gene:
            gene_list.append(i)

    return(gene_list)

In [120]:
# Open files and retrieve metrics

list_counter = 0

genes_df_list = list()
disease_df_list = list()
source_file_list = list()
genes_length_list = list()


while list_counter < len(excel_file_list):
    for i in excel_file_list[list_counter]:
        try:
            try:
                f = pd.read_excel('../../diseases/' + disease_sets[list_counter] + '/' + str(i),sheet_name='genes',index_col=0)
                genes = get_genes(f)

                genes_df_list.append(genes)
                genes_length_list.append(str(len(genes)))
                disease_df_list.append(disease_sets[list_counter])
                source_file_list.append(i)

            except:
                f = pd.read_excel('../../diseases/' + disease_sets[list_counter] + '/' + str(i),sheet_name='data',index_col=0)
                genes = get_genes(f)

                genes_df_list.append(genes)
                genes_length_list.append(str(len(genes)))
                disease_df_list.append(disease_sets[list_counter])
                source_file_list.append(i)

            print('Opened file: ' + i + ' for disease ' + disease_sets[list_counter])
        except:
            print ('../../diseases/' + disease_sets[list_counter] + '/' + str(i))

            genes_df_list.append('None')     
            genes_length_list.append('Nan')
            disease_df_list.append(disease_sets[list_counter])
            source_file_list.append(i)

    list_counter += 1

Opened file: Ambry Genetics BrainTumorNext ®.xlsx for disease brainCancer
Opened file: Baylor Miraca Genetics Laboratories Hereditary Brain CNS PNS Cancer Panel.xlsx for disease brainCancer
../../diseases/brainCancer/Children's Hospital of Philadelphia - Division of Genomic Diagnostics Hereditary Brain Tumor Panel.xlsx
Opened file: EGL Genetics Brain, CNS, and PNS Cancer Panel: Sequencing and CNV Analysis.xlsx for disease brainCancer
Opened file: EGL Genetics Brain, CNS, and PNS Cancer: Deletion Duplication Panel.xlsx for disease brainCancer
Opened file: Fulgent Genetics Nervous System   Brain Cancer Comprehensive Panel  Sequencing Only).xlsx for disease brainCancer
Opened file: BioReference Laboratories OnkoSight Glioma Panel.xlsx for disease brainCancer
Opened file: GeneDx Brain Tumor Panel.xlsx for disease brainCancer
Opened file: Integrated Genetics VistaSeq Brain   CNS   PNS Cancer Panel.xlsx for disease brainCancer
Opened file: Invitae Invitae Nervous System Brain Cancer Panel - 

In [121]:
genes_df = pd.DataFrame()
genes_df = genes_df.assign(disease=disease_df_list,source=source_file_list,genes=genes_df_list,number=genes_length_list)
genes_df

Unnamed: 0,disease,source,genes,number
0,brainCancer,Ambry Genetics BrainTumorNext ®.xlsx,"[AIP, ALK, APC, CDKN1B, CDKN2A, DICER1, EPCAM,...",29
1,brainCancer,Baylor Miraca Genetics Laboratories Hereditary...,"[ALK, APC, ATM, MEN1, MLH1, MRE11, MSH2, MSH6,...",17
2,brainCancer,Children's Hospital of Philadelphia - Division...,,Nan
3,brainCancer,"EGL Genetics Brain, CNS, and PNS Cancer Panel:...","[ALK, APC, ATM, MEN1, MLH1, MSH2, MSH6, NBN, N...",15
4,brainCancer,"EGL Genetics Brain, CNS, and PNS Cancer: Delet...","[ALK, APC, ATM, MEN1, MLH1, MSH2, MSH6, NBN, N...",15
...,...,...,...,...
159,pediatricDisease,Pharmacogenomics_Paediatric.xlsx,"[CACNA1S, CYP2C19, CYP2C9, CYP2D6, CYP3A5, G6P...",17
160,pediatricDisease,PreventionGenetics Pediatric Cancer Panel.xlsx,"[ALK, ANKRD26, APC, ATM, AXIN2, BAP1, BLM, BMP...",64
161,pediatricDisease,Severe Paediatric Disorders.xlsx,"[AAAS, AARS, AARS2, AASS, ABAT, ABCA1, ABCA12,...",2691
162,pediatricDisease,Unexplained paediatric onset end-stage renal d...,"[ACE, ACTG2, ACTN4, AGT, AGTR1, AGXT, AHI1, AL...",231


# Define Monte Carlo functionality
Gene sets have been extracted with appropriate meta data attached in one big dataframe. Build functionality to perform monte carlo simulations off local DGIdb instance using the length of gene list for sample size. Enable functionality for both source level and disease level? I think disease level is the focus

In [123]:
# Get gene set as defined by diseases
def get_gene_set(disease,df):

    df = df[df['disease']==disease]

    all_genes = list()

    for i in df.genes:
        if i == 'None':
            pass
        else:
            for j in i:
                all_genes.append(j)
    full_gene_set = list(set(all_genes))
    return(full_gene_set)

# Get drug JSON from locally run DGIdb  
def get_json(input_genes):

    input_genes = ','.join(input_genes)

    r = requests.get('http://localhost:3000/api/v2/interactions.json?genes=' + input_genes + '&fda_approved_drug=true')
    data = r.json()

    return(data)

# Get number of drugs from JSON (more data points can be added here)
def get_monte_carlo_data_points(data):

    number_of_drugs = 0

    for i in data['matchedTerms']:
        data_point = len(i['interactions'])
        number_of_drugs = number_of_drugs + data_point

    return(number_of_drugs)


In [73]:
lol = get_gene_set('brainCancer',genes_df)
len(lol)

76

In [65]:
data = get_json(lol)

In [88]:
result = get_monte_carlo_data_points(data)
result

2316

6

# Random sampling functionality
Random sampling functionality for simulations. Use previously made random gene sets.

In [124]:
def get_random_sample(length):

    with open('allGenes.yaml') as f:
        all_genes = f.read().splitlines()    

    sample = random.sample(all_genes, length)

    return(sample)

# For random sampling purposes, save json for later retrieval
def save_json(data,file_name):

    with open('json/' + file_name + '.json', 'w') as outfile:
        json.dump(data, outfile)

    pass

In [92]:
random_genes = get_random_sample(len(lol))
random_genes

['KRTAP10-9',
 'RCL1',
 'PHAX',
 'LINC01395',
 'HDHD2',
 'OTOA',
 'SNAPIN',
 'KIAA0101',
 'CRK',
 'ITGA4',
 'CXCL5',
 'CHRFAM7A',
 'ANAPC7',
 'ZNF266',
 'MYO9A',
 'UBE2R2',
 'DCLK2',
 'ROBO4',
 'GSC',
 'TSSK4',
 'ZBTB10',
 'IRAK3',
 'GCOM1',
 'LINC01138',
 'SNORD116-2',
 'PPP2R5A',
 'SDIM1',
 'EP400NL',
 'SNORD115-43',
 'JAG2',
 'NALCN-AS1',
 'GPATCH11',
 'ARFIP2',
 'OSBPL8',
 'LINC01288',
 'KIAA1715',
 'CDKN3',
 'PIRC84',
 'AKR7A2',
 'ARHGAP5',
 'RAPGEF4-AS1',
 'SCN2A',
 'IRX5',
 'NGFR',
 'TRIM65',
 'LACTB2',
 'AGK',
 'IDDM4',
 'FEV',
 'C7orf72',
 'PROS1',
 'ZMYND8',
 'OMD',
 'ATCAY',
 'C3orf56',
 'WNT7B',
 'ECEL1',
 'VASP',
 'PLCH2',
 'TLCD1',
 'NUTM2A',
 'U2AF1L4',
 'AGAP4',
 'LINC01488',
 'OR52W1',
 'TMEM201',
 'TMED8',
 'PAUPAR',
 'IFNA14',
 'DONSON',
 'FBXO31',
 'ATR',
 'SQSTM1',
 'PAX6',
 'TRS-CGA3-1',
 'RPS5']

In [96]:
random_data = get_json(random_genes)


In [95]:
random_result = get_monte_carlo_data_points(random_data)
random_result

93

# Putting it all together
Do an actual monte carlo simulation with 500 samples (499 + 1)

In [131]:

def run_monte_carlo_simulation(genes_df,disease):

    # Putting it all together 
    df = pd.DataFrame()
    sample_source_list = list()
    number_of_drugs_list = list()
    genes_input_list = list()

    # disease_sets has full list, but random samples length will change depending on gene set length

    # Primary samples
    primary_sample = get_gene_set(disease,genes_df)
    data = get_json(primary_sample)
    result = get_monte_carlo_data_points(data)

    sample_source_list.append(disease)
    number_of_drugs_list.append(str(result))
    genes_input_list.append(primary_sample)

    # Loop
    loop_counter = 1

    while loop_counter < 500:
        random_sample = get_random_sample(len(primary_sample))
        random_data = get_json(random_sample)
        random_result = get_monte_carlo_data_points(random_data)

        sample_source_list.append('random_sample')
        number_of_drugs_list.append(int(random_result))
        genes_input_list.append(random_sample)

        loop_counter += 1
        if loop_counter == 1 | 50 | 100 | 200 | 300 | 400 | 500:
            print(str('On iteration ' + loop_counter + 'for disease state: ' + disease))

    df = df.assign(sample_source=sample_source_list, number_of_drugs=number_of_drugs_list, genes_input=genes_input_list)

    # Write DataFrames to excel files by query input
    file_name = disease
    writer = pd.ExcelWriter('simulation_results/' + file_name + '.xlsx')
    df.to_excel(writer,sheet_name='results')
    print('Saving simulation result for ' + disease)
    writer.save()


    return(df)

In [128]:
for i in disease_sets:
    print(i)

brainCancer
cardiacDisorders
cnsCancer
hereditaryCancer
metabolicDiseases
renalDiseases
skinDiseases
t-All
wilmsTumor
MDS
epilepsy
pediatricCancer
pediatricDisease


In [132]:
for i in disease_sets:
    run_monte_carlo_simulation(genes_df,i)

Saving simulation result for brainCancer


# Plot resulting histogram
Simulation is ran and data is saved. Plot the results!

In [106]:
# quick transform to int on accident
temp = monte_carlo_df
index_counter = 0
for i in monte_carlo_df['number_of_drugs']:
    monte_carlo_df['number_of_drugs'][index_counter] = int(i)
    

In [107]:
fig = px.histogram(temp, x='number_of_drugs', color='sample_source', marginal='box')
fig.show()