# Explore some GENIE data

## Data Explanation

* https://docs.gdc.cancer.gov/Data/File_Formats/MAF_Format/
* https://docs.cbioportal.org/file-formats/#mutation-data
* https://docs.cbioportal.org/file-formats/#discrete-copy-number-data

## GENIE Datasets

* GENIE 13.3 consortium https://www.synapse.org/#!Synapse:syn36709873
  * data_clinical_patient https://www.synapse.org/#!Synapse:syn36710136
  * data_clinical_sample https://www.synapse.org/#!Synapse:syn36710137
  * data_mutations_extended https://www.synapse.org/#!Synapse:syn36710142
  * data_CNA https://www.synapse.org/#!Synapse:syn36710134
* GENIE 12.0 public https://www.synapse.org/#!Synapse:syn32309524
  * data_clinical_patient https://www.synapse.org/#!Synapse:syn32689054
  * data_clinical_sample https://www.synapse.org/#!Synapse:syn32689057
  * data_mutations_extended https://www.synapse.org/#!Synapse:syn32689317
  * data_CNA https://www.synapse.org/#!Synapse:syn32689019

In [None]:
SYNIDS = {
    "genie_13.3": {
        "data_clinical_patient": "syn36710136",
        "data_clinical_sample": "syn36710137",
        "data_mutations_extended": "syn36710142",
        "data_CNA": "syn36710134",
    },
    "genie_12.0": {
        "data_clinical_patient": "syn32689054",
        "data_clinical_sample": "syn32689057",
        "data_mutations_extended": "syn32689317",
        "data_CNA": "syn32689019",    
    }
}

In [None]:
import json
from pathlib import Path
import sys
from timeit import default_timer

import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
import plotly.express as px

In [None]:
from hack4nf.synapse import get_dataset
from hack4nf.genie_reader import (
    read_clinical_patient, 
    read_clinical_sample, 
    read_mutations_extended,
    read_cna,
)

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
pd.set_option('display.max_columns', 50)

In [None]:
genie_dataset_version = "genie_12.0"

# Data Clinical Patient

In [None]:
syn_file = get_dataset(SYNIDS[genie_dataset_version]['data_clinical_patient'])
df_dcp = read_clinical_patient(syn_file.path)
profile_dcp = ProfileReport(df_dcp, title="Data Clinical Patient", minimal=True)
profile_dcp.to_notebook_iframe()

# Data Clinical Sample

In [None]:
syn_file = get_dataset(SYNIDS[genie_dataset_version]['data_clinical_sample'])
df_dcs = read_clinical_sample(syn_file.path)
profile_dcs = ProfileReport(df_dcs, title="Data Clinical Sample", minimal=True)
profile_dcs.to_notebook_iframe()

# Data Mutations Extended

In [None]:
syn_file = get_dataset(SYNIDS[genie_dataset_version]['data_mutations_extended'])
df_dme = read_mutations_extended(syn_file.path)
profile_dme = ProfileReport(df_dme, title="Data Mutations Extended", minimal=True)
profile_dme.to_notebook_iframe()

In [None]:
df_dcp

In [None]:
df_dcs

In [None]:
oncotree_codes = ["NST", "GNBL", "GN", "NBL", "SCHW", "NFIB", "MPNST"]
df_dcs[df_dcs['ONCOTREE_CODE'].isin(oncotree_codes)]

In [None]:
df_dme

In [None]:
dme_cols = [
    'Tumor_Sample_Barcode', 'Hugo_Symbol', 'Chromosome', 
    'Variant_Classification', 'Variant_Type',
    't_ref_count', 't_alt_count',
    'n_depth', 't_depth',
    'dbSNP_RS', 'HGVSp_Short', 
    'Protein_position', 'Codons', 'Exon_Number',
    'Polyphen_Prediction', 'Polyphen_Score',
    'SIFT_Prediction', 'SIFT_Score',
       ]
df_dme[dme_cols]

In [None]:
df_dme_nf = df_dme[df_dme['Hugo_Symbol'].isin(["NF1", "NF2", "SMARCB1", "LZTR1"])]
df_dme_nf[dme_cols]

In [None]:
assert df_dme['Tumor_Sample_Barcode'].isin(df_dcs['SAMPLE_ID']).all()

In [None]:
df_dme_nf['HGVSp_Short'].value_counts().head(20)

In [None]:
df_mrg = pd.merge(df_dme[dme_cols], df_dcs, left_on='Tumor_Sample_Barcode', right_on='SAMPLE_ID')

In [None]:
df_mrg

# What are the oncotree codes for NF related GENES? 

In [None]:
df_mrg[df_mrg['Hugo_Symbol'].isin(["NF1"])]['ONCOTREE_CODE'].value_counts()

In [None]:
df_mrg[df_mrg['Hugo_Symbol'].isin(["NF2"])]['ONCOTREE_CODE'].value_counts()

In [None]:
df_mrg[df_mrg['Hugo_Symbol'].isin(["SMARCB1"])]['ONCOTREE_CODE'].value_counts()

In [None]:
df_mrg[df_mrg['Hugo_Symbol'].isin(["LZTR1"])]['ONCOTREE_CODE'].value_counts()

# What are the GENES for NF oncotree codes? 

In [None]:
df_mrg[df_mrg['ONCOTREE_CODE'].isin(oncotree_codes)]['Hugo_Symbol'].value_counts()

# Random

In [None]:
df_dme.groupby('Tumor_Sample_Barcode').size().sort_values()

In [None]:
df_dme[df_dme['Tumor_Sample_Barcode']=='GENIE-UHN-692643-ARC2'][dme_cols].head(20)

In [None]:
syn_file = get_dataset(SYNIDS[genie_dataset_version]['data_CNA'])
df_dcna = read_cna(syn_file.path)

In [None]:
df_dcna

In [None]:
(~df_dcna['GENIE-DFCI-009184-6622'].isna()).sum()

In [None]:
df_dcna.loc[["NF1", "NF2", "SMARCB1", "LZTR1"]]

In [None]:
vals = df_dcna.values[~np.isnan(df_dcna.values)]