# Write Open Cravat Input and Read Open Cravat Output


In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
from hack4nf.synapse import get_dataset
from hack4nf.genie_utils import (
    read_clinical_patient, 
    read_clinical_sample, 
    read_mutations_extended,
    read_cna,
    read_cna_seg,
    SYNIDS,
    dme_to_cravat,
    get_cna_norms,
    get_melted_cna,
)

In [None]:
#from IPython.display import display, HTML
#display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
pd.set_option('display.max_columns', 200)

In [None]:
#genie_dataset_version = "genie_12.0"
genie_dataset_version = "genie_13.3"

# Synapse File Paths

If you are not using the python client to sync synapse data then replace these file paths with the file paths on your local system. 

In [None]:
syn_file_paths = {
    'data_clinical_patient': get_dataset(SYNIDS[genie_dataset_version]['data_clinical_patient']).path,
    'data_clinical_sample': get_dataset(SYNIDS[genie_dataset_version]['data_clinical_sample']).path,
    'data_mutations_extended': get_dataset(SYNIDS[genie_dataset_version]['data_mutations_extended']).path,
    'data_CNA': get_dataset(SYNIDS[genie_dataset_version]['data_CNA']).path,
    'data_cna_hg19_seg': get_dataset(SYNIDS[genie_dataset_version]['data_cna_hg19_seg']).path,
}
syn_file_paths

In [None]:
cravat_file_path = f"../data/open-cravat/challenge-1/run1/{genie_dataset_version}-oc-input.txt.variant.tsv"

# GENIE - Data Mutations Extended

In [None]:
df_dme = read_mutations_extended(syn_file_paths['data_mutations_extended'])
df_cravat = dme_to_cravat(df_dme)
df_cravat['INDIVIDUAL'] = range(df_dme.shape[0])
df_dme = pd.concat([df_dme, df_cravat], axis=1)
df_dme

In [None]:
df_cravat

In [None]:
# only need to write once
#df_cravat.to_csv(f'{genie_dataset_version}-oc-input.txt', sep='\t', index=False)

# Read Open Cravat Output

In [None]:
df_crv_unique = pd.read_csv(cravat_file_path, sep='\t', comment='#', low_memory=False)

In [None]:
df_crv_unique

In [None]:
df_crv_unique['samples'] = df_crv_unique['samples'].apply(lambda x: [int(el) for el in x.split(';')])
df_crv = df_crv_unique.explode('samples')

In [None]:
df = pd.merge(
    df_dme,
    df_crv,
    left_on='INDIVIDUAL',
    right_on='samples',
    how='left',
)

In [None]:
df

In [None]:
df_dme.shape

In [None]:
df['clinvar.disease_names'] = df['clinvar.disease_names'].fillna('')

In [None]:
clinvar_disease_names = [
    'neurofibromatosis',
    'au-lait macules with pulmonary stenosis',
]

In [None]:
bmask0 = df['clinvar.disease_names'].str.lower().str.contains(clinvar_disease_names[0])
bmask1 = df['clinvar.disease_names'].str.lower().str.contains(clinvar_disease_names[1])
dns = df[bmask0 | bmask1]['clinvar.disease_names'].value_counts()
dns