# Preprocessing 
- Prepare SMR files for Analysis
    - Create df with only known genes that are in NDD related tissues/omics and from NDD GWAS

In [1]:
import glob
import pandas as pd
import biomart as bm
import json
import pyensembl
import sys
import subprocess
import os

In [2]:
# create dfs for each omic and concat into one big df
def central_df(omic, df):
    # initialize central df
    print(f'Creating {omic} main df')
    main_df = pd.DataFrame()
    
    # use paths to aggregate data into single df
    for index,row in df.iterrows():
        df = pd.read_csv(row.Path, sep = '\t')
        df.insert(0, 'Omic', omic)
        df.insert(1, 'Disease', row.Disease)
        
        main_df = pd.concat([main_df,df])

    return main_df  


!for f in /data/CARD_AA/projects/omicSynth/v8/intermediate_results/*pQTL_brain*; do mv "$f" "$(echo "$f" | sed s/pQTL_brain/pQTL_atlas_brain/)"; done
!for f in /data/CARD_AA/projects/omicSynth/v8/intermediate_results/*pQTL_csf*; do mv "$f" "$(echo "$f" | sed s/pQTL_csf/pQTL_atlas_csf/)"; done
!for f in /data/CARD_AA/projects/omicSynth/v8/intermediate_results/*pQTL_plasma*; do mv "$f" "$(echo "$f" | sed s/pQTL_plasma/pQTL_atlas_plasma/)"; done

### Get relevant SMR result file paths

In [3]:
# get desired NDDs from Gtex
gtex_brain = glob.glob('/../omicSynth/GTEx/Brain*')
gtex_liver = glob.glob('/../omicSynth/GTEx//Liver*')
gtex_nerve = glob.glob('/../omicSynth/GTEx/Nerve*')
gtex_muscle = glob.glob('/../omicSynth/GTEx/Muscle*')
gtex_blood = glob.glob('/../omicSynth/GTEx/*Blood*')

gtex_list = gtex_brain + gtex_liver + gtex_nerve + gtex_blood + gtex_muscle

gtex = []
for x in gtex_list:
    shortname = x.split('/')[-1].rsplit('.',2)[0] 
    if shortname not in gtex:
        gtex.append(x.split('/')[-1].rsplit('.',2)[0])
        
gtex = list(map(lambda x: x.replace('Brain_Spinal_cord_cervical_c-1', 'Brain_Spinal_cord_cervical_c1'), gtex))

ndd_list = ['AD','ALS', 'FTD', 'LBD', 'PD', 'PSP']

# add non-GTEx xQTL sources
ndd_omic = gtex + ['Cerebellum_metaBrain', 'Spinalcord_metaBrain', 'brain_eMeta', 'Cortex_metaBrain', 
            'Basalganglia_metaBrain', 'Hippocampus_metaBrain', 'blood_eQTLgen', 'brain_mMeta', 'blood_Bryois', 'blood_mcrae', 'psychEncode_prefrontal_cortex', 'atlas_csf', 'atlas_plasma', 'atlas_brain', 'multiancestry', 'pQTL_brain', 'pQTL_csf', 'pQTL_plasma']

In [4]:
# get all file paths needed
msmr_files_main = glob.glob("/../omicSynth/intermediate_results/*")

In [5]:
# pull metabrain files
meta_files = [x for x in msmr_files_main if '.csv' in x]

# pull all other msmr files
msmr_files = [x for x in msmr_files_main if '.msmr' in x]

# pull all paths except unlifted metabrain msmr files
msmr_files_clean = [x for x in msmr_files if 'metaBrain' not in x]

# pull all paths except old methylation files since updated csv files exist
msmr_files_clean = [x for x in msmr_files_clean if 'methylation' not in x]

# combine lists for final list of paths
final_msmr_paths = meta_files + msmr_files_clean

# pull all paths except old FTD msmr files
final_msmr_paths = [x for x in final_msmr_paths if 'FTDold' not in x]

In [6]:
# only pull NDD relavent omics
final_paths = []
for path in final_msmr_paths:
    for omic in ndd_omic:
        if omic in path:
            final_paths.append(path)

In [7]:
# only pull NDD relavent diseases
final_paths2 = []
for path in final_paths:
    for omic in ndd_list:
        if omic in path:
            final_paths2.append(path)

### Create Central Data Frame

In [8]:
# extract disease names
diseases = []
for file in final_paths2:
    d = file.rsplit('/')[-1].rsplit('_')[0]
    if d not in diseases:
        diseases.append(d)
    else:
        continue

all_df = pd.DataFrame()
data_type = []
disease = []
chr_num = []
omic = []
source = []
for file in final_paths2:
    
    # data type for each path
    if 'pottier' in file:
        data_type.append(file.split('/')[-1].split('_')[2])
    else:
        data_type.append(file.split('/')[-1].split('_')[1])

        
    # extract disease name
    if 'pottier' in file:
        disease.append(str(file.rsplit('/')[-1].rsplit('_',6)[0]))
    else:
        disease.append(str(file.rsplit('/')[-1].rsplit('_')[0]))
    
    # extract chromosome number or all
    exclude = ['allChrs']
    if any(x in file for x in exclude):
        chr_num.append(int(0))
    elif 'lifted' in file:
        chr_num.append(int(file.rsplit('/')[-1].rsplit('.')[0].rsplit('_')[-2].rsplit('chr')[-1]))
    else:
        chr_num.append(int(file.rsplit('/')[-1].rsplit('.')[0].rsplit('_')[-1].rsplit('chr')[-1]))
        
    # extract source name + 
    if 'metaBrain' in file:
        source.append(file.rsplit('/')[-1].rsplit('.')[0].split('_',2)[-1].rsplit('_',2)[0].split('_')[-2])
        omic.append(str(file.rsplit('/')[-1].rsplit('.')[0].split('_',2)[-1].rsplit('_',3)[0]))
    elif 'GTEx' in file:
        source.append(file.rsplit('_SMR')[0].split('_')[-1])
        omic.append(str(file.rsplit('/')[-1].split('_',2)[-1].split('.')[0]))
    elif "psychEncode" in file:
        source.append(file.rsplit('/')[-1].rsplit(".")[0].split("_", 2)[-1].rsplit('_',2)[0].split("_")[0])
        omic.append(file.rsplit('/')[-1].rsplit(".")[0].split("_", 2)[-1].rsplit('_',2)[0])
    elif 'pQTL' in file:
        source.append(file.split("pQTL_")[-1].split("_")[0])
        omic.append(file.split("pQTL_")[-1].split("_SMR")[0])
        
    else:
        source.append(file.rsplit('_SMR')[0].split('_')[-1])
        omic.append(str(file.rsplit('/')[-1].rsplit('.')[0].split('_',2)[-1].rsplit('_',2)[0]))
        
    
all_df['Disease'] = disease
all_df['Chromosome'] = chr_num
all_df['Data'] = data_type
all_df['Path'] = final_paths2
all_df['Omic'] = omic
all_df['Source'] = source

all_df.sort_values('Disease', inplace = True)

In [9]:
all_df.Omic.value_counts()

Cerebellum_metaBrain                     132
Spinalcord_metaBrain                     132
Hippocampus_metaBrain                    132
Basalganglia_metaBrain                   132
blood_mcrae                              132
Cortex_metaBrain                         132
blood_eQTLgen                              6
Brain_Anterior_cingulate_cortex_BA24       6
blood_Bryois                               6
atlas_plasma                               6
brain_eMeta                                6
Brain_Putamen_basal_ganglia                6
Brain_Amygdala                             6
Brain_Hypothalamus                         6
Whole_Blood                                6
Brain_Cerebellum                           6
Liver                                      6
Brain_Hippocampus                          6
Brain_Substantia_nigra                     6
multiancestry                              6
Muscle_Skeletal                            6
Nerve_Tibial                               6
Brain_Caud

In [12]:
# keep only NDDs
all_df = all_df.query('Disease == @ndd_list')

In [13]:
# extract unique sources
omics_list = list(all_df.Omic.unique())

centraldf = pd.DataFrame()
for omic in omics_list:
    # sort out for paths specific to each omic
    tmp = all_df.query(f"Omic== '{omic}'")
    
    # read paths in tmp df and concat results into an omic specific df
    tmp_df = central_df(omic,tmp)
    
    if 'metaBrain' in omic:
        tmp_df = tmp_df[['Omic', 'Disease', 'probeID', 'ProbeChr', 'Gene', 'Probe_bp', 'topSNP',
       'topSNP_chr', 'topSNP_bp', 'A1', 'A2', 'Freq', 'b_GWAS', 'se_GWAS',
       'p_GWAS', 'b_eQTL', 'se_eQTL', 'p_eQTL', 'b_SMR', 'se_SMR', 'p_SMR', 'p_SMR_multi', 'p_HEIDI', 'nsnp_HEIDI']]
        
    #print(f'{omic} columns {tmp_df.columns}')
    
    centraldf = pd.concat([centraldf, tmp_df])

Creating Cerebellum_metaBrain main df
Creating Basalganglia_metaBrain main df
Creating Spinalcord_metaBrain main df
Creating Hippocampus_metaBrain main df
Creating brain_mMeta main df
Creating blood_mcrae main df
Creating Cortex_metaBrain main df
Creating atlas_brain main df
Creating Brain_Frontal_Cortex_BA9 main df
Creating Brain_Cerebellar_Hemisphere main df
Creating psychEncode_prefrontal_cortex main df
Creating Brain_Cortex main df
Creating Brain_Caudate_basal_ganglia main df
Creating Nerve_Tibial main df
Creating Muscle_Skeletal main df
Creating Brain_Hippocampus main df
Creating multiancestry main df
Creating Brain_Substantia_nigra main df
Creating atlas_csf main df
Creating Brain_Hypothalamus main df
Creating Liver main df
Creating Brain_Anterior_cingulate_cortex_BA24 main df
Creating blood_Bryois main df
Creating atlas_plasma main df
Creating blood_eQTLgen main df
Creating Brain_Putamen_basal_ganglia main df
Creating Brain_Amygdala main df
Creating brain_eMeta main df
Creating 

In [None]:
centraldf

### Format Central dataframe and make any needed manual annotations

In [15]:
# format df column types
centraldf = centraldf.astype({'p_GWAS': float, 'se_eQTL': float, 'p_eQTL': float, 'se_SMR': float, 'p_SMR': float, 'p_SMR_multi':float, 'p_HEIDI': float, 'nsnp_HEIDI': float})

# fill any mising fields in the gene column with 'missing'
centraldf['Gene'] = centraldf['Gene'].fillna('Novel_or_none')

# fill any other missing values with sentinel value -9999
centraldf.fillna(int(-9999), inplace = True)


toprsid = []
for snp in centraldf['topSNP']:
    if ':' in snp:
        new = snp.rsplit(':')[-2]
        toprsid.append(new)
    else:
        toprsid.append(snp)

centraldf['topRSID'] = toprsid

In [16]:
# reformat some omic names
centraldf['Omic'] = centraldf['Omic'].replace(['Brain_Spinal_cord_cervical_c-1'], 'Brain_Spinal_cord_cervical_c1')
centraldf['Omic'] = centraldf['Omic'].replace(['Cells_EBV-transformed_lymphocytes'], 'Cells_EBV_transformed_lymphocytes')

# cleaning
centraldf = centraldf[~centraldf['Gene'].str.contains("HLA")]

# fix gene names that are a list
tmp = []

for x in centraldf['Gene']:
    if ';' in x:
        res = x.split(';')
        if len(set(res)) > 1:
            tmp_set = ','.join(map(str,list(set(res))))
            tmp.append(tmp_set)
        elif len(set(res)) == 1:
            tmp.append(res[0])
    else:
        tmp.append(x)

centraldf['Gene'] = tmp

In [16]:
# manual annotations
centraldf = centraldf.replace({'MAPT,LOC100130148' : 'MAPT',
                        'LOC100128977,MAPT' : 'MAPT',
                        'DKFZp686A1627' : 'PHF2P1',
                        'LOC285456,RPL34' : 'RPL34-DT',
                        'FAM119B' : 'EEF1AKMT3', 
                        'FAM119B,METTL1' : 'METTL1',
                        'METTL1,FAM119B':'METTL1',
                        'FAM119B,TSFM': 'TSFM', 
                        'TSFM,FAM119B' : 'TSFM',
                        'FAM109A': 'PHETA1', 
                        'FAM195A':'MCRIP2',
                        'WDR90,FAM195A': 'WDR90',
                        'FAM64A' : 'PIMREG',
                        'FAM173A': 'ANTKMT',
                        'FAM195B,DYSFIP1' : 'DYSFIP1',
                        'FAM83G,SLC5A10' : 'SLC5A10',
                        'MIR1182,FAM89A': 'MIR1182',
                        'FAM66D,USP17L2' : 'USP17L2',
                        'PRUNE,FAM63A' : 'PRUNE',
                        'FAM63A,PRUNE' : 'PRUNE',
                        'FAM83A,LOC100131726' : 'MGC14128',
                        'FAM120AOS,FAM120A': 'KIAA0183',
                        'FAM120A,FAM120AOS' : 'KIAA0183',
                        'FAM196A,DOCK1' : 'DOCK1',
                        'C8orf12,FAM167A' : 'C8orf12',
                        'FAM167A,C8orf12' : 'C8orf12',
                        'FAM114A1,MIR574' : 'MIR574',
                        'FAM83E,SPACA4' : 'SPACA4',
                        'COX6B2,FAM71E2' : 'COX6B2',
                        'FAM71E2,COX6B2' : 'COX6B2',
                        'PIP5K1B,FAM122A' : 'PIP5K1B',
                        'FAM122A,PIP5K1B' : 'PIP5K1B',
                        'MFAP3,FAM114A2' : 'MFAP3',
                        'FAM114A2,MFAP3' : 'MFAP3',
                        'SCAMP3,FAM189B' : 'SCAMP3',
                        'FAM189B,SCAMP3' : 'SCAMP3',
                        'MIR921,FAM78B' : 'MIR921',
                        'FAM78B,MIR921' : 'MIR921',
                        'ACOT11,FAM151A' : 'ACOT11',
                        'LOC646471,FAM54B' : 'MTFR1L',
                        'FAM104A,C17orf80' : 'C17orf80',
                        'C17orf80,FAM104A' : 'C17orf80',
                        'FAM126B,NDUFB3' : 'NDUFB3',
                        'LOC150776,FAM128A' : 'MZT2A',
                        'FAM128B,SMPD4' : 'SMPD4',
                        'FAM196B,DOCK2' : 'DOCK2',
                        'MIR2277,FAM172A' : 'FAM172A',
                        'FAM45A,FAM45B' : 'DENND10',
                        'C22orf32,FAM109B' : 'C22orf32',
                        'FAM109B,C22orf32' : 'C22orf32',
                        'BET3L,FAM26D' : 'BET3L',
                        'FAM26E,BET3L' : 'CALHM5',
                        'FAM162A,CCDC58' : 'CCDC58',
                        'CCDC58,FAM162A' : 'CCDC58',
                        'SNX1,FAM96A' : 'SNX1',
                        'ANKS1B,FAM71C' : 'ANKS1B',
                        'LOC100233209,FAM113B' : 'PCED1B',
                        'FAM113A,VPS16' : 'VPS16',
                        'VPS16,FAM113A' : 'VPS16',
                        'CEP57,FAM76B' : 'CEP57',
                        'HCCA2,FAM99B' : 'HCCA2',
                        'FAM99B,HCCA2' : 'HCCA2',
                        'FAM99A,HCCA2' : 'HCCA2',
                        'C14orf86,FAM181A' : 'C14orf86',
                        'COQ6,FAM161B' : 'COQ6',
                        'FAM161B,COQ6' : 'COQ6',
                        'KLHL28,FAM179B' : 'KLHL28',
                        'CES2,FAM96B' : 'CES2',
                        'FAM96B,CES2' : 'CES2',
                        'MIR574,FAM114A1': 'MIR574',
                        'FAM192A,RSPRY1' : 'RSPRY1',
                        'FAM26D,BET3L' : 'CALHM4',
                        'FAM25G,FAM25B,FAM25C' : 'FAM25G',
                        'SLC5A10,FAM83G': 'SLC5A10',
                        'C22orf45,UPB1' : 'UPB1',
                        'C21orf122,ADARB1' : 'ADARB1',
                        'C22orf29,GNB1L' : 'GNB1L',
                        'C22orf46,NHP2L1' : 'NHP2L1',
                        'C21orf119,URB1' : 'URB1',
                        'KRTAP12-2,C21orf29' : 'KRTAP12-2',
                        'NCRNA00112,C21orf129' : 'NCRNA00112',
                        'KRTAP12-3,C21orf29' : 'KRTAP12-3',
                        'KRTAP12-4,C21orf29' : 'KRTAP12-4',
                        'UMODL1,C21orf128' : 'UMODL1',
                        'C21orf66,C21orf49' : 'PAXBP1',
                        'C21orf49,C21orf66' : 'PAXBP1',
                        'KRTAP12-1,C21orf29' : 'KRTAP12-1',
                        'C21orf29,KRTAP10-1' : 'KRTAP10-1',
                        'KRTAP10-3,C21orf29' : 'KRTAP10-3',
                        'KRTAP10-11,C21orf29' : 'KRTAP10-11',
                        'KRTAP10-10,C21orf29' : 'KRTAP10-10',
                        'C21orf49,C21orf62' : 'C21orf62',
                        'KRTAP10-4,C21orf29': 'KRTAP10-4',
                        'KRTAP10-6,C21orf29' : 'KRTAP10-6',
                        'KRTAP10-7,C21orf29' : 'KRTAP10-7',
                        'C21orf29,KRTAP10-8' : 'KRTAP10-8',
                        'KRTAP10-9,C21orf29' : 'KRTAP10-9',
                        'KRTAP10-12,C21orf29' : 'KRTAP10-12',
                        'C21orf70,C21orf67' : 'SLX9',
                        'C21orf57,MCM3AP' : 'MCM3AP',
                        'C20orf201,OPRL1' : 'OPRL1',
                        'KRTAP10-2,C21orf29' : 'KRTAP10-2',
                        'C21orf29,KRTAP10-5' : 'KRTAP10-5',
                        'MAK16,C8orf41' : 'MAK16',
                        'C11orf21,TSPAN32': 'TSPAN32',
                        'C6orf122,C6orf208' : 'C6orf122',
                        'CHRNE,C17orf107': 'CHRNE',
                        'GPR146,C7orf50' : 'GPR146',
                        'CTBP1,C4orf42' : 'CTBP1',
                        'C8orf56,BAALC' : 'BAALC',
                        'C7orf50,GPER' : 'GPER',
                        'TRIM61,C4orf39' : 'TRIM61',
                        'DHRS4,C14orf167' : 'DHRS4',
                        'C5orf55,EXOC3' : 'EXOC3',
                        'PRAC,C17orf93' : 'PRAC',
                        'MRPS18B,C6orf134' : 'MRPS18B',
                        'SURF4,C9orf96' : 'SURF4',
                        'C1orf86,PRKCZ' : 'PRKCZ',
                        'C6orf52,PAK1IP1' : 'PAK1IP1',
                        'C22orf26,LOC150381' : 'C22orf26',
                        'C20orf134,NECAB3' : 'NECAB3',
                        'C20orf200,C20orf166' : 'CRMA',
                        'C1orf77,S100A13' : 'CHTOP',
                        'IDI2,C10orf110' : 'IDI2',
                        'C8orf51,RHPN1' : 'RHPN1',
                        'C19orf73,PPFIA3' : 'PPFIA3',
                        'C17orf102,TMEM132E' : 'TMEM132E',        
                        'C20orf199,SNORD12C,ZNFX1' : 'ZNFX1',        
                        'C6orf114,GFOD1' : 'GFOD1',
                        'C6orf201,C6orf146' : 'TEX56P',
                        'C3orf30,IGSF11' : 'IGSF11',
                        'C6orf41,LOC100270746' : 'C6orf41',
                        'CRNKL1,C20orf26' : 'CRNKL1',
                        'C1orf126,TMEM51' : 'TMEM51',
                        'MIR1539,C18orf32' : 'MIR1539',
                        'NEBL,C10orf113' : 'NEBL',
                        'C17orf106,ACOX1' : 'ACOX1',
                        'C17orf46,LOC100133991,MAP3K14' : 'MAP3K14',
                        'C19orf30,MIR7-3' : 'MIR7-3',
                        'POMGNT1,C1orf190' : 'POMGNT1',
                        'C18orf16,AQP4' : 'AQP4',
                        'KTN1,C14orf33' : 'KTN1',
                        'C17orf90,CCDC137' : 'CCDC137',
                        'C13orf31,CCDC122' : 'CCDC122',
                        'GUSBL1,C6orf41' : 'GUSBL1',
                        'C6orf26,MSH5' : 'MSH5',
                        'C9orf24,C9orf25' : 'C9orf24',
                        'GGT1,C22orf36' : 'GGT1',
                        'C8orf40,SLC20A2' : 'SLC20A2',
                        'C1orf86,LOC100128003' : 'C1orf86',
                        'C18orf18,LOC339290' : 'C18orf18',
                        'C4orf10,NOP14': 'NOP14',
                        'C10orf10,RASSF4' : 'RASSF4',
                        'GJB7,C6orf162' : 'GJB7',
                        'C7orf50,MIR339' : 'MIR339',
                        'C9orf68,PPAPDC2': 'PPAPDC2',
                        'C1orf105,PIGC' : 'PIGC',
                        'C1orf174,LOC100133612' : 'C1orf174',
                        'C20orf166,MIR133A2' : 'MIR133A2',
                        'MGC14436,C12orf34' : 'MGC14436',
                        'C11orf92,C11orf93' : 'C11orf92',
                        'C11orf36,MRGPRG' :'MRGPRG',
                        'ST5,C11orf17' : 'ST5',
                        'C10orf116,AGAP11' : 'AGAP11',
                        'KCNJ5,C11orf45': 'KCNJ5',
                        'SNW1,C14orf178': 'SNW1',
                        'C19orf76,CPT1C': 'CPT1C',
                        'C19orf40,CCDC123': 'CCDC123',
                        'SERTAD4,C1orf133': 'SERTAD4',
                        'DDX20,C1orf183': 'DDX20',
                        'ST20,C15orf37': 'ST20',
                        'GHRLOS,C3orf42': 'GHRLOS',
                        'C19orf34,CSNK1G2': 'CSNK1G2',
                        'C1orf194,KIAA1324': 'KIAA1324',
                        'C19orf71,C19orf28': 'C19orf71',
                        'CDH23,C10orf54': 'CDH23',
                        'RMND1,C6orf211': 'RMND1',
                        'C1orf94,CSMD2': 'CSMD2',
                        'CALR3,C19orf44': 'CALR3',
                        'C1orf125,NPHS2': 'NPHS2',
                        'RNMT,C18orf19' : 'RNMT',
                        'PIK3CD,C1orf200': 'PIK3CD',
                        'MIR548G,FILIP1L,C3orf26' : 'MIR548G',
                        'ASB16,C17orf65' : 'ASB16',
                        'C11orf83,C11orf48': 'UQCC3',
                        'YIF1B,C19orf33': 'YIF1B',
                        'C6orf48,SNORD52': 'SNORD52',
                        'CDH23,C10orf105': 'CDH23',
                        'C19orf55,HSPB6': 'HSPB6',
                        'CCT3,C1orf182': 'CCT3',
                        'C14orf169,HEATR4': 'HEATR4',
                        'C1orf109,CDCA8': 'CDCA8',
                        'SNRPA,C19orf54': 'SNRPA',
                        'YWHAH,C22orf24': 'YWHAH',
                        'C1orf204,VSIG8': 'VSIG8',
                        'C10orf25,ZNF22': 'ZNF2',
                        'C10orf79,MIR609': 'MIR609',
                        'C2orf63,RPS27A': 'RPS27A',
                        'MED31,C17orf100': 'MED31',
                        'PSORS1C1,C6orf15': 'PSORS1C',
                        'C19orf48,SNORD88C' : 'SNORD88C',
                        'GCNT7,C20orf43' : 'GCNT7',
                        'GNPAT,C1orf131' : 'GNPAT',
                        'TSGA10,C2orf15' : 'TSGA10',
                        'GLIS3,C9orf70' : 'GLIS3',
                        'FUT7,C9orf139' : 'FUT7',
                        'USP20,C9orf78' : 'USP20',
                        'C17orf86,SCARNA16' : 'SCARNA16',
                        'BTG4,C11orf88,MIR34C' : 'BTG4',
                        'CHMP1A,C16orf55' : 'CHMP1A',
                        'KIAA1328,C18orf10' : 'KIAA1328',
                        'NOP14,C4orf10' : 'NOP14', 
                        'WDFY3,C4orf12' : 'WDFY3',
                        'C17orf72,ICAM2' : 'ICAM2',
                        'CENPN,C16orf61': 'CENPN',
                        'C14orf159,SNORA11B': 'SNORA11B',
                        'DOCK8,C9orf66': 'DOCK8',
                        'C19orf48,SNORD88B': 'SNORD88B',
                        'C4orf10,MFSD10': 'MFSD10',
                        'C9orf6,IKBKAP': 'IKBKAP',
                        'PLD3,C19orf47': 'PLD3',
                        'MVP,C16orf53': 'MVP',
                        'C2orf77,KLHL23,PHOSPHO2': 'KLHL23',
                        'C13orf34,C13orf37': 'MZT1',
                        'C17orf101,HEXDC': 'HEXDC',
                        'S1PR3,C9orf47': 'S1PR3',
                        'CC2D1A,C19orf57': 'CC2D1A',
                        'C5orf36,MIR1974': 'MIR1974',
                        'C7orf29,LRRC61': 'LRRC61',
                        'FOXL2,C3orf72': 'FOXL2',
                        'C17orf77,CD300LD': 'CD300LD',
                        'C15orf56,PAK6': 'PAK6',
                        'C1orf43,UBAP2L': 'UBAP2L',
                        'C20orf166,MIR1-1': 'MIR1-1',
                        'SNORD48,C6orf48': 'SNORD48',
                        'C1orf70,SSU72': 'SSU72',
                        'C17orf105,DUSP3': 'DUSP3',
                        'C1orf213,ZNF436': 'ZNF436',
                        'C20orf199,MIR1259,SNORD12,SNORD12B': 'MIR1259',
                        'C7orf13,RNF32': 'RNF32',
                        'ABCA2,C9orf139': 'ABCA2',
                        'C18orf56,TYMS': 'TYMS',
                        'DBI,C2orf76': 'DBI',
                        'C9orf110,C9orf109': 'C9orf110',
                        'C12orf32,FOXM1': 'FOXM1',
                        'C14orf45,ENTPD5': 'ENTPD5',
                        'ZNF276,C16orf7': 'ZNF276',
                        'TTLL5,C14orf1': 'TTLL5',
                        'C3orf62,USP4': 'USP4',
                        'C1orf84,MED8': 'MED8',
                        'WBP11,C12orf60': 'WBP11',
                        'C10orf55,PLAU': 'PLAU',
                        'C4orf14,POLR2B': 'POLR2B',
                        'C12orf49,RNFT2': 'RNFT2',
                        'GAS8,C16orf3': 'GAS8',
                        'C9orf7,ADAMTS13': 'ADAMTS13',
                        'C17orf46,LOC100133991': 'SPATA32',
                        'C11orf54,TAF1D': 'TAFD1',
                        'C10orf57,LOC219347': 'C10orf57',
                        'RHPN1,C8orf51': 'RHPN1',
                        'C20orf141,LOC100288797': 'C20orf141',
                        'WWC2,C4orf38': 'WWC2',
                        'C1orf187,MAD2L2': 'MAD2L2',
                        'C15orf28,ANP32A': 'ANP32A',
                        'DKFZP434K028,C11orf9': 'MYRF',
                        'C16orf79,PGP': 'PGP',
                        'SCLT1,C4orf33': 'SCLT1',
                        'C15orf33,FGF7': 'FGF7',
                        'SNX5,C20orf72': 'SNX5',
                        'C11orf45,KCNJ5': 'KCNJ5',
                        'MIR185,C22orf25': 'MIR185',
                        'TM7SF2,C11orf2': 'TM7SF2',
                        'MRPL43,C10orf2': 'MRPL43',
                        'C17orf49,RNASEK': 'RNASEK',
                        'ZNF219,C14orf176': 'ZNF219',
                        'C20orf199,SNORD12': 'SNORD12',
                        'CIB1,C15orf58': 'CIBI',
                        'C17orf76,NCRNA00188': 'LRRC75A',
                        'DIP2C,C10orf108': 'DIP2C',
                        'C6orf204,BRD7P3': 'BRD7P3',
                        'MIR22,C17orf91': 'MIR22',
                        'FILIP1L,C3orf26': 'FILPIP1L',
                        'C2orf47,C2orf60': 'MAIP1',
                        'C5orf44,TRIM23': 'TRIM23',
                        'C11orf68,DRAP1': 'DRAP1',
                        'GNPTG,C16orf42': 'GNPTG',
                        'C1orf228,TMEM53': 'TMEM53',
                        'C9orf163,SEC16A': 'SEC16A',
                        'C16orf73,FAHD1': 'FAHD1',
                        'BAG5,C14orf153': 'BAG5',
                        'POLE3,C9orf43': 'POLE3',
                        'SEC16A,C9orf163': 'SEC16A',
                        'WDR27,C6orf120': 'WDR27',
                        'TDH,C8orf12': 'TDH',
                        'C16orf86,C16orf48': 'ENKD1',
                        'C7orf54,SND1': 'SND1',
                        'C2orf28,SLC5A6': 'SLC5A6',
                        'ZNF252,C8orf77': 'ZNF252',
                        'MIR548G,C3orf26': 'MIR548G',
                        'KIAA0408,C6orf174': 'KIAA0408',
                        'RSF1,C11orf67': 'RSF1',
                        'PRRT2,C16orf53': 'PRRT2',
                        'RNASEK,C17orf49': 'RNASEK',
                        'C14orf21,DHRS1': 'DHRS1',
                        'TYMS,C18orf56': 'TYMS',
                        'C1orf226,C1orf111': 'SPATA46',
                        'C1orf126,KIAA1026': 'KIAA1026',
                        'TMEM38A,C19orf42': 'TMEM38A',
                        'C1orf43,C1orf189': 'CFAP141',
                        'C9orf95,OSTF1': 'OSTF1',
                        'C5orf22,RNASEN': 'RNASEN',
                        'LY6G6C,C6orf25': 'LY6G6C',
                        'PIGX,C3orf34': 'PIGX',
                        'DULLARD,C17orf81': 'DULLARD',
                        'C3orf51,ERC2': 'ERC2',
                        'C12orf26,CCDC59': 'CCDC59',
                        'C9orf129,WNK2': 'WNK2',
                        'C6orf182,SESN1': 'SESN1',
                        'C11orf10,MIR611,FEN1': 'FEN1',
                        'YRDC,C1orf122': 'YRDC',
                        'C7orf40,SNORA9': 'SNORA9',
                        'C19orf24,CIRBP': 'CIRBP',
                        'CEBPZ,C2orf56': 'CEBPZ',
                        'RPP38,C10orf111': 'RPP38',
                        'IQCK,C16orf88': 'IQCK',
                        'TUBE1,C6orf225': 'TUBE1',
                        'C17orf81,DULLARD': 'DULLARD',
                        'LOC100272146,C17orf57': 'EFCAB13',
                        'C20orf165,NEURL2': 'NEURL2',
                        'MMEL1,C1orf93': 'MMEL1',
                        'LOC441177,C6orf176': 'LOC441177',
                        'ISG20L2,C1orf66': 'ISG20L2',
                        'DNAI1,C9orf25': 'DNAI1',
                        'C9orf116,MRPS2': 'MRPS2',
                        'BAIAP3,C16orf42': 'BAIAP3',
                        'C14orf43,PNMA1': 'PNMA1',
                        'C5orf39,LOC153684': 'ANXA2R',
                        'JKAMP,C14orf149': 'JKAMP',
                        'C9orf45,MIR600': 'MIR600',
                        'C6orf81,LOC285847': 'ARMC12',
                        'C17orf106,CDK3': 'CDK3',
                        'MIR1259,SNORD12C,SNORD12B,C20orf199,ZNFX1': 'MIR1259',
                        'C19orf76,PRMT1': 'PRMT1',
                        'C19orf23,CIRBP': 'CIRBP',
                        'C20orf177,PPP1R3D': 'PPP1R3D',
                        'C17orf89,C17orf56': 'C17orf89',
                        'C1orf58,AIDA': 'AIDA',
                        'MAPKAPK5,C12orf47': 'MAPKAPK5',
                        'MKKS,C20orf94': 'MKKS',
                        'C6orf167,MIR548H3': 'MIR548H3',
                        'C16orf57,ZNF319': 'ZNF319',
                        'C14orf156,ALKBH1': 'ALKBH1',
                        'C11orf31,TMX2': 'TMX2',
                        'C10orf108,DIP2C': 'DIP2C',
                        'KIAA1984,LOC100131193,C9orf86': 'KIAA1984',
                        'C11orf71,RBM7': 'RBM7',
                        'C2orf34,PREPL': 'PREPL',
                        'SNORA9,C7orf40': 'SNORA9',
                        'C3orf47,H1FX': 'H1FX',
                        'C6orf153,KLHDC3': 'KLHDC3',
                        'C3orf24,FANCD2': 'FANCD2',
                        'TNNI3,C19orf51': 'TNNI3',
                        'C1orf156,C1orf112': 'METTL18',
                        'MIR1259,SNORD12B,SNORD12,C20orf199,ZNFX1': 'MIR1259',
                        'C17orf69,MGC57346': ',MGC5734',
                        'C5orf40,CYFIP2': 'CYFIP2',
                        'MIR22,C17orf91,WDR81': 'MIR22',
                        'C12orf24,GPN3': 'GPN3',
                        'C17orf48,SCO1': 'SCO1',
                        'MIR1910,C16orf74': 'MIR1910',
                        'MORG1,C19orf56': 'MORG1',
                        'ESF1,C20orf7': 'ESF1',
                        'C12orf69,C12orf60': 'SMCO3',
                        'DDX54,C12orf52': 'DDX54',
                        'C12orf10,PFDN5': 'PFDN5',
                        'C11orf57,PIH1D2': 'PIH1D2',
                        'C11orf20,KCNK4': 'KCNK4',
                        'C3orf52,MIR567': 'MIR567',
                        'MIR32,C9orf5': 'MIR32',
                        'C9orf102,C9orf130': 'ERCC6L2',
                        'MIR24-1,C9orf3': 'MIR24-1',
                        'C1orf50,LEPRE1': 'LEPRE1',
                        'TSPAN32,C11orf21': 'TSPAN32',
                        'C6orf208,C6orf122': 'LINC00574',
                        'UPB1,C22orf45': 'UPB1',
                        'C17orf107,CHRNE': 'CHRNE',
                        'C7orf50,GPR146': 'GPR146',
                        'C4orf39,TRIM61': 'TRIM61',
                        'C21orf128,UMODL1': 'UMODL1',
                        'EXOC3,C5orf55': 'EXOC3',
                        'C20orf166,C20orf200': 'CRMA',
                        'NECAB3,C20orf134': 'NECAB3',
                        'LOC150381,C22orf26': 'C22orf26',
                        'OPRL1,C20orf201': 'OPRL1',
                        'C10orf110,IDI2': 'IDI2',
                        'IGSF11,C3orf30': 'IGSF11',
                        'LOC100270746,C6orf41': 'LINC00240',
                        'C20orf26,CRNKL1': 'CRNKL1',
                        'TMEM51,C1orf126': 'TMEM51',
                        'CCDC122,C13orf31': 'CCDC122',
                        'ADARB1,C21orf122': 'ADARB1',
                        'LOC100133612,C1orf174': 'C1orf174',
                        'MIR133A2,C20orf166': 'MIR133A2',
                        'PIGC,C1orf105': 'PIGC',
                        'C12orf34,MGC14436': 'MGC14436',
                        'MRGPRG,C11orf36': 'MRGPRG',
                        'AQP4,C18orf16': 'AQP4',
                        'AGAP11,C10orf116': 'AGAP11',
                        'CPT1C,C19orf76': 'CPT1C',
                        'C1orf133,SERTAD4': 'SERTAD4',
                        'C15orf37,ST20': 'ST20',
                        'C10orf54,CDH23': 'CDH23',
                        'PPAPDC2,C9orf68': 'PPAPDC2',
                        'C6orf211,RMND1': 'RMND1',
                        'C19orf44,CALR3': 'CALR3',
                        'C11orf48,C11orf83': 'UQCC3',
                        'FILIP1L,C3orf26,MIR548G': 'FILIP1L',
                        'C19orf33,YIF1B': 'YIF1B',
                        'C10orf105,CDH23': 'CDH23',
                        'C14orf167,DHRS4': 'DHRS4',
                        'C1orf182,CCT3': 'CCT3',
                        'C19orf54,SNRPA': 'SNRPA',
                        'C22orf24,YWHAH': 'YWAH',
                        'MIR609,C10orf79': 'MIR609',
                        'C17orf100,MED31': 'MED31',
                        'C9orf139,FUT7': 'FUT7',
                        'C6orf15,PSORS1C1': 'PSORS1C1',
                        'C21orf129,NCRNA00112': 'NCRNA00112',
                        'MIR34C,C11orf88,BTG4': 'MIR34C',
                        'SNORA11B,C14orf159':'SNORA11B',
                        'C9orf66,DOCK8': 'DOCK8',
                        'MFSD10,C4orf10': 'MFSD10',
                        'C21orf29,KRTAP12-4': 'KRTAP12-4',
                        'PHOSPHO2,KLHL23,C2orf77': 'PHOSPHO2',
                        'C13orf37,C13orf34': 'MZT1',
                        'C21orf29,KRTAP10-11': 'KRTAP10-11',
                        'LRRC61,C7orf29': 'LRRC61',
                        'MIR1974,C5orf36': 'MIR1974',
                        'CD300LD,C17orf77': 'CD300LD',
                        'C21orf29,KRTAP10-7': 'KRTAP10-7',
                        'C6orf48,SNORD48': 'SNORD48',
                        'SSU72,C1orf70': 'SSU72',
                        'DUSP3,C17orf105': 'DUSP3',
                        'KRTAP10-1,C21orf29': 'KRTAP10-1',
                        'C17orf65,ASB16': 'ASB16',
                        'URB1,C21orf119': 'URB1',
                        'C9orf109,C9orf110': 'FAM225A',
                        'C9orf139,ABCA2': 'ABCA2',
                        'FOXM1,C12orf32': 'FOXM1',
                        'RNF32,C7orf13': 'RNF32',
                        'C2orf76,DBI': 'DBI',
                        'ENTPD5,C14orf45': 'ENTPD5',
                        'POLR2B,C4orf14': 'POLR2B',
                        'PLAU,C10orf55': 'PLAU',
                        'C16orf3,GAS8': 'GAS8',
                        'TAF1D,C11orf54': 'TAF1D',
                        'LOC100288797,C20orf141': 'C20orf141',
                        'ANP32A,C15orf28': 'ANP32A',
                        'C11orf9,DKFZP434K028': 'MYRF',
                        'FGF7,C15orf33': 'FGF7',
                        'PGP,C16orf79': 'PGP',
                        'C14orf176,ZNF219': 'ZNF219',
                        'HEXDC,C17orf101': 'HEXDC',
                        'LOC100133991,C17orf46': 'SPATA32',
                        'C2orf60,C2orf47': 'TYW5',
                        'C17orf91,MIR22': 'MIRR22',
                        'DRAP1,C11orf68': 'DRAP1',
                        'KRTAP10-8,C21orf29': 'KRTAP10-8',
                        'C16orf48,C16orf86': 'ENKD1',
                        'C6orf120,WDR27': 'WDR27',
                        'C14orf153,BAG5': 'BAG5',
                        'SLC5A6,C2orf28': 'SLC5A6',
                        'C6orf174,KIAA0408': 'KIAA0408',
                        'C3orf26,MIR548G': 'MIR548G',
                        'C11orf67,RSF1': 'RSF1',
                        'C8orf77,ZNF252': 'ZNF252',
                        'C3orf34,PIGX': 'PIGX',
                        'WNK2,C9orf129': 'WNK2',
                        'RNASEN,C5orf22': 'RNASEN',
                        'CCDC59,C12orf26': 'CCDC59',
                        'C1orf111,C1orf226': 'SPATA46',
                        'C20orf72,SNX5': 'SNX5',
                        'ERC2,C3orf51': 'ERC2',
                        'DHRS1,C14orf21': 'DHRS1',
                        'C6orf25,LY6G6C': 'LY6G6C',
                        'SESN1,C6orf182': 'SESN1',
                        'RNFT2,C12orf49': 'RNFT2',
                        'C16orf88,IQCK': 'IQCK',
                        'C6orf225,TUBE1': 'TUBE1',
                        'CIRBP,C19orf24': 'CIRBP',
                        'C17orf57,LOC100272146': 'EFCAB13',
                        'C9orf25,DNAI1':'DNAI1',
                        'C1orf93,MMEL1': 'MMEL1',
                        'NHP2L1,C22orf46': 'NHP2L1',
                        'FAM181A,C14orf86': 'FAM181A',
                        'LOC285847,C6orf81': 'ARMC12',
                        'PRMT1,C19orf76': 'PRMT1',
                        'PPP1R3D,C20orf177': 'PPP1R3D',
                        'SNORD12C,C20orf199,ZNFX1,MIR1259,SNORD12B': 'SNORD12C',
                        'AIDA,C1orf58': 'AIDA',
                        'ALKBH1,C14orf156': 'ALKBH1',
                        'RBM7,C11orf71': 'RBM7',
                        'C9orf86,LOC100131193,KIAA1984': 'KIAA1984',
                        'CYFIP2,C5orf40': 'CYFIP2',
                        'FANCD2,C3orf24': 'FANCD2',
                        'C1orf112,C1orf156': 'METTL18',
                        'C12orf47,MAPKAPK5': 'MAPKAPK5',
                        'SNORD12,C20orf199,ZNFX1,MIR1259,SNORD12B': 'SNORD12',
                        'C9orf5,MIR32': 'MIR32',
                        'SCO1,C17orf48': 'SCO1',
                        'C16orf74,MIR1910': 'MIR1910',
                        'ICAM2,C17orf72': 'ICAM2',
                        'C12orf52,DDX54': 'DDX54',
                        'PIH1D2,C11orf57': 'PIH1D2',
                        'C18orf10,KIAA1328': 'KIAA1328',
                        'C20orf7,ESF1': 'ESF1',
                        'C17orf91,WDR81,MIR22': 'WDR81',
                        'NPHS2,C1orf125': 'NPHS2',
                        'C6orf41,GUSBL1': 'GUSBL1',
                        'C22orf36,GGT1': 'GGT1',
                        'C10orf113,NEBL': 'NEBL',
                        'ACOX1,C17orf106': 'ACOX1',
                        'TMEM53,C1orf228': 'TMEM53',
                        'LOC100133991,MAP3K14,C17orf46': 'MAP3K14',
                        'MIR7-3,C19orf30': 'MIR4-3',
                        'PPFIA3,C19orf73': 'PPFIA3',
                        'C21orf29,KRTAP10-2': 'KRTAP10-2',
                        'GFOD1,C6orf114': 'GFOD1',
                        'C17orf93,PRAC': 'PRAC',
                        'KRTAP10-5,C21orf29': 'KRTAP10-5',
                        'GPER,C7orf50': 'GPER',
                        'S100A13,C1orf77': 'S100A13',
                        'C14orf33,KTN1': 'KTN1',
                        'RASSF4,C10orf10': 'RASSF4',
                        'MIR339,C7orf50': 'MIR339',
                        'C11orf93,C11orf92': 'AKIP1',
                        'C11orf17,ST5': 'ST5',
                        'C1orf183,DDX20': 'DDX20',
                        'C3orf42,GHRLOS': 'GHRLOS',
                        'KIAA1324,C1orf194': 'KIAA1324',
                        'C19orf28,C19orf71': 'MFSD12',
                        'CSMD2,C1orf94': 'CSMD2',
                        'C21orf29,KRTAP12-1': 'KRTAP12-1',
                        'MIR548G,C3orf26,FILIP1L': 'MIR548G',
                        'HSPB6,C19orf55': 'HSPB6',
                        'CDCA8,C1orf109': 'CDCA8',
                        'ZNF22,C10orf25': 'ZNF22',
                        'MIR34C,BTG4,C11orf88': 'BTG4',
                        'C16orf55,CHMP1A': 'CHMP1A',
                        'C16orf61,CENPN': 'CENPN',
                        'C2orf77,PHOSPHO2,KLHL23': 'PHOSPHO2',
                        'C19orf57,CC2D1A': 'CC2DIA',
                        'UBAP2L,C1orf43': 'UBAP2L',
                        'MIR1-1,C20orf166': 'MIR1-1',
                        'ZNF436,C1orf213': 'ZNF436',
                        'SNORD12B,SNORD12,C20orf199,MIR1259': 'SNORD12B',
                        'C14orf1,TTLL5': 'TTLL5',
                        'USP4,C3orf62': 'USP4',
                        'C12orf60,WBP11': 'WBP11',
                        'ADAMTS13,C9orf7': 'ADAMTS13',
                        'MAD2L2,C1orf187': 'MAD2L2',
                        'SNORD12,C20orf199': 'SNORD12',
                        'NCRNA00188,C17orf76': 'SNGH29',
                        'C15orf58,CIB1': 'CIB1',
                        'BRD7P3,C6orf204': 'BRD7P3',
                        'C3orf26,FILIP1L': 'FILIP1L',
                        'C16orf42,GNPTG': 'GNPTG',
                        'C9orf43,POLE3': 'POLE3',
                        'FAHD1,C16orf73': 'FAHD1',
                        'MCM3AP,C21orf57': 'MCM3AP',
                        'C21orf62,C21orf49': 'C21orf62',
                        'KIAA1026,C1orf126': 'KIAA1026',
                        'C1orf189,C1orf43': 'CFAP141',
                        'GNB1L,C22orf29': 'GNB1L',
                        'C18orf19,RNMT': 'RNMT',
                        'C8orf41,MAK16': 'MAK16',
                        'FEN1,MIR611,C11orf10': 'FEN1',
                        'C1orf122,YRDC': 'YRDC',
                        'C21orf67,C21orf70': 'SLX9',
                        'C6orf176,LOC441177': 'C6orf176',
                        'C1orf66,ISG20L2': 'ISG20L2',
                        'C16orf42,BAIAP3': 'BAIAP3',
                        'LOC339290,C18orf18': 'LINC00256',
                        'PNMA1,C14orf43': 'PNMA1',
                        'LOC153684,C5orf39': 'ANXA2R',
                        'C14orf149,JKAMP': 'JKAMP',
                        'MIR600,C9orf45': 'MIR600',
                        'SNORD12B,C20orf199,ZNFX1,MIR1259,SNORD12C': 'SNORD12B',
                        'C20orf94,MKKS': 'MKKS',
                        'MIR548H3,C6orf167': 'MIR548H3',
                        'ZNF319,C16orf57': 'ZNF319',
                        'C19orf51,TNNI3': 'TNNI3',
                        'MGC57346,C17orf69': 'MGC57346',
                        'SNORD12B,C20orf199,SNORD12,ZNFX1,MIR1259': 'SNORD12B',
                        'H1FX,C3orf47': 'H1FX',
                        'C9orf130,C9orf102': 'ERCC6L2',
                        'C21orf29,KRTAP10-12': 'KRTAP10-12',
                        'KCNK4,C11orf20': 'KCNK4',
                        'MIR567,C3orf52': 'MIR567',
                        'C9orf70,GLIS3': 'GLIS3',
                        'MSH5,C6orf26': 'MSH5',
                        'C9orf25,C9orf24': 'FAM219A',
                        'C18orf32,MIR1539': 'MIR1539',
                        'SNORD12C,C20orf199,ZNFX1': 'SNORD12C',
                        'TMEM132E,C17orf102': 'TMEM132E',
                        'C6orf134,MRPS18B': 'MRPS18B',
                        'C9orf96,SURF4': 'SURF4',
                        'PAK1IP1,C6orf52': 'PAK1IP1',
                      'LOC100130148,MAPT': 'MAPT', 'MAPT,LOC100128977' : 'MAPT', 'NNAT,BLCAP' : 'NNAT',
                       'TCF19,CCHCR1' : 'TCF19', 'TAPBP,ZBTB22' : 'TAPBP',
                       'LOC84740,AFAP1' : 'AFAP1', 'SLC22A18,SLC22A18AS': 'SLC22A18',
                       'IFT140,TMEM204' : 'IFT140', 'CHKB-CPT1B,CPT1B' : 'CHKB-CPT1B', 
                       'GNASAS,GNAS' : 'GNASAS','MATN4,RBPJL' : 'MATN4',
                       'TYW3,CRYZ' : 'TYW3', 'IGF2,INS-IGF2' : 'IGF2',
                       'LOC100170939,LOC653391': 'GUSBP14', 'RPPH1,PARP2' : 'RPPH1',
                       'AAMP,PNKD' : 'AAMP', 'SNAR-B2,SNAR-B1' : 'SNAR-B2', 
                       'IPCEF1,OPRM1' : 'IPCEF1,OPRM1',
                       'MIR1224,VWA5B2': 'MIR1224',
                         'STAG3,GPC2': 'STAG3',
                         'CDSN,PSORS1C1':'CDSN',
                         'PSORS1C2,PSORS1C1': 'PSORS1C1',
                         'PIGL,NCOR1': 'PIGL',
                         'DYDC1,DYDC2': 'DYDC1',
                         'TAP1,PSMB9': 'PSMB9',
                         'FBXL19,NCRNA00095': 'FBXL19',
                         'ZKSCAN3,ZNF323': 'ZSCAN3',
                         'NDUFS2,ADAMTS4': 'NDUFS2',
                         'ADAM15,EFNA4': 'ADAM15',
                         'FAM76B,CEP57': 'CEP57',
                         'GPSM3,PBX2': 'GPSM3',
                         'SETD1A,HSD3B7': 'HSD3B7',
                         'SLC25A11,RNF167': 'SLC25A11',
                         'TPCN1,IQCD': 'TPCN1',
                         'GTF2H4,VARS2': 'GTF2H4',
                         'HCG18,TRIM39': 'HCG18',
                         'ADAT3,SCAMP4': 'ADAT3',
                         'RAD51C,TEX14': 'RAD51C',
                         'LOC100128977,IMP5': 'IMP5',
                         'NRBP1,KRTCAP3': 'NRBP1',
                         'TRIM72,PYDC1': 'TRIM72',
                         'HIRIP3,INO80E': 'INO80E',
                         'SLC39A7,RXRB': 'RXRB',
                         'RNF5,RNF5P1,AGPAT1': 'RNF5',
                         'BAT4,CSNK2B': 'BAT4',
                         'MEPCE,ZCWPW1': 'MEPCE',
                         'NFKBIL1,ATP6V1G2': 'NFKBIL1',
                         'AKT3,SDCCAG8': 'AKT3',
                         'BCL7C,MIR762': 'BCL7C',
                         'IER3,FLOT1': 'IER3',
                         'SLC16A6,ARSG': 'SLC16A6',
                         'CCDC47,DDX42': 'CCDC47',
                         'PILRB,PMS2L1': 'PILRB',
                         'PFDN2,NIT1': 'PFDN2',
                         'CCR10,CNTNAP1': 'CCR10',
                         'SEMA4G,MIR608': 'SEMA4G',
                         'ALS2CR8,WDR12': 'ALS2CR8',
                         'LACTB2,XKR9': 'LACTB2',
                         'INCA1,CAMTA2': 'INCA1',
                         'STH,MAPT': 'MAPT',
                         'GNGT2,ABI3': 'GNGT2',
                         'SKA2,MIR301A': 'SKA2',
                         'MIR1909,REXO1': 'MIR1909',
                         'ERCC2,KLC3': 'ERCC2',
                         'SLC26A1,IDUA': 'SLC26A1',
                         'EFCAB5,SSH2': 'EFCAB5',
                         'CDIPT,LOC440356': 'CDIPT',
                         'SEZ6L2,ASPHD1': 'SEZ6L2',
                         'CORO1A,LOC606724': 'CORO1A',
                         'CPSF3,ITGB1BP1': 'CPSF3',
                         'INHA,OBSL1': 'INHA',
                         'NDUFS2,FCER1G': 'NDUFS2',
                         'TSNAX-DISC1,DISC1': 'TSNAX-DISC1',
                         'INCA1,KIF1C': 'INCA1',
                         'LOC100132724,AP4E1': 'AP4E1',
                       'DOC2A,INO80E': 'INO80E', 'LOC100130148,MAPT,LOC100128977' : 'MAPT',
                       'TNXB,CYP21A2' : 'TNXB', 
                       'BLCAP,NNAT' : 'BLCAP',
                        'AFAP1,LOC84740' : 'AFAP1',
                        'ZBTB22,TAPBP' : 'ZBTB22',
                        'RIBC2,SMC1B' : 'RIBC2',
                        'CPT1B,CHKB-CPT1B' : 'CPT1B',
                        'LOC100130148,LOC100128977,MAPT': 'MAPT',
                         'VWA5B2,MIR1224': 'VWA5B2',
                         'GPC2,STAG3': 'STAG3',
                         'PSORS1C1,CDSN': 'PSORS1C1',
                         'PSORS1C1,PSORS1C2': 'PSORS1C1',
                         'NCOR1,PIGL': 'NCOR1',
                         'PSMB9,TAP1': 'PSMB9',
                         'CCHCR1,TCF19': 'CCHCR1',
                         'KRTCAP3,NRBP1': 'KRTCAP3',
                         'SCAMP4,ADAT3': 'SCAMP4',
                         'HSD3B7,SETD1A': 'HSD3B7',
                         'RNF167,SLC25A11': 'RNF167',
                         'VARS2,GTF2H4': 'VARS2',
                         'IMP5,LOC100128977': 'IMP5',
                         'TEX14,RAD51C': 'TEX14',
                         'KLC3,ERCC2': 'ERCC2',
                         'DDX42,CCDC47': 'DDX42',
                         'PMS2L1,PILRB': 'PMS2L1',
                         'RXRB,SLC39A7': 'RXRB',
                         'CSNK2B,BAT4': 'CSNK2B',
                         'ATP6V1G2,NFKBIL1': 'ATP6V1G2',
                         'ABI3,GNGT2': 'AB13',
                         'CNTNAP1,CCR10': 'CNTNAP1',
                         'MIR608,SEMA4G': 'MIR608',
                         'SSH2,EFCAB5': 'SSH2',
                         'MIR301A,SKA2': 'MIR301A',
                         'AP4E1,LOC100132724': 'AP4E1',
                         'IQCD,TPCN1': 'IQCD',
                         'ITGB1BP1,CPSF3': 'ITGBIBP1',
                         'LOC606724,CORO1A': 'CORO1A',
                         'LOC440356,CDIPT': 'CDIPT',
                         'C10orf2,MRPL43': 'MRPL43',
                         'DISC1,TSNAX-DISC1': 'DISC1'})                  

In [46]:
fix = centraldf[centraldf.Gene.str.contains(',')]
                                                       
fix

Unnamed: 0,Omic,Disease,probeID,ProbeChr,Gene,Probe_bp,topSNP,topSNP_chr,topSNP_bp,A1,...,b_eQTL,se_eQTL,p_eQTL,b_SMR,se_SMR,p_SMR,p_SMR_multi,p_HEIDI,nsnp_HEIDI,topRSID
110,brain_mMeta,AD,cg25101291,1,"MIR200A,MIR429",1103235,rs9660710,1,1099342,C,...,-0.482462,0.080237,1.821528e-09,-0.013058,0.033027,0.692571,0.475570,0.091273,4.0,rs9660710
266,brain_mMeta,AD,cg19018051,1,"CDK11B,LOC728661",1625395,rs34645159,1,1724366,A,...,0.393627,0.061115,1.188428e-10,-0.013719,0.021193,0.517432,0.756812,0.325488,14.0,rs34645159
1259,brain_mMeta,AD,cg22792644,1,"NOL9,TAS1R1",6614718,rs10864625,1,6614230,G,...,-0.541522,0.053649,5.882408e-24,-0.018282,0.022418,0.414782,0.611251,0.658456,13.0,rs10864625
2921,brain_mMeta,AD,cg25775721,1,"SNHG3,SNHG3-RCC1",28831874,rs41270837,1,28833834,T,...,-0.464158,0.074007,3.568545e-10,0.149518,0.059295,0.011683,0.011543,0.052456,5.0,rs41270837
2922,brain_mMeta,AD,cg20985980,1,"SNHG3,SNHG3-RCC1",28832113,rs58663247,1,28832095,C,...,-0.979163,0.064582,6.349198e-52,0.069753,0.026144,0.007630,0.004595,0.000151,5.0,rs58663247
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4795,blood_mcrae,PSP,cg14416726,6,"C6orf211,RMND1",151773293,rs6916340,6,151779064,C,...,-1.007460,0.026204,0.000000e+00,-0.010720,0.054693,0.844607,0.285167,0.094692,17.0,rs6916340
4796,blood_mcrae,PSP,cg17179660,6,"C6orf211,RMND1",151773299,rs6916340,6,151779064,C,...,-0.837161,0.028950,7.261755e-184,-0.012901,0.065819,0.844608,0.260737,0.101026,17.0,rs6916340
5162,blood_mcrae,PSP,cg04387240,6,"C6orf176,LOC441177",166402699,rs480621,6,166419693,T,...,0.340789,0.052964,1.239922e-10,-0.128525,0.267481,0.630870,0.459848,0.541598,3.0,rs480621
5794,blood_mcrae,PSP,cg08611833,6,"PSMB1,TBP",170862455,rs4428484,6,170902109,A,...,0.650768,0.084763,1.621864e-14,0.074988,0.196626,0.702925,0.702925,-9999.000000,-9999.0,rs4428484


In [19]:
# check against protein coding genes
coding = pd.read_csv('/../omicSynth/proteincoding_genesym.csv')

In [47]:
fix_genes = list(fix.Gene.value_counts().index) # get list of genes that ned to be annotated

fix_dict = {} # dictonary to hold new gene symbols
for gene in fix_genes: # grab every string
    genes = gene.split(',') # split string into list
    
    stat = 0 # counter
    for x in genes: # iterate through list

        if x in list(coding['Gene_symbol']): # check if gene is in our coding gene list and if so append. Will only take first instance if it is protein coding
            fix_dict[gene] = x
            stat -= 1
            break
        else:
            stat += 1 # if gene is non protein coding add to counter
    if stat == len(genes): # if counter = length 
        fix_dict[gene] = genes[0]
             

In [48]:
len(fix_dict)

223

In [49]:
len(dict(fix.Gene.value_counts()))

223

In [50]:
centraldf2 = centraldf.replace(fix_dict)

In [51]:
fix2 = centraldf2[centraldf2.Gene.str.contains(',')]
fix2

Unnamed: 0,Omic,Disease,probeID,ProbeChr,Gene,Probe_bp,topSNP,topSNP_chr,topSNP_bp,A1,...,b_eQTL,se_eQTL,p_eQTL,b_SMR,se_SMR,p_SMR,p_SMR_multi,p_HEIDI,nsnp_HEIDI,topRSID


In [52]:
# print number of unique SNPS before drop
print(len(centraldf2.topRSID.unique()))

314224


In [53]:
# drop all 'Novel_or_none'
red_df = centraldf2.query("Gene != 'Novel_or_none'")

print(len(red_df.topRSID.unique()))

311382


In [54]:
# export 
red_df.to_csv('/../omicSynth/NDD_SMR_genes.csv', index = None, sep = ',')