##### TODO:
- Separate array values
- Add functionality to the utils module

In [1]:
import numpy as np
import pandas as pd

from utils.utils import get_sample_data

## Default Data Cleanup

#### Load a single dataframe

In [2]:
EE_default_015 = get_sample_data("data/EE_015/", ["default"])
EE_default_050 = get_sample_data("data/EE_050/", ["default"])
EE_default_069 = get_sample_data("data/EE_069/", ["default"])
EE_default = pd.concat([EE_default_015, EE_default_050, EE_default_069], ignore_index=True, axis=0)

#### Drop ID

In [3]:
EE_default.drop("ID", axis=1, inplace=True)

#### Split FILTER into separate columns (omit PASS and FAIL)

https://docs.varsome.com/en/variant-calling-and-quality-filters

In [4]:
unique_filters = set(';'.join(EE_default['FILTER']).split(';'))
unique_filters = [f for f in unique_filters if f != "PASS" and f != "FAIL"]
unique_filters

['base_qual',
 'fragment',
 'multiallelic',
 'clustered_events',
 'weak_evidence',
 'slippage',
 'haplotype',
 'germline',
 'strand_bias',
 'map_qual']

#### Create new columns names

In [5]:
filter_cols = ["FILTER_" + f for f in unique_filters]
filter_cols

['FILTER_base_qual',
 'FILTER_fragment',
 'FILTER_multiallelic',
 'FILTER_clustered_events',
 'FILTER_weak_evidence',
 'FILTER_slippage',
 'FILTER_haplotype',
 'FILTER_germline',
 'FILTER_strand_bias',
 'FILTER_map_qual']

In [6]:
for f_col, f_val in zip(filter_cols, unique_filters):
    EE_default[f_col] = EE_default['FILTER'].apply(lambda x: 1 if f_val in x else 0)

EE_default_clean = EE_default.drop("FILTER", axis=1)
EE_default_clean.head()

Unnamed: 0,#CHROM,POS,REF,ALT,QUAL,FILTER_base_qual,FILTER_fragment,FILTER_multiallelic,FILTER_clustered_events,FILTER_weak_evidence,FILTER_slippage,FILTER_haplotype,FILTER_germline,FILTER_strand_bias,FILTER_map_qual
0,chr1,15820,G,T,.,0,0,0,0,0,0,0,0,0,0
1,chr1,17385,G,A,.,0,0,0,0,0,0,0,0,0,0
2,chr1,17697,G,C,.,0,0,0,0,0,0,0,0,0,0
3,chr1,133129,G,A,.,0,0,0,0,0,0,0,0,0,0
4,chr1,183629,G,A,.,0,0,0,0,0,0,0,0,0,0


In [7]:
EE_default_clean[filter_cols].sum(axis=0)

FILTER_base_qual             562
FILTER_fragment              315
FILTER_multiallelic         3614
FILTER_clustered_events    20008
FILTER_weak_evidence        1423
FILTER_slippage             6497
FILTER_haplotype           14501
FILTER_germline             1075
FILTER_strand_bias          1032
FILTER_map_qual            10247
dtype: int64

## CSQ Data Cleanup

#### Load a single dataframe

In [8]:
EE_csq_015 = get_sample_data("data/EE_015/", ["csq"])
EE_csq_050 = get_sample_data("data/EE_050/", ["csq"])
EE_csq_069 = get_sample_data("data/EE_069/", ["csq"])
EE_csq = pd.concat([EE_csq_015, EE_csq_050, EE_csq_069], ignore_index=True, axis=0)

In [9]:
EE_csq.columns

Index(['Allele', 'Consequence', 'IMPACT', 'SYMBOL', 'Gene', 'Feature_type',
       'Feature', 'BIOTYPE', 'EXON', 'INTRON', 'HGVSc', 'HGVSp',
       'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids',
       'Codons', 'Existing_variation', 'DISTANCE', 'STRAND', 'FLAGS',
       'SYMBOL_SOURCE', 'HGNC_ID', 'CANONICAL', 'MANE_SELECT',
       'MANE_PLUS_CLINICAL', 'TSL', 'APPRIS', 'CCDS', 'ENSP', 'SWISSPROT',
       'TREMBL', 'UNIPARC', 'UNIPROT_ISOFORM', 'REFSEQ_MATCH', 'SOURCE',
       'REFSEQ_OFFSET', 'GIVEN_REF', 'USED_REF', 'BAM_EDIT', 'SIFT',
       'PolyPhen', 'DOMAINS', 'HGVS_OFFSET', 'AF', 'AFR_AF', 'AMR_AF',
       'EAS_AF', 'EUR_AF', 'SAS_AF', 'gnomADe_AF', 'gnomADe_AFR_AF',
       'gnomADe_AMR_AF', 'gnomADe_ASJ_AF', 'gnomADe_EAS_AF', 'gnomADe_FIN_AF',
       'gnomADe_NFE_AF', 'gnomADe_OTH_AF', 'gnomADe_SAS_AF', 'gnomADg_AF',
       'gnomADg_AFR_AF', 'gnomADg_AMI_AF', 'gnomADg_AMR_AF', 'gnomADg_ASJ_AF',
       'gnomADg_EAS_AF', 'gnomADg_FIN_AF', 'gnomADg_MID_AF', 

In [10]:
EE_csq.shape

(444444, 96)

#### Drop uninteresting columns

In [11]:
drop_columns = ["TSL", "APPRIS", "CCDS", "ENSP", "SWISSPROT", "TREMBL", "UNIPARC", "UNIPROT_ISOFORM", "REFSEQ_MATCH", "Gene",
                "SOURCE", "REFSEQ_OFFSET", "GIVEN_REF", "USED_REF", "BAM_EDIT", "DOMAINS", "HGVS_OFFSET", "AF", "AFR_AF",
                "AMR_AF", "EAS_AF", "EUR_AF", "SAS_AF", "cDNA_position", "CDS_position", "Protein_position", "HGVSp", "TRANSCRIPTION_FACTORS"]

drop_columns_gnomad = [c for c in EE_csq if c.startswith("gnom") and c != "gnomADe_AF" and c!= "gnomADg_AF"]
drop_columns.extend(drop_columns_gnomad)
EE_potential_csq = EE_csq.drop(drop_columns, axis=1)
EE_potential_csq.shape

(444444, 50)

In [12]:
# These columns need further discussion

drop_columns2 = ["SOMATIC", "PHENO", "CLIN_SIG", "Consequence", "HGNC_ID"]
EE_potential_csq.drop(drop_columns2, axis=1, inplace=True)

In [13]:
len(drop_columns)

46

In [14]:
EE_potential_csq.columns

Index(['Allele', 'IMPACT', 'SYMBOL', 'Feature_type', 'Feature', 'BIOTYPE',
       'EXON', 'INTRON', 'HGVSc', 'Amino_acids', 'Codons',
       'Existing_variation', 'DISTANCE', 'STRAND', 'FLAGS', 'SYMBOL_SOURCE',
       'CANONICAL', 'MANE_SELECT', 'MANE_PLUS_CLINICAL', 'SIFT', 'PolyPhen',
       'gnomADe_AF', 'gnomADg_AF', 'PUBMED', 'VAR_SYNONYMS', 'MOTIF_NAME',
       'MOTIF_POS', 'HIGH_INF_POS', 'MOTIF_SCORE_CHANGE', 'EVE_CLASS',
       'EVE_SCORE', 'CADD_PHRED', 'CADD_RAW', 'SpliceAI_pred_DP_AG',
       'SpliceAI_pred_DP_AL', 'SpliceAI_pred_DP_DG', 'SpliceAI_pred_DP_DL',
       'SpliceAI_pred_DS_AG', 'SpliceAI_pred_DS_AL', 'SpliceAI_pred_DS_DG',
       'SpliceAI_pred_DS_DL', 'SpliceAI_pred_SYMBOL', 'LOEUF', 'PHENOTYPES',
       'NMD'],
      dtype='object')

#### Drop potentially interesting columns

In [15]:
potential_drop_columns = ["Consequence", "IMPACT", "CANONICAL", "MANE_SELECT", "MANE_PLUS_CLINICAL", "SIFT", "PolyPhen",
                          "CLIN_SIG", "EVE_CLASS", "EVE_SCORE", "CADD_PHRED", "CADD_RAW", "LOEUF", "NMD", "SpliceAI_pred_DP_AG",
                          "SpliceAI_pred_DP_AL", "SpliceAI_pred_DP_DG", "SpliceAI_pred_DP_DL", "SpliceAI_pred_DS_AG",
                          "SpliceAI_pred_DS_AL", "SpliceAI_pred_DS_DG", "SpliceAI_pred_DS_DL", "SpliceAI_pred_SYMBOL", "FLAGS"]

potential_drop_columns = [c for c in potential_drop_columns if c in EE_potential_csq.columns]

EE_important_csq = EE_potential_csq.drop(potential_drop_columns, axis=1)
EE_important_csq.shape

(444444, 23)

In [16]:
len(potential_drop_columns)

22

#### Separate array values

In [17]:
separable_csq_columns = ["PHENOTYPES", "CLIN_SIG", "SOMATIC", "DOMAINS", "PUBMED", "TREMBL", "Consequence", "HGVSp",
                         "FLAGS", "PHENO", "Existing_variation", "SWISSPROT", "TRANSCRIPTION_FACTORS", "VAR_SYNONYMS"]
# Filter out discarded columns
separable_csq_columns = [c for c in separable_csq_columns if c in EE_potential_csq.columns]

# Filter out discarded columns
potentially_separable_columns = ["CDS_position", "cDNA_position", "HGNC_ID", "SIFT", "PolyPhen", "Protein_position"]
potentially_separable_columns = [c for c in potentially_separable_columns if c in EE_potential_csq.columns]

check_correctness = ["Codons"]

In [18]:
for col in separable_csq_columns:
    print(col)
    print("Num uniq:", len(EE_potential_csq[col].unique()))
    max_len = np.argmax(list(map(len, map(str, EE_potential_csq[col]))))
    print("Argmax:",max_len)
    print(EE_potential_csq[col][max_len])
    print(EE_potential_csq[col].unique()[:10])
    print()

PHENOTYPES
Num uniq: 54932
Argmax: 54168
FASTING_PLASMA_GLUCOSE_LEVEL_QUANTITATIVE_TRAIT_LOCUS_5+MIM_morbid+ENSG00000084734&ClinVar:_phenotype_not_specified+ClinVar+rs1260326&FASTING_PLASMA_GLUCOSE_LEVEL_QUANTITATIVE_TRAIT_LOCUS_5+ClinVar+rs1260326&Cholesterol+dbGaP+rs1260326&Cholesterol+dbGaP+rs1260326&Triglycerides+dbGaP+rs1260326&Triglycerides+dbGaP+rs1260326&Fasting_Glucose+MAGIC+rs1260326&1-carboxyethylisoleucine_levels+NHGRI-EBI_GWAS_catalog+rs1260326&1-carboxyethylleucine_levels+NHGRI-EBI_GWAS_catalog+rs1260326&1-carboxyethylphenylalanine_levels+NHGRI-EBI_GWAS_catalog+rs1260326&1-carboxyethylvaline_levels+NHGRI-EBI_GWAS_catalog+rs1260326&1-myristoyl-2-arachidonoyl-GPC_14:0/20:4_levels+NHGRI-EBI_GWAS_catalog+rs1260326&1-myristoyl-2-docosahexaenoyl-GPC_14:0/22:6_levels+NHGRI-EBI_GWAS_catalog+rs1260326&1-myristoyl-2-linoleoyl-GPC_14:0/18:2_levels+NHGRI-EBI_GWAS_catalog+rs1260326&1-myristoylglycerol_14:0_levels+NHGRI-EBI_GWAS_catalog+rs1260326&1-myristoyl-GPC_14:0_levels+NHGRI-EBI_G

In [19]:
# PHENOTYPES split - test if a value exist in the cell
phenotypes_split_example = [x.split("+") for x in EE_potential_csq["PHENOTYPES"].iloc[24931].split("&")]

# phenotypes_unique = set([])
# for c in EE_potential_csq["PHENOTYPES"].unique():
#     if not isinstance(c, float):
#         for _c in c.split("&"):
#             pc = _c.split("+")[0]
#             phenotypes_unique.add(pc)
#     else:
#         phenotypes_unique.add(c)

EE_potential_csq["PHENOTYPES_exist"] = EE_potential_csq["PHENOTYPES"].apply(lambda x: 1 if pd.notna(x) else 0)
EE_potential_csq.drop("PHENOTYPES", axis=1, inplace=True)
EE_potential_csq["PHENOTYPES_exist"].head(5)

0    0
1    0
2    0
3    0
4    0
Name: PHENOTYPES_exist, dtype: int64

In [20]:
# CLIN_SIG split - skip for now POTENTIALLY INTERESTING - DISCUSS
# clin_sig_unique = set([])
# for c in EE_potential_csq["CLIN_SIG"].unique():
#     if not isinstance(c, float):
#         for _c in c.split("&"):
#             clin_sig_unique.add(_c)
#     else:
#         clin_sig_unique.add(c)

# clin_sig_unique

In [21]:
# unequal_rows = EE_potential_csq[EE_potential_csq['SOMATIC'].notna() & EE_potential_csq['PHENO'].notna() & (EE_potential_csq['SOMATIC'] != EE_potential_csq['PHENO'])]
# unequal_rows[["Existing_variation", "SOMATIC", "PHENO", "CLIN_SIG"]].iloc[10:15]

In [22]:
# SOMATIC split - skip for now
# EE_potential_csq[["Existing_variation", "SOMATIC", "PHENO", "CLIN_SIG"]].iloc[202:208]

In [23]:
# Exising_variation - split on: exists in COSV/rs

existing_var_unique = set([])
for c in EE_potential_csq["Existing_variation"].unique():
    if not isinstance(c, float):
        for _c in c.split("&"):
            existing_var_unique.add("".join([x for x in _c if not x.isnumeric()]))
    # else:
    #     existing_var_unique.add(c)

existing_var_cols = ["Existing_variation_" + x for x in list(existing_var_unique)]
print(existing_var_cols)

for prefix, col in zip(existing_var_unique, existing_var_cols):
    EE_potential_csq[col] = EE_potential_csq["Existing_variation"].apply(lambda x: 1 if prefix in str(x) else 0)

EE_potential_csq.drop("Existing_variation", axis=1, inplace=True)
EE_potential_csq[existing_var_cols].head(5)

['Existing_variation_rs', 'Existing_variation_HM', 'Existing_variation_CX', 'Existing_variation_CS', 'Existing_variation_CR', 'Existing_variation_CD', 'Existing_variation_BM', 'Existing_variation_CM', 'Existing_variation_CP', 'Existing_variation_CI', 'Existing_variation_COSV']


Unnamed: 0,Existing_variation_rs,Existing_variation_HM,Existing_variation_CX,Existing_variation_CS,Existing_variation_CR,Existing_variation_CD,Existing_variation_BM,Existing_variation_CM,Existing_variation_CP,Existing_variation_CI,Existing_variation_COSV
0,1,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0


In [24]:
# Consequence split - skip for now POTENTIALLY INTERESTING - DISCUSS
# consequence_unique = set([])
# for c in EE_potential_csq["Consequence"].unique():
#     if not isinstance(c, float):
#         for _c in c.split("&"):
#             consequence_unique.add(_c)
#     else:
#         consequence_unique.add(c)

# consequence_unique

In [25]:
# FLAGS split
flags = ['cds_start_NF', 'cds_end_NF']

EE_potential_csq["FLAGS_start"] = EE_potential_csq["FLAGS"].apply(lambda x: 1 if flags[0] in str(x) else 0)
EE_potential_csq["FLAGS_start"] = EE_potential_csq["FLAGS"].apply(lambda x: 1 if flags[1] in str(x) else 0)
EE_potential_csq.drop("FLAGS", axis=1, inplace=True)

In [26]:
# VAR_SYNONYMS split - skip for now POTENTIALLY INTERESTING - DISCUSS how to split
# ...

In [27]:
for col in potentially_separable_columns:
    print(col)
    print("Num uniq:", len(EE_potential_csq[col].unique()))
    max_len = np.argmax(list(map(len, map(str, EE_potential_csq[col]))))
    print("Argmax:",max_len)
    print(EE_potential_csq[col][max_len])
    print(EE_potential_csq[col].unique()[:10])
    print()

SIFT
Num uniq: 205
Argmax: 569
deleterious_low_confidence(0.01)
[nan 'tolerated(0.55)' 'tolerated(0.14)' 'tolerated(1)' 'tolerated(0.24)'
 'tolerated(0.08)' 'tolerated(0.32)' 'deleterious_low_confidence(0)'
 'tolerated_low_confidence(0.22)' 'tolerated_low_confidence(0.06)']

PolyPhen
Num uniq: 912
Argmax: 62
possibly_damaging(0.737)
[nan 'benign(0)' 'possibly_damaging(0.737)' 'benign(0.049)'
 'benign(0.068)' 'probably_damaging(0.991)' 'benign(0.014)'
 'benign(0.048)' 'possibly_damaging(0.646)' 'benign(0.04)']



In [28]:
# HGNC_ID split - skip for now POTENTIALLY INTERESTING - DISCUSS

In [29]:
# SIFT split
sift_unique = set([])
for c in EE_potential_csq["SIFT"].unique():
    if not isinstance(c, float):        
        sift_unique.add(c.split("(")[0])
    else:
        sift_unique.add(c)

# sift_unique

EE_potential_csq['SIFT_class'] = EE_potential_csq['SIFT'].str.extract(r'([^\(]+)')
EE_potential_csq['SIFT_pred'] = EE_potential_csq['SIFT'].str.extract(r'\(([^)]+)\)').astype(float)
EE_potential_csq.drop("SIFT", axis=1, inplace=True)

In [30]:
EE_potential_csq[EE_potential_csq["SIFT_class"].notna()][["SIFT_class", "SIFT_pred"]].iloc[:10]

Unnamed: 0,SIFT_class,SIFT_pred
43,tolerated,0.55
62,tolerated,0.14
64,tolerated,1.0
65,tolerated,0.24
109,tolerated,0.08
120,tolerated,0.32
133,tolerated,1.0
190,deleterious_low_confidence,0.0
213,tolerated_low_confidence,0.22
295,tolerated_low_confidence,0.06


In [31]:
# PolyPhen split        
polyphen_unique = set([])
for c in EE_potential_csq["PolyPhen"].unique():
    if not isinstance(c, float):        
        polyphen_unique.add(c.split("(")[0])
    else:
        polyphen_unique.add(c)

# polyphen_unique
EE_potential_csq['PolyPhen_class'] = EE_potential_csq['PolyPhen'].str.extract(r'([^\(]+)')
EE_potential_csq['PolyPhen_pred'] = EE_potential_csq['PolyPhen'].str.extract(r'\(([^)]+)\)').astype(float)
EE_potential_csq.drop("PolyPhen", axis=1, inplace=True)

In [32]:
EE_potential_csq[EE_potential_csq["PolyPhen_class"].notna()][["PolyPhen_class", "PolyPhen_pred"]].iloc[:10]

Unnamed: 0,PolyPhen_class,PolyPhen_pred
43,benign,0.0
62,possibly_damaging,0.737
64,benign,0.0
65,benign,0.0
109,benign,0.049
120,benign,0.068
133,benign,0.0
190,probably_damaging,0.991
213,benign,0.014
295,benign,0.048


## Genotype Data Cleanup

#### Load EE_15 with EE_50 and EE_69 separately

In [33]:
EE_genotype_015 = get_sample_data("data/EE_015/", ["genotype"])
EE_genotype_050 = get_sample_data("data/EE_050/", ["genotype"])
EE_genotype_069 = get_sample_data("data/EE_069/", ["genotype"])

#### Remove excess columns from EE_069

In [34]:
EE_genotype_069.shape

(140174, 46)

In [35]:
excess_columns = set(EE_genotype_069.columns).difference(set(EE_genotype_050.columns))
excess_columns

{'AC',
 'AF',
 'AN',
 'BaseQRankSum',
 'ClippingRankSum',
 'ExcessHet',
 'FS',
 'MLEAC',
 'MLEAF',
 'MQ',
 'MQRankSum',
 'QD',
 'ReadPosRankSum',
 'SOR'}

In [36]:
cut_EE_genotype_069 = EE_genotype_069.drop(excess_columns, axis=1)
cut_EE_genotype_069.shape

(140174, 32)

#### Delete uninteresting columns from EE_069

In [37]:
drop_columns_acmg =  [c for c in cut_EE_genotype_069.columns if c.startswith("ACMG") and c!="ACMG_class"]
drop_columns_gnomad = [c for c in cut_EE_genotype_069 if c.startswith("gnom") and c != "gnomadExomes_AF" and c!= "gnomadGenomes_AF"]

EE_potential_genotype_069 = cut_EE_genotype_069.drop(drop_columns_acmg, axis=1)
EE_potential_genotype_069.drop(drop_columns_gnomad, axis=1, inplace=True)
EE_potential_genotype_069.drop("DP", axis=1, inplace=True)
EE_potential_genotype_069.drop("Gene", axis=1, inplace=True)
EE_potential_genotype_069.drop("hgvs", axis=1, inplace=True)
EE_potential_genotype_069.columns

Index(['ACMG_class', 'CGDinheritance', 'ClinVarClass', 'ClinVarDisease',
       'DANN_score', 'MutationTaster_pred', 'MutationTaster_score',
       'SIFT_score', 'coding_impact', 'function', 'gnomadExomes_AF',
       'gnomadGenomes_AF'],
      dtype='object')

In [39]:
# These columns need further discussion
skip_columns = ["MutationTaster_score", "MutationTaster_pred", "ClinVarClass"]

EE_potential_genotype_069.drop(skip_columns, axis=1, inplace=True)

#### Delete potentially interesting columns from EE_069

In [41]:
potential_drop_columns = ["ClinVarClass", "ClinVarDisease", "DANN_score", "MutationTaster_pred", "MutationTaster_score", "SIFT_score"]
potential_drop_columns = [c for c in potential_drop_columns if c in EE_potential_genotype_069.columns]

EE_important_genotype_069 = EE_potential_genotype_069.drop(potential_drop_columns, axis=1)
EE_important_genotype_069.columns

Index(['ACMG_class', 'CGDinheritance', 'coding_impact', 'function',
       'gnomadExomes_AF', 'gnomadGenomes_AF'],
      dtype='object')

#### Combine EE_015 and EE_050

In [42]:
EE_genotype_015_050 = pd.concat([EE_genotype_015, EE_genotype_050], ignore_index=True, axis=0)

#### Delete uninteresting columns from EE_050 and EE_015

In [43]:
drop_columns_acmg_amp =  [c for c in EE_genotype_015_050.columns if (c.startswith("ACMG") or c.startswith("AMP")) and c!="ACMG_class"]
drop_columns_gnomad = [c for c in EE_genotype_015_050 if c.startswith("gnom") and c != "gnomadExomes_AF" and c!= "gnomadGenomes_AF"]

EE_potential_genotype_015_050 = EE_genotype_015_050.drop(drop_columns_acmg_amp, axis=1)
EE_potential_genotype_015_050.drop(drop_columns_gnomad, axis=1, inplace=True)
EE_potential_genotype_015_050.drop("DP", axis=1, inplace=True)
EE_potential_genotype_015_050.drop("MMQ", axis=1, inplace=True)
EE_potential_genotype_015_050.drop("Gene", axis=1, inplace=True)
EE_potential_genotype_015_050.drop("hgvs", axis=1, inplace=True)
EE_potential_genotype_015_050.columns

Index(['ACMG_class', 'AS_FilterStatus', 'AS_SB_TABLE', 'CGDinheritance',
       'ClinVarClass', 'ClinVarDisease', 'DANN_score', 'ECNT', 'GERMQ', 'MBQ',
       'MFRL', 'MPOS', 'MutationTaster_pred', 'MutationTaster_score', 'POPAF',
       'RPA', 'RU', 'SIFT_score', 'STR', 'STRQ', 'TLOD', 'coding_impact',
       'cosmicFathMMPrediction', 'cosmicFathMMScore', 'function',
       'gnomadExomes_AF', 'gnomadGenomes_AF'],
      dtype='object')

In [44]:
# These columns need further discussion
skip_columns = ["MutationTaster_score", "MutationTaster_pred", "ClinVarClass", "AS_SB_TABLE"]

EE_potential_genotype_015_050.drop(skip_columns, axis=1, inplace=True)

#### Delete potentially interesting columns from EE_050 and EE_015

In [46]:
potential_drop_columns = ["ClinVarClass", "ClinVarDisease", "DANN_score", "MutationTaster_pred", "MutationTaster_score", "SIFT_score"]
potential_drop_columns = [c for c in potential_drop_columns if c in EE_potential_genotype_015_050.columns]

EE_important_genotype_015_050 = EE_potential_genotype_015_050.drop(potential_drop_columns, axis=1)
EE_important_genotype_015_050.columns

Index(['ACMG_class', 'AS_FilterStatus', 'CGDinheritance', 'ECNT', 'GERMQ',
       'MBQ', 'MFRL', 'MPOS', 'POPAF', 'RPA', 'RU', 'STR', 'STRQ', 'TLOD',
       'coding_impact', 'cosmicFathMMPrediction', 'cosmicFathMMScore',
       'function', 'gnomadExomes_AF', 'gnomadGenomes_AF'],
      dtype='object')

#### Propose two combinations of EE_015 EE_050 with EE_069

In [47]:
# Concatenate with all columns from EE_015 EE_050
EE_potential_genotype_all = pd.concat([EE_potential_genotype_015_050, EE_potential_genotype_069], ignore_index=True, axis=0)
EE_important_genotype_all = pd.concat([EE_important_genotype_015_050, EE_important_genotype_069], ignore_index=True, axis=0)

# Concatenate with only common columns
different_potential_columns = set(EE_potential_genotype_015_050.columns).difference(set(EE_potential_genotype_069.columns))
EE_important_genotype_common = pd.concat([EE_potential_genotype_015_050.drop(different_potential_columns, axis=1), EE_potential_genotype_069], ignore_index=True, axis=0)

different_important_columns = set(EE_important_genotype_015_050.columns).difference(set(EE_important_genotype_069.columns))
EE_important_genotype_common = pd.concat([EE_important_genotype_015_050.drop(different_potential_columns, axis=1), EE_important_genotype_069], ignore_index=True, axis=0)

In [48]:
EE_potential_genotype_all.columns

Index(['ACMG_class', 'AS_FilterStatus', 'CGDinheritance', 'ClinVarDisease',
       'DANN_score', 'ECNT', 'GERMQ', 'MBQ', 'MFRL', 'MPOS', 'POPAF', 'RPA',
       'RU', 'SIFT_score', 'STR', 'STRQ', 'TLOD', 'coding_impact',
       'cosmicFathMMPrediction', 'cosmicFathMMScore', 'function',
       'gnomadExomes_AF', 'gnomadGenomes_AF'],
      dtype='object')

#### Separate array values

In [49]:
object_columns = set(EE_potential_genotype_all.select_dtypes(object).columns)
object_columns = list(object_columns)
# object_columns

array_columns = [c for c in object_columns if c!="RU" and c !="cosmicFathMMPrediction" and c!="ACMG_class"]

In [50]:
for col in array_columns:
    print(col)
    print("Num uniq:", len(EE_potential_genotype_all[col].unique()))
    max_len = np.argmax(list(map(len, map(str, EE_potential_genotype_all[col]))))
    print("Argmax:",max_len)
    print(EE_potential_genotype_all[col][max_len])
    if col == "ClinVarClass":
        print(EE_potential_genotype_all[col].unique())
    else:
        print(EE_potential_genotype_all[col].unique()[:10])
    print()

SIFT_score
Num uniq: 7860
Argmax: 170703
0.586%3B0.344%3B.%3B0.628%3B0.393%3B0.655%3B0.586%3B0.586%3B0.586%3B0.393%3B0.586%3B0.381%3B0.586%3B0.6%3B0.586%3B0.586%3B0.6%3B0.344%3B0.597%3B0.586%3B0.393%3B0.455%3B0.344%3B0.586%3B0.6%3B0.327%3B0.597%3B0.678
['0' '1.0' '0.224%3B0.245%3B0.217%3B.' '0.024%3B.' '1.0%3B.' '0.236%3B.'
 '0.018%3B0.018' '0.413%3B0.397%3B0.397%3B0.399'
 '0.094%3B0.094%3B0.095%3B0.097%3B0.1%3B0.096%3B0.188'
 '0.109%3B0.109%3B0.154%3B0.102']

coding_impact
Num uniq: 33
Argmax: 30133
in%40frame,splice%40junction%40loss,start%40loss
['0' 'missense' 'synonymous' 'synonymous,missense' 'in%40frame'
 'missense,synonymous' 'splice%40junction%40loss,in%40frame' 'frameshift'
 'missense,start%40loss' 'nonsense']

AS_FilterStatus
Num uniq: 64
Argmax: 170785
base_qual%2Cweak_evidence|base_qual%2Cweak_evidence
['SITE' 'SITE|SITE' 'SITE|base_qual' 'SITE|base_qual%2Cweak_evidence'
 'SITE|SITE|SITE' 'SITE|strand_bias' 'SITE|map_qual%2Cweak_evidence'
 'SITE|strand_bias|SITE' 'SITE|wea

In [51]:
# AS_FilterStatus split - Discuss correctness

as_filter_status_unique = set([])
for c in EE_potential_genotype_all["AS_FilterStatus"].unique():
    if not isinstance(c, float):        
        for _c in c.split("|"):
            for x in _c.split("%2C"):
                as_filter_status_unique.add(x)
    else:
        as_filter_status_unique.add(c)

as_filter_status_unique = [x for x in as_filter_status_unique if not pd.isna(x)]

as_filter_status_cols = ["AS_FilterStatus_" + c for c in as_filter_status_unique]

for filt, col in zip(as_filter_status_unique, as_filter_status_cols):
    EE_potential_genotype_all[col] = EE_potential_genotype_all["AS_FilterStatus"].apply(lambda x: 1 if filt in str(x) else 0)

EE_potential_genotype_all.drop("AS_FilterStatus", axis=1, inplace=True)
EE_potential_genotype_all[as_filter_status_cols].head(5)

Unnamed: 0,AS_FilterStatus_base_qual,AS_FilterStatus_weak_evidence,AS_FilterStatus_strand_bias,AS_FilterStatus_map_qual,AS_FilterStatus_SITE
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,0,0,1
4,0,0,0,0,1


In [52]:
# AS_SB_TABLE split - skip for now POTENTIALLY INTERESTING - DISCUSS how to split

# as_sb_table_unique = set([])
# for c in EE_potential_genotype_all["AS_SB_TABLE"].unique():
#     if not isinstance(c, float):        
#         for _c in c.split("|"):
#             for x in _c.split("%2C"):
#                 as_sb_table_unique.add(x)
#     else:
#         as_sb_table_unique.add(c)

# list(as_sb_table_unique)[:10]

In [53]:
# MutationTaster_pred split - skip for now POTENTIALLY INTERESTING - DISCUSS how to split

# mutation_tester_pred_unique = set([])
# for c in EE_potential_genotype_all["MutationTaster_pred"].unique():
#     if not isinstance(c, float):        
#         for _c in c.split("%3B"):
#             mutation_tester_pred_unique.add(_c)
#     else:
#         mutation_tester_pred_unique.add(c)

# mutation_tester_pred_unique

In [54]:
# MutationTaster_score split - skip for now POTENTIALLY INTERESTING - DISCUSS how to split
# Corresponding to the MutationTaster_pred split

In [55]:
# ClinVarClass split - skip for now POTENTIALLY INTERESTING - DISCUSS how to split

# clin_var_class_unique = set([])
# for c in EE_potential_genotype_all["ClinVarClass"].unique():
#     if not isinstance(c, float):        
#         # for _c in c.split("|"):
#         clin_var_class_unique.add(c)
#     else:
#         clin_var_class_unique.add(c)

# clin_var_class_unique

In [56]:
# function split
functions = ["0", "NMD", "3'utr", "5'utr", "3'flank", "5'flank", "coding", "non-coding%40exon", "intronic", "splicing", "splicing-ACMG"]
function_cols = ["function_" + c for c in functions]

for function, col in zip(functions, function_cols):
    EE_potential_genotype_all[col] = EE_potential_genotype_all["function"].apply(lambda x: 1 if function in str(x) else 0)

EE_potential_genotype_all.drop("function", axis=1, inplace=True)
EE_potential_genotype_all[function_cols].head(5)

Unnamed: 0,function_0,function_NMD,function_3'utr,function_5'utr,function_3'flank,function_5'flank,function_coding,function_non-coding%40exon,function_intronic,function_splicing,function_splicing-ACMG
0,1,0,0,0,0,0,1,1,0,0,0
1,1,0,0,0,0,0,1,1,1,0,0
2,1,0,0,0,0,1,1,1,0,0,0
3,1,0,0,0,0,0,1,1,1,0,0
4,0,0,0,0,0,0,0,0,1,0,0


In [57]:
# CGDinheritance split

cgd_inheritance_unique = set([])
for c in EE_potential_genotype_all["CGDinheritance"].unique():
    if not isinstance(c, float):        
        # for _c in c.split("|"):
        clean_elem = c.split("%40")[0]
        for _c in clean_elem.split("/"):
            for __c in _c.split("|"):
                cgd_inheritance_unique.add(__c)

    else:
        cgd_inheritance_unique.add(c)

cgd_inheritance_cols = ["CGDinheritance_" + c for c in cgd_inheritance_unique]

for element, col in zip(cgd_inheritance_unique, cgd_inheritance_cols):
    EE_potential_genotype_all[col] = EE_potential_genotype_all["CGDinheritance"].apply(lambda x: 1 if element in str(x) else 0)

EE_potential_genotype_all.drop("CGDinheritance", axis=1, inplace=True)
EE_potential_genotype_all[cgd_inheritance_cols].head(5)    

Unnamed: 0,CGDinheritance_YL,CGDinheritance_Maternal,CGDinheritance_AR,CGDinheritance_Methylation,CGDinheritance_BG,CGDinheritance_Multigenic,CGDinheritance_AD,CGDinheritance_XL,CGDinheritance_Digenic,CGDinheritance_0
0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,1


In [58]:
# coding_impact split

coding_impact_unique = set([])
for c in EE_potential_genotype_all["coding_impact"].unique():
    if not isinstance(c, float):        
        for _c in c.split(","):
            coding_impact_unique.add(_c)
    else:
        coding_impact_unique.add(c)

# coding_impact_unique

coding_impact_cols = ["coding_impact_" + c for c in coding_impact_unique]

for element, col in zip(cgd_inheritance_unique, coding_impact_cols):
    EE_potential_genotype_all[col] = EE_potential_genotype_all["coding_impact"].apply(lambda x: 1 if element in str(x) else 0)

EE_potential_genotype_all.drop("coding_impact", axis=1, inplace=True)
EE_potential_genotype_all[coding_impact_cols].head(5)   

Unnamed: 0,coding_impact_start%40loss,coding_impact_synonymous,coding_impact_missense,coding_impact_frameshift,coding_impact_stopLoss,coding_impact_non%40coding,coding_impact_nonsense,coding_impact_in%40frame,coding_impact_splice%40junction%40loss,coding_impact_0
0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,1
