##### TODO:
- Separate array values
- Add functionality to the utils module

In [1]:
import numpy as np
import pandas as pd

from utils.utils import get_data

## Default Data Cleanup

#### Load a single dataframe

In [2]:
EE_default_015 = get_data("data/EE_015/", ["default"])
EE_default_050 = get_data("data/EE_050/", ["default"])
EE_default_069 = get_data("data/EE_069/", ["default"])
EE_default = pd.concat([EE_default_015, EE_default_050, EE_default_069], ignore_index=True, axis=0)

#### Drop ID

In [3]:
EE_default.drop("ID", axis=1, inplace=True)

#### Split FILTER into separate columns (omit PASS and FAIL)

https://docs.varsome.com/en/variant-calling-and-quality-filters

In [4]:
unique_filters = set(';'.join(EE_default['FILTER']).split(';'))
unique_filters = [f for f in unique_filters if f != "PASS" and f != "FAIL"]
unique_filters

['multiallelic',
 'germline',
 'fragment',
 'base_qual',
 'weak_evidence',
 'map_qual',
 'strand_bias',
 'slippage',
 'haplotype',
 'clustered_events']

#### Create new columns names

In [5]:
filter_cols = ["FILTER_" + f for f in unique_filters]
filter_cols

['FILTER_multiallelic',
 'FILTER_germline',
 'FILTER_fragment',
 'FILTER_base_qual',
 'FILTER_weak_evidence',
 'FILTER_map_qual',
 'FILTER_strand_bias',
 'FILTER_slippage',
 'FILTER_haplotype',
 'FILTER_clustered_events']

In [6]:
for f_col, f_val in zip(filter_cols, unique_filters):
    EE_default[f_col] = EE_default['FILTER'].apply(lambda x: 1 if f_val in x else 0)

EE_default_clean = EE_default.drop("FILTER", axis=1)
EE_default_clean.head()

Unnamed: 0,#CHROM,POS,REF,ALT,QUAL,FILTER_multiallelic,FILTER_germline,FILTER_fragment,FILTER_base_qual,FILTER_weak_evidence,FILTER_map_qual,FILTER_strand_bias,FILTER_slippage,FILTER_haplotype,FILTER_clustered_events
0,chr1,15820,G,T,.,0,0,0,0,0,0,0,0,0,0
1,chr1,17385,G,A,.,0,0,0,0,0,0,0,0,0,0
2,chr1,17697,G,C,.,0,0,0,0,0,0,0,0,0,0
3,chr1,133129,G,A,.,0,0,0,0,0,0,0,0,0,0
4,chr1,183629,G,A,.,0,0,0,0,0,0,0,0,0,0


In [7]:
EE_default_clean[filter_cols].sum(axis=0)

FILTER_multiallelic         3614
FILTER_germline             1075
FILTER_fragment              315
FILTER_base_qual             562
FILTER_weak_evidence        1423
FILTER_map_qual            10247
FILTER_strand_bias          1032
FILTER_slippage             6497
FILTER_haplotype           14501
FILTER_clustered_events    20008
dtype: int64

## CSQ Data Cleanup

#### Load a single dataframe

In [8]:
EE_csq_015 = get_data("data/EE_015/", ["csq"])
EE_csq_050 = get_data("data/EE_050/", ["csq"])
EE_csq_069 = get_data("data/EE_069/", ["csq"])
EE_csq = pd.concat([EE_csq_015, EE_csq_050, EE_csq_069], ignore_index=True, axis=0)

In [9]:
EE_csq.columns

Index(['Allele', 'Consequence', 'IMPACT', 'SYMBOL', 'Gene', 'Feature_type',
       'Feature', 'BIOTYPE', 'EXON', 'INTRON', 'HGVSc', 'HGVSp',
       'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids',
       'Codons', 'Existing_variation', 'DISTANCE', 'STRAND', 'FLAGS',
       'SYMBOL_SOURCE', 'HGNC_ID', 'CANONICAL', 'MANE_SELECT',
       'MANE_PLUS_CLINICAL', 'TSL', 'APPRIS', 'CCDS', 'ENSP', 'SWISSPROT',
       'TREMBL', 'UNIPARC', 'UNIPROT_ISOFORM', 'REFSEQ_MATCH', 'SOURCE',
       'REFSEQ_OFFSET', 'GIVEN_REF', 'USED_REF', 'BAM_EDIT', 'SIFT',
       'PolyPhen', 'DOMAINS', 'HGVS_OFFSET', 'AF', 'AFR_AF', 'AMR_AF',
       'EAS_AF', 'EUR_AF', 'SAS_AF', 'gnomADe_AF', 'gnomADe_AFR_AF',
       'gnomADe_AMR_AF', 'gnomADe_ASJ_AF', 'gnomADe_EAS_AF', 'gnomADe_FIN_AF',
       'gnomADe_NFE_AF', 'gnomADe_OTH_AF', 'gnomADe_SAS_AF', 'gnomADg_AF',
       'gnomADg_AFR_AF', 'gnomADg_AMI_AF', 'gnomADg_AMR_AF', 'gnomADg_ASJ_AF',
       'gnomADg_EAS_AF', 'gnomADg_FIN_AF', 'gnomADg_MID_AF', 

In [10]:
EE_csq.shape

(444444, 96)

#### Drop uninteresting columns

In [11]:
drop_columns = ["TSL", "APPRIS", "CCDS", "ENSP", "SWISSPROT", "TREMBL", "UNIPARC", "UNIPROT_ISOFORM", "REFSEQ_MATCH",
                "SOURCE", "REFSEQ_OFFSET", "GIVEN_REF", "USED_REF", "BAM_EDIT", "DOMAINS", "HGVS_OFFSET", "AF", "AFR_AF",
                "AMR_AF", "EAS_AF", "EUR_AF", "SAS_AF", "cDNA_position", "CDS_position", "Protein_position"]
drop_columns_gnomad = [c for c in EE_csq if c.startswith("gnom") and c != "gnomADe_AF" and c!= "gnomADg_AF"]
drop_columns.extend(drop_columns_gnomad)
EE_potential_csq = EE_csq.drop(drop_columns, axis=1)
EE_potential_csq.shape

(444444, 56)

In [12]:
EE_potential_csq.columns

Index(['Allele', 'Consequence', 'IMPACT', 'SYMBOL', 'Gene', 'Feature_type',
       'Feature', 'BIOTYPE', 'EXON', 'INTRON', 'HGVSc', 'HGVSp',
       'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids',
       'Codons', 'Existing_variation', 'DISTANCE', 'STRAND', 'FLAGS',
       'SYMBOL_SOURCE', 'HGNC_ID', 'CANONICAL', 'MANE_SELECT',
       'MANE_PLUS_CLINICAL', 'SIFT', 'PolyPhen', 'gnomADe_AF', 'gnomADg_AF',
       'CLIN_SIG', 'SOMATIC', 'PHENO', 'PUBMED', 'VAR_SYNONYMS', 'MOTIF_NAME',
       'MOTIF_POS', 'HIGH_INF_POS', 'MOTIF_SCORE_CHANGE',
       'TRANSCRIPTION_FACTORS', 'EVE_CLASS', 'EVE_SCORE', 'CADD_PHRED',
       'CADD_RAW', 'SpliceAI_pred_DP_AG', 'SpliceAI_pred_DP_AL',
       'SpliceAI_pred_DP_DG', 'SpliceAI_pred_DP_DL', 'SpliceAI_pred_DS_AG',
       'SpliceAI_pred_DS_AL', 'SpliceAI_pred_DS_DG', 'SpliceAI_pred_DS_DL',
       'SpliceAI_pred_SYMBOL', 'LOEUF', 'PHENOTYPES', 'NMD'],
      dtype='object')

#### Drop potentially interesting columns

In [13]:
potential_drop_columns = ["Consequence", "IMPACT", "CANONICAL", "MANE_SELECT", "MANE_PLUS_CLINICAL", "SIFT", "PolyPhen",
                          "CLIN_SIG", "EVE_CLASS", "EVE_SCORE", "CADD_PHRED", "CADD_RAW", "LOEUF", "NMD", "SpliceAI_pred_DP_AG",
                          "SpliceAI_pred_DP_AL", "SpliceAI_pred_DP_DG", "SpliceAI_pred_DP_DL", "SpliceAI_pred_DS_AG",
                          "SpliceAI_pred_DS_AL", "SpliceAI_pred_DS_DG", "SpliceAI_pred_DS_DL", "SpliceAI_pred_SYMBOL"]

EE_important_csq = EE_potential_csq.drop(potential_drop_columns, axis=1)
EE_important_csq.shape

(444444, 33)

#### Separate array values

In [14]:
separable_csq_columns = ["PHENOTYPES", "CLIN_SIG", "SOMATIC", "DOMAINS", "PUBMED", "TREMBL", "Consequence", "HGVSp",
                         "FLAGS", "PHENO", "Existing_variation", "SWISSPROT", "TRANSCRIPTION_FACTORS", "VAR_SYNONYMS"]
# Filter out discarded columns
separable_csq_columns = [c for c in separable_csq_columns if c in EE_potential_csq.columns]

# Filter out discarded columns
potentially_separable_columns = ["CDS_position", "cDNA_position", "HGNC_ID", "SIFT", "PolyPhen", "Protein_position"]
potentially_separable_columns = [c for c in potentially_separable_columns if c in EE_potential_csq.columns]

check_correctness = ["Codons"]

In [15]:
EE_potential_csq[["Existing_variation", "PHENO"]].iloc[200:205]

Unnamed: 0,Existing_variation,PHENO
200,rs2281171,
201,rs66653340,
202,rs3817856&COSV55802795,0&1
203,rs34869725&COSV55803457,0&1
204,rs1474648&COSV55804435,0&1


In [16]:
for col in separable_csq_columns:
    print(col)
    print("Num uniq:", len(EE_potential_csq[col].unique()))
    max_len = np.argmax(list(map(len, map(str, EE_potential_csq[col]))))
    print("Argmax:",max_len)
    print(EE_potential_csq[col][max_len])
    print(EE_potential_csq[col].unique()[:10])
    print()

PHENOTYPES
Num uniq: 54932
Argmax: 54168
FASTING_PLASMA_GLUCOSE_LEVEL_QUANTITATIVE_TRAIT_LOCUS_5+MIM_morbid+ENSG00000084734&ClinVar:_phenotype_not_specified+ClinVar+rs1260326&FASTING_PLASMA_GLUCOSE_LEVEL_QUANTITATIVE_TRAIT_LOCUS_5+ClinVar+rs1260326&Cholesterol+dbGaP+rs1260326&Cholesterol+dbGaP+rs1260326&Triglycerides+dbGaP+rs1260326&Triglycerides+dbGaP+rs1260326&Fasting_Glucose+MAGIC+rs1260326&1-carboxyethylisoleucine_levels+NHGRI-EBI_GWAS_catalog+rs1260326&1-carboxyethylleucine_levels+NHGRI-EBI_GWAS_catalog+rs1260326&1-carboxyethylphenylalanine_levels+NHGRI-EBI_GWAS_catalog+rs1260326&1-carboxyethylvaline_levels+NHGRI-EBI_GWAS_catalog+rs1260326&1-myristoyl-2-arachidonoyl-GPC_14:0/20:4_levels+NHGRI-EBI_GWAS_catalog+rs1260326&1-myristoyl-2-docosahexaenoyl-GPC_14:0/22:6_levels+NHGRI-EBI_GWAS_catalog+rs1260326&1-myristoyl-2-linoleoyl-GPC_14:0/18:2_levels+NHGRI-EBI_GWAS_catalog+rs1260326&1-myristoylglycerol_14:0_levels+NHGRI-EBI_GWAS_catalog+rs1260326&1-myristoyl-GPC_14:0_levels+NHGRI-EBI_G

In [17]:
# PHENOTYPES split
phenotypes_split_example = [x.split("+") for x in EE_potential_csq["PHENOTYPES"].iloc[24931].split("&")]

phenotypes_unique = set([])
for c in EE_potential_csq["PHENOTYPES"].unique():
    if not isinstance(c, float):
        for _c in c.split("&"):
            pc = _c.split("+")[0]
            phenotypes_unique.add(pc)
    else:
        phenotypes_unique.add(c)

len(phenotypes_unique)

20043

In [18]:
# CLIN_SIG split
clin_sig_unique = set([])
for c in EE_potential_csq["CLIN_SIG"].unique():
    if not isinstance(c, float):
        for _c in c.split("&"):
            clin_sig_unique.add(_c)
    else:
        clin_sig_unique.add(c)

# clin_sig_unique

In [19]:
unequal_rows = EE_potential_csq[EE_potential_csq['SOMATIC'].notna() & EE_potential_csq['PHENO'].notna() & (EE_potential_csq['SOMATIC'] != EE_potential_csq['PHENO'])]
unequal_rows[["Existing_variation", "SOMATIC", "PHENO", "CLIN_SIG"]].iloc[10:15]

Unnamed: 0,Existing_variation,SOMATIC,PHENO,CLIN_SIG
92,rs397834473&COSV65068887,0&1,1&1,benign
93,rs147681220&COSV101038847,0&1,1&1,benign
98,rs3121561&COSV65069616,0&1,1&1,benign
99,rs2465136&COSV65069776,0&1,1&1,benign
133,rs307377&CM098260&COSV59564749,0&0&1,0&1&1,


In [20]:
# SOMATIC split
EE_potential_csq[["Existing_variation", "SOMATIC", "PHENO", "CLIN_SIG"]].iloc[202:208]

Unnamed: 0,Existing_variation,SOMATIC,PHENO,CLIN_SIG
202,rs3817856&COSV55802795,0&1,0&1,
203,rs34869725&COSV55803457,0&1,0&1,
204,rs1474648&COSV55804435,0&1,0&1,
205,rs1474649,,,
206,rs2377208,,,
207,rs2889577,,,


In [21]:
# Consequence split
consequence_unique = set([])
for c in EE_potential_csq["Consequence"].unique():
    if not isinstance(c, float):
        for _c in c.split("&"):
            consequence_unique.add(_c)
    else:
        consequence_unique.add(c)

consequence_unique

{'3_prime_UTR_variant',
 '5_prime_UTR_variant',
 'NMD_transcript_variant',
 'TF_binding_site_variant',
 'coding_sequence_variant',
 'downstream_gene_variant',
 'frameshift_variant',
 'inframe_deletion',
 'inframe_insertion',
 'intergenic_variant',
 'intron_variant',
 'mature_miRNA_variant',
 'missense_variant',
 'non_coding_transcript_exon_variant',
 'non_coding_transcript_variant',
 'protein_altering_variant',
 'regulatory_region_variant',
 'splice_acceptor_variant',
 'splice_donor_5th_base_variant',
 'splice_donor_region_variant',
 'splice_donor_variant',
 'splice_polypyrimidine_tract_variant',
 'splice_region_variant',
 'start_lost',
 'start_retained_variant',
 'stop_gained',
 'stop_lost',
 'stop_retained_variant',
 'synonymous_variant',
 'upstream_gene_variant'}

In [22]:
# FLAGS split
flags = ['cds_start_NF', 'cds_end_NF']

In [23]:
# TRANSCRIPTION_FACTORS split
consequence_unique = set([])
for c in EE_potential_csq["TRANSCRIPTION_FACTORS"].unique():
    if not isinstance(c, float):
        for _c in c.split("&"):
            consequence_unique.add(_c)
    else:
        consequence_unique.add(c)

consequence_unique

{'CTCF',
 'ETV2::RFX5',
 'GCM1::MAX',
 'MEF2A',
 'MEF2B',
 'MEF2D',
 'MYBL1::MAX',
 'SOX6::TBX21',
 'TFAP2C::MAX',
 nan}

In [24]:
# VAR_SYNONYMS split
# ...

In [25]:
for col in potentially_separable_columns:
    print(col)
    print("Num uniq:", len(EE_potential_csq[col].unique()))
    max_len = np.argmax(list(map(len, map(str, EE_potential_csq[col]))))
    print("Argmax:",max_len)
    print(EE_potential_csq[col][max_len])
    print(EE_potential_csq[col].unique()[:10])
    print()

CDS_position
Num uniq: 6519
Argmax: 43820
11857-11858
[nan '1182' '918' '898' '1863' '1988' '1896' '1216' '1192' '841']

cDNA_position
Num uniq: 14066
Argmax: 2503
11335-11336
[nan '2105' '2100' '301' '251' '257' '2664' '2653' '1198' '934']

HGNC_ID
Num uniq: 22706
Argmax: 0
HGNC:37102
['HGNC:37102' 'HGNC:48835' 'HGNC:49954' nan 'HGNC:50701' 'HGNC:49377'
 'HGNC:32337' 'HGNC:24517' 'HGNC:24023' 'HGNC:28208']

SIFT
Num uniq: 205
Argmax: 569
deleterious_low_confidence(0.01)
[nan 'tolerated(0.55)' 'tolerated(0.14)' 'tolerated(1)' 'tolerated(0.24)'
 'tolerated(0.08)' 'tolerated(0.32)' 'deleterious_low_confidence(0)'
 'tolerated_low_confidence(0.22)' 'tolerated_low_confidence(0.06)']

PolyPhen
Num uniq: 912
Argmax: 62
possibly_damaging(0.737)
[nan 'benign(0)' 'possibly_damaging(0.737)' 'benign(0.049)'
 'benign(0.068)' 'probably_damaging(0.991)' 'benign(0.014)'
 'benign(0.048)' 'possibly_damaging(0.646)' 'benign(0.04)']

Protein_position
Num uniq: 3401
Argmax: 192717
13558-13559
[nan '394' '3

In [26]:
# SIFT split
sift_unique = set([])
for c in EE_potential_csq["SIFT"].unique():
    if not isinstance(c, float):        
        sift_unique.add(c.split("(")[0])
    else:
        sift_unique.add(c)

sift_unique

{'deleterious',
 'deleterious_low_confidence',
 nan,
 'tolerated',
 'tolerated_low_confidence'}

In [27]:
# PolyPhen split        
polyphen_unique = set([])
for c in EE_potential_csq["PolyPhen"].unique():
    if not isinstance(c, float):        
        polyphen_unique.add(c.split("(")[0])
    else:
        polyphen_unique.add(c)

polyphen_unique

{'benign', nan, 'possibly_damaging', 'probably_damaging', 'unknown'}

## Genotype Data Cleanup

#### Load EE_15 with EE_50 and EE_69 separately

In [29]:
EE_genotype_015 = get_data("data/EE_015/", ["genotype"])
EE_genotype_050 = get_data("data/EE_050/", ["genotype"])
EE_genotype_069 = get_data("data/EE_069/", ["genotype"])

#### Remove excess columns from EE_069

In [30]:
EE_genotype_069.shape

(140174, 46)

In [31]:
excess_columns = set(EE_genotype_069.columns).difference(set(EE_genotype_050.columns))
excess_columns

{'AC',
 'AF',
 'AN',
 'BaseQRankSum',
 'ClippingRankSum',
 'ExcessHet',
 'FS',
 'MLEAC',
 'MLEAF',
 'MQ',
 'MQRankSum',
 'QD',
 'ReadPosRankSum',
 'SOR'}

In [32]:
cut_EE_genotype_069 = EE_genotype_069.drop(excess_columns, axis=1)
cut_EE_genotype_069.shape

(140174, 32)

#### Delete uninteresting columns from EE_069

In [33]:
drop_columns_acmg =  [c for c in cut_EE_genotype_069.columns if c.startswith("ACMG") and c!="ACMG_class"]
drop_columns_gnomad = [c for c in cut_EE_genotype_069 if c.startswith("gnom") and c != "gnomadExomes_AF" and c!= "gnomadGenomes_AF"]

EE_potential_genotype_069 = cut_EE_genotype_069.drop(drop_columns_acmg, axis=1)
EE_potential_genotype_069 = EE_potential_genotype_069.drop(drop_columns_gnomad, axis=1)
EE_potential_genotype_069 = EE_potential_genotype_069.drop("DP", axis=1)
EE_potential_genotype_069.columns

Index(['ACMG_class', 'CGDinheritance', 'ClinVarClass', 'ClinVarDisease',
       'DANN_score', 'Gene', 'MutationTaster_pred', 'MutationTaster_score',
       'SIFT_score', 'coding_impact', 'function', 'gnomadExomes_AF',
       'gnomadGenomes_AF', 'hgvs'],
      dtype='object')

#### Delete potentially interesting columns from EE_069

In [34]:
potential_drop_columns = ["ClinVarClass", "ClinVarDisease", "DANN_score", "MutationTaster_pred", "MutationTaster_score", "SIFT_score"]

EE_important_genotype_069 = EE_potential_genotype_069.drop(potential_drop_columns, axis=1)
EE_important_genotype_069.columns

Index(['ACMG_class', 'CGDinheritance', 'Gene', 'coding_impact', 'function',
       'gnomadExomes_AF', 'gnomadGenomes_AF', 'hgvs'],
      dtype='object')

#### Combine EE_015 and EE_050

In [35]:
EE_genotype_015_050 = pd.concat([EE_genotype_015, EE_genotype_050], ignore_index=True, axis=0)

#### Delete uninteresting columns from EE_050 and EE_015

In [36]:
drop_columns_acmg_amp =  [c for c in EE_genotype_015_050.columns if (c.startswith("ACMG") or c.startswith("AMP")) and c!="ACMG_class"]
drop_columns_gnomad = [c for c in EE_genotype_015_050 if c.startswith("gnom") and c != "gnomadExomes_AF" and c!= "gnomadGenomes_AF"]

EE_potential_genotype_015_050 = EE_genotype_015_050.drop(drop_columns_acmg_amp, axis=1)
EE_potential_genotype_015_050 = EE_potential_genotype_015_050.drop(drop_columns_gnomad, axis=1)
EE_potential_genotype_015_050 = EE_potential_genotype_015_050.drop("DP", axis=1)
EE_potential_genotype_015_050 = EE_potential_genotype_015_050.drop("MMQ", axis=1)
EE_potential_genotype_015_050.columns

Index(['ACMG_class', 'AS_FilterStatus', 'AS_SB_TABLE', 'CGDinheritance',
       'ClinVarClass', 'ClinVarDisease', 'DANN_score', 'ECNT', 'GERMQ', 'Gene',
       'MBQ', 'MFRL', 'MPOS', 'MutationTaster_pred', 'MutationTaster_score',
       'POPAF', 'RPA', 'RU', 'SIFT_score', 'STR', 'STRQ', 'TLOD',
       'coding_impact', 'cosmicFathMMPrediction', 'cosmicFathMMScore',
       'function', 'gnomadExomes_AF', 'gnomadGenomes_AF', 'hgvs'],
      dtype='object')

#### Delete potentially interesting columns from EE_050 and EE_015

In [37]:
potential_drop_columns = ["ClinVarClass", "ClinVarDisease", "DANN_score", "MutationTaster_pred", "MutationTaster_score", "SIFT_score"]

EE_important_genotype_015_050 = EE_potential_genotype_015_050.drop(potential_drop_columns, axis=1)
EE_important_genotype_015_050.columns

Index(['ACMG_class', 'AS_FilterStatus', 'AS_SB_TABLE', 'CGDinheritance',
       'ECNT', 'GERMQ', 'Gene', 'MBQ', 'MFRL', 'MPOS', 'POPAF', 'RPA', 'RU',
       'STR', 'STRQ', 'TLOD', 'coding_impact', 'cosmicFathMMPrediction',
       'cosmicFathMMScore', 'function', 'gnomadExomes_AF', 'gnomadGenomes_AF',
       'hgvs'],
      dtype='object')

#### Propose two combinations of EE_015 EE_050 with EE_069

In [38]:
# Concatenate with all columns from EE_015 EE_050
EE_potential_genotype_all = pd.concat([EE_potential_genotype_015_050, EE_potential_genotype_069], ignore_index=True, axis=0)
EE_important_genotype_all = pd.concat([EE_important_genotype_015_050, EE_important_genotype_069], ignore_index=True, axis=0)

# Concatenate with only common columns
different_potential_columns = set(EE_potential_genotype_015_050.columns).difference(set(EE_potential_genotype_069.columns))
EE_important_genotype_common = pd.concat([EE_potential_genotype_015_050.drop(different_potential_columns, axis=1), EE_potential_genotype_069], ignore_index=True, axis=0)

different_important_columns = set(EE_important_genotype_015_050.columns).difference(set(EE_important_genotype_069.columns))
EE_important_genotype_common = pd.concat([EE_important_genotype_015_050.drop(different_potential_columns, axis=1), EE_important_genotype_069], ignore_index=True, axis=0)

#### Separate array values

In [76]:
object_columns = set(EE_potential_genotype_all.select_dtypes(object).columns)
object_columns = list(object_columns)
# object_columns

array_columns = [c for c in object_columns if c!="RU" and c !="cosmicFathMMPrediction" and c!="ACMG_class"]

In [77]:
for col in array_columns:
    print(col)
    print("Num uniq:", len(EE_potential_genotype_all[col].unique()))
    max_len = np.argmax(list(map(len, map(str, EE_potential_genotype_all[col]))))
    print("Argmax:",max_len)
    print(EE_potential_genotype_all[col][max_len])
    if col == "ClinVarClass":
        print(EE_potential_genotype_all[col].unique())
    else:
        print(EE_potential_genotype_all[col].unique()[:10])
    print()

MutationTaster_pred
Num uniq: 107
Argmax: 170703
N%3BN%3BN%3BN%3BN%3BN%3BN%3BN%3BN%3BN%3BN%3BN%3BN%3BN%3BN%3BN%3BN%3BN%3BN%3BN%3BN%3BN%3BN%3BN%3BN%3BN
['0' 'P' 'P%3BP%3BP' 'P%3BP' 'P%3BP%3BP%3BP%3BP'
 'D%3BD%3BD%3BD%3BD%3BD%3BD' 'N%3BN%3BN%3BN%3BN%3BN%3BN'
 'P%3BP%3BP%3BP%3BP%3BP%3BP%3BP' 'P%3BP%3BP%3BP' 'D%3BD%3BD%3BD']

SIFT_score
Num uniq: 7860
Argmax: 170703
0.586%3B0.344%3B.%3B0.628%3B0.393%3B0.655%3B0.586%3B0.586%3B0.586%3B0.393%3B0.586%3B0.381%3B0.586%3B0.6%3B0.586%3B0.586%3B0.6%3B0.344%3B0.597%3B0.586%3B0.393%3B0.455%3B0.344%3B0.586%3B0.6%3B0.327%3B0.597%3B0.678
['0' '1.0' '0.224%3B0.245%3B0.217%3B.' '0.024%3B.' '1.0%3B.' '0.236%3B.'
 '0.018%3B0.018' '0.413%3B0.397%3B0.397%3B0.399'
 '0.094%3B0.094%3B0.095%3B0.097%3B0.1%3B0.096%3B0.188'
 '0.109%3B0.109%3B0.154%3B0.102']

AS_FilterStatus
Num uniq: 64
Argmax: 170785
base_qual%2Cweak_evidence|base_qual%2Cweak_evidence
['SITE' 'SITE|SITE' 'SITE|base_qual' 'SITE|base_qual%2Cweak_evidence'
 'SITE|SITE|SITE' 'SITE|strand_bias' 'SITE|ma

In [54]:
# MutationTaster_pred split

mutation_tester_pred_unique = set([])
for c in EE_potential_genotype_all["MutationTaster_pred"].unique():
    if not isinstance(c, float):        
        for _c in c.split("%3B"):
            mutation_tester_pred_unique.add(_c)
    else:
        mutation_tester_pred_unique.add(c)

mutation_tester_pred_unique

{'0', 'A', 'D', 'N', 'P'}

In [62]:
# MutationTaster_score split
# Corresponding to the MutationTaster_pred split

In [65]:
# CGDinheritance split
# Leave alone maybe try to delete additional comments

In [66]:
# function split
functions = ["0", "NMD", "3'utr", "5'utr", "3'flank", "5'flank", "coding", "non-coding%40exon", "intronic", "splicing", "splicing-ACMG"]

In [67]:
# hgvs split
# No idea yet

In [74]:
# ClinVarClass split
# No idea yet

In [78]:
# Gene split
# We can skip it - Gene ID

In [79]:
# coding_impact split

coding_impact_unique = set([])
for c in EE_potential_genotype_all["coding_impact"].unique():
    if not isinstance(c, float):        
        for _c in c.split(","):
            coding_impact_unique.add(_c)
    else:
        coding_impact_unique.add(c)

coding_impact_unique

{'0',
 'frameshift',
 'in%40frame',
 'missense',
 'non%40coding',
 'nonsense',
 'splice%40junction%40loss',
 'start%40loss',
 'stopLoss',
 'synonymous'}