<h1>landscape of STXBP1-related disorders </h1>
<p>Extract the clinical data from <a href="https://pubmed.ncbi.nlm.nih.gov/35190816/"target="__blank">Xian et al. (2022) Assessing the landscape of STXBP1-related disorders in 534 individuals. Brain.</a>.<p>

In [34]:
import phenopackets as php
from google.protobuf.json_format import MessageToDict, MessageToJson
from google.protobuf.json_format import Parse, ParseDict
import pandas as pd
from csv import DictReader
pd.set_option('display.max_colwidth', None) # show entire column contents, important!
from collections import defaultdict
import re
from pyphetools.creation import *
# last tested with pyphetools version 0.2.20

In [2]:
parser = HpoParser()
hpo_cr = parser.get_hpo_concept_recognizer()
hpo_version = parser.get_version()
metadata = MetaData(created_by="ORCID:0000-0002-0736-9199")
metadata.default_versions_with_hpo(version=hpo_version)

In [5]:
clinical_df = pd.read_table("input/brain-2021-00642-File011.tsv");
genotype_df = pd.read_table("input/brain-2021-00642-File011-genotype.tsv");

In [6]:
clinical_df.head()

Unnamed: 0,PatID,Source_Journal,Source_PMID*,Year,Sex,Phenotypic_group**,age_onset_m,age_offset_m,age_eval_y,Base_HPO***,HPO_term,Notes
0,STX_18469812_Subject_11,Nat Genet,18469812,2008.0,M,EOEE,2.0,,8.0,HP:0003593,Infantile onset,
1,STX_18469812_Subject_11,Nat Genet,18469812,2008.0,M,EOEE,2.0,,8.0,HP:0010818,Generalized tonic seizures,
2,STX_18469812_Subject_11,Nat Genet,18469812,2008.0,M,EOEE,2.0,,8.0,HP:0002069,Generalized tonic-clonic seizures,
3,STX_18469812_Subject_11,Nat Genet,18469812,2008.0,M,EOEE,2.0,,8.0,HP:0010851,EEG with burst suppression,
4,STX_18469812_Subject_11,Nat Genet,18469812,2008.0,M,EOEE,2.0,,8.0,HP:0002521,Hypsarrhythmia,


In [7]:
patient_d = defaultdict(list)

In [17]:
class HpoTerm:
    def __init__(self, row):
        try: 
            self.age_onset_m = int(row["age_onset_m"])
        except:
            self.age_onset_m = None
        try:
            self.age_offset_m = int(row["age_offset_m"])
        except:
            self.age_offset_m = None
        try:
            self.age_eval_y = int(row["age_eval_y"])
        except:
            self.age_eval_y = None
        self.hpo_id = row["Base_HPO***"]
        self.hpo_label = row["HPO_term"]




class PatientRow:
    def __init__(self, row):
        self.patID = row["PatID"]
        self.sex = row["Sex"]
        self.phenogroup = row["Phenotypic_group**"]
        self.hpo_term = HpoTerm(row=row)
        
    

In [18]:
with open("input/brain-2021-00642-File011.tsv") as f:
    reader = DictReader(f, delimiter="\t")
    for row in reader:
        prow = PatientRow(row=row)
        patient_d[prow.patID].append(prow)
print(f"We extracted data on {len(patient_d)} individuals")

We extracted data on 534 individuals


In [19]:
genotype_df.head()

Unnamed: 0,PatID,Chr,Start,End,Ref,Alt,Func.refGeneWithVer,Gene.refGeneWithVer,GeneDetail.refGeneWithVer,ExonicFunc.refGeneWithVer,...,Otherinfo3,Otherinfo4,Otherinfo5,Otherinfo7,Otherinfo8,Otherinfo9,Otherinfo10,Otherinfo11,bed,Unnamed: 134
0,STX_18469812_Subject_11,9.0,130422313.0,130422313.0,T,A,exonic,STXBP1,.,nonsynonymous SNV,...,.,9,130422313,T,A,.,PASS,.,Name=70.695764689,
1,STX_18469812_Subject_3,9.0,130444768.0,130444768.0,G,A,exonic,STXBP1,.,nonsynonymous SNV,...,.,9,130444768,G,A,.,PASS,.,Name=75.862552050,
2,STX_18469812_Subject_6,9.0,130425593.0,130425593.0,G,A,exonic,STXBP1,.,nonsynonymous SNV,...,.,9,130425593,G,A,.,PASS,.,Name=89.193399398,
3,STX_18469812_Subject_7,9.0,130439001.0,130439001.0,T,G,exonic,STXBP1,.,nonsynonymous SNV,...,.,9,130439001,T,G,.,PASS,.,Name=20.833326089,
4,STX_19557857_Patient_1,9.0,130416076.0,130416076.0,G,A,splicing,STXBP1,NM_001032221.3:exon3:c.169+1G>A;NM_003165.3:exon3:c.169+1G>A,.,...,.,9,130416076,G,A,.,PASS,.,.,


In [43]:
class GenotypeEntry:
    def __init__(self, row):
        self.patID = row["PatID"]
        self.chrom = row["Chr"]
        self.start = row["Start"]
        self.end = row["End"]
        self.ref = row["Ref"]
        self.alt = row["Alt"]
        func = row["Func.refGeneWithVer"]
        genenot = row["Gene.refGeneWithVer"]
        aachange = row["AAChange.refGeneWithVer"]
        if func == "exonic":
            fields = aachange.split(":")
            index = 0
            i = 0
            for f in fields:
                if f == "NM_001032221.3":
                    index = i
                    break
                i += 1
            if (i + 2) < len(fields):
                variant = fields[i+2]
            else:
                raise ValueError(f"Could not get variant from row: {row}")
            regex_del = r"c.\d+_\d+del"
            regex_dup = r"c.(\d+)dup([A-Z]+)"
            regex_sub = r"c.([A-Z]+)(\d+)([A-Z]+)"
            result = re.search(regex_sub, variant)
            result_dup = re.search(regex_dup, variant)
            if re.match(regex_del, variant):
                self.hgvs = variant
            elif result:
                ref = result.group(1)
                position = result.group(2)
                alt = result.group(3)
                hgvs = f"c.{position}{ref}>{alt}"
                self.hgvs = hgvs
            elif result_dup:
                position=result_dup.group(1)
                hgvs = f"c.{position}dup"
                self.hgvs = hgvs
            else:
                raise ValueError(f"Could not parse variant {variant}")
                    

In [44]:
cc= 0
with open("input/brain-2021-00642-File011-genotype.tsv") as f:
    reader = DictReader(f, delimiter="\t")
    for row in reader:
        #print(row)
        ge = GenotypeEntry(row=row)
        cc += 1
        if cc > 50:
            break

ValueError: Could not get variant from row: {'PatID': 'STX_21770924_Patient_3', 'Chr': '9', 'Start': '130446664', 'End': '130446664', 'Ref': 'A', 'Alt': 'C', 'Func.refGeneWithVer': 'exonic', 'Gene.refGeneWithVer': 'STXBP1', 'GeneDetail.refGeneWithVer': '.', 'ExonicFunc.refGeneWithVer': 'nonsynonymous SNV', 'AAChange.refGeneWithVer': 'STXBP1:NM_003165.3:exon19:c.A1720C:p.T574P', 'Func.ensGene': 'exonic', 'Gene.ensGene': 'ENSG00000136854', 'GeneDetail.ensGene': '.', 'ExonicFunc.ensGene': 'nonsynonymous SNV', 'AAChange.ensGene': 'ENSG00000136854:ENST00000373302:exon19:c.A1720C:p.T574P', 'Gnomad_exome_AF': '.', 'Gnomad_exome_AF_popmax': '.', 'Gnomad_exome_AF_male': '.', 'Gnomad_exome_AF_female': '.', 'Gnomad_exome_AF_raw': '.', 'Gnomad_exome_AF_afr': '.', 'Gnomad_exome_AF_sas': '.', 'Gnomad_exome_AF_amr': '.', 'Gnomad_exome_AF_eas': '.', 'Gnomad_exome_AF_nfe': '.', 'Gnomad_exome_AF_fin': '.', 'Gnomad_exome_AF_asj': '.', 'Gnomad_exome_AF_oth': '.', 'non_topmed_Gnomad_exome_AF_popmax': '.', 'non_neuro_Gnomad_exome_AF_popmax': '.', 'non_cancer_Gnomad_exome_AF_popmax': '.', 'controls_Gnomad_exome_AF_popmax': '.', 'PopFreqMax': '.', '1000G_ALL': '.', '1000G_AFR': '.', '1000G_AMR': '.', '1000G_EAS': '.', '1000G_EUR': '.', '1000G_SAS': '.', 'ExAC_ALL': '.', 'ExAC_AFR': '.', 'ExAC_AMR': '.', 'ExAC_EAS': '.', 'ExAC_FIN': '.', 'ExAC_NFE': '.', 'ExAC_OTH': '.', 'ExAC_SAS': '.', 'ESP6500siv2_ALL': '.', 'ESP6500siv2_AA': '.', 'ESP6500siv2_EA': '.', 'CG46': '.', 'CLNALLELEID': '.', 'CLNDN': '.', 'CLNDISDB': '.', 'CLNREVSTAT': '.', 'CLNSIG': '.', 'DamagePredCount': '16.19', 'SIFT_pred': 'D', 'SIFT4G_pred': 'D', 'Polyphen2_HDIV_pred': 'D', 'Polyphen2_HVAR_pred': 'D', 'LRT_pred': 'D', 'MutationTaster_pred': 'D', 'MutationAssessor_pred': '.', 'FATHMM_pred': 'T', 'PROVEAN_pred': 'D', 'VEST4_score': '0.879', 'MetaSVM_pred': 'D', 'MetaLR_pred': 'D', 'M-CAP_pred': 'D', 'REVEL_score': '0.918', 'MutPred_score': '0.652', 'MVP_score': '0.877', 'MPC_score': '2.626', 'PrimateAI_pred': 'T', 'DEOGEN2_pred': 'T', 'BayesDel_addAF_pred': 'D', 'BayesDel_noAF_pred': 'D', 'ClinPred_pred': 'D', 'LIST-S2_pred': 'D', 'CADD_raw': '3.958', 'CADD_phred': '27.3', 'DANN_score': '0.996', 'fathmm-MKL_coding_pred': 'D', 'fathmm-XF_coding_pred': 'D', 'Eigen-raw_coding': '0.847', 'Eigen-phred_coding': '9.779', 'Eigen-PC-raw_coding': '0.799', 'Eigen-PC-phred_coding': '10.102', 'GenoCanyon_score': '1', 'integrated_fitCons_score': '0.497', 'GM12878_fitCons_score': '0.59', 'H1-hESC_fitCons_score': '0.545', 'HUVEC_fitCons_score': '0.492', 'LINSIGHT': '.', 'GERP++_NR': '5.27', 'GERP++_RS': '5.27', 'phyloP100way_vertebrate': '8.914', 'phyloP30way_mammalian': '1.312', 'phyloP17way_primate': '0.756', 'phastCons100way_vertebrate': '1', 'phastCons30way_mammalian': '0.994', 'phastCons17way_primate': '0.994', 'bStatistic': '301', 'Interpro_domain': '.', 'GTEx_V8_gene': '.', 'GTEx_V8_tissue': '.', 'dbscSNV_ADA_SCORE': '.', 'dbscSNV_RF_SCORE': '.', 'score_zscore_gene': '11;0.89;STXBP3:STXBP1:STXBP2', 'hgmd_class': 'NA', 'hgmd_prot': 'NA', 'hgmd_phen': 'NA', 'hgmd_rankscore': 'NA', 'Otherinfo1': '.', 'Otherinfo2': '.', 'Otherinfo3': '.', 'Otherinfo4': '9', 'Otherinfo5': '130446664', 'Otherinfo7': 'A', 'Otherinfo8': 'C', 'Otherinfo9': '.', 'Otherinfo10': 'PASS', 'Otherinfo11': '.', 'bed': 'Name=98.760813298', '': ''}