<h1>Genotype–phenotype correlation at codon 1740 of SETD2</h1>
<p>Generate phenopackets from the data reported in <a href="https://pubmed.ncbi.nlm.nih.gov/33766796/">Chen et al., (2021)</a>.</p>

In [None]:
import phenopackets as php
from google.protobuf.json_format import MessageToDict, MessageToJson
from google.protobuf.json_format import Parse, ParseDict
import pandas as pd
pd.set_option('display.max_colwidth', None) # show entire column contents, important!
from collections import defaultdict
import numpy as np
import pyphetools
from pyphetools.creation import *
from pyphetools.output import PhenopacketTable
print(f"pyphetools version {pyphetools.__version__}")

In [None]:
parser = HpoParser()
hpo_cr = parser.get_hpo_concept_recognizer()
hpo_version = parser.get_version()
pmid = "PMID:33766796"  # Chen et al, 2021
title = "Mutation pattern and genotype-phenotype correlations of SETD2 in neurodevelopmental disorders"
metadata = MetaData(created_by="ORCID:0000-0002-0736-9199", pmid=pmid, pubmed_title=title)
metadata.default_versions_with_hpo(version=hpo_version)

In [None]:
df = pd.read_table('./input/chen21_setd2.csv').astype(str)
df

In [None]:
dft = df.transpose()
dft.columns = dft.iloc[0]
dft.drop(dft.index[0], inplace=True)
dft['patient_id'] = dft.index
dft.head()

In [None]:
hpo_cr = parser.get_hpo_concept_recognizer()

In [None]:
items = {
    'Speech delay': ["Delayed speech and language development", "HP:0000750"], 
    'Motor delay': ['Motor delay', 'HP:0001270'],
    'Intellectual disability': ['Intellectual disability', 'HP:0001249'],
    'Macrocephaly': ['Macrocephaly', 'HP:0000256'],
    'ASD': ['Autism', 'HP:0000717'],
    'Recurrent otitis media': ['Recurrent otitis media','HP:0000403'],
    'Seizure': ['Seizure', 'HP:0001250'],
    'Facial deformity': ['Abnormal facial shape', 'HP:0001999'],
    'Hypotonia': ['Hypotonia', 'HP:0001252'],
    'Accelerated osseous maturation': ['Accelerated skeletal maturation','HP:0005616'],
    'Anxiety': ['Anxiety','HP:0000739'],
    'ADHD': ['Attention deficit hyperactivity disorder','HP:0007018'],
    'Obsessive behavior': ['Compulsive behaviors','HP:0000722'],
    'Aggressive behavior': ['Aggressive behavior','HP:0000718'],
    'Self-injury behavior': ['Self-injurious behavior','HP:0100716'],
}
item_column_mapper_d = hpo_cr.initialize_simple_column_maps(column_name_to_hpo_label_map=items, observed='+',
    excluded='-')
print(f"We created {len(item_column_mapper_d)} simple column mappers")

<h2>Transcript/Variant mapping</h2>

In [None]:
setd2_transcript = "NM_014159.7"
genome = 'hg38'
default_genotype = 'heterozygous'
varMapper = VariantColumnMapper(assembly=genome,column_name='Variant', 
                                transcript=setd2_transcript, 
                                default_genotype=default_genotype)

In [None]:
# Ages not available
sexMapper = SexColumnMapper(male_symbol='male', female_symbol='female', column_name='Sex')
#sexMapper.preview_column(dft['Sex'])

In [None]:
pmid = "PMID:33766796"  # Chen et al, 2021
encoder = CohortEncoder(df=dft, 
                        hpo_cr=hpo_cr, 
                        column_mapper_d=item_column_mapper_d, 
                        individual_column_name="patient_id", 
                        sexmapper=sexMapper,
                        agemapper=AgeColumnMapper.not_provided(),
                        variant_mapper=varMapper, metadata=metadata,
                        pmid=pmid)
encoder.set_disease(disease_id='OMIM:616831', label='Luscan-Lumish syndrome')

In [None]:
individuals = encoder.get_individuals()

In [None]:
# Preview
i1 = individuals[0]
phenopacket1 = i1.to_ga4gh_phenopacket(metadata=metadata.to_ga4gh())
json_string = MessageToJson(phenopacket1)
print(json_string)

In [None]:
output_directory = "phenopackets"
Individual.output_individuals_as_phenopackets(individual_list=individuals,
                                              pmid=pmid,
                                              metadata=metadata.to_ga4gh(),
                                              outdir=output_directory)