# This notebook leads you through generating NDD free controls
### Use this notebook to update your controls groups when there are new data releases.

In [None]:
from datetime import datetime
import os 
import pandas as pd
import pyspark
import dxpy
import dxdata 
import numpy as np

In [None]:
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

In [None]:
#setup - grabbing dataset
dispensed_database_name = dxpy.find_one_data_object(classname="database", name="app*", folder="/", name_mode="glob", describe=True)["describe"]["name"]
dispensed_dataset_id = dxpy.find_one_data_object(typename="Dataset", name="app*.dataset", folder="/", name_mode="glob")["id"]

In [None]:
# the participant dataset is the one we ultimately want to work with 
dataset = dxdata.load_dataset(id=dispensed_dataset_id)
participant = dataset["participant"]

In [None]:
#Be sure to creat this cohort -- 'Age at Recruitment" is not null
#cohort = dxdata.load_cohort('All_UKB_no_age_missing')

In [None]:
# going to filter on these fields for an overall 'healthy' NDD free control cohort
# Date G10 first reported (huntington's disease), Date D11 first reported (hereditary ataxia), Date G12 first reported (spinal muscular atrophy and related syndromes),
# Date G13 first reported (systemic atrophies primarily affecting central nervous system in diseases classified elswhere), Date G14 first reported (postpolio syndrome),
# Date G20 first reported (parkinson's disease), Date G21 first reported (secondary parkinsonism), Date G22 first reported (parkinsonism in diseases classified elsewhere),
# Date G23 first reported (other degenerative diseases of basal ganglia), Date G24 first reported (dystonia), Date G25 first reported (other extrapyramidal and movement disorders),
# Date G30 first reported (alzheimer's disease), Date G31 first reported (other degenerative diseases of nervous system, not elsewhere classified),
# Date G32 first reported (other degenerative disorders of nervous system in diseases classified elsewhere), Date G35 first reported (multiple sclerosis),
# Date G36 first reported (other acute disseminated demyelination), Date G37 first reported (other demyelinating diseases of central nervous system),
# Date G45 first reported (transient cerebral ischaemic attacks and related syndromes), Date G46 first reported (vascular syndromes of brain in cerebrovascular diseases),
# Date G50 first reported (disorders of trigeminal nerve), Date G52 first reported (disorders of other cranial nerves), Date G53 first reported (cranial nerve disorders in diseases classified elsewhere),
# Date G54 first reported (nerve root and plexus disorders), Date G55 first reported (nerve root and plexus compressions in diseases classified elsewhere),
# Date G56 first reported (mononeuropathies of upper limb), Date G57 first reported (mononeuropathies of lower limb), Date G58 first reported (other mononeuropathies),
# Date G59 first reported (mononeuropathy in diseases classified elsewhere), Date G60 first reported (hereditary and idiopathic neuropathy),
# Date G61 first reported (inflammatory polyneuropathy), Date G62 first reported (other polyneuropathies), Date G63 first reported (polyneuropathy in diseases classified elsewhere),
# Date G64 first reported (other disorders of peripheral nervous system), Date G70 first reported (myasthenia gravis and other myoneural disorders), Date G71 first reported (primary disorders of muscles),
# Date G72 first reported (other myopathies), Date G73 first reported (disorders of myoneural junction and muscle in diseases classified elsewhere), Date G80 first reported (infantile cerebral palsy),
# Date G81 first reported (hemiplegia), Date G82 first reported (paraplegia and tetraplegia), Date G83 first reported (other paralytic syndromes), Date G90 first reported (disorders of autonomic nervous system)
# Date G91 first reported (hydrocephalus), Date G92 first reported (toxic encephalopathy), Date G93 first reported (other disorders of brain), Date G94 first reported (other disorders of brain in diseases classified elsewhere),
# Date G96 first reported (other disorders of central nervous system), Date G97 first reported (postprocedural disorders of nervous system, not elsewhere classified), 
# Date G98 first reported (other disorders of nervous system, not elsewhere classified), Date G99 first reported (other disorders of nervous system in diseases classified elsewhere),
#  Date of all cause dementia report, Date of alzheimer's disease report,
# Date of vascular dementia report, Date of frontotemporal dementia report, Date of motor neurone disease report, Date of all cause parkinsonism report, Date of parkinson's disease report,
# Date of progressive supranuclear palsy report, Date of multiple system atrophy report, Genetic ethnic grouping, Age at recruitment, Townsend deprivation index at recruitment,
# Sex, Genetic Principal components | Array 1, Genetic Principal components | Array 2, Genetic Principal components | Array 3, Genetic Principal components | Array 4, Genetic Principal components | Array 5

In [None]:
field_names = ['eid', 'p131012', 'p131012', 'p131016', 'p131018', 'p131020', 'p131022', 'p131024', 'p131026', 'p131028', 'p131030', 'p131036', 'p131038', 'p131040', 'p131042',
              'p131042', 'p131046', 'p131056', 'p131058', 'p131062', 'p131066', 'p131068', 'p131070', 'p131070', 'p131074', 'p131076', 'p131078', 'p131080', 'p131082',
              'p131084', 'p131086', 'p131088', 'p131090', 'p131092', 'p131094', 'p131096', 'p131098', 'p131100', 'p131102', 'p131104', 'p131106', 'p131108', 
              'p131110', 'p131112', 'p131114', 'p131116', 'p131120', 'p131122', 'p131124', 'p131126',  'p42018', 'p42020', 'p42022', 'p42024', 'p42028', 'p42030', 'p42032', 'p42034', 'p42036', 'p22006', 'p21022', 'p22189', 'p31', 'p22009_a1',
              'p22009_a2', 'p22009_a3', 'p22009_a4', 'p22009_a5', 'p34', 'p40000_i0', 'p20110_i0']

In [None]:
df = participant.retrieve_fields(names=field_names, coding_values="replace", engine=dxdata.connect())
#df = participant.retrieve_fields(names=field_names, filter_sql=cohort.sql, coding_values='replace',engine=dxdata.connect())

In [None]:
df_pandas = df.toPandas()
df_pandas

In [None]:
df_pandas.info()

In [None]:
healthy = df_pandas[df_pandas['p131012'].isnull() & df_pandas['p131016'].isnull() & df_pandas['p131018'].isnull() & df_pandas['p131020'].isnull() & df_pandas['p131022'].isnull() &
                   df_pandas['p131024'].isnull() & df_pandas['p131026'].isnull() & df_pandas['p131028'].isnull() & df_pandas['p131030'].isnull() & df_pandas['p131036'].isnull() &
                   df_pandas['p131038'].isnull() & df_pandas['p131040'].isnull() & df_pandas['p131042'].isnull() & df_pandas['p131046'].isnull() & df_pandas['p131056'].isnull() &
                   df_pandas['p131058'].isnull() & df_pandas['p131062'].isnull() & df_pandas['p131066'].isnull() & df_pandas['p131068'].isnull() & df_pandas['p131070'].isnull() &
                df_pandas['p131074'].isnull() & df_pandas['p131076'].isnull() & df_pandas['p131078'].isnull() & df_pandas['p131080'].isnull() & df_pandas['p131082'].isnull() &
                    df_pandas['p131084'].isnull() & df_pandas['p131086'].isnull() & df_pandas['p131088'].isnull() & df_pandas['p131090'].isnull() & df_pandas['p131092'].isnull() &
                    df_pandas['p131094'].isnull() & df_pandas['p131096'].isnull() & df_pandas['p131098'].isnull() & df_pandas['p131100'].isnull() & df_pandas['p131102'].isnull() &
                    df_pandas['p131104'].isnull() & df_pandas['p131106'].isnull() & df_pandas['p131108'].isnull() & df_pandas['p131110'].isnull() & df_pandas['p131112'].isnull() &
                    df_pandas['p131114'].isnull() & df_pandas['p131116'].isnull() & df_pandas['p131120'].isnull() & df_pandas['p131122'].isnull() & df_pandas['p131124'].isnull() &
                    df_pandas['p131126'].isnull() & df_pandas['p42018'].isnull() & df_pandas['p42020'].isnull() & df_pandas['p42022'].isnull() & df_pandas['p42024'].isnull() &
                    df_pandas['p42028'].isnull() & df_pandas['p42030'].isnull() & df_pandas['p42032'].isnull() & df_pandas['p42034'].isnull() & df_pandas['p42036'].isnull()]
healthy.info()

In [None]:
#Illness of mother 20110/ Illness of father 20107/ 4 instances
# get rid of proxies for pd and ad, this cohort may need to be remade each time data is updated

In [None]:
cohort = dxdata.load_cohort('AD_and_PD_proxies')

In [None]:
field_names = ['eid']

In [None]:
dfp = participant.retrieve_fields(names=field_names, filter_sql=cohort.sql, coding_values='replace',engine=dxdata.connect())

In [None]:
proxies = dfp.toPandas()
proxies

In [None]:
prox_remove = proxies['eid']

In [None]:
healthy_no_proxies = healthy[~(healthy['eid'].isin(prox_remove))]
healthy_no_proxies.info()

In [None]:
healthy_no_proxies['p22006'].value_counts()

In [None]:
healthy_no_proxies_cauc = healthy_no_proxies[healthy_no_proxies['p22006'].notna()]
healthy_no_proxies_cauc.info()

In [None]:
# now we filter for age

In [None]:
healthy_no_proxies_cauc_45 = healthy_no_proxies_cauc[healthy_no_proxies_cauc['p21022'] >= 45]
healthy_no_proxies_cauc_45.info()

In [None]:
healthy_no_proxies_cauc_60 = healthy_no_proxies_cauc[healthy_no_proxies_cauc['p21022'] >= 60]
healthy_no_proxies_cauc_60.info()

## Done, but remember to remove related people when you have finished building your disease cohort

In [None]:
healthy_45 = healthy_no_proxies_cauc_45[['eid', 'p21022', 'p22189', 'p31', 'p22009_a1', 'p22009_a2', 'p22009_a3', 'p22009_a4', 'p22009_a5', 'p34', 'p22006', 'p40000_i0']]
healthy_45.columns = ['eid', 'AGE_OF_RECRUIT', 'TOWNSEND', 'GENETIC_SEX', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'BIRTH_YEAR', 'ETHNICITY', 'date_of_death']
healthy_45.head()

In [None]:
healthy_60 = healthy_no_proxies_cauc_60[['eid', 'p21022', 'p22189', 'p31', 'p22009_a1', 'p22009_a2', 'p22009_a3', 'p22009_a4', 'p22009_a5', 'p34', 'p22006', 'p40000_i0']]
healthy_60.columns = ['eid', 'AGE_OF_RECRUIT', 'TOWNSEND', 'GENETIC_SEX', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'BIRTH_YEAR', 'ETHNICITY', 'date_of_death']
healthy_60.head()

In [None]:
healthy_45.to_csv("NDD_free_controls_45.csv", index=False)

In [None]:
healthy_60.to_csv("NDD_free_controls_60.csv", index=False)

In [None]:
!dx upload NDD_free_controls_45.csv --path data/controls/NDD_free_controls_45.csv