















































































































































































































# Generate NDD CSVs













































## Setup -- You need a spark session! 
(but the default/smallest one should be fine) 

In [1]:
#setup - packages & env
import pyspark
import dxdata
import dxpy
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

In [2]:
#setup - grabbing dataset - make sure to add most recent dataset here
dispensed_database_name = dxpy.find_one_data_object(classname="database", name="app*", folder="/", name_mode="glob", describe=True)["describe"]["name"]
dispensed_dataset_id = dxpy.find_one_data_object(typename="Dataset", name="app*.dataset", folder="/", name_mode="glob")["id"]

In [3]:
# the participant dataset is the one we want to work with 
dataset = dxdata.load_dataset(id=dispensed_dataset_id)
participant = dataset["participant"]

# List of ICD10 codes / UKB Field IDs

#see spreadsheet here: https://docs.google.com/document/d/1AebkQ-Nxrk63jhsDzZpn5QD-7EK4unsykHVj-saEm3U/edit?usp=sharing for more info

In [4]:
# Pull down the fields we need 
field_names = ["eid", "p31", "p34", "p22189", "p22006", "p21022", 'p42028', 'p42020', 'p42018', 'p131042', 'p42032', 'p42022', 'p42024', 'p22009_a1', 'p22009_a2', 'p22009_a3', 'p22009_a4', 'p22009_a5', 'p40000_i0']
df = participant.retrieve_fields(names=field_names, coding_values="replace", engine=dxdata.connect())

In [5]:
# Send to Pandas 
df = df.toPandas()

In [6]:
# Human readable columns please
df = df.rename(columns={'eid':'ID',
                'p31':'GENETIC_SEX', 
                'p34':'BIRTH_YEAR', 
                'p22189':'TOWNSEND', 
                'p22006':'ETHNICITY', 
                'p21022':'AGE_OF_RECRUIT',
                "p42028":'ALS_DATE',       
                'p42020':'AD_DATE',
                'p42018':'DEM_DATE',  
                'p131042': 'MS_DATE',
                'p42032':'PD_DATE',
                'p42022':'VAS_DATE',
                'p42024':'FTD_DATE',
                'p22009_a1':'PC1',
                'p22009_a2':'PC2',
                'p22009_a3':'PC3',
                'p22009_a4':'PC4',
                'p22009_a5':'PC5',
                'p40000_i0':'date_of_death'
                                             
               })

df

Unnamed: 0,ID,GENETIC_SEX,BIRTH_YEAR,TOWNSEND,ETHNICITY,AGE_OF_RECRUIT,ALS_DATE,AD_DATE,DEM_DATE,MS_DATE,PD_DATE,VAS_DATE,FTD_DATE,PC1,PC2,PC3,PC4,PC5,date_of_death
0,2899510,Male,1949,-4.62,Caucasian,59,,,,,,,,-12.03870,5.65683,-3.148140,2.253370,3.377710,
1,1690778,Female,1957,-0.42,Caucasian,52,,,,,,,,-12.76440,4.47954,-3.146240,5.171100,0.421126,
2,4514243,Male,1946,-4.35,Caucasian,61,,,,,,,,-17.06920,3.13966,0.018041,-0.113465,3.599410,2012-06-03
3,1715646,Female,1950,-3.06,Caucasian,57,,,,,,,,-9.58077,3.08286,-2.366970,2.159380,9.835700,
4,2039968,Female,1956,8.22,Caucasian,51,,,,,,,,-10.86490,3.73989,-2.732980,4.830110,3.736770,2021-09-30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502350,2096169,Female,1955,-1.34,,55,,,,,,,,-2.63018,7.20726,1.402660,-1.203050,-1.728970,
502351,2658168,Female,1956,-3.68,Caucasian,52,,,,,,,,-11.31880,2.94186,-0.853824,-1.185850,1.930060,
502352,5942859,Female,1942,-4.29,Caucasian,68,,,,,,,,-13.23150,3.77631,-4.230280,2.986840,0.342256,
502353,2873193,Female,1952,-4.41,Caucasian,56,,,,,,,,-11.40520,2.38744,1.454070,7.047850,3.866110,


In [7]:
date = 'SEPT_2023'
ndd_list = ['ALS', 'AD', 'DEM', 'MS', 'PD', 'VAS', 'FTD']

for ndd in ndd_list:
    test = df[~df[f'{ndd}_DATE'].isna()]
    test = test[['ID', 'GENETIC_SEX', 'BIRTH_YEAR', 'TOWNSEND', 'ETHNICITY',
       'AGE_OF_RECRUIT', ndd + '_DATE', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5',
       'date_of_death']]
    a = len(test)
    print(ndd, a)
    test.to_csv(f'{ndd}_cases_n{a}_{date}.csv', header = True, index = False)

ALS 753
AD 4447
DEM 10043
MS 2595
PD 4413
VAS 2182
FTD 319


In [None]:
import pandas as pd
test = pd.read_csv('ALS_cases_n753_SEPT_2023.csv')
test

# Save all files

In [None]:
for ndd in ndd_list:
    test = df[~df[f'{ndd}_DATE'].isna()]
    a = len(test)
    print(f'dx upload {ndd}_cases_n{a}_{date}.csv --path /data/NDD_cases/{ndd}_cases_n{a}_{date}.csv')

In [None]:
%%bash
dx upload ALS_cases_n753_SEPT_2023.csv --path /data/NDD_cases/ALS_cases_n753_SEPT_2023.csv
dx upload AD_cases_n4447_SEPT_2023.csv --path /data/NDD_cases/AD_cases_n4447_SEPT_2023.csv
dx upload DEM_cases_n10043_SEPT_2023.csv --path /data/NDD_cases/DEM_cases_n10043_SEPT_2023.csv
dx upload MS_cases_n2595_SEPT_2023.csv --path /data/NDD_cases/MS_cases_n2595_SEPT_2023.csv
dx upload PD_cases_n4413_SEPT_2023.csv --path /data/NDD_cases/PD_cases_n4413_SEPT_2023.csv
dx upload VAS_cases_n2182_SEPT_2023.csv --path /data/NDD_cases/VAS_cases_n2182_SEPT_2023.csv
dx upload FTD_cases_n319_SEPT_2023.csv --path /data/NDD_cases/FTD_cases_n319_SEPT_2023.csv

In [None]:
! dx upload ALS_cases_n753_SEPT_2023.csv