# This notebook leads you through generating NDD free controls at age 60+ and various case cohorts.
### Use this notebook to update your controls groups when there are new data releases.

A few changes being made here:
1. Using current age for the cut-off. After disscusion with eur collaborators, they feel UKB is updated often enough that we can fairly safely use censor age (this does introduce a little risk that some of the controls have a slight lag in their health information update to DNAnexus so keep this in mind)
2. Less filtering for the controls - because we are not doing the same level of filtering in our cases, I will be only excluding things that are an NDD or in the icd10 code family
3. More filtering for cases - this can be the choice of the researcher but since we are using fairly healthy ndd-free controls, we need to unbias our cases (remove PD from AD cases etc.)

# notes on NDD cohort formation in the UKB
https://docs.google.com/document/d/1AebkQ-Nxrk63jhsDzZpn5QD-7EK4unsykHVj-saEm3U/edit?usp=sharing

# files for codes pulled in this notebooks


https://docs.google.com/spreadsheets/d/1O2DqCu-tVGgV-SRvNAcS5rM3P0Dvlmt_G-CktUaxR9o/edit?usp=sharing

In [None]:
from datetime import datetime
import os 
import pandas as pd
import pyspark
import dxpy
import dxdata 
import numpy as np

In [None]:
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

In [None]:
dispensed_database_name = dxpy.find_one_data_object(classname='database', name='app*', folder='/', name_mode='glob', describe=True)['describe']['name']

dispensed_dataset_id = dxpy.find_one_data_object(typename='Dataset', name='app*.dataset', folder='/', name_mode='glob')['id']


In [None]:
dataset = dxdata.load_dataset(id=dispensed_dataset_id)
participant = dataset['participant']

In [None]:
# going to filter on these fields for an overall 'healthy' NDD free control cohort (see file for codes pulled above)

## Date G12 first reported (spinal muscular atrophy and related syndromes),
## Date G20 first reported (parkinson's disease),
## Date G21 first reported (secondary parkinsonism),
## Date G22 first reported (parkinsonism in diseases classified elsewhere),
## Date G23 first reported (other degenerative diseases of basal ganglia),
## Date G24 first reported (dystonia),
## Date G25 first reported (other extrapyramidal and movement disorders),
## Date G30 first reported (alzheimer's disease), 
## Date G31 first reported (other degenerative diseases of nervous system, not elsewhere classified),
## Date G32 first reported (other degenerative disorders of nervous system in diseases classified elsewhere),
## Date G35 first reported (multiple sclerosis)
## Date G36 first reported (other acute disseminated demyelination), 
## Date G37 first reported (other demyelinating diseases of central nervous system),
## Date G45 first reported (transient cerebral ischaemic attacks and related syndromes),
## Date G46 first reported (vascular syndromes of brain in cerebrovascular diseases),
## Date of all cause dementia report,
## Date of alzheimer's disease report,
## Date of vascular dementia report, 
## Date of frontotemporal dementia report,
## Date of motor neurone disease report,
## Date of all cause parkinsonism report,
## Date of parkinson's disease report,
## Date of progressive supranuclear palsy report,
## Date of multiple system atrophy report,
## Genetic ethnic grouping,
## Age at recruitment,
## Townsend deprivation index at recruitment,
## Sex, 
## Genetic Principal components | Array 1,
## Genetic Principal components | Array 2,
## Genetic Principal components | Array 3,
## Genetic Principal components | Array 4,
## Genetic Principal components | Array 5,
## birth month,
## birth year
## Date of death | Instance 1
## Date of death | Instance 2

In [None]:
field_names = ['eid', 'p131016', 'p131022', 'p131024', 'p131026', 'p131028', 'p131030', 'p131032', 'p131036', 'p131038', 'p131040',
               'p131042', 'p131044', 'p131046', 'p131056', 'p131058', 'p42018', 'p42020', 'p42022', 'p42024', 'p42028', 'p42030', 'p42032', 'p42034', 'p42036',
               'p22006', 'p21022', 'p22189', 'p31',  'p22009_a1', 'p22009_a2', 'p22009_a3', 'p22009_a4', 'p22009_a5', 'p52', 'p34', 'p40000_i0', 'p40000_i1']
print(len(field_names))

In [None]:
df = participant.retrieve_fields(names=field_names, coding_values='replace', engine = dxdata.connect(dialect="hive+pyspark"))

In [None]:
df_pandas = df.toPandas()
df_pandas.head()

In [None]:
df_pandas.info()

# Create controls

In [None]:
# ALL co-conditions removed
healthy = df_pandas[df_pandas['p131016'].isnull() & df_pandas['p131022'].isnull() & df_pandas['p131024'].isnull() & df_pandas['p131026'].isnull() & df_pandas['p131028'].isnull() &
                   df_pandas['p131030'].isnull() & df_pandas['p131032'].isnull() & df_pandas['p131036'].isnull() & df_pandas['p131038'].isnull() & df_pandas['p131040'].isnull() &
                   df_pandas['p131042'].isnull() & df_pandas['p131044'].isnull() & df_pandas['p131046'].isnull() & df_pandas['p131056'].isnull() & df_pandas['p131058'].isnull() &
                   df_pandas['p42018'].isnull() & df_pandas['p42020'].isnull() & df_pandas['p42022'].isnull() & df_pandas['p42024'].isnull() & df_pandas['p42028'].isnull() & 
                   df_pandas['p42030'].isnull() & df_pandas['p42032'].isnull() & df_pandas['p42034'].isnull() & df_pandas['p42036'].isnull()]
healthy.info()

In [None]:
# keep only caucasians
healthy_cauc = healthy[healthy['p22006'].notna()]
healthy_cauc.info()

In [None]:
# Start with the controls DataFrame
controls = healthy_cauc.copy()

# Set phenotype and censor date
controls['PHENO'] = 1
controls['date_censor'] = pd.to_datetime('2025-06-01')

#Change censor date to death date if it exists
controls.loc[~controls['p40000_i0'].isnull(), 'date_censor'] = controls['p40000_i0']
controls.loc[~controls['p40000_i1'].isnull(), 'date_censor'] = controls['p40000_i1']

# Construct birth date from year (p34) and month name (p52)
born = pd.to_datetime(controls['p34'].astype(str) + '-' + controls['p52'].astype(str), format='%Y-%B')

# Calculate age at censoring
temp = controls['date_censor'] - born
controls['censor_age'] = (temp / np.timedelta64(1, 'D') / 365.25).round()

# Set AAO as placeholder
controls['AAO'] = '-9'

# Preview
controls.head()

In [None]:
# Looks to see censor_age has been created correctly for those with a death date
test = controls[controls['date_censor'] != '2025-06-01']
test.head()

In [None]:
controls = controls[['eid', 'PHENO', 'p31', 'p21022', 'AAO', 'date_censor', 'censor_age', 'p34', 'p40000_i0', 'p40000_i1', 'p22189', 'p22009_a1',
                    'p22009_a2', 'p22009_a3', 'p22009_a4', 'p22009_a5']]
controls.columns = ['eid', 'PHENO', 'sex', 'age_at_recruitment', 'AAO', 'date_censor', 'censor_age', 'birth_year', 'date_of_death1', 'date_of_death2', 'townsend', 'PC1',
                    'PC2', 'PC3', 'PC4', 'PC5']
controls.loc[:, 'sex'] = np.where(controls['sex'] == 'Male', 1, 2)
print(len(controls))
controls.head()

In [None]:
#save if you want ALL controls
#controls.to_csv('NDD_healthy_controls_june_2025.txt', sep='\t', index=False)

In [None]:
# Gets controls who are at least 60 when the study ends
controls_60 = controls[controls['censor_age'] > 60]
print(len(controls_60))
controls_60.info()

In [None]:
controls_60.to_csv('NDD_healthy_controls_current_age_60_june_2025.txt', sep='\t', index=False)

In [None]:
#!dx upload NDD_healthy_controls.txt
!dx upload NDD_healthy_controls_current_age_60_june_2025.txt --path data/june_2025_controls/NDD_healthy_controls_current_age_60_june_2025.txt

# controls done, now do cases

# Alzheimers 
## removing - ALS, FTD, parkinsonism, PD ,PSP, MSA

In [None]:
AD = df_pandas[df_pandas['p42020'].notna()]
print(len(AD))
AD.head()

In [None]:
# Only select caucasians
AD_cauc = AD[AD['p22006'].notna()]
AD_cauc.head()

In [None]:
#p42024 - ftd
#p42028 - als
#p42030 - parkinsonism
#42032 - pd
#42034 - psp
#42036 - msa

In [None]:
# Filer out NDDs listed above
AD_filt = AD_cauc[AD_cauc['p42024'].isnull() & AD_cauc['p42028'].isnull() & AD_cauc['p42030'].isnull() & AD_cauc['p42032'].isnull() & AD_cauc['p42034'].isnull() &
                   AD_cauc['p42036'].isnull()]
AD_filt.info()

In [None]:
# Copy AD_filt to avoid modifying original
ad_cases = AD_filt.copy()

# Set phenotype and censor date
ad_cases['PHENO'] = 2
ad_cases['date_censor'] = pd.to_datetime(ad_cases['p42020'])

# Build birthdate from year (p34) and month name (p52)
born = pd.to_datetime(ad_cases['p34'].astype(str) + '-' + ad_cases['p52'].astype(str), format='%Y-%B')

# Calculate censor age
temp = ad_cases['date_censor'] - born
ad_cases['censor_age'] = (temp / np.timedelta64(1, 'D') / 365.25).round()

# Calculate AAO (age at onset)
AAO_temp = pd.to_datetime(ad_cases['p42020'], format='%Y-%m-%d', errors='coerce')
temp = AAO_temp - born
ad_cases['AAO'] = (temp / np.timedelta64(1, 'D') / 365.25).round()

# Preview
print(len(ad_cases))
ad_cases.head()


In [None]:
ad_cases = ad_cases[['eid', 'PHENO','p42020', 'p31', 'p21022', 'AAO', 'date_censor', 'censor_age', 'p34', 'p40000_i0', 'p40000_i1', 'p22189', 'p22009_a1',
                    'p22009_a2', 'p22009_a3', 'p22009_a4', 'p22009_a5']]
ad_cases.columns = ['eid', 'PHENO', 'AD_date', 'sex', 'age_at_recruitment', 'AAO', 'date_censor', 'censor_age', 'birth_year', 'date_of_death1', 'date_of_death2', 'townsend', 'PC1',
                    'PC2', 'PC3', 'PC4', 'PC5']
ad_cases.loc[:, 'sex'] = np.where(ad_cases['sex'] == 'Male', 1, 2)
print(len(ad_cases))
ad_cases.head()

In [None]:
ad_cases = ad_cases.sort_values(by = 'AD_date')
ad_cases

In [None]:
ad_cases.to_csv('AD_cases_june_2025.txt', sep='\t', index=False)

In [None]:
!dx upload AD_cases_june_2025.txt --path data/june_2025_NDDs/AD_cases_june_2025.txt

# PD all - this is PD keeping all cause dementia codes - will remove ftd, vad, ad, als, psp, msa

In [None]:
PD_all = df_pandas[df_pandas['p42032'].notna()]
PD_all.head()

In [None]:
PD_cauc = PD_all[PD_all['p22006'].notna()]
PD_cauc.head()

In [None]:
#p42024 - ftd
#p42028 - als
#42034 - psp
#42036 - msa
#42022 - vad
#42020 - ad

In [None]:
PD_filt = PD_cauc[PD_cauc['p42024'].isnull() & PD_cauc['p42028'].isnull() & PD_cauc['p42034'].isnull() &
                   PD_cauc['p42036'].isnull() & PD_cauc['p42022'].isnull() & PD_cauc['p42020'].isnull()]
PD_filt.info()

In [None]:
# Work on a copy to preserve the original
pd_cases = PD_filt.copy()

# Set phenotype and censoring date
pd_cases['PHENO'] = 2
pd_cases['date_censor'] = pd.to_datetime(pd_cases['p42032'])

# Build birthdate from year (p34) and month (p52)
born = pd.to_datetime(pd_cases['p34'].astype(str) + '-' + pd_cases['p52'].astype(str), format='%Y-%B')

# Calculate age at censoring
temp = pd_cases['date_censor'] - born
pd_cases['censor_age'] = (temp / np.timedelta64(1, 'D') / 365.25).round()

# Calculate age at onset from diagnosis date (p42032)
AAO_temp = pd.to_datetime(pd_cases['p42032'], format='%Y-%m-%d', errors='coerce')
temp = AAO_temp - born
pd_cases['AAO'] = (temp / np.timedelta64(1, 'D') / 365.25).round()

# Preview result
print(len(pd_cases))
pd_cases.head()


In [None]:
pd_cases = pd_cases[['eid', 'PHENO','p42032', 'p31', 'p21022', 'AAO', 'date_censor', 'censor_age', 'p34', 'p40000_i0', 'p40000_i1', 'p22189', 'p22009_a1',
                    'p22009_a2', 'p22009_a3', 'p22009_a4', 'p22009_a5']]
pd_cases.columns = ['eid', 'PHENO', 'PD_date', 'sex', 'age_at_recruitment', 'AAO', 'date_censor', 'censor_age', 'birth_year', 'date_of_death1', 'date_of_death2', 'townsend', 'PC1',
                    'PC2', 'PC3', 'PC4', 'PC5']
pd_cases.loc[:, 'sex'] = np.where(pd_cases['sex'] == 'Male', 1, 2)
pd_cases.head()

In [None]:
pd_cases = pd_cases.sort_values(by = 'PD_date')
pd_cases

In [None]:
pd_cases.to_csv('PD_cases_with_dementia_june_2025.txt', sep='\t', index=False)

In [None]:
!dx upload PD_cases_with_dementia_june_2025.txt --path data/june_2025_NDDs/PD_cases_with_dementia_june_2025.txt

# PD strict - this is PD removing will remove ftd, vad, ad, als, psp, msa AND all cause dem

In [None]:
#42024 - ftd
#42028 - als
#42032 - pd
#42034 - psp
#42036 - msa
#42022 - vascular
#42018 - all cause dem
#42020 ad

In [None]:
PD_strict = df_pandas[df_pandas['p42032'].notna()]
PD_strict.info()

In [None]:
PD_filt = PD_strict[PD_strict['p42024'].isnull() & PD_strict['p42028'].isnull() & PD_strict['p42034'].isnull() &
                   PD_strict['p42036'].isnull() & PD_strict['p42022'].isnull() & PD_strict['p42020'].isnull() & PD_strict['p42018'].isnull()]
PD_filt.info()

In [None]:
PD_cauc = PD_filt[PD_filt['p22006'].notna()]
PD_cauc.info()

In [None]:
# Work on a copy to preserve the original
pd_cases = PD_cauc.copy()

# Set phenotype and censoring date
pd_cases['PHENO'] = 2
pd_cases['date_censor'] = pd.to_datetime(pd_cases['p42032'])

# Build birthdate from year (p34) and month (p52)
born = pd.to_datetime(pd_cases['p34'].astype(str) + '-' + pd_cases['p52'].astype(str), format='%Y-%B')

# Calculate age at censoring
temp = pd_cases['date_censor'] - born
pd_cases['censor_age'] = (temp / np.timedelta64(1, 'D') / 365.25).round()

# Calculate age at onset from diagnosis date (p42032)
AAO_temp = pd.to_datetime(pd_cases['p42032'], format='%Y-%m-%d', errors='coerce')
temp = AAO_temp - born
pd_cases['AAO'] = (temp / np.timedelta64(1, 'D') / 365.25).round()

# Preview result
print(len(pd_cases))
pd_cases.head()


In [None]:
pd_cases = pd_cases[['eid', 'PHENO','p42032', 'p31', 'p21022', 'AAO', 'date_censor', 'censor_age', 'p34', 'p40000_i0', 'p40000_i1', 'p22189', 'p22009_a1',
                    'p22009_a2', 'p22009_a3', 'p22009_a4', 'p22009_a5']]
pd_cases.columns = ['eid', 'PHENO','PD_date', 'sex', 'age_at_recruitment', 'AAO', 'date_censor', 'censor_age', 'birth_year', 'date_of_death1', 'date_of_death2', 'townsend', 'PC1',
                    'PC2', 'PC3', 'PC4', 'PC5']
pd_cases.loc[:,'sex'] = np.where(pd_cases['sex']=='Male', 1, 2)
pd_cases.head()

In [None]:
pd_cases = pd_cases.sort_values(by = 'PD_date')
pd_cases

In [None]:
pd_cases.to_csv('PD_cases_strict_no_dementia_june_2025.txt', sep='\t', index=False)

In [None]:
!dx upload PD_cases_strict_no_dementia_june_2025.txt --path data/june_2025_NDDs/PD_cases_strict_no_dementia_june_2025.txt

# all dementia

In [None]:
#p42024 - ftd
#p42028 - als
#p42030 - parkinsonism
#42032 - pd
#42034 - psp
#42036 - msa
#42022 - vascular
# 42018 - all cause dem
# p42020 ad

In [None]:
dem = df_pandas[df_pandas['p42018'].notna()]
#dem.info()

In [None]:
dem_cauc = dem[dem['p22006'].notna()]
#dem_cauc.info()

In [None]:
# this is a more general category - keeping all cases of dementia despite comorbidities vs healthy controls
dem_filt = dem_cauc
#dem_filt.info()

In [None]:
dem_filt.head()

In [None]:
# Start from a clean copy
dem_cases = dem_filt.copy()

# Filter out rows with unknown diagnosis date
dem_cases = dem_cases[dem_cases['p42018'] != 'Date is unknown']

# Set phenotype and censoring date
dem_cases['PHENO'] = 2
dem_cases['date_censor'] = pd.to_datetime(dem_cases['p42018'])

# Construct birthdate from year (p34) and month (p52)
born = pd.to_datetime(dem_cases['p34'].astype(str) + '-' + dem_cases['p52'].astype(str), format='%Y-%B')

# Calculate censor age
temp = dem_cases['date_censor'] - born
dem_cases['censor_age'] = (temp / np.timedelta64(1, 'D') / 365.25).round()

# Calculate age at onset from p42018
AAO_temp = pd.to_datetime(dem_cases['p42018'], format='%Y-%m-%d', errors='coerce')
temp = AAO_temp - born
dem_cases['AAO'] = (temp / np.timedelta64(1, 'D') / 365.25).round()

# Preview result
print(len(dem_cases))
dem_cases.head()


In [None]:
dem_cases = dem_cases[['eid', 'PHENO', 'p42018', 'p31', 'p21022', 'AAO', 'date_censor', 'censor_age', 'p34', 'p40000_i0', 'p40000_i1', 'p22189', 'p22009_a1',
                    'p22009_a2', 'p22009_a3', 'p22009_a4', 'p22009_a5']]
dem_cases.columns = ['eid', 'PHENO', 'Dementia_date', 'sex', 'age_at_recruitment', 'AAO', 'date_censor', 'censor_age', 'birth_year', 'date_of_death1', 'date_of_death2', 'townsend', 'PC1',
                    'PC2', 'PC3', 'PC4', 'PC5']
dem_cases.loc[:, 'sex'] = np.where(dem_cases['sex']=='Male', 1, 2)
dem_cases.head()

In [None]:
dem_cases = dem_cases.sort_values(by = 'Dementia_date')
dem_cases

In [None]:
dem_cases.to_csv('dementia_cases_ALL_june_2025.txt', sep='\t', index=False)

In [None]:
!dx upload dementia_cases_ALL_june_2025.txt --path data/june_2025_NDDs/dementia_cases_ALL_june_2025.txt

# vascular - remove AD pd, ftd, msa, psp, als

In [None]:
#p42024 - ftd
#p42028 - als
#42032 - pd
#p42030 - parkinsonism
#42034 - psp
#42036 - msa
#42022 - vascular
# 42018 - all cause dem
# p42020 ad

In [None]:
vas = df_pandas[df_pandas['p42022'].notna()]
#vas.info()

In [None]:
vas_filt = vas[vas['p42032'].isnull() & vas['p42030'].isnull() & vas['p42020'].isnull() &
                   vas['p42024'].isnull() & vas['p42036'].isnull() & vas['p42034'].isnull() & vas['p42028'].isnull()]
vas_filt.info()

In [None]:
vas_cauc = vas_filt[vas_filt['p22006'].notna()]
vas_cauc.info()

In [None]:
# Work on a copy to avoid modifying the original
vas_cases = vas_cauc.copy()

# Exclude rows where the AAO date is unknown
vas_cases = vas_cases[vas_cases['p42022'] != 'Date is unknown']

# Set phenotype and censoring date
vas_cases['PHENO'] = 2
vas_cases['date_censor'] = pd.to_datetime(vas_cases['p42022'])

# Construct birthdate from year (p34) and month name (p52)
born = pd.to_datetime(vas_cases['p34'].astype(str) + '-' + vas_cases['p52'].astype(str), format='%Y-%B')

# Calculate age at censoring
temp = vas_cases['date_censor'] - born
vas_cases['censor_age'] = (temp / np.timedelta64(1, 'D') / 365.25).round()

# Calculate age at onset from diagnosis date (p42022)
AAO_temp = pd.to_datetime(vas_cases['p42022'], format='%Y-%m-%d', errors='coerce')
temp = AAO_temp - born
vas_cases['AAO'] = (temp / np.timedelta64(1, 'D') / 365.25).round()

# Preview
print(len(vas_cases))
vas_cases.head()


In [None]:
vas_cases = vas_cases[['eid', 'PHENO', 'p42022', 'p31', 'p21022', 'AAO', 'date_censor', 'censor_age', 'p34', 'p40000_i0', 'p40000_i1', 'p22189', 'p22009_a1',
                    'p22009_a2', 'p22009_a3', 'p22009_a4', 'p22009_a5']]
vas_cases.columns = ['eid', 'PHENO', 'vascular_dem_date', 'sex', 'age_at_recruitment', 'AAO', 'date_censor', 'censor_age', 'birth_year', 'date_of_death1', 'date_of_death2', 'townsend', 'PC1',
                    'PC2', 'PC3', 'PC4', 'PC5']
vas_cases.loc[:, 'sex'] = np.where(vas_cases['sex']=='Male', 1, 2)
vas_cases.head()

In [None]:
vas_cases = vas_cases.sort_values(by = 'vascular_dem_date')
vas_cases

In [None]:
vas_cases.to_csv('vascular_dementia_cases_june_2025.txt', sep='\t', index=False)

In [None]:
! dx upload vascular_dementia_cases_june_2025.txt --path data/june_2025_NDDs/vascular_dementia_cases_june_2025.txt

# als only has 753 people - potentially skip for now?

In [None]:
#p42024 - ftd
#p42028 - als
#p42030 - parkinsonism
#42032 - pd
#42034 - psp
#42036 - msa
#42022 - vascular
# 42018 - all cause dem
# p42020 ad

In [None]:
als = df_pandas[df_pandas['p42028'].notna()]
print(len(als))
#als.info()

In [None]:
als_filt = als[als['p42024'].isnull() & als['p42032'].isnull() & als['p42034'].isnull() &
                   als['p42036'].isnull() & als['p42022'].isnull() & als['p42020'].isnull() & als['p42018'].isnull() & als['p42030'].isnull()]
als_filt.info()

In [None]:
als_cauc = als_filt[als_filt['p22006'].notna()]
als_cauc.info()

In [None]:
# Work on a copy to avoid modifying the original DataFrame
als_cases = als_cauc.copy()

# Filter out rows with unknown diagnosis date
als_cases = als_cases[als_cases['p42028'] != 'Date is unknown']

# Set phenotype and censoring date
als_cases['PHENO'] = 2
als_cases['date_censor'] = pd.to_datetime(als_cases['p42028'])

# Construct birthdate from year (p34) and month name (p52)
born = pd.to_datetime(als_cases['p34'].astype(str) + '-' + als_cases['p52'].astype(str), format='%Y-%B')

# Calculate age at censoring
temp = als_cases['date_censor'] - born
als_cases['censor_age'] = (temp / np.timedelta64(1, 'D') / 365.25).round()

# Calculate age at onset from p42028
AAO_temp = pd.to_datetime(als_cases['p42028'], format='%Y-%m-%d', errors='coerce')
temp = AAO_temp - born
als_cases['AAO'] = (temp / np.timedelta64(1, 'D') / 365.25).round()

# Preview
print(len(als_cases))
als_cases.head()

In [None]:
als_cases = als_cases[['eid', 'PHENO', 'p42028', 'p31', 'p21022', 'AAO', 'date_censor', 'censor_age', 'p34', 'p40000_i0', 'p40000_i1', 'p22189', 'p22009_a1',
                    'p22009_a2', 'p22009_a3', 'p22009_a4', 'p22009_a5']]
als_cases.columns = ['eid', 'PHENO', 'ALS_date', 'sex', 'age_at_recruitment', 'AAO', 'date_censor', 'censor_age', 'birth_year', 'date_of_death1', 'date_of_death2', 'townsend', 'PC1',
                    'PC2', 'PC3', 'PC4', 'PC5']
als_cases.loc[:, 'sex'] = np.where(als_cases['sex']=='Male', 1, 2)
als_cases.head()

In [None]:
als_cases = als_cases.sort_values(by = 'ALS_date')
als_cases

In [None]:
als_cases.to_csv('ALS_cases_june_2025.txt', sep='\t', index=False)

In [None]:
!dx upload ALS_cases_june_2025.txt --path data/june_2025_NDDs/ALS_cases_june_2025.txt

# MS p131042

In [None]:
ms = df_pandas[df_pandas['p131042'].notna()]
ms.info()

In [None]:
ms_filt = ms[ms['p42024'].isnull() & ms['p42032'].isnull() & ms['p42034'].isnull() &
                   ms['p42036'].isnull() & ms['p42022'].isnull() & ms['p42020'].isnull() & ms['p42018'].isnull() & ms['p42030'].isnull()]
ms_filt.info()

In [None]:
ms_cauc = ms_filt[ms_filt['p22006'].notna()]
ms_cauc.info()

In [None]:
# Work on a copy to avoid modifying the original DataFrame
ms_cases = ms_cauc.copy()

# Filter out problematic diagnosis dates
ms_cases = ms_cases[
    (ms_cases['p131042'] != 'Date is unknown') &
    (ms_cases['p131042'] != "Code has event date matching participant's date of birth")
]

# Assign phenotype and set censoring date
ms_cases['PHENO'] = 2
ms_cases['date_censor'] = pd.to_datetime(ms_cases['p131042'])

# Construct birthdate from year (p34) and month name (p52)
born = pd.to_datetime(ms_cases['p34'].astype(str) + '-' + ms_cases['p52'].astype(str), format='%Y-%B')

# Calculate censor age
temp = ms_cases['date_censor'] - born
ms_cases['censor_age'] = (temp / np.timedelta64(1, 'D') / 365.25).round()

# Parse diagnosis date and calculate AAO
AAO_temp = pd.to_datetime(ms_cases['p131042'], format='%Y-%m-%d', errors='coerce')
temp = AAO_temp - born
ms_cases['AAO'] = (temp / np.timedelta64(1, 'D') / 365.25).round()

# Preview
print(len(ms_cases))
ms_cases.head()


In [None]:
ms_cases = ms_cases[['eid', 'PHENO', 'p131042', 'p31', 'p21022', 'AAO', 'date_censor', 'censor_age', 'p34', 'p40000_i0', 'p40000_i1', 'p22189', 'p22009_a1',
                    'p22009_a2', 'p22009_a3', 'p22009_a4', 'p22009_a5']]
ms_cases.columns = ['eid', 'PHENO', 'MS_date', 'sex', 'age_at_recruitment', 'AAO', 'date_censor', 'current_age', 'birth_year', 'date_of_death1', 'date_of_death2', 'townsend', 'PC1',
                    'PC2', 'PC3', 'PC4', 'PC5']
ms_cases.loc[:, 'sex'] = np.where(ms_cases['sex']=='Male', 1, 2)
ms_cases.head()

In [None]:
ms_cases = ms_cases.sort_values(by = 'MS_date')
ms_cases

In [None]:
ms_cases.to_csv('MS_cases_june_2025.txt', sep='\t', index=False)

In [None]:
!dx upload MS_cases_june_2025.txt --path data/june_2025_NDDs/MS_cases_june_2025.txt