## Selection of participants for puberty classification

**Input:**

- ABCD pubertal development data (youth-reported) (abcd_ypdms01.txt)
        
**Output:**

- **relevantMenarcheData2year.csv.csv:** 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sn
from matplotlib import pyplot as plt 
import os

In [None]:
os.getcwd()
os.chdir('ABCDTabular\\')

abcd_y_pds = pd.read_csv('abcd_ypdms01.txt',sep='\s+')  

In [None]:
## save and delete fulltext variable names
dicttPDS = abcd_y_pds.iloc[0]
abcd_y_pds = abcd_y_pds.drop(0)
dicttPDS

In [None]:
## start creating df for metainformation about sample
meta_df = pd.DataFrame()
meta_df['eventname'] = abcd_y_pds.eventname.unique()
meta_df

In [None]:
## count datapoints in each measurement
datapoints = [np.count_nonzero(abcd_y_pds.eventname == "baseline_year_1_arm_1")]
datapoints.append(np.count_nonzero(abcd_y_pds.eventname == "1_year_follow_up_y_arm_1"))
datapoints.append(np.count_nonzero(abcd_y_pds.eventname == "2_year_follow_up_y_arm_1"))
datapoints.append(np.count_nonzero(abcd_y_pds.eventname == "3_year_follow_up_y_arm_1"))
meta_df['datapoints'] = datapoints

In [None]:
## get mean age at each measurement
abcd_y_pds['interview_age'] = pd.to_numeric(abcd_y_pds['interview_age'])

meanAge = abcd_y_pds.groupby('eventname').interview_age.mean().reset_index(name='meanAge')
meanAge['meanAge'] = meanAge['meanAge']/12
meta_df = meta_df.merge(meanAge, on = 'eventname', how = 'outer')

In [None]:
## count nr of female subjects in each measurement
fcount = abcd_y_pds.groupby(['eventname','sex']).size().reset_index(name='female')
fcount = fcount[fcount.sex != 'M']
meta_df = meta_df.merge(fcount[['eventname', 'female']], on = 'eventname', how = 'outer')

In [None]:
meta_df['male'] = meta_df['datapoints'] - meta_df['female']

In [None]:
meta_df

### PDS coding
4 = Yes; 1 = No; 999 = I don't know; 777= refuse to answer 

In [None]:
## replace coding in menarche variable to string from float
abcd_y_pds['pds_f5_y'].unique()
d = {999.0:'999',4.0:'4',1.0:'1', 777.0:'777'}
abcd_y_pds['pds_f5_y'] = abcd_y_pds['pds_f5_y'].replace(d)

In [None]:
## count menarche distribution in each measurement
mcount = abcd_y_pds.groupby(['eventname','pds_f5_y']).size().reset_index(name='count')
count1 = mcount[mcount.pds_f5_y == '1']
count1.rename(columns = {'count':'pre-menarche'}, inplace = True)
count4 = mcount[mcount.pds_f5_y == '4']
count4.rename(columns = {'count':'post-menarche'}, inplace = True)
count777 = mcount[mcount.pds_f5_y == '777']
count777.rename(columns = {'count':'refused-answer'}, inplace = True)
count999= mcount[mcount.pds_f5_y == '999']
count999.rename(columns = {'count':'dont-know'}, inplace = True)

meta_df = meta_df.merge(count1[['eventname', 'pre-menarche']], on = 'eventname', how = 'outer')
meta_df = meta_df.merge(count4[['eventname', 'post-menarche']], on = 'eventname', how = 'outer')
meta_df = meta_df.merge(count777[['eventname', 'refused-answer']], on = 'eventname', how = 'outer')
meta_df = meta_df.merge(count999[['eventname', 'dont-know']], on = 'eventname', how = 'outer')

meta_df['dont-know'].fillna(0,inplace=True)

In [None]:
## Table of info about data
meta_df.set_index('eventname', inplace = True)
meta_df.head()
meta_df.to_csv('..\\processedData\\meta_df_menarche.csv', index = False)

In [None]:
menarche_df = abcd_y_pds[abcd_y_pds.sex == 'F']

menarche_df_2year = menarche_df[menarche_df.eventname == '2_year_follow_up_y_arm_1']

## Exclude subjects with answers 777 or 999
menarche_df_2year = menarche_df_2year[((menarche_df_2year['pds_f5_y'] == '1')|(menarche_df_2year['pds_f5_y'] == '4'))] 

menarche_df_2year.to_csv('..\\processedData\\relevantMenarcheData2year.csv', index = False)