In [14]:
import pandas as pd
import json
from collections import Counter

filenames = ['P_ALQ.csv', 'P_DPQ.csv', 'P_SLQ.csv', 'P_SMQ.csv', 'P_DEMO.csv']
filedirectory = '../../data/2017_2020_NHANES/'

# read files and merge them according to SEQN
df = pd.read_csv(filedirectory + filenames[0])
for filename in filenames[1:]:
    df = pd.merge(df, pd.read_csv(filedirectory + filename), on='SEQN')

# SLD012 - Sleep hours - weekdays or workdays
# SLD013 - Sleep hours - weekends
# ALQ121 - Past 12 mo how often drink alcoholic beverages
# DPQ020 - Feeling down, depressed, or hopeless
# DPQ030 - Trouble sleeping or sleeping too much
# DPQ060 - Feeling bad about yourself
# SMQ040 - Do you now smoke cigarettes?
# RIDAGEYR - Age in years of the participant at the time of screening. Individuals 80 and over are topcoded at 80 years of age
studyColumns = ['SLD012', 'SLD013', 'ALQ121', 'DPQ020', 'DPQ030', 'DPQ060', 'SMQ040', 'RIDAGEYR']
df = df.dropna(subset=studyColumns) # drop rows with missing values

print("Data size: ",df.shape)

df.head()

Data size:  (3266, 73)


Unnamed: 0,SEQN,ALQ111,ALQ121,ALQ130,ALQ142,ALQ270,ALQ280,ALQ290,ALQ151,ALQ170,...,FIAINTRP,MIALANG,MIAPROXY,MIAINTRP,AIALANGA,WTINTPRP,WTMECPRP,SDMVPSU,SDMVSTRA,INDFMPIR
1,109271.0,1.0,0.0,,,,,,1.0,,...,2.0,1.0,2.0,2.0,1.0,8481.589837,8658.732873,1.0,167.0,
2,109273.0,1.0,0.0,,,,,,2.0,,...,2.0,1.0,2.0,2.0,1.0,20171.847767,22163.59685,1.0,155.0,0.83
4,109282.0,1.0,0.0,,,,,,2.0,,...,2.0,1.0,2.0,2.0,,28363.015286,31816.3465,2.0,164.0,3.61
17,109307.0,1.0,9.0,1.0,0.0,,,,2.0,0.0,...,2.0,1.0,2.0,1.0,3.0,15248.7439,16989.215538,1.0,160.0,1.38
22,109317.0,1.0,3.0,3.0,4.0,5.0,6.0,8.0,2.0,10.0,...,,1.0,2.0,2.0,1.0,13686.835274,14593.735207,2.0,154.0,


In [16]:
# export to csv
df.to_csv('../../data/2017_2020_NHANES/merged_data.csv', index=False)

# calculate the number of each sleep duration on weekdays and weekends
# Avg hour = (5*SLD012 + 2*SLD013) / 7
def round_to_nearest_half(x):
    return round(x * 2) / 2

df['AvgSleep'] = (5*df['SLD012'] + 2*df['SLD013']) / 7
df['AvgSleep'] = df['AvgSleep'].apply(round_to_nearest_half)

# generate map for average sleep duration and store in json

avgSleepMap = Counter(df['AvgSleep'])
avgSleepMap = dict(sorted(avgSleepMap.items()))
totalCnt = sum(avgSleepMap.values())
avgSleepList = [{'hour': hour, 'count': count, 'percent': count/totalCnt} for hour, count in avgSleepMap.items()]
dataDict = {'records': avgSleepList}
with open('../../data/2017_2020_NHANES/avgSleepMap.json', 'w') as f:
    json.dump(dataDict, f)
    

3266


In [12]:
# calculate average sleep duration for each age group
ageGroups = [10, 20, 30, 40, 50, 60, 70, 80]
ageGroupAvgSleep = {}

# The minimum age is 18 because of privacy reasons
for i in range(len(ageGroups) - 1):
    ageGroupAvgSleep[str(ageGroups[i]) + '-' + str(ageGroups[i+1])] = df[(df['RIDAGEYR'] >= ageGroups[i]) & (df['RIDAGEYR'] < ageGroups[i+1])]['AvgSleep'].mean()

with open('../../data/2017_2020_NHANES/ageGroupAvgSleep.json', 'w') as f:
    json.dump(ageGroupAvgSleep, f)

In [13]:
# export everyones sleep information

personSleepInfo = []
for index, row in df.iterrows():
    tmp = {}
    tmp['id'] = row['SEQN']
    tmp['sleepWeekdays'] = row['SLD012']
    tmp['sleepWeekends'] = row['SLD013']
    tmp['avgSleep'] = row['AvgSleep']
    tmp['alcholFreq'] = row['ALQ121']
    tmp['depressed'] = row['DPQ020']
    tmp['troubleSleeping'] = row['DPQ030']
    tmp['feelingBad'] = row['DPQ060']
    tmp['smoking'] = row['SMQ040']
    tmp['age'] = row['RIDAGEYR']

    personSleepInfo.append(tmp)

with open('../../data/2017_2020_NHANES/personSleepInfo.json', 'w') as f:
    json.dump(personSleepInfo, f)