In [1]:
# all imports

import pandas as pd
import numpy as np
import re

# Parsing SAS File

In [2]:
def handle_special_cases(value):
    if value.startswith("$"):
        return value[1:]
    elif value.startswith("Z"):
        total_width = int(value[1])
        return value[1:]
    else:
        return value

def parse_sas(filepath):
    section_pattern = r'^\s*(\w+)\s*$'
    variable_metadata = {}

    with open(filepath, "r") as file:
        for _ in range(5):
            next(file)

        content = file.read()

    sections = re.split(r'\s*;\s*', content)

    for section in sections:
        lines = section.strip().split('\n')
        current_section = None

        match = re.match(section_pattern, lines[0])
        if match:
            current_section = match.group(1)

        for line in lines[1:]:
            line = line.strip()

            if not line or line == "***":
                continue

            parts = line.split()
            if len(parts) >= 2:
                var_name = parts[0]
                var_value = " ".join(parts[1:])  
                var_value = handle_special_cases(var_value)  

                if var_name not in variable_metadata:
                    variable_metadata[var_name] = {}

                if current_section == "LENGTH":
                    variable_metadata[var_name]["length"] = var_value
                elif current_section == "FORMAT":
                    variable_metadata[var_name]["format"] = var_value
                elif current_section == "INPUT":
                    input_range = parts[-1]  
                    if "-" in input_range:
                        input_start, input_end = map(int, input_range.split("-"))
                    else:
                        input_start = input_end = int(input_range)

                    variable_metadata[var_name]["input_start"] = input_start
                    variable_metadata[var_name]["input_end"] = input_end
                elif current_section == "LABEL":
                    label_value = var_value.split('=')[1].strip().strip('"')
                    variable_metadata[var_name]["label"] = label_value

    return variable_metadata

In [3]:
adult_metadata = parse_sas("/kaggle/input/nhanes3-raw/adult.sas.txt")
adult_metadata_df = pd.DataFrame.from_dict(adult_metadata, orient='index')

youth_metadata = parse_sas("/kaggle/input/nhanes3-raw/youth.sas.txt")
youth_metadata_df = pd.DataFrame.from_dict(youth_metadata, orient='index')

lab_metadata = parse_sas("/kaggle/input/nhanes3-raw/lab.sas.txt")
lab_metadata_df = pd.DataFrame.from_dict(lab_metadata, orient='index')

exam_metadata = parse_sas("/kaggle/input/nhanes3-raw/exam.sas.txt")
exam_metadata_df = pd.DataFrame.from_dict(exam_metadata, orient='index')

In [4]:
adult_metadata_df.to_csv('adult_metadata.csv')

youth_metadata_df.to_csv('youth_metadata.csv')

lab_metadata_df.to_csv('lab_metadata.csv')

exam_metadata_df.to_csv('exam_metadata.csv')

# Parsing DAT File

In [5]:
def parse_dat(df, filepath):
    df['input_start'] = df['input_start'] - 1
    df['input_end'] = df['input_end'] - 1
    df_data = pd.read_fwf(filepath, header=None, widths=df['input_end'] - df['input_start'] + 1)
    df_data.columns = df.index.tolist()
    return df_data

In [6]:
adult_data_df = parse_dat(adult_metadata_df, "/kaggle/input/nhanes3-raw/adult.dat")

youth_data_df = parse_dat(youth_metadata_df, "/kaggle/input/nhanes3-raw/youth.dat")

lab_data_df = parse_dat(lab_metadata_df, "/kaggle/input/nhanes3-raw/lab.dat")

exam_data_df = parse_dat(exam_metadata_df, "/kaggle/input/nhanes3-raw/exam.dat")

In [7]:
adult_data_df.to_csv('adult_data.csv')

youth_data_df.to_csv('youth_data.csv')

lab_data_df.to_csv('lab_data.csv')

exam_data_df.to_csv('exam_data.csv')