### Loads the separate XPT files from the raw data folder, merges them, and outputs the tables into a csv file. 
### Only needs to be ran once, the other algortihms use the CSV file. 

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import os

In [3]:
data_folder = 'raw_data'
raw_data_paths = [
    ('/depression', '/targets/DPQ_L_Target_Depression.xpt'),
    ('/insomnia', '/targets/SLQ_L_Target_Insomnia.xpt'),
]

for i in range(len(raw_data_paths)):

    data = pd.read_sas(data_folder + raw_data_paths[i][1], format='xport')

    # Remove rows with any missing values in the targets
    data = data[data[list(data.columns)].notnull().all(1)]

    for file_path in os.listdir(data_folder + raw_data_paths[i][0]):
        if not file_path.endswith('.xpt'):
            raise ValueError("Unsupported file format")

        file_path = os.path.join(data_folder + raw_data_paths[i][0], file_path)
        new_data = pd.read_sas(file_path, format='xport') 

        if 'SEQN' not in new_data.columns:
            print("Missing SEQN column in file: " + file_path)
        else:     
            data = pd.merge(data, new_data, how="left", left_on="SEQN", right_on="SEQN")

    numeric_cols = data.select_dtypes(include=['float64']).columns
    for col in numeric_cols:
        data[col] = data[col].round().astype("Int64")

    os.makedirs('processed_data', exist_ok=True)
    data.to_csv('processed_data/' + raw_data_paths[i][0] + '_data.csv', index=False)