In [None]:
import os
import pandas as pd
import time
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

data_dir_1 = '/workspaces/Final-Year-Project/Data/2023/'
file_list = os.listdir(data_dir_1)
sas_files = [file for file in file_list if file.endswith('.XPT') or file.endswith('.xpt')]

first_file_path = os.path.join(data_dir_1, sas_files[0])
merged_df = pd.read_sas(first_file_path, format='xport', encoding='iso-8859-1')

for file_name in sas_files[1:]:
    file_path = os.path.join(data_dir_1, file_name)
    df = pd.read_sas(file_path, format='xport', encoding='iso-8859-1')
    if 'SEQN' not in df.columns:
        continue  # Skip files missing SEQN
    merged_df = pd.merge(merged_df, df, on='SEQN', how='outer', suffixes=('', f'_{file_name}'))
    del df  # Free memory

merged_df = merged_df.select_dtypes(include=['float64'])

In [None]:
merged_df_cleaned = merged_df.dropna(subset=['LBXGH', 'LBXGLU'], how='all').copy()

def categorize_diabetes(row):    
    if (pd.notnull(row['LBXGH']) and row['LBXGH'] >= 5.7) or (pd.notnull(row['LBXGLU']) and row['LBXGLU'] >= 100):
        return 1  # Diabetes / At Risk
    else:
        return 0  # No Diabetes

merged_df_cleaned['Diabetes Status'] = merged_df_cleaned.apply(categorize_diabetes, axis=1)
merged_df_cleaned.drop(columns=['LBXGH', 'LBXGLU'], inplace=True)

df = merged_df_cleaned.copy()

In [None]:
missing_percentage = df.isnull().mean() * 100

columns_to_drop = missing_percentage[missing_percentage > 60].index
df = df.drop(columns=columns_to_drop)

df = df.dropna(thresh=len(df.columns) * 0.8)

df = df.dropna(subset=['Diabetes Status'])

In [None]:
no_diabetes = df[df['Diabetes Status'] == 0]
diabetes = df[df['Diabetes Status'] == 1]

minority_class_size = min(len(no_diabetes), len(diabetes))

no_diabetes_downsampled = resample(no_diabetes, replace=False, n_samples=minority_class_size, random_state=42)
diabetes_downsampled = resample(diabetes, replace=False, n_samples=minority_class_size, random_state=42)

df_balanced = pd.concat([no_diabetes_downsampled, diabetes_downsampled])

df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
df_corr = df_balanced.corr()

df_top_filtered = df_corr['Diabetes Status'].abs()[df_corr['Diabetes Status'].abs() > 0.15]

columns_to_keep = df_top_filtered.index
df_filtered = df_balanced[columns_to_keep]

In [None]:
df_train_test, df_validate = train_test_split(df_filtered, test_size=0.1, random_state=42)

df_filtered.to_csv('/workspaces/Final-Year-Project/Cleaned Data/DataTopCorrelationData.csv', index=False)
df_validate.to_csv('/workspaces/Final-Year-Project/Cleaned Data/ValidationData.csv', index=False)
df_train_test.to_csv('/workspaces/Final-Year-Project/Cleaned Data/TrainTestData.csv', index=False)

print("Final Diabetes Status Distribution:\n", df_filtered['Diabetes Status'].value_counts())
print("Final Dataset Info:\n")
df_filtered.info()

Final Diabetes Status Distribution:
 Diabetes Status
0    2357
1    2357
Name: count, dtype: int64
Final Dataset Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4714 entries, 0 to 4713
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   BPQ020           4714 non-null   float64
 1   BPQ080           4714 non-null   float64
 2   BPQ101D          4714 non-null   float64
 3   RHQ031           2585 non-null   float64
 4   RHD280           2577 non-null   float64
 5   RIDAGEYR         4714 non-null   float64
 6   HUQ010           4714 non-null   float64
 7   FNQ440           4714 non-null   float64
 8   LBDGLUSI         2585 non-null   float64
 9   OCD150           4714 non-null   float64
 10  RXQ033           4714 non-null   float64
 11  RXQ050           3298 non-null   float64
 12  Diabetes Status  4714 non-null   int64  
dtypes: float64(12), int64(1)
memory usage: 478.9 KB
