In [32]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Define data directory
data_dir_1 = '/workspaces/Final-Year-Project/Data/2023/'

# List all files in the directory
file_list = os.listdir(data_dir_1)

# Filter out non-SAS files if needed (only include .XPT and .xpt files)
sas_files = [file for file in file_list if file.endswith('.XPT') or file.endswith('.xpt')]

# Create a dictionary to hold the dataframes
data_frames = {}

# Loop through each file and read them
for file_name in sas_files:
    file_path = os.path.join(data_dir_1, file_name)
    
    # Read the SAS file
    df = pd.read_sas(file_path, format='xport', encoding='iso-8859-1')
    
    # Add the dataframe to the dictionary with the file name as the key
    data_frames[file_name] = df

# Merge all dataframes on the 'SEQN' (common identifier)
merged_df = data_frames[sas_files[0]]  # Start with the first dataset

for file_name in sas_files[1:]:  # Skip the first file as it's already merged
    merged_df = pd.merge(merged_df, data_frames[file_name], on='SEQN', how='outer')  # Merge based on SEQN

# Display info about the merged dataset
merged_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11933 entries, 0 to 11932
Columns: 302 entries, SEQN to RHQ332
dtypes: float64(294), object(8)
memory usage: 27.5+ MB


In [33]:
# Keep only float columns
df_float = merged_df.select_dtypes(include=['float64'])

merged_df_cleaned = df_float
merged_df_cleaned.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11933 entries, 0 to 11932
Columns: 294 entries, SEQN to RHQ332
dtypes: float64(294)
memory usage: 26.8 MB


In [34]:
merged_df_cleaned = merged_df.dropna(subset=['LBXGH', 'LBXGLU'], how='all')

def categorize_diabetes(row):
    if (pd.notnull(row['LBXGH']) and row['LBXGH'] >= 6.5) or (pd.notnull(row['LBXGLU']) and row['LBXGLU'] >= 126):
        return 2
    
    elif (pd.notnull(row['LBXGH']) and 5.7 <= row['LBXGH'] < 6.5) or (pd.notnull(row['LBXGLU']) and 100 <= row['LBXGLU'] < 126):
        return 1
    else:
        return 0

merged_df_cleaned['Diabetes Status'] = merged_df_cleaned.apply(categorize_diabetes, axis=1)

merged_df_cleaned = merged_df_cleaned.drop(columns=['LBXGH', 'LBXGLU'])

df = merged_df_cleaned
df.info()



<class 'pandas.core.frame.DataFrame'>
Index: 6717 entries, 0 to 11932
Columns: 301 entries, SEQN to Diabetes Status
dtypes: float64(292), int64(1), object(8)
memory usage: 15.5+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df_cleaned['Diabetes Status'] = merged_df_cleaned.apply(categorize_diabetes, axis=1)


In [38]:
# Keep only float columns
df_float = merged_df.select_dtypes(include=['float64'])

df = df_float

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11933 entries, 0 to 11932
Columns: 294 entries, SEQN to RHQ332
dtypes: float64(294)
memory usage: 26.8 MB


In [35]:
df = df.dropna(subset=['Diabetes Status'])

missing_percentage = df.isnull().mean() * 100

columns_to_drop = missing_percentage[missing_percentage > 10].index

df = df.drop(columns=columns_to_drop)

df = df.dropna()

df.info()





<class 'pandas.core.frame.DataFrame'>
Index: 4446 entries, 0 to 11932
Data columns (total 85 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   SEQN             4446 non-null   float64
 1   IMQ011           4446 non-null   float64
 2   OHQ845           4446 non-null   float64
 3   OHQ620           4446 non-null   float64
 4   OCD150           4446 non-null   float64
 5   WHD010           4446 non-null   float64
 6   WHD020           4446 non-null   float64
 7   WHD050           4446 non-null   float64
 8   WHQ070           4446 non-null   float64
 9   HOD051           4446 non-null   float64
 10  MCQ010           4446 non-null   float64
 11  AGQ030           4446 non-null   float64
 12  MCQ053           4446 non-null   float64
 13  HUQ010           4446 non-null   float64
 14  HUQ030           4446 non-null   float64
 15  HUQ055           4446 non-null   float64
 16  HUQ090           4446 non-null   float64
 17  SLQ300           4

In [21]:
df['Diabetes Status'].value_counts()

Diabetes Status
0    2177
1    1708
2     561
Name: count, dtype: int64

In [22]:


plt.figure(figsize=(16, 9))

corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, fmt=".2f")

ValueError: could not convert string to float: '21:30'

<Figure size 1600x900 with 0 Axes>