## SQL Implementation of our database

# Import

In [3]:
import duckdb
from pathlib import Path

Query

In [5]:
cwd = Path().cwd()
project = cwd.parent
train = Path('data/train_data.csv')
test = Path('data/test_data_competition.csv')

train_df = project / train
test_df = project / test

if not train_df.exists() or not test_df.exists():
    raise FileNotFoundError("Data files not found.")

con = duckdb.connect(':memory:')

con.execute(f"CREATE TABLE train_df AS SELECT * FROM read_csv_auto('{train_df}')")
con.execute(f"CREATE TABLE test_df AS SELECT * FROM read_csv_auto('{test_df}')")

# Preview the data
print("Training data:")
print(con.execute("SELECT * FROM train_df LIMIT 5").df())
print("\nTest data:")
print(con.execute("SELECT * FROM test_df LIMIT 5").df())

Training data:
                                           Person ID  Gender  Age  \
0  d29d53701d3c859e29e1b90028eec1ca8e2f29439198b6...    Male   62   
1  71a1c003a2b855d85582c8f6c7648c49d3fe836408a7e1...  Female   35   
2  86e50149658661312a9e0b35558d84f6c6d3da797f552a...  Female   50   
3  093434a3ee9e0a010bb2c2aae06c2614dd24894062a1ca...    Male   38   
4  e3d6c4d4599e00882384ca981ee287ed961fa5f3828e2a...  Female   34   

      Occupation  Sleep Duration (hours)  Quality of Sleep (scale: 1-10)  \
0        Student                5.802690                        9.684249   
1   Manual Labor                7.928079                        4.589872   
2  Office Worker                4.425595                        6.809697   
3  Office Worker                9.031985                        4.682931   
4        Student                8.719110                        6.729776   

   Physical Activity Level (minutes/day)  Stress Level (scale: 1-10)  \
0                              26.361925 

In [6]:
# Filter by Sleep Disorder (returns as pandas DataFrames)
no_dis = con.execute("SELECT * FROM train_df WHERE \"Sleep Disorder\" = 'No Disorder'").df()
insomnia = con.execute("SELECT * FROM train_df WHERE \"Sleep Disorder\" = 'Insomnia'").df()
apnea = con.execute("SELECT * FROM train_df WHERE \"Sleep Disorder\" = 'Sleep Apnea'").df()

In [7]:
print(no_dis.info())
print(no_dis.describe(include='all'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70 entries, 0 to 69
Data columns (total 13 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Person ID                              70 non-null     object 
 1   Gender                                 70 non-null     object 
 2   Age                                    70 non-null     int64  
 3   Occupation                             70 non-null     object 
 4   Sleep Duration (hours)                 70 non-null     float64
 5   Quality of Sleep (scale: 1-10)         70 non-null     float64
 6   Physical Activity Level (minutes/day)  70 non-null     float64
 7   Stress Level (scale: 1-10)             70 non-null     float64
 8   BMI Category                           70 non-null     object 
 9   Blood Pressure (systolic/diastolic)    70 non-null     object 
 10  Heart Rate (bpm)                       70 non-null     float64
 11  Daily St