## SQL Implementation of our database

# Import

In [2]:
import duckdb
from pathlib import Path

Query

In [19]:
cwd = Path().cwd()
project = cwd.parent
train = Path('data/train_data.csv')
test = Path('data/test_data_competition.csv')

train_df = project / train
test_df = project / test

if not train_df.exists() or not test_df.exists():
    raise FileNotFoundError("Data files not found.")

con = duckdb.connect(':memory:')

# Create and populate train table
con.execute("""
    CREATE TABLE IF NOT EXISTS train AS
    SELECT * FROM read_csv_auto(?)
""", [str(train_df)])

# Create and populate test table
con.execute("""
    CREATE TABLE IF NOT EXISTS test AS
    SELECT * FROM read_csv_auto(?)
""", [str(test_df)])

# Preview the data
print("Training data:")
print(con.execute("SELECT * FROM train LIMIT 5").df())
print("\nTest data:")
print(con.execute("SELECT * FROM test LIMIT 5").df())

Training data:
                                           Person ID  Gender  Age  \
0  d29d53701d3c859e29e1b90028eec1ca8e2f29439198b6...    Male   62   
1  71a1c003a2b855d85582c8f6c7648c49d3fe836408a7e1...  Female   35   
2  86e50149658661312a9e0b35558d84f6c6d3da797f552a...  Female   50   
3  093434a3ee9e0a010bb2c2aae06c2614dd24894062a1ca...    Male   38   
4  e3d6c4d4599e00882384ca981ee287ed961fa5f3828e2a...  Female   34   

      Occupation  Sleep Duration (hours)  Quality of Sleep (scale: 1-10)  \
0        Student                5.802690                        9.684249   
1   Manual Labor                7.928079                        4.589872   
2  Office Worker                4.425595                        6.809697   
3  Office Worker                9.031985                        4.682931   
4        Student                8.719110                        6.729776   

   Physical Activity Level (minutes/day)  Stress Level (scale: 1-10)  \
0                              26.361925 

In [17]:
# Create new tables inside the same DB
con.execute("""
    CREATE TABLE IF NOT EXISTS no_dis AS
    SELECT *
    FROM train
    WHERE "Sleep Disorder" = 'No Disorder';
""")

con.execute("""
    CREATE TABLE IF NOT EXISTS insomnia AS
    SELECT *
    FROM train
    WHERE "Sleep Disorder" = 'Insomnia';
""")

con.execute("""
    CREATE TABLE IF NOT EXISTS apnea AS
    SELECT *
    FROM train
    WHERE "Sleep Disorder" = 'Sleep Apnea';
""")

<_duckdb.DuckDBPyConnection at 0x275d8340cf0>

In [13]:

# Preview each
print("No Disorder:")
print(con.execute("SELECT * FROM no_dis LIMIT 5").df())

No Disorder:
                                           Person ID  Gender  Age  \
0  d29d53701d3c859e29e1b90028eec1ca8e2f29439198b6...    Male   62   
1  86e50149658661312a9e0b35558d84f6c6d3da797f552a...  Female   50   
2  093434a3ee9e0a010bb2c2aae06c2614dd24894062a1ca...    Male   38   
3  b4944c6ff08dc6f43da2e9c824669b7d927dd1fa976fad...    Male   18   
4  5426d2ca50f244fb43fe9eafc82da08f33f3b4f8d91408...  Female   25   

      Occupation  Sleep Duration (hours)  Quality of Sleep (scale: 1-10)  \
0        Student                5.802690                        9.684249   
1  Office Worker                4.425595                        6.809697   
2  Office Worker                9.031985                        4.682931   
3   Manual Labor                6.279765                        8.170468   
4        Student                5.016662                        5.829710   

   Physical Activity Level (minutes/day)  Stress Level (scale: 1-10)  \
0                              26.361925   

In [14]:
print("\nInsomnia:")
print(con.execute("SELECT * FROM insomnia LIMIT 5").df())


Insomnia:
                                           Person ID  Gender  Age  \
0  71a1c003a2b855d85582c8f6c7648c49d3fe836408a7e1...  Female   35   
1  e3d6c4d4599e00882384ca981ee287ed961fa5f3828e2a...  Female   34   
2  44cb730c420480a0477b505ae68af508fb90f96cf0ec54...  Female   42   
3  2c4cf657337835125bc4258d0e2e546af4185bdb70f64e...  Female   43   
4  6ea2fdb3399f4d2e806beb01e9a3371bd622bed6a409ac...    Male   61   

      Occupation  Sleep Duration (hours)  Quality of Sleep (scale: 1-10)  \
0   Manual Labor                7.928079                        4.589872   
1        Student                8.719110                        6.729776   
2  Office Worker                9.639597                        8.597408   
3  Office Worker                4.500696                        3.168051   
4  Office Worker                8.966661                        6.830468   

   Physical Activity Level (minutes/day)  Stress Level (scale: 1-10)  \
0                             111.302978     

In [15]:
print("\nSleep Apnea:")
print(con.execute("SELECT * FROM apnea LIMIT 5").df())


Sleep Apnea:
                                           Person ID  Gender  Age  \
0  efd96aedf377e20afd95285a7c751a864260bd6a149656...    Male   22   
1  3c1b7053f0edd447b778edbc0ad8359b0fa892d69857d9...  Female   36   
2  6208ef0f7750c111548cf90b6ea1d0d0a66f6bff40dbef...    Male   90   
3  9d693eeee1d1899cbc50b6d45df953d3835acf28ee8698...  Female   27   
4  9400f1b21cb527d7fa3d3eabba93557a18ebe7a2ca4e47...  Female   26   

      Occupation  Sleep Duration (hours)  Quality of Sleep (scale: 1-10)  \
0        Retired                4.365188                        6.339010   
1        Student                4.361516                        8.170927   
2        Student               10.544222                        5.072626   
3  Office Worker               10.910968                        4.367647   
4   Manual Labor                9.353351                        6.690117   

   Physical Activity Level (minutes/day)  Stress Level (scale: 1-10)  \
0                              34.339509  