In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
import numpy as np
import math
import torch.optim as optim

In [2]:
d = {
    "good": 0,
    "neutral": 1,
    "bad" : 2
    
}
def encoding(label):
    return d[label]

In [3]:
class PatientDataset(Dataset):
    def __init__(self, df_as_np, labels, seq_len):
        self.data = df_as_np
        self.labels = labels      
        self.seq_len = seq_len
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [4]:
def load_patient_data(df_as_np, labels, seq_len, batch_size=50):
    dataset = PatientDataset(df_as_np, labels, seq_len)
    train_size = int(0.8 * len(dataset))
    val_size = int(0.1 * len(dataset))
    test_size = len(dataset) - train_size - val_size
    train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

    trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    testloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return trainloader, valloader, testloader

In [5]:
class RecurrentNetwork(nn.Module):
    def __init__(self, seq_length, hidden_size, num_layers):
        super(RecurrentNetwork, self).__init__()
        self.rnn = nn.RNN(input_size=7, hidden_size=hidden_size, num_layers=num_layers, batch_first=True,  nonlinearity='relu')
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(seq_length*hidden_size,3),           
        )
    
    def forward(self, x):
        x, _ = self.rnn(x)
        return self.classifier(x)

In [6]:
def train(dataloader, seq_length, lr, epochs):
    model = RecurrentNetwork(seq_length=seq_length, hidden_size=3, num_layers=10)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=0.0001)
    model.train()
    criterion = nn.CrossEntropyLoss()
    batch = 0
    for epoch in range(epochs):
        
        for seq, label in dataloader:
            optimizer.zero_grad()
            outputs = model(seq.float())
            loss = criterion(outputs, label.long())
            loss.backward()
            optimizer.step()
            print(epoch, batch, loss.item())
            batch += 1
    return model

In [7]:
def select_target(df_features_as_np, df_labels_as_np, i, seq_len):
    train_loader, val_loader, test_loader = load_patient_data(df_features_as_np, df_labels_as_np[:, i], seq_len=seq_len, batch_size=500)
    return train_loader, val_loader,test_loader

In [8]:
def test(model, dataloader):
    for seq, labels in dataloader:
        output = model(seq.float())
        pred_labels = torch.argmax(output, dim=1)
        acc = (pred_labels == labels).float().mean().item()
        print(acc)
        print(pred_labels)
        print("##########################")
        print(labels)
        break

In [9]:
def train_and_test(csv_features, csv_labels, feature_cols_to_drop, label_cols_to_drop, features_range, features_shape, seq_len):
    df_features= pd.read_csv(csv_features)
    df_labels = pd.read_csv(csv_labels)
    df_features = df_features.drop(columns=feature_cols_to_drop)
    df_labels = df_labels.drop(columns=label_cols_to_drop)
    for column in df_labels.columns:
        df_labels[column] = df_labels[column].apply(encoding)
    df_features_as_np = df_features.to_numpy()[:features_range,:]
    df_features_as_np = df_features_as_np.reshape(features_shape)
    df_labels_as_np = df_labels.to_numpy()

    print("pre_train shapes")
    print(df_features_as_np.shape)
    print(df_labels_as_np.shape)
    
    for i in range(6):
        print(f"############### LABEL {i} #################")
        train_loader, val_loader, test_loader = select_target(df_features_as_np, df_labels_as_np, i, seq_len)
        model = train(dataloader=train_loader, lr=0.05, epochs=4, seq_length=seq_len)
        print("################# TESTING ##############################")
        test(model, test_loader)

Youcef's minute data training

In [10]:
df = pd.read_csv("minute_data_007_youcef/encoded.csv")
print(df)

        Unnamed: 0.2  Unnamed: 0.1  Unnamed: 0                     date  \
0                  0             0       39524  2024-08-20 00:00:34:000   
1                  1             1       39525  2024-08-20 00:01:34:000   
2                  2             2       39526  2024-08-20 00:02:34:000   
3                  3             3       39527  2024-08-20 00:03:34:000   
4                  4             4       39528  2024-08-20 00:04:34:000   
...              ...           ...         ...                      ...   
180967        180967        180967      220491  2025-02-28 02:57:05:000   
180968        180968        180968      220492  2025-02-28 02:58:05:000   
180969        180969        180969      220493  2025-02-28 02:59:05:000   
180970        180970        180970      220494  2025-02-28 03:00:05:000   
180971        180971        180971      220495  2025-02-28 03:01:05:000   

        Accelerometer_X  Accelerometer_Y  Accelerometer_Z  Light_Lux  \
0              0.585266    

In [11]:
df = pd.read_csv("minute_data_007_youcef/labels.csv")
print(df.columns)

Index(['Unnamed: 0', 'average', 'phq_9', 'cgis', 'gad_7', 'wsas', 'qids'], dtype='object')


In [12]:
train_and_test(csv_features="minute_data_007_youcef/encoded.csv", csv_labels="minute_data_007_youcef/labels.csv", feature_cols_to_drop=['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'date', 'date_only', 'time'], label_cols_to_drop=['Unnamed: 0'], features_range=180950, features_shape=(3619, 50, 7), seq_len=50)

pre_train shapes
(3619, 50, 7)
(3619, 6)
############### LABEL 0 #################
0 0 0.993659257888794
0 1 2.410212516784668
0 2 0.9701600074768066
0 3 1.0059473514556885
0 4 1.1866576671600342
0 5 1.1545220613479614
1 6 0.9572981595993042
1 7 0.8860386610031128
1 8 0.9355542659759521
1 9 0.9098655581474304
1 10 0.9092230796813965
1 11 0.9424866437911987
2 12 0.9237146377563477
2 13 0.9036576151847839
2 14 0.8924536108970642
2 15 0.88327956199646
2 16 0.8606400489807129
2 17 0.9290419220924377
3 18 0.8801220655441284
3 19 0.9265136122703552
3 20 0.9154288172721863
3 21 0.8806639313697815
3 22 0.8788085579872131
3 23 0.8792601227760315
################# TESTING ##############################
0.6611570119857788
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

2s data, seq len of 10, randomly sampled, months 2-7

In [13]:
df = pd.read_csv("cleaned_parquet/007/sample_1_seq_len_10/sample_features.csv")
print(df)

        Unnamed: 0.1  Unnamed: 0                date_time         A         B  \
0                  0     7919121  2024-08-22 09:13:30:420 -1.720211 -0.154510   
1                  1     7919221  2024-08-22 09:13:32:420 -0.985059  0.213792   
2                  2     7919321  2024-08-22 09:13:34:420 -1.312712  0.336491   
3                  3     7919421  2024-08-22 09:13:36:420 -1.224800  0.336491   
4                  4     7919521  2024-08-22 09:13:38:420 -1.129024  0.737608   
...              ...         ...                      ...       ...       ...   
239995         39995    92838308  2025-02-20 23:56:11:160  0.432339  0.637942   
239996         39996    92838408  2025-02-20 23:56:13:160  0.759266  0.531946   
239997         39997    92838508  2025-02-20 23:56:15:160  0.551981  0.580930   
239998         39998    92838608  2025-02-20 23:56:17:160  0.663672  0.605423   
239999         39999    92838708  2025-02-20 23:56:19:160  0.639824  0.523713   

               C         D 

In [14]:
df = pd.read_csv("cleaned_parquet/007/sample_1_seq_len_10/sample_labels.csv")
print(df)

       Unnamed: 0.1  Unnamed: 0  average    phq_9     cgis    gad_7     wsas  \
0                 0           0      bad  neutral  neutral      bad  neutral   
1                 1           1      bad  neutral  neutral      bad  neutral   
2                 2           2      bad  neutral  neutral      bad  neutral   
3                 3           3      bad  neutral  neutral      bad  neutral   
4                 4           4      bad  neutral  neutral      bad  neutral   
...             ...         ...      ...      ...      ...      ...      ...   
23995          3995        3995  neutral  neutral     good  neutral  neutral   
23996          3996        3996      bad      bad  neutral      bad  neutral   
23997          3997        3997  neutral  neutral  neutral  neutral  neutral   
23998          3998        3998  neutral  neutral     good  neutral  neutral   
23999          3999        3999  neutral  neutral     good  neutral  neutral   

          qids  
0          bad  
1    

In [15]:
train_and_test(csv_features="cleaned_parquet/007/sample_1_seq_len_10/sample_features.csv", csv_labels="cleaned_parquet/007/sample_1_seq_len_10/sample_labels.csv", feature_cols_to_drop=['Unnamed: 0.1', 'Unnamed: 0', 'date_time', "date", "E"], label_cols_to_drop=['Unnamed: 0.1','Unnamed: 0'], features_range=240000, features_shape=(24000, 10, 7), seq_len=10)

pre_train shapes
(24000, 10, 7)
(24000, 6)
############### LABEL 0 #################
0 0 1.1223045587539673
0 1 1.0691578388214111
0 2 1.0323941707611084
0 3 0.9960426688194275
0 4 1.0679757595062256
0 5 1.031273603439331
0 6 0.9965734481811523
0 7 1.0624762773513794
0 8 1.0354710817337036
0 9 1.030466079711914
0 10 1.044063925743103
0 11 1.0314885377883911
0 12 0.9984816312789917
0 13 1.012941837310791
0 14 1.0090672969818115
0 15 1.0358754396438599
0 16 1.048802137374878
0 17 1.0368834733963013
0 18 1.040433406829834
0 19 1.0206513404846191
0 20 1.0334970951080322
0 21 1.0479601621627808
0 22 1.0298885107040405
0 23 1.009189248085022
0 24 1.0328153371810913
0 25 1.0138578414916992
0 26 1.0315897464752197
0 27 1.0235002040863037
0 28 1.0156983137130737
0 29 1.0477365255355835
0 30 1.0051816701889038
0 31 1.0177674293518066
0 32 1.029474139213562
0 33 0.9974966049194336
0 34 1.0478599071502686
0 35 1.015384316444397
0 36 1.0542447566986084
0 37 1.0086897611618042
0 38 1.042601346969604

2s data, seq len of 50, randomly sampled, months 2-7

In [16]:
df = pd.read_csv("cleaned_parquet/007/sample_2_seq_len_50/sample_features.csv")
print(df)

         Unnamed: 0.1  Unnamed: 0                date_time         A  \
0                   0     6135382  2024-08-21 23:18:55:640  1.268186   
1                   1     6135432  2024-08-21 23:18:56:640  1.292180   
2                   2     6135482  2024-08-21 23:18:57:640  1.324038   
3                   3     6135532  2024-08-21 23:18:58:640  1.308109   
4                   4     6135582  2024-08-21 23:18:59:640  1.316175   
...               ...         ...                      ...       ...   
1199995        199995    37440911  2025-02-08 04:10:23:220 -0.093925   
1199996        199996    37440961  2025-02-08 04:10:24:220 -0.093925   
1199997        199997    37441011  2025-02-08 04:10:25:220 -0.085975   
1199998        199998    37441061  2025-02-08 04:10:26:220 -0.093925   
1199999        199999    37441111  2025-02-08 04:10:27:220 -0.101874   

                B         C         D    E         F  date_encoded  \
0        1.711662 -0.461736 -0.221601  0.0 -1.041534      0.28571

In [17]:
df = pd.read_csv("cleaned_parquet/007/sample_2_seq_len_50/sample_labels.csv")
print(df)

       Unnamed: 0.1  Unnamed: 0  average    phq_9     cgis    gad_7     wsas  \
0                 0           0      bad  neutral  neutral      bad  neutral   
1                 1           1      bad  neutral  neutral      bad  neutral   
2                 2           2      bad  neutral  neutral      bad  neutral   
3                 3           3      bad  neutral  neutral      bad  neutral   
4                 4           4      bad  neutral  neutral      bad  neutral   
...             ...         ...      ...      ...      ...      ...      ...   
23995          3995        3995  neutral  neutral     good  neutral  neutral   
23996          3996        3996  neutral  neutral     good  neutral  neutral   
23997          3997        3997  neutral  neutral     good  neutral  neutral   
23998          3998        3998      bad      bad  neutral      bad  neutral   
23999          3999        3999     good     good  neutral  neutral  neutral   

          qids  
0          bad  
1    

In [18]:
train_and_test(csv_features="cleaned_parquet/007/sample_2_seq_len_50/sample_features.csv", csv_labels="cleaned_parquet/007/sample_2_seq_len_50/sample_labels.csv", feature_cols_to_drop=['Unnamed: 0.1', 'Unnamed: 0', 'date_time', "date", "E"], label_cols_to_drop=['Unnamed: 0.1','Unnamed: 0'], features_range=1200000, features_shape=(24000, 50, 7), seq_len=50)

pre_train shapes
(24000, 50, 7)
(24000, 6)
############### LABEL 0 #################
0 0 1.0584776401519775
0 1 2.032606601715088
0 2 1.0281473398208618
0 3 1.1488077640533447
0 4 1.2677961587905884
0 5 1.2492400407791138
0 6 1.1026818752288818
0 7 1.0508472919464111
0 8 1.0421334505081177
0 9 1.0165932178497314
0 10 1.0133987665176392
0 11 1.0127286911010742
0 12 1.0416202545166016
0 13 1.041266918182373
0 14 1.049554467201233
0 15 1.0297755002975464
0 16 1.0249072313308716
0 17 1.016404390335083
0 18 1.0314388275146484
0 19 1.0010641813278198
0 20 1.0765533447265625
0 21 1.0396772623062134
0 22 0.9913254976272583
0 23 1.0350291728973389
0 24 1.0364025831222534
0 25 1.0289894342422485
0 26 1.0349119901657104
0 27 1.0230776071548462
0 28 1.008071780204773
0 29 1.038006067276001
0 30 1.028470754623413
0 31 1.0226069688796997
0 32 1.0416008234024048
0 33 1.0311223268508911
0 34 1.020322561264038
0 35 1.0269675254821777
0 36 1.036236047744751
0 37 1.0216538906097412
0 38 1.068256855010986

2s data, seq len of 10, randomly sampled, months 3, 4, 6. I'll only train for "average" column

In [19]:
def train_and_test_only_avg(par_dir, features_arr_shape, labels_arr_shape, seq_len):
    df_3_features = pd.read_csv(f"cleaned_parquet/007/{par_dir}/month_3.csv")
    df_3_labels = pd.read_csv(f"cleaned_parquet/007/{par_dir}/month_3_labels.csv")
    
    df_4_features = pd.read_csv(f"cleaned_parquet/007/{par_dir}/month_4.csv")
    df_4_labels = pd.read_csv(f"cleaned_parquet/007/{par_dir}/month_4_labels.csv")
    
    df_6_features = pd.read_csv(f"cleaned_parquet/007/{par_dir}/month_6.csv")
    df_6_labels = pd.read_csv(f"cleaned_parquet/007/{par_dir}/month_6_labels.csv")
    
    df_feature_rows_all = pd.concat([df_3_features, df_4_features, df_6_features])
    df_all_label_rows = pd.concat([df_3_labels, df_4_labels, df_6_labels])
    
    df_feature_rows_all = df_feature_rows_all.drop(columns=['Unnamed: 0', 'date_time', "date", "E"])
    
    df_all_label_rows["average"] = df_all_label_rows["average"].apply(encoding)
    
    df_as_np = df_feature_rows_all.to_numpy().reshape(features_arr_shape)
    labels = df_all_label_rows["average"].to_numpy().reshape(labels_arr_shape)

    
    train_loader, val_loader, test_loader = select_target(df_features_as_np=df_as_np, df_labels_as_np=labels, i=0, seq_len=seq_len)
    model = train(dataloader=train_loader, lr=0.05, epochs=4, seq_length=seq_len)
    test(model, dataloader=test_loader)

In [20]:
train_and_test_only_avg(par_dir="sample_1_seq_len_10", features_arr_shape=(12000, 10, 7), labels_arr_shape=(12000, 1), seq_len=10)

0 0 1.1538660526275635
0 1 1.116295576095581
0 2 1.1293838024139404
0 3 1.0941518545150757
0 4 1.1017155647277832
0 5 1.1030552387237549
0 6 1.0950766801834106
0 7 1.1060898303985596
0 8 1.1018568277359009
0 9 1.1014206409454346
0 10 1.099120855331421
0 11 1.0990945100784302
0 12 1.0987277030944824
0 13 1.099334716796875
0 14 1.0986086130142212
0 15 1.0980411767959595
0 16 1.098451018333435
0 17 1.1007755994796753
0 18 1.101486325263977
0 19 1.0957454442977905
1 20 1.1004573106765747
1 21 1.0961257219314575
1 22 1.099479079246521
1 23 1.0987234115600586
1 24 1.0976454019546509
1 25 1.1007750034332275
1 26 1.0978796482086182
1 27 1.0989290475845337
1 28 1.1008702516555786
1 29 1.0985116958618164
1 30 1.0984441041946411
1 31 1.098205804824829
1 32 1.0995855331420898
1 33 1.096897840499878
1 34 1.09757661819458
1 35 1.1003830432891846
1 36 1.0992070436477661
1 37 1.1002641916275024
1 38 1.0995069742202759
1 39 1.0969127416610718
2 40 1.096835732460022
2 41 1.0983858108520508
2 42 1.099147

2s data, seq len of 50, randomly sampled, months 3, 4, 6. I'll only train for "average" column

In [21]:
train_and_test_only_avg(par_dir="sample_2_seq_len_50", features_arr_shape=(12000, 50, 7), labels_arr_shape=(12000, 1), seq_len=50)

0 0 1.0973550081253052
0 1 1.098651647567749
0 2 1.0997881889343262
0 3 1.0995265245437622
0 4 1.099698543548584
0 5 1.0989173650741577
0 6 1.1000720262527466
0 7 1.0991930961608887
0 8 1.0986700057983398
0 9 1.0986205339431763
0 10 1.1005780696868896
0 11 1.0993447303771973
0 12 1.0985233783721924
0 13 1.098670482635498
0 14 1.099918246269226
0 15 1.100693702697754
0 16 1.0959781408309937
0 17 1.1002018451690674
0 18 1.100597620010376
0 19 1.0972657203674316
1 20 1.1027390956878662
1 21 1.09779691696167
1 22 1.0979505777359009
1 23 1.0991469621658325
1 24 1.0987643003463745
1 25 1.0979726314544678
1 26 1.1000295877456665
1 27 1.0995970964431763
1 28 1.0991696119308472
1 29 1.0986415147781372
1 30 1.0986360311508179
1 31 1.0988928079605103
1 32 1.1010991334915161
1 33 1.0991913080215454
1 34 1.1015105247497559
1 35 1.0977054834365845
1 36 1.1000919342041016
1 37 1.09816312789917
1 38 1.098265290260315
1 39 1.0997756719589233
2 40 1.1013280153274536
2 41 1.09904944896698
2 42 1.09952986