In [15]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
import numpy as np
import math
import torch.optim as optim

In [16]:
d = {
    "good": 0,
    "neutral": 1,
    "bad" : 2
    
}
def encoding(label):
    return d[label]

In [17]:
class PatientDataset(Dataset):
    def __init__(self, df_as_np, labels, seq_len):
        self.data = df_as_np
        self.labels = labels      
        self.seq_len = seq_len
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [18]:
def load_patient_data(df_as_np, labels, seq_len, batch_size=50):
    dataset = PatientDataset(df_as_np, labels, seq_len)
    train_size = int(0.8 * len(dataset))
    val_size = int(0.1 * len(dataset))
    test_size = len(dataset) - train_size - val_size
    train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

    trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    testloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return trainloader, valloader, testloader

In [19]:
class RecurrentNetwork(nn.Module):
    def __init__(self, seq_length, hidden_size, num_layers):
        super(RecurrentNetwork, self).__init__()
        self.rnn = nn.RNN(input_size=7, hidden_size=hidden_size, num_layers=num_layers, batch_first=True,  nonlinearity='relu')
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(seq_length*3,3),           
        )
    
    def forward(self, x):
        x, _ = self.rnn(x)
        return self.classifier(x)

In [20]:
def train(dataloader, seq_length, lr, epochs):
    model = RecurrentNetwork(seq_length=seq_length, hidden_size=3, num_layers=10)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=0.0001)
    model.train()
    criterion = nn.CrossEntropyLoss()
    batch = 0
    for epoch in range(epochs):
        
        for seq, label in dataloader:
            optimizer.zero_grad()
            outputs = model(seq.float())
            loss = criterion(outputs, label.long())
            loss.backward()
            optimizer.step()
            print(epoch, batch, loss.item())
            batch += 1
    return model

In [21]:
def select_target(df_features_as_np, df_labels_as_np, i, seq_len):
    train_loader, val_loader, test_loader = load_patient_data(df_features_as_np, df_labels_as_np[:, i], seq_len=seq_len, batch_size=500)
    return train_loader, val_loader,test_loader

In [22]:
def test(model, dataloader):
    for seq, labels in dataloader:
        output = model(seq.float())
        pred_labels = torch.argmax(output, dim=1)
        acc = (pred_labels == labels).float().mean().item()
        print(acc)
        print(pred_labels)
        print("##########################")
        print(labels)
        break

In [23]:
def train_and_test(csv_features, csv_labels, feature_cols_to_drop, label_cols_to_drop, features_range, features_shape, seq_len):
    df_features= pd.read_csv(csv_features)
    df_labels = pd.read_csv(csv_labels)
    df_features = df_features.drop(columns=feature_cols_to_drop)
    df_labels = df_labels.drop(columns=label_cols_to_drop)
    for column in df_labels.columns:
        df_labels[column] = df_labels[column].apply(encoding)
    df_features_as_np = df_features.to_numpy()[:features_range,:]
    df_features_as_np = df_features_as_np.reshape(features_shape)
    df_labels_as_np = df_labels.to_numpy()

    print("pre_train shapes")
    print(df_features_as_np.shape)
    print(df_labels_as_np.shape)
    
    for i in range(6):
        print(f"############### LABEL {i} #################")
        train_loader, val_loader, test_loader = select_target(df_features_as_np, df_labels_as_np, i, seq_len)
        model = train(dataloader=train_loader, lr=0.05, epochs=4, seq_length=seq_len)
        print("################# TESTING ##############################")
        test(model, test_loader)

Youcef's minute data training

In [None]:
df = pd.read_csv("minute_data_007_youcef/encoded.csv")
print(df)

In [None]:
df = pd.read_csv("minute_data_007_youcef/labels.csv")
print(df.columns)

In [None]:
train_and_test(csv_features="minute_data_007_youcef/encoded.csv", csv_labels="minute_data_007_youcef/labels.csv", feature_cols_to_drop=['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'date', 'date_only', 'time'], label_cols_to_drop=['Unnamed: 0'], features_range=180950, features_shape=(3619, 50, 7), seq_len=50)

2s data, seq len of 10, randomly sampled, months 2-7

In [None]:
df = pd.read_csv("cleaned_parquet/007/sample_1_seq_len_10/sample_features.csv")
print(df)

In [None]:
df = pd.read_csv("cleaned_parquet/007/sample_1_seq_len_10/sample_labels.csv")
print(df)

In [None]:
train_and_test(csv_features="cleaned_parquet/007/sample_1_seq_len_10/sample_features.csv", csv_labels="cleaned_parquet/007/sample_1_seq_len_10/sample_labels.csv", feature_cols_to_drop=['Unnamed: 0.1', 'Unnamed: 0', 'date_time', "date", "E"], label_cols_to_drop=['Unnamed: 0.1','Unnamed: 0'], features_range=240000, features_shape=(24000, 10, 7), seq_len=10)

2s data, seq len of 50, randomly sampled, months 2-7

In [None]:
df = pd.read_csv("cleaned_parquet/007/sample_2_seq_len_50/sample_features.csv")
print(df)

In [None]:
df = pd.read_csv("cleaned_parquet/007/sample_2_seq_len_50/sample_labels.csv")
print(df)

In [None]:
train_and_test(csv_features="cleaned_parquet/007/sample_2_seq_len_50/sample_features.csv", csv_labels="cleaned_parquet/007/sample_2_seq_len_50/sample_labels.csv", feature_cols_to_drop=['Unnamed: 0.1', 'Unnamed: 0', 'date_time', "date", "E"], label_cols_to_drop=['Unnamed: 0.1','Unnamed: 0'], features_range=1200000, features_shape=(24000, 50, 7), seq_len=50)

2s data, seq len of 10, randomly sampled, months 3, 4, 6. I'll only train for "average" column

In [80]:
def train_and_test_only_avg(par_dir, features_arr_shape, labels_arr_shape, seq_len):
    df_3_features = pd.read_csv(f"cleaned_parquet/007/{par_dir}/month_3.csv")
    df_3_labels = pd.read_csv(f"cleaned_parquet/007/{par_dir}/month_3_labels.csv")
    
    df_4_features = pd.read_csv(f"cleaned_parquet/007/{par_dir}/month_4.csv")
    df_4_labels = pd.read_csv(f"cleaned_parquet/007/{par_dir}/month_4_labels.csv")
    
    df_6_features = pd.read_csv(f"cleaned_parquet/007/{par_dir}/month_6.csv")
    df_6_labels = pd.read_csv(f"cleaned_parquet/007/{par_dir}/month_6_labels.csv")
    
    df_feature_rows_all = pd.concat([df_3_features, df_4_features, df_6_features])
    df_all_label_rows = pd.concat([df_3_labels, df_4_labels, df_6_labels])
    
    df_feature_rows_all = df_feature_rows_all.drop(columns=['Unnamed: 0', 'date_time', "date", "E"])
    
    df_all_label_rows["average"] = df_all_label_rows["average"].apply(encoding)
    
    df_as_np = df_feature_rows_all.to_numpy().reshape(features_arr_shape)
    labels = df_all_label_rows["average"].to_numpy().reshape(labels_arr_shape)

    
    train_loader, val_loader, test_loader = select_target(df_features_as_np=df_as_np, df_labels_as_np=labels, i=0, seq_len=seq_len)
    model = train(dataloader=train_loader, lr=0.05, epochs=4, seq_length=seq_len)
    test(model, dataloader=test_loader)

In [None]:
train_and_test_only_avg(par_dir="sample_1_seq_len_10", features_arr_shape=(12000, 10, 7), labels_arr_shape=(12000, 1), seq_len=10)

2s data, seq len of 50, randomly sampled, months 3, 4, 6. I'll only train for "average" column

In [84]:
train_and_test_only_avg(par_dir="sample_2_seq_len_50", features_arr_shape=(12000, 50, 7), labels_arr_shape=(12000, 1), seq_len=50)

0 0 1.2231528759002686
0 1 4.316273212432861
0 2 3.7468833923339844
0 3 1.5085508823394775
0 4 1.127094030380249
0 5 1.1511147022247314
0 6 1.2539817094802856
0 7 1.1142102479934692
0 8 1.1036556959152222
0 9 1.1060819625854492
0 10 1.0972312688827515
0 11 1.1003899574279785
0 12 1.1003321409225464
0 13 1.099125623703003
0 14 1.1011409759521484
0 15 1.1040444374084473
0 16 1.1022144556045532
0 17 1.1009526252746582
0 18 1.0998716354370117
0 19 1.0987277030944824
1 20 1.0997847318649292
1 21 1.100624918937683
1 22 1.0998942852020264
1 23 1.097243070602417
1 24 1.0991166830062866
1 25 1.0989850759506226
1 26 1.0999258756637573
1 27 1.0978045463562012
1 28 1.0979042053222656
1 29 1.0991344451904297
1 30 1.0987861156463623
1 31 1.0989739894866943
1 32 1.0989238023757935
1 33 1.09808349609375
1 34 1.0982314348220825
1 35 1.0990158319473267
1 36 1.0994009971618652
1 37 1.0977013111114502
1 38 1.100135087966919
1 39 1.098105788230896
2 40 1.0970005989074707
2 41 1.0978676080703735
2 42 1.0990