## Some experiments to check data distributions and model performance

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from pytorch_lightning import Trainer, LightningModule
from torch.utils.data import random_split
from pytorch_lightning.callbacks import EarlyStopping
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import numpy as np

import pandas as pd

from model import GRUD
from data import TimeSeriesDataset

from datetime import datetime
import argparse

In [2]:
seed = 42
seq_len = 100
step_size = 10

def split_data(
    df,
    start_test="2023-06-14 07:36:33.297403",
    end_test="2023-06-14 08:03:30.700492",
):
    """The timestamps are in column 'Time (s)"""
    # Definne the test set as the rows in between the start_test and end_test timestamps
    test_df = df[(df["Time (s)"] >= start_test) & (df["Time (s)"] <= end_test)]
    # Define the train set as the rows before the start_test timestamp and after the end_test timestamp
    train_df = df[(df["Time (s)"] < start_test) | (df["Time (s)"] > end_test)]
    return train_df, test_df

df = pd.read_csv('data.csv')

# Split the dataframe into train, validation, and test
train_df, test_df = split_data(df)
# create the training dataset
train_dataset = TimeSeriesDataset(
    train_df
)

# create the test dataset using the scaler from the training dataset
test_dataset = TimeSeriesDataset(
    test_df,
    scaler=train_dataset.get_scaler(),
    label_encoder=train_dataset.get_label_encoder(),
)

In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def metrics(labels, predictions):
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    f1 = f1_score(labels, predictions, average='weighted')
    
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-Score:", f1)
    
    cm = confusion_matrix(labels, predictions)
    class_labels = test_dataset.get_label_encoder().classes_
    
    sns.set_style("whitegrid")
    sns.set_context("paper")
    plt.figure(figsize=(6, 6))
    sns.set(font_scale=1.5)
    sns.heatmap(cm, 
                annot=True, 
                cmap="Blues", 
                fmt="d", 
                xticklabels=class_labels, 
                yticklabels=class_labels, 
                cbar=False
    )
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.savefig("confusion_matrix.pdf", bbox_inches='tight')
    plt.show()
    
    return accuracy, precision, recall, f1

# Load the model from grud\models\model_42_20230622-181251.pt
checkpoint = torch.load('models/model_42_20230622-181251.pt')
model = GRUD(input_size=17, hidden_size=17, output_size=3)
print(checkpoint['state_dict'].keys())
model.load_state_dict(checkpoint['state_dict'])


# Make predictions on the test set
predictions = model.predict(test_dataset)

# Get the labels from the test set with the label encoder
labels = test_dataset.get_labels()

# Calculate the metrics
metrics(labels, predictions)


KeyError: 'state_dict'

In [None]:
# Print the shapes of the datasets
print(f"Training set shape: {train_dataset.x.shape}")
# print(f"Validation set shape: {val_dataset.x.shape}")
print(f"Test set shape: {test_dataset.x.shape}")

# # Print the first few labels in each dataset
# print("First few labels:")
# print("Training set:", train_dataset.y[:50].numpy())
# # print("Validation set:", val_dataset.y[:10].numpy())
# print("Test set:", test_dataset.y[:10].numpy())


# Print the distributions of the labels in each dataset
print("Label distributions:")
print("Training set:", np.bincount(train_dataset.y.numpy()))
# print("Validation set:", np.bincount(val_dataset.y.numpy()))
print("Test set:", np.bincount(test_dataset.y.numpy()))

# Print the datasets as sets
print("Training set:", set(train_dataset.y.numpy()))
# print("Validation set:", set(val_dataset.y.numpy()))
print("Test set:", set(test_dataset.y.numpy()))


Training set shape: torch.Size([3215, 100, 17])
Test set shape: torch.Size([1607, 100, 17])
Label distributions:
Training set: [2294    7  100   84  193  537]
Test set: [979  26 118 123 258 103]
Training set: {0, 1, 2, 3, 4, 5}
Test set: {0, 1, 2, 3, 4, 5}


NameError: name 'val_dataset' is not defined

In [None]:


# Analyze the datasets
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate label distribution
train_label_distribution = np.bincount(train_dataset.y.flatten()) / len(train_dataset.y.flatten())
val_label_distribution = np.bincount(val_dataset.y.flatten()) / len(val_dataset.y.flatten())
test_label_distribution = np.bincount(test_dataset.y.flatten()) / len(test_dataset.y.flatten())

# Plot label distribution
plt.figure(figsize=(12, 6))
plt.bar(np.arange(6), train_label_distribution, alpha=0.5, label='Train')
plt.bar(np.arange(6), val_label_distribution, alpha=0.5, label='Validation')
plt.bar(np.arange(6), test_label_distribution, alpha=0.5, label='Test')
plt.xlabel('Label')
plt.ylabel('Frequency')
plt.title('Label Distribution')
plt.legend()
plt.show()


# Calculate missing values
missing_values_train = np.sum(~train_dataset.mask.numpy().astype(bool)) / np.prod(train_dataset.mask.shape)
missing_values_val = np.sum(~val_dataset.mask.numpy().astype(bool)) / np.prod(val_dataset.mask.shape)
missing_values_test = np.sum(~test_dataset.mask.numpy().astype(bool)) / np.prod(test_dataset.mask.shape)

print(f"Missing values in training data: {missing_values_train*100:.2f}%")
print(f"Missing values in validation data: {missing_values_val*100:.2f}%")
print(f"Missing values in testing data: {missing_values_test*100:.2f}%")

# Plot feature distribution
plt.figure(figsize=(12, 6))
for i in range(train_dataset.x.shape[2]):
    sns.histplot(train_dataset.x[:, :, i].numpy().flatten(), bins=50, kde=True)
plt.xlabel('Feature Value')
plt.ylabel('Frequency')
plt.title('Feature Distribution in Training Data')
plt.show()

