In [48]:

import pandas as pd
from pathlib import Path
import pyarrow.parquet as pq
from dataclasses import dataclass
import hvplot.pandas 
from tqdm import tqdm
# hv.renderer('bokeh').theme = 'dark_minimal'

In [49]:



# dataset_root = Path(r"C:\Users\Raffael\Documents\Datasets\alpiq_2023") # Raw string works without escaping \
dataset_root = Path(r"C:/Users/jadbh\Documents/Swisse/EPFL/courses/Fall 2024/Machine Learning for Predictive Maintenance/project/Dataset")
dataset_root = Path(r"C:\Users\jadbh\Documents\Swisse\EPFL\courses\Fall 2024\Machine Learning for Predictive Maintenance\project\team repo\Machine-Learning-for-Predictive-Maintenance-project\Dataset")


@dataclass
class Case():
    info: pd.DataFrame
    measurements: pd.DataFrame


class RawDataset():

    def __init__(self, root, unit = "VG4", load_training=False, load_synthetic=False) -> None:
        
        
        read_pq_file = lambda f: pq.read_table(root / f).to_pandas()
        
        
        cases = {
            "test": [f"{unit}_generator_data_testing_real_measurements.parquet", root / f"{unit}_generator_data_testing_real_info.csv" ], 
        }
        
        if load_training:
            cases = {
                **cases,
                "train": [f"{unit}_generator_data_training_measurements.parquet", root / f"{unit}_generator_data_training_info.csv" ], 
            }
        
        if load_synthetic:
            cases = {
                **cases,
                "test_s01": [f"{unit}_generator_data_testing_synthetic_01_measurements.parquet", root / f"{unit}_generator_data_testing_synthetic_01_info.csv"], 
                "test_s02": [f"{unit}_generator_data_testing_synthetic_02_measurements.parquet", root / f"{unit}_generator_data_testing_synthetic_02_info.csv"]
            }
        
        
        self.data_dict = dict()
        
        for id_c, c in cases.items():
            # if you need to verify the parquet header:
            # pq_rows = RawDataset.read_parquet_schema_df(root / c[0])
            info = pd.read_csv(c[1])
            measurements = read_pq_file(c[0])
            self.data_dict[id_c] = Case(info, measurements)
            
        
        
    @staticmethod
    def read_parquet_schema_df(uri: str) -> pd.DataFrame:
        """Return a Pandas dataframe corresponding to the schema of a local URI of a parquet file.

        The returned dataframe has the columns: column, pa_dtype
        """
        # Ref: https://stackoverflow.com/a/64288036/
        schema = pq.read_schema(uri, memory_map=True)
        schema = pd.DataFrame(({"column": name, "pa_dtype": str(pa_dtype)} for name, pa_dtype in zip(schema.names, schema.types)))
        schema = schema.reindex(columns=["column", "pa_dtype"], fill_value=pd.NA)  # Ensures columns in case the parquet file has an empty dataframe.
        return schema
    

rds_u4 = RawDataset(dataset_root, "VG4", load_synthetic=False, load_training=True)
rds_u5 = RawDataset(dataset_root, "VG5", load_synthetic=True, load_training=True)
rds_u6 = RawDataset(dataset_root, "VG6", load_synthetic=True)

In [50]:
vg5_train_meas = rds_u5.data_dict["train"].measurements
vg5_train_info = rds_u5.data_dict["train"].info
vg5_test_meas = rds_u5.data_dict["test"].measurements
vg5_test_info = rds_u5.data_dict["test"].info

In [51]:
vg5_train_filt = vg5_train_meas [ (vg5_train_meas['equilibrium_turbine_mode'] == True) |
                                           ((vg5_train_meas['equilibrium_pump_mode'] == True) & (vg5_train_meas['short_circuit_mode'] == False)) |
                                           ((vg5_train_meas['equilibrium_pump_mode'] == True) & (vg5_train_meas['short_circuit_mode'] == True) & (vg5_train_meas['equilibrium_short_circuit_mode'] == True)) ]

In [52]:
vg5_s1 = rds_u5.data_dict["test_s01"].measurements          # synthetic testing
vg5_s1_filt = vg5_s1 [ (vg5_s1['equilibrium_turbine_mode'] == True) |        # filtered synthetic testing
                    ((vg5_s1['equilibrium_pump_mode'] == True) & (vg5_s1['short_circuit_mode'] == False)) |
                    ((vg5_s1['equilibrium_pump_mode'] == True) & (vg5_s1['short_circuit_mode'] == True) & (vg5_s1['equilibrium_short_circuit_mode'] == True)) ]

# ACTUAL WORK NOW

In [53]:
vg5_train_meas.reset_index(inplace=True)
vg5_train_filt = vg5_train_meas [ (vg5_train_meas['equilibrium_turbine_mode'] == True) |
                                           ((vg5_train_meas['equilibrium_pump_mode'] == True) & (vg5_train_meas['short_circuit_mode'] == False)) |
                                           ((vg5_train_meas['equilibrium_pump_mode'] == True) & (vg5_train_meas['short_circuit_mode'] == True) & (vg5_train_meas['equilibrium_short_circuit_mode'] == True)) ]

In [54]:
# summary of VG5 useful variables

vg5_train_info
vg5_train_filt      # equilibirum
vg5_train_meas

vg5_test_info
vg5_test_meas

vg5_s1
vg5_s1_filt         # equilibrium

print()




In [55]:
df = vg5_train_filt
df.columns

Index(['index', 'tot_activepower', 'ext_tmp', 'plant_tmp', 'charge',
       'coupler_position', 'injector_01_opening', 'injector_02_opening',
       'injector_03_opening', 'injector_04_opening', 'injector_05_opening',
       'pump_calculated_flow', 'pump_pressure_diff', 'pump_rotspeed',
       'tot_current', 'tot_effectivepower', 'tot_reactivepower',
       'turbine_pressure', 'turbine_rotspeed', 'water_primary_pump_01_opening',
       'water_primary_pump_02_opening', 'air_circ_cold_01_tmp',
       'air_circ_cold_02_tmp', 'air_circ_cold_03_tmp', 'air_circ_cold_04_tmp',
       'air_circ_cold_05_tmp', 'air_circ_cold_06_tmp', 'air_circ_hot_01_tmp',
       'air_circ_hot_02_tmp', 'air_circ_hot_03_tmp', 'air_circ_hot_04_tmp',
       'air_circ_hot_05_tmp', 'air_circ_hot_06_tmp', 'elec_freq',
       'exc_current', 'exc_voltage', 'mid_voltage', 'neutral_current',
       'ph01_current', 'ph01_voltage', 'ph12_voltage', 'ph02_current',
       'ph02_voltage', 'ph23_voltage', 'ph03_current', 'ph03_v

In [56]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

class SlidingWindowDataset(Dataset):
    def __init__(self, dataframe, feature_columns, window_size, step_size=1):
        """
        Args:
            dataframe (pd.DataFrame): The dataframe containing sensor data.
            feature_columns (list): List of column names for features.
            window_size (int): The number of timesteps in each sliding window.
            step_size (int): The step size to slide the window.
        """
        self.features = dataframe[feature_columns].values
        self.window_size = window_size
        self.step_size = step_size
        self.num_samples = (len(self.features) - window_size) // step_size + 1

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        start_idx = idx * self.step_size
        end_idx = start_idx + self.window_size
        x = torch.tensor(self.features[start_idx:end_idx], dtype=torch.float32)
        return x

# Example usage
# Replace 'feature_columns' with your list of feature column names
feature_columns = ["ext_tmp", "injector_01_opening"]  # Replace with your column names
window_size = 10  # Number of timesteps per window
step_size = 1  # Step size for sliding window

# Create Dataset and DataLoader
dataset = SlidingWindowDataset(df, feature_columns, window_size, step_size)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Example: Iterating through the dataloader
for batch in dataloader:
    sliding_windows = batch
    print(sliding_windows.shape)  # Replace with your training loop logic
    break

torch.Size([32, 10, 2])


In [57]:
df2 = df.copy()
df2['index_num'] = df2.index
df2.sample(2)

Unnamed: 0,index,tot_activepower,ext_tmp,plant_tmp,charge,coupler_position,injector_01_opening,injector_02_opening,injector_03_opening,injector_04_opening,...,machine_off,turbine_mode,all,equilibrium_turbine_mode,dyn_only_on,pump_mode,short_circuit_mode,equilibrium_pump_mode,equilibrium_short_circuit_mode,index_num
827755,2020-10-15 18:32:00+02:00,115.738297,14.947145,18.891874,113.121992,9.12835,64.437325,64.074158,64.318302,64.157388,...,False,True,True,True,False,False,False,False,False,827755
688909,2020-08-28 10:12:00+02:00,6.962749,17.676562,18.058655,4.627991,9.1945,0.0,16.343939,0.0,0.0,...,False,True,True,True,False,False,False,False,False,688909


In [58]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

class SlidingWindowDataset(Dataset):
    def __init__(self, dataframe, feature_columns, window_size, step_size=1, max_gap=10, stride = 2):
        """
        Args:
            dataframe (pd.DataFrame): The dataframe containing sensor data.
            feature_columns (list): List of column names for features.
            window_size (int): The number of timesteps in each sliding window.
            step_size (int): The step size to slide the window.
            max_gap (int): Maximum allowed gap between consecutive indices for grouping.
        """
        self.features = dataframe[feature_columns].values
        self.indices = dataframe.index.values
        self.window_size = window_size
        self.step_size = step_size
        self.max_gap = max_gap
        self.stride = stride

        # Identify groups based on index gaps
        self.groups = self._identify_groups()
        # print(self.groups[0:50])
        # print(len(self.groups[500]))
        self.valid_windows = self._generate_valid_windows()

    def _identify_groups(self):
        """
        Identify groups of rows based on the max_gap condition.
        """
        groups = []
        current_group = [0]  # Start with the first row
        for i in range(1, len(self.indices)):
            if self.indices[i] - self.indices[i - 1] > self.max_gap:
                groups.append(current_group)
                current_group = [i]
            else:
                current_group.append(i)
        groups.append(current_group)  # Add the last group
        return groups

    def _generate_valid_windows(self):
        """
        Generate valid sliding windows based on groups.
        """
        valid_windows = []
        for group in self.groups:
            
            for start in range(0, len(group) - self.window_size + 1, self.stride):
                valid_windows.append(group[start : start + self.window_size])

            if (len(group) - self.window_size)%self.stride != 0 and len(group)> self.window_size:
                # print(len(group[len(group) - self.window_size : len(group)]))
                valid_windows.append(group[len(group) - self.window_size : len(group)])

        # print(valid_windows[-1])
        return valid_windows

    def __len__(self):
        return len(self.valid_windows)

    def __getitem__(self, idx):
        """
        Retrieve a sliding window by index.
        """
        window_indices = self.valid_windows[idx]
        x = torch.tensor(self.features[window_indices], dtype=torch.float32)
        return x

# Example usage
# Replace 'feature_columns' with your list of feature column names
feature_columns = ["ext_tmp", "tot_activepower"]  # Replace with your column names
window_size = 10    # Number of timesteps per window
stride = 3    # Step size for sliding window
max_gap = 10   # Maximum allowed gap for grouping

# Create Dataset and DataLoader
dataset = SlidingWindowDataset(df2, feature_columns, window_size, step_size, max_gap, stride = stride)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

maxes = []
# Example: Iterating through the dataloader
# for i, batch in enumerate(dataloader):
#     sliding_windows = batch
#     diff = sliding_windows[0,1:10,1] - sliding_windows[0,0:9,1]
#     maxes.append(diff.max().item())
#     # print(maxes)  # Replace with your training loop logi
#     # if i>100:
#     #     break
# # plt.hist(maxes)
# pd_maxes = pd.DataFrame(maxes)

In [59]:
# pd_maxes.value_counts()

In [60]:
dataset

<__main__.SlidingWindowDataset at 0x21db2e78c50>

In [61]:
from torch import nn

class DenseAutoencoder(nn.Module):

    def __init__(self, input_dim):
        super(DenseAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 32),
            )
        
        # TODO : write a symetrical neural network to the encoder to reconstuct the input           
        # ===================================
        # IMPLEMENT YOUR CODE HERE                                  
        self.decoder = nn.Sequential(
            nn.Linear(32,128),
            nn.ReLU(),
            nn.Linear(128, 512),
            nn.ReLU(),
            nn.Linear(512, input_dim)
            # nn.Sigmoid()
            )
        # ===================================

    def forward(self, x):
        feature = self.encoder(x)
        reconstruction = self.decoder(feature)
        return reconstruction,feature

In [65]:
def train_loop(dataloader, model, loss_fn, optimizer, print_every=10):
    loss_evol = []
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    loss_running = 0
    for batch, x in enumerate(tqdm(dataloader)):
        # Compute prediction and loss
        x = x.flatten(start_dim=1)
        optimizer.zero_grad()
        x_pred , _ = model(x)
        
        # TODO : write the loss function, compute the gradient, and update the model parameters
        # ===================================
        # IMPLEMENT YOUR CODE HERE
        loss = loss_fn(x_pred, x)
        loss.backward()
        optimizer.step()
        # ===================================
        
        # Back Propagation
        loss_running += loss.item()
    # print(f"loss: {loss_running/len(dataloader):>7f}")
    loss_evol.append(loss.item())
    print(f"loss: {loss_running/len(dataloader)}")
    return loss_evol

model = DenseAutoencoder(window_size * len(feature_columns))
epochs = 5
optimizer = torch.optim.AdamW(model.parameters(), lr = 0.001) # slightly better convergence for pump
loss_fn = nn.MSELoss()

# %%time
for epoch in range(epochs):
    print(f"Epoch {epoch+1}\n-------------------------------")
    loss_evol = train_loop(dataloader, model, loss_fn, optimizer, print_every=1);


Epoch 1
-------------------------------


100%|██████████| 4721/4721 [00:37<00:00, 125.24it/s]


loss: 143.20048843629405
Epoch 2
-------------------------------


100%|██████████| 4721/4721 [00:37<00:00, 125.25it/s]


loss: 96.45691745621569
Epoch 3
-------------------------------


100%|██████████| 4721/4721 [00:40<00:00, 115.32it/s]


loss: 62.40443716563633
Epoch 4
-------------------------------


100%|██████████| 4721/4721 [00:37<00:00, 124.63it/s]


loss: 59.316910387936716
Epoch 5
-------------------------------


100%|██████████| 4721/4721 [00:39<00:00, 119.13it/s]

loss: 25.85276252691518





In [67]:
import matplotlib.pyplot as plt
print(loss_evol)

[8.992798805236816]
