In [7]:
# general imports
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt 
from matplotlib.colors import ListedColormap
import seaborn as sns

# from our documents
import preprocessing_functions as pf

# from Scikit Learn library
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import random

#seed for random processes
seed = 42
np.random.seed(seed)
random.seed(seed)


The following pipeline aims to automatize the preprocessing of the stream data from time series.

We will assume that we already have part of the dataset, that will be used as historical data to fill missing values or study seasonalities and stationarities.

This will particular show what to do whenever new data arrive to the system.

# Initial setup

In [8]:
file_path = 'smart_app_data.pkl'
df = pd.read_pickle(file_path)

In [None]:

# Ensure 'time' column is in datetime format
df['time'] = pd.to_datetime(df['time'])

# Sort the dataframe by 'time'
df = df.sort_values(by='time')

# Define the cutoff date for historical and incoming data
cutoff_date = pd.Timestamp('2024-10-01')

# Split data into historical (before October 2024) and incoming (after October 2024)
# Define the conditions correctly using & and | operators and parentheses for grouping
historical_data = df[(df['time'].dt.year < 2024) | ((df['time'].dt.year == 2024) & (df['time'].dt.month < 10))]
incoming_data = df[(df['time'].dt.year > 2024) | ((df['time'].dt.year == 2024) & (df['time'].dt.month >= 10))]

# Check the first few rows of each dataset to confirm
print("Historical Data (before October 2024):")
print(historical_data.head())

print("\nIncoming Data (after October 2024):")
print(incoming_data.head())

In [14]:
# Check the lengths to confirm the split
print(f"\nNumber of rows in historical data: {len(historical_data)}")
print(f"Number of rows in incoming data: {len(incoming_data)}")

# Check first and last rows of historical and incoming data
print(f"First row in historical data: {historical_data.iloc[0]['time']}")
print(f"Last row in historical data: {historical_data.iloc[-1]['time']}")

print(f"\nFirst row in incoming data: {incoming_data.iloc[0]['time']}")
print(f"Last row in incoming data: {incoming_data.iloc[-1]['time']}")



Number of rows in historical data: 47936
Number of rows in incoming data: 4256
First row in historical data: 2024-03-01 00:00:00+00:00
Last row in historical data: 2024-09-30 00:00:00+00:00

First row in incoming data: 2024-10-01 00:00:00+00:00
Last row in incoming data: 2024-10-19 00:00:00+00:00


In [15]:
# Divide data by machine and kpis (time series)
# Retrieve unique machines and KPIs from the DataFrame
machines = df['name'].unique().tolist()

print("All machines are: ")
print(machines)

kpis = df['kpi'].unique().tolist()

print("All KPIs are: ")
print(kpis)

All machines are: 
['Large Capacity Cutting Machine 1', 'Assembly Machine 2', 'Laser Cutter', 'Riveting Machine', 'Assembly Machine 3', 'Medium Capacity Cutting Machine 1', 'Medium Capacity Cutting Machine 2', 'Low Capacity Cutting Machine 1', 'Testing Machine 3', 'Medium Capacity Cutting Machine 3', 'Testing Machine 1', 'Testing Machine 2', 'Laser Welding Machine 1', 'Laser Welding Machine 2', 'Large Capacity Cutting Machine 2', 'Assembly Machine 1']
All KPIs are: 
['working_time', 'consumption_idle', 'consumption_working', 'power', 'consumption', 'offline_time', 'idle_time', 'cost', 'cost_working', 'cost_idle', 'cycles', 'good_cycles', 'bad_cycles', 'average_cycle_time']


In [None]:
# Dictionary about specific preprocessing information for kpis
# So the kwargs that will be used in the pipelines are stored there
# The idea is to implement it somewhere else and importing it

# Preprocessing pipeline

The pipeline will receive as an input the new incoming data for a specific machine and kpi. Also, to perform the preprocessing it need also to receive the batch of a fixed amount of past data and the information about how specifically handle that kpi for that machine (given by kwargs). 

In [None]:
def cleaning_pipeline(batch, new_input, kwargs):

    ####### Preprocessing of the data

    # Resampling (if needed)
    if kwargs.get('resample', False):
        new_input= pf.resample_data(batch, new_input, kwargs) #not implemented yet

    # Smoothing (if needed, based on kwargs)
    if kwargs.get('smooth', False):
        new_input = pf.smooth_data(batch, new_input, kwargs) #not implemented yet

    ### DATA CLEANING

    ## Data type standardization
    
    # function present preprocessing_functions

    ## Check for inconsistencies
    
    # function present preprocessing_functions

    ## Fill missing values

    # function present preprocessing_functions

    ### FEATURE ENGINEERING
    new_batch = pd.concat([batch, new_input]).sort_values(by='timestamp') 
    # Remove the oldest row (the first row after sorting)
    new_batch = new_batch.iloc[1:]

    return new_batch, new_input


In [None]:
def feature_engineering_pipeline(dataframe, kwargs):
    ## Check stationarity
    is_stationary = pf.adf_test(dataframe.dropna()) #False if not stationary, True if it is, None if test couldn't be applied
    
    ## Check seasonality 
    period_of_observation = kwargs.get('period_of_observation', None)
    if period_of_observation is None:
        raise ValueError("Period of observation must be provided in kwargs.")
    
    trend, seasonalilty, residual = pf.seasonal_additive_decomposition(dataframe, period_of_observation) 

    #Make data stationary / Detrend / Deseasonalize (if needed)
    
    make_stationary = kwargs.get('need_stationarity', False)  # Set default to False if not provided
    detrend = kwargs.get('detrend', False) 
    deseasonalize = kwargs.get('deseasonalize', False) 
    
    if make_stationary and not is_stationary:
        dataframe = pf.make_stationary(dataframe, kwargs) #not implemented yet

    if detrend and deseasonalize:
        dataframe = pf.rest_trend(dataframe, kwargs) #not implemented yet
        dataframe = pf.rest_seasonality(dataframe, kwargs) #not implemented yet
    elif detrend:
        dataframe = pf.rest_trend(dataframe, kwargs) #not implemented yet
    elif deseasonalize:
        dataframe = pf.rest_seasonality(dataframe, kwargs) #not implemented yet

    # Normalize data and apply encoding (if needed)
    categorical_columns = dataframe.select_dtypes(include=['object']).columns

    # Determine if encoder or scaler should be applied based on kwargs
    encoder = kwargs.get('encoder', False)  # Set default to False if not provided
    scaler = kwargs.get('scaler', True)  # Set default to True if not provided
    
    # Apply transformations based on encoder and scaler flags
    if not encoder:  # Only scaler
        transformers = [('num', RobustScaler(), ~dataframe.columns.isin(categorical_columns))]
    elif not scaler:  # Only encoder
        transformers = [('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)]
    else:  # Both encoder and scaler
        transformers = [
            ('num', RobustScaler(), ~dataframe.columns.isin(categorical_columns)),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
        ]
    
    preprocessor = ColumnTransformer(transformers=transformers)

    # Fit and transform the data
    dataframe_transformed = preprocessor.fit_transform(dataframe)

    return dataframe_transformed

For the ML algorithms part, should we make a function specific for the drift (separated from the ML function)?

Should we make two different functions for training and usage or is better to have a singular function?

In [None]:
def features_extraction_pipeline(time_series, kwargs):
    
    ## Definition of metrics for goodness of model 

    ## Drift detection algorithms

    ### ML ALGORTIHMS (this should be divided on training phase and prediction phase)

    ## Check Outliers 

    ## Feature selection

    ## Parameters setting

    ## Feature extraction algorithm (maybe more than one, include selection)

    ## Models comparison (if needed)
        
    return extracted_features #as new KPIs
    

In [None]:
def forecasting_pipeline(time_series, kwargs):
    
    ## Definition of metrics for goodness of model 

    ## Drift detection algorithms

    ### ML ALGORTIHMS (this should be divided on training phase and prediction phase)

    ## Check Outliers 

    ## Feature selection

    ## Parameters setting

    ## Forecasting algorithm (maybe more than one, include selection)

    ## Models comparison (if needed)
        
    return forecasted_feature #forecasting prediction as a new KPI

In [None]:
def anomalies_detection_pipeline(time_series, kwargs):
    
    ## Definition of metrics for goodness of model 

    ## Drift detection algorithms

    ### ML ALGORTIHMS (this should be divided on training phase and prediction phase)

    ## Check Outliers 

    ## Feature selection

    ## Parameters setting

    ## Anomalies detection algorithm (maybe more than one, include selection)

    ## Models comparison (if needed)
        
    return anomalies_detection_feature #detected anomaly as a new KPI

# Implementation

## Historical data

### Preprocessing historical data

### Training on historical data

## Incoming data