In [None]:
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import feature_engineering_functions as fef
import machine_learning_functions as mlf

# Load the dataset
# Load the .pkl file
file_path = 'smart_app_data.pkl'
historical_data = pd.read_pickle(file_path)

In [None]:
kpis = historical_data['kpi'].unique()
print("------------ kpis are ----------------")
print(kpis)

machines = historical_data['name'].unique()
print("------------ machines are ----------------")
print(machines)

'''operations = historical_data['operation'].unique()
print("------------ operations are ----------------")
print(operations)'''


In [None]:
def extract_features(kpi_name, machine_name, operation_name, data):

  # function that is able to extract time series filtering for:
  # kpi_vame = name of the kpi
  # kpi_value = which should be chose in the list ['sum', 'avg','min', 'max']
  # machine_name = name of the machine
  # data = name of the dataframe from which data are provided

  filtered_data = data[(data["name"] == machine_name) &
                    (data["kpi"] == kpi_name)]
  #later replace by
  #filtered_data = data[(data["name"] == machine_name) &
  # (data["kpi"] == kpi_name)] & (data["operation"] == operation_name)]
  filtered_data['time'] = pd.to_datetime(filtered_data['time'])
  filtered_data = filtered_data.sort_values(by='time')

  return filtered_data



In [None]:
# The input dataframe corresponds to a filtrate version of the dataset for a given machine, kpi and operation,
# so it contains 9 columns and the amount of entries correspondent to the selected time range.

def feature_engineering_pipeline(dataframe, kwargs):
    features = ['sum', 'avg','min', 'max', 'var']
    for feature_name in features:
        # Check if the column exists in the DataFrame
        if feature_name in dataframe.columns:
            print("-------------------- Results for " + str(feature_name))
            feature = dataframe[feature_name]
            if feature.empty or feature.isna().all() or feature.isnull().all():
                print("Feature is empty (no data).")
            else:
                ## Check stationarity 
                # (output is False if not stationary, True if it is, None if test couldn't be applied)
                is_stationary = fef.adf_test(feature.dropna()) 
                print('Output is stationary? ' + str(is_stationary))
            
                ## Check seasonality
                # (output: period of the seasonality None if no seasonalaty was detected.
                seasonality_period = fef.detect_seasonality_acf(feature)
                print('Seasonality period is? ' + str(seasonality_period))
            
                #further check in the case the seasonality pattern is complex and cannot be detected
                if seasonality_period == None:
                    # (output: period of the seasonality None if no seasonalaty was detected.
                    seasonality_period = fef.detect_seasonality_fft(feature)
                    print('Recomputed seasonality period is? ' + str(seasonality_period))
            
                # (output: the decomposed time series in a list, of form [trend, seasonal, residual],
                # None if it isn't sufficient data or if some error occurs.
                decompositions = fef.seasonal_additive_decomposition(feature, seasonality_period) 

                #Make data stationary / Detrend / Deseasonalize (if needed)
            
                make_stationary = kwargs.get('make_stationary', False)  # Set default to False if not provided
                detrend = kwargs.get('detrend', False) # Set default to False if not provided
                deseasonalize = kwargs.get('deseasonalize', False) # Set default to False if not provided
                get_residuals = kwargs.get('get_residuals', False) # Set default to False if not provided
                scaler = kwargs.get('scaler', False)  # Set default to False if not provided
                
                if make_stationary and (not is_stationary):
                    if decompositions != None:
                        feature = fef.make_stationary_decomp(feature, decompositions)
                        is_stationary = fef.adf_test(feature.dropna())
                        print('Is stationary after trying to make it stationary? ' + str(is_stationary))
                        if not is_stationary:
                            feature = fef.make_stationary_diff(feature, seasonality_period=[7]) #default weekly
                            is_stationary = fef.adf_test(feature.dropna())
                            print('Is stationary after re-trying to make it stationary? ' + str(is_stationary))
                    else:
                        feature = fef.make_stationary_diff(feature, seasonality_period=[7]) #default weekly
                        is_stationary = fef.adf_test(feature.dropna())
                        print('Is stationary after trying to make it stationary? ' + str(is_stationary))
            
                if detrend:
                    if decompositions != None:
                        feature = fef.rest_trend(feature, decompositions)
                    else:
                        feature = fef.make_stationary_diff(feature)
                
                if deseasonalize:
                    if decompositions != None:
                        feature = fef.rest_seasonality(feature, decompositions)
                    else:
                        feature = fef.make_stationary_diff(feature, seasonality_period=[7]) #default weekly
            
                if get_residuals:
                    if decompositions != None:
                        feature = fef.get_residuals(feature, decompositions)
                    else:
                        feature = fef.make_stationary_diff(feature)
                        feature = fef.make_stationary_diff(feature, seasonality_period=[7]) #default weekly
                
                if scaler:
                    # Apply standardization (z-score scaling)
                    feature = (feature - np.mean(feature)) / np.std(feature)
            
            dataframe[feature_name] = feature

    return dataframe

# IMPLEMENTATION: How should this work?

First of all, the dictionaries that tell us how to preprocess data for each pourpose

In [None]:
ML_algorithms_config = {
    'forecasting_ffnn': {
        'make_stationary': True,  # Default: False
        'detrend': True,          # Default: False
        'deseasonalize': True,    # Default: False
        'get_residuals': True,    # Default: False
        'scaler': True             # Default: True
    },
    'anomaly_detection': {
        'make_stationary': False, # Default: False
        'detrend': False,         # Default: False
        'deseasonalize': False,   # Default: False
        'get_residuals': False,    # Default: False
        'scaler': False           # Default: True
    }
}


Then, for each machine, KPI and operation:

In [None]:
#examples of usage
kpi = 'good_cycles'
machine = 'Large Capacity Cutting Machine 1'
operation = "working"

for machine in machines:
    for kpi in kpis:
        #for operation in operations:

            #extract feature of interest (this should arrive from historical data)
            #this function will be in feature_engineering functions

            feature_extracted = extract_features(kpi, machine, operation, historical_data)
            feature_extracted.reset_index(drop=True, inplace=True)

            #this should be run at the beginning of the given operation (forecasting or anomaly detection)
            #let's suppose we want to perform anomaly detection
            #this dictionary will be in information

            processing_config = ML_algorithms_config['anomaly_detection']

            #check for presence of drift at last timepoint
            #this algorithm will be in machine_learning_functions
            drift_presence = mlf.ADWIN_drift(feature_extracted)
            
            #transform data to become a suitable input to the ML algorithm
            transformed_time_series = feature_engineering_pipeline(feature_extracted, processing_config)

            # ouput is in the same format as feature_extracted, so is the dataset filtered for machine, kpi
            # and operation (to decomment)
            if drift_presence:
                  #retrain
                  pass
            
            #model usage

print('hi')
