In [54]:
import numpy as np
import pandas as pd
import sys
sys.path.append('C:\\Users\\mcapo\\data-preprocessing-\\data-preprocessing-')

from dataprocessing_functions import machine

#code to assign anomaly
file_path = 'smart_app_data.pkl'
df = pd.read_pickle(file_path)

asset_ids = df['asset_id'].unique().tolist()


In [55]:
for a in asset_ids:
    key = [key for key, val in machine.items() if a in val]
    df.loc[df['asset_id'] == a, 'name'] = key[0]


In [56]:
df['operation']=np.nan

In [57]:
df.loc[df['kpi']=='working_time', 'operation']='working'
df.loc[df['kpi']=='working_time', 'kpi']='time'
df.loc[df['kpi']=='idle_time', 'operation']='idle'
df.loc[df['kpi']=='idle_time', 'kpi']='time'
df.loc[df['kpi']=='offline_time', 'operation']='offline'
df.loc[df['kpi']=='offline_time', 'kpi']='time'

  df.loc[df['kpi']=='working_time', 'operation']='working'


In [58]:
# Corrected code to drop rows where 'kpi' is 'cost_working' or 'cost_idle'
df.drop(df[df['kpi'] == 'cost_working'].index, inplace=True)
df.drop(df[df['kpi'] == 'cost_idle'].index, inplace=True)


In [59]:
df.loc[df['kpi']=='consumption', 'operation']='working'
df.loc[df['kpi']=='consumption_idle', 'operation']='offline'
df.loc[df['kpi']=='consumption_working', 'operation']='idle'
df.loc[df['kpi']=='consumption_idle', 'kpi']='consumption'
df.loc[df['kpi']=='consumption_working', 'kpi']='consumption'

In [60]:
df.loc[df['kpi']=='power', 'operation']='independent'
df.loc[df['kpi']=='cost', 'operation']='independent'
df.loc[df['kpi']=='cycles', 'operation']='working'
df.loc[df['kpi']=='good_cycles', 'operation']='working'
df.loc[df['kpi']=='bad_cycles', 'operation']='working'
df.loc[df['kpi']=='average_cycle_time', 'operation']='working'

In [68]:
from dataprocessing_functions import fields, features, identity, check_f_consistency, kpi, get_batch, update_counter, imputer, get_counter, faulty_aq_tol, update_batch
from datetime import datetime 
from collections import OrderedDict

def validate(x):

    for f in fields:
        x.setdefault(f, np.nan) #if some fields is missing from the expected ones, put a nan
    x = dict(OrderedDict((key, x[key]) for key in fields)) # order the fields of the datapoint

    # Ensure the reliability of the field time
    if pd.isna(x['time']):
        x['time'] = datetime.now()

    # Check that there is no missing information in the identity of the datapoint, otherwise we store in the database, labelled 'Corrupted'.
    if any(pd.isna(x.get(key)) for key in identity):
        update_counter(x)
        x['status']='Corrupted'
        return x
    # Check if all the features that the datapoint has are nan or missing.
    elif all(pd.isna(x.get(key)) for key in features):
        update_counter(x)
        x['status']='Corrupted'
        return x
    
    #if the datapoint comes here it means that it didn't miss any information about the identity and at least one feature that is not nan.

    x=check_range(x) # the flag is to take trace if the datapoint has naturally nans or nans are the result of validation checks.

    #if the datapoint comes here it means that at least one feature value is respecting the range constraint for the specific kpi.
    if x:
        # Check if the features (min, max, sum, avg) satisfy the basic logic rule min<=avg<=max<=sum
        cc=check_f_consistency(x)
        if all(not c for c in cc): #meaning that no feature respect the logic rule
            update_counter(x)
            x['status']='Corrupted'
            return x
        elif all(c for c in cc): #the datapoint verifies the logic rule.
                            #if now there is a nan it could be either the result of the range check or that the datapoint intrinsically has these nans.
            any_nan=False
            for f in features:
                if np.isnan(x[f]):
                    any_nan=True
                    if all(np.isnan(get_batch(x, f))):
                        pass
                    else:
                        update_counter(x)
                        break
            if any_nan==False:
                                 #it means that the datapoint is consistent and it doesn't have nan values --> it is perfect.
                update_counter(x, True) #reset the counter.
        else: #it means that some feature are consistent and some not. Put at nan the not consistent ones.
            for f, c in zip(features, cc):
                if c==False:
                    x[f]=np.nan
            update_counter(x)
        x['status']='A/N'
        return x



def check_range(x):

    #Retrieve the specific range for the kpi that we are dealing with
    l_thr=kpi[x['kpi']][0][0]
    h_thr=kpi[x['kpi']][0][1]

    for k in features:
        if x[k]<l_thr:
            x[k]=np.nan
        if k in ['avg', 'max', 'min', 'var'] and x[k]>h_thr:
            x[k]=np.nan

    # if after checking the range all features are nan --> corrupted
    if all(np.isnan(value) for value in [x.get(key) for key in features]):
        update_counter(x)
        x['status']='Corrupted'
    return x

def check_range_ai(x):
    flag=True #takes trace of: has the datapoint passed the range check without being changed?
    l_thr=kpi[x['kpi']][0][0]
    h_thr=kpi[x['kpi']][0][1]

    for k in features:
        if x[k]<l_thr:
            flag=False
        if k in ['avg', 'max', 'min', 'var'] and x[k]>h_thr:
            flag=False
    return flag
from statsmodels.tsa.holtwinters import ExponentialSmoothing
def predict_missing(batch):
    seasonality=7
    cleaned_batch= [x for x in batch if not np.isnan(x)]
    if not(all(pd.isna(x) for x in batch)) and batch:
        if len(cleaned_batch)>2*seasonality:
            model = ExponentialSmoothing(cleaned_batch, seasonal='add', trend='add', seasonal_periods=seasonality)
            model_fit = model.fit()
            prediction = model_fit.forecast(steps=1)[0]
        else:
            prediction=np.nanmean(batch)
        return prediction
    else: 
        return np.nan # Leave the feature as nan since we don't have any information in the batch to make the imputation. If the datapoint has a nan because the feature is not definable for it, it will be leaved as it is from the imputator.

# ______________________________________________________________________________________________
# This function is the one managing the imputation for all the features of the data point  receives as an input the new data point, extracts the information

def imputer(x):
    if x:
        if isinstance(x, tuple):
            x = x[0]
            #Because the validated datapoint may exit in the check range with 2 returned values.

        # Try imputation with mean or the HWES model.
        for f in features:
            batch = get_batch(x, f)
            if pd.isna(x[f]):
                    x[f]=predict_missing(batch)

        # Check again the consistency of features and the range.
        if check_f_consistency(x) and check_range_ai(x):
            pass
        else:  # It means that the imputed data point has not passed the check on the features and on their expected range.
            # In this case we use the LVCF as a method of imputation since it ensures the respect of these conditiono (the last point in the batch has been preiovusly checked)
            for f in features:
                batch = get_batch(x, f)
                x[f]=batch[-1]
        
        # In the end update batches with the new data point
        for f in features:
            update_batch(x, f, x[f])

        return x



In [None]:
df['status']=np.nan
cleaned_df=df.copy()

In [69]:
for i in range(5332, df.shape[0]):
    if i%100==0:
        print(f'{i}/{df.shape[0]}')
    datapoint=df.iloc[i].to_dict()
    old_counter=get_counter(datapoint)
    #print(f'original datapoint: {datapoint}')
    datapoint=validate(datapoint)
    new_counter=get_counter(datapoint)
    if new_counter==old_counter+1 and new_counter>=faulty_aq_tol:
        id = {key: datapoint[key] for key in identity if key in datapoint}
        f"It has been {new_counter} days (from {datapoint['time']} that {id['name']} - {id['asset_id']} returns NaN values in {id['kpi']} - {id['operation']}. Possible malfunctioning either in the acquisition system or in the machine!"
    if datapoint['status']!='Corrupted':
        cleaned_datapoint=imputer(datapoint)
    cleaned_df.iloc[i]=cleaned_datapoint


5400/44736
5500/44736


  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.no

5600/44736


  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.no

5700/44736


  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.no

5800/44736


  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.no

5900/44736


  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.no

6000/44736


  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.no

6100/44736


  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.no

6200/44736
6300/44736




6400/44736




6500/44736




6600/44736
6700/44736




6800/44736




6900/44736




7000/44736




7100/44736




7200/44736
7300/44736
7400/44736
7500/44736


  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.no

7600/44736


  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)


7700/44736




7800/44736




7900/44736




8000/44736


KeyboardInterrupt: 

In [70]:
print(i)

8052


In [73]:
cleaned_df.iloc[8051]

time         2024-03-05T00:00:00Z
asset_id         ast-xpimckaf3dlf
name                laser_cutting
kpi            average_cycle_time
sum                           NaN
avg                           0.0
min                           0.0
max                           0.0
operation                 working
status                        A/N
Name: 17445, dtype: object

In [79]:
import json
data=[cleaned_df.to_dict(), i]
with open('C:\\Users\\mcapo\\data-preprocessing-\\data-preprocessing-\\initialization\\transformation_interrupted.json', "w") as json_file:
    json.dump(data, json_file, indent=1) 