In [1]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
import joblib 
import os
import logging
import json
import time
import matplotlib.pyplot as plt
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from sklearn.metrics import accuracy_score

In [2]:
#load data
df = pd.read_csv('sensor.csv').drop('Unnamed: 0', axis=1)
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.set_index('timestamp')


In [3]:
#missing values
percentage_missing = df.isnull().sum().sort_values(ascending=False)/len(df)*100
percentage_missing.head() # show 5 largest missing %

sensor_15    100.000000
sensor_50     34.956881
sensor_51      6.982117
sensor_00      4.633261
sensor_07      2.474129
dtype: float64

In [4]:
#drop low quality columns
df.drop(['sensor_15', 'sensor_50'],inplace = True,axis=1)


In [5]:
#split: Training,testing,validation
train_df = df.loc['2018-04-01':'2018-06-30']
test_df = df.loc['2018-07-01':'2018-07-31']
valid_df = df.loc['2018-08-01':'2018-08-31']

In [6]:
valid_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

In [7]:
#save to csv file:
train_df.to_csv('train.csv', index=False)
valid_df.to_csv('valid.csv', index=False)
test_df.to_csv('test.csv', index=False)

In [8]:
valid_df.head(5)

Unnamed: 0,timestamp,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,sensor_08,...,sensor_42,sensor_43,sensor_44,sensor_45,sensor_46,sensor_47,sensor_48,sensor_49,sensor_51,machine_status
0,2018-08-01 00:00:00,2.455556,46.657986,51.08507,44.661457,641.435181,81.67928,14.5761,16.24711,15.65393,...,35.15625,38.28125,37.32639,36.16898,46.58565,48.03241,135.9954,46.2963,184.8958,NORMAL
1,2018-08-01 00:01:00,2.455556,46.657986,51.08507,44.661457,641.435181,81.67928,14.5761,16.24711,15.65393,...,35.15625,38.28125,37.32639,36.16898,46.58565,48.03241,135.9954,46.2963,184.8958,NORMAL
2,2018-08-01 00:02:00,2.449653,46.657986,50.99826,44.661457,654.28241,80.48033,14.60503,16.24711,15.56713,...,35.15625,38.28125,37.90509,36.16898,46.00694,50.34722,142.3611,46.00694,185.1852,NORMAL
3,2018-08-01 00:03:00,2.455556,46.657986,50.99826,44.661457,641.898132,81.66726,14.6412,16.13136,15.61777,...,35.15625,39.58333,37.90509,36.45833,45.13889,53.24074,149.0162,46.00694,187.2106,NORMAL
4,2018-08-01 00:04:00,2.454572,46.657986,50.95486,44.66146,635.300903,81.66495,14.59057,16.13136,15.65393,...,35.15625,40.625,38.19444,36.458332,45.13889,55.55556,154.5139,46.00694,189.5255,NORMAL


In [9]:
broken_rows = train_df[train_df['machine_status']=='BROKEN']
recovery_rows = train_df[train_df['machine_status']=='RECOVERING']
normal_rows = train_df[train_df['machine_status']=='NORMAL']
#use mean of the column to handle missing values and remove label in feature matrix X
m, n = train_df.shape
X = train_df.iloc[:,:n-1] #ignore machine status columns
X = X.fillna(X.mean())
X.shape
X

Unnamed: 0_level_0,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,sensor_08,sensor_09,...,sensor_41,sensor_42,sensor_43,sensor_44,sensor_45,sensor_46,sensor_47,sensor_48,sensor_49,sensor_51
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-04-01 00:00:00,2.465394,47.092010,53.211800,46.310760,634.375000,76.459750,13.411460,16.131360,15.567130,15.053530,...,30.98958,31.770832,41.92708,39.641200,65.68287,50.92593,38.194440,157.98610,67.708340,201.3889
2018-04-01 00:01:00,2.465394,47.092010,53.211800,46.310760,634.375000,76.459750,13.411460,16.131360,15.567130,15.053530,...,30.98958,31.770832,41.92708,39.641200,65.68287,50.92593,38.194440,157.98610,67.708340,201.3889
2018-04-01 00:02:00,2.444734,47.352430,53.211800,46.397570,638.888900,73.545980,13.324650,16.037330,15.617770,15.010130,...,30.46875,31.770830,41.66666,39.351852,65.39352,51.21528,38.194443,155.96060,67.129630,203.7037
2018-04-01 00:03:00,2.460474,47.092010,53.168400,46.397568,628.125000,76.988980,13.317420,16.247110,15.697340,15.082470,...,30.46875,31.510420,40.88541,39.062500,64.81481,51.21528,38.194440,155.96060,66.840280,203.1250
2018-04-01 00:04:00,2.445718,47.135410,53.211800,46.397568,636.458300,76.588970,13.353590,16.210940,15.697340,15.082470,...,30.98958,31.510420,41.40625,38.773150,65.10416,51.79398,38.773150,158.27550,66.550930,201.3889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-06-30 23:55:00,2.357252,36.024300,38.888885,35.460068,3.336227,99.999878,12.984942,15.601139,14.995619,14.642531,...,23.17708,23.437500,26.30208,26.620371,27.77778,27.48843,28.935184,27.19907,28.067130,1000.0000
2018-06-30 23:56:00,2.357252,36.024303,38.888890,35.460068,3.145254,99.999878,12.984942,15.601139,14.995619,14.642531,...,22.65625,23.437500,26.56250,26.620371,27.48843,27.48843,28.935184,27.48843,27.777780,1000.0000
2018-06-30 23:57:00,2.357252,36.024303,38.845490,35.460070,3.451967,99.999878,12.984942,15.601139,14.995619,14.642531,...,22.65625,23.437500,26.30208,26.620371,27.48843,27.19907,28.935184,27.48843,28.067130,1000.0000
2018-06-30 23:58:00,2.357252,36.024303,38.888890,35.416660,3.029514,99.999878,12.984942,15.601139,14.995619,14.642531,...,22.65625,23.437500,26.30208,26.620371,27.77778,27.48843,28.935180,27.19907,27.777780,1000.0000


In [10]:
#perform data transformations
scaler = StandardScaler()
X = scaler.fit_transform(X)

outliers_fraction = 1 - (len(normal_rows)/(len(train_df)))
clf = IsolationForest(contamination=outliers_fraction, n_jobs = -1)
clf.fit(X)
y_pred = clf.fit(X).predict(X)
    
train_df['IsolationForest'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['IsolationForest'] = y_pred


In [11]:
#saving model
joblib.dump(clf, 'model.joblib')
joblib.dump(scaler, 'scaler.joblib')


['scaler.joblib']

In [12]:
train_df

Unnamed: 0_level_0,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,sensor_08,sensor_09,...,sensor_43,sensor_44,sensor_45,sensor_46,sensor_47,sensor_48,sensor_49,sensor_51,machine_status,IsolationForest
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-04-01 00:00:00,2.465394,47.092010,53.211800,46.310760,634.375000,76.459750,13.41146,16.13136,15.56713,15.05353,...,41.92708,39.641200,65.68287,50.92593,38.194440,157.98610,67.708340,201.3889,NORMAL,1
2018-04-01 00:01:00,2.465394,47.092010,53.211800,46.310760,634.375000,76.459750,13.41146,16.13136,15.56713,15.05353,...,41.92708,39.641200,65.68287,50.92593,38.194440,157.98610,67.708340,201.3889,NORMAL,1
2018-04-01 00:02:00,2.444734,47.352430,53.211800,46.397570,638.888900,73.545980,13.32465,16.03733,15.61777,15.01013,...,41.66666,39.351852,65.39352,51.21528,38.194443,155.96060,67.129630,203.7037,NORMAL,1
2018-04-01 00:03:00,2.460474,47.092010,53.168400,46.397568,628.125000,76.988980,13.31742,16.24711,15.69734,15.08247,...,40.88541,39.062500,64.81481,51.21528,38.194440,155.96060,66.840280,203.1250,NORMAL,1
2018-04-01 00:04:00,2.445718,47.135410,53.211800,46.397568,636.458300,76.588970,13.35359,16.21094,15.69734,15.08247,...,41.40625,38.773150,65.10416,51.79398,38.773150,158.27550,66.550930,201.3889,NORMAL,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-06-30 23:55:00,,36.024300,38.888885,35.460068,3.336227,99.999878,,,,,...,26.30208,26.620371,27.77778,27.48843,28.935184,27.19907,28.067130,1000.0000,RECOVERING,-1
2018-06-30 23:56:00,,36.024303,38.888890,35.460068,3.145254,99.999878,,,,,...,26.56250,26.620371,27.48843,27.48843,28.935184,27.48843,27.777780,1000.0000,RECOVERING,-1
2018-06-30 23:57:00,,36.024303,38.845490,35.460070,3.451967,99.999878,,,,,...,26.30208,26.620371,27.48843,27.19907,28.935184,27.48843,28.067130,1000.0000,RECOVERING,1
2018-06-30 23:58:00,,36.024303,38.888890,35.416660,3.029514,99.999878,,,,,...,26.30208,26.620371,27.77778,27.48843,28.935180,27.19907,27.777780,1000.0000,RECOVERING,1


In [13]:
class FileProcessor:
    def __init__(self, config_path):
        #load configuration
        with open(config_path, 'r') as file:
            self.config = json.load(file)
        
        self.input_directory = self.config['input_directory']
        self.output_directory = self.config['output_directory']
        self.image_directory = self.config['image_directory']
        self.model_path = self.config['model_path']
        self.scaler_path = self.config['scaler_path']
        self.sensors_to_draw = self.config['sensors_to_draw']
        self.check_interval = self.config['check_interval']
        
        #load model and scaler
        self.model = joblib.load(self.model_path)
        self.scaler = joblib.load(self.scaler_path)
        
        #set up logging
        logging.basicConfig(filename='file_processor.log', level=logging.INFO,
                            format='%(asctime)s %(levelname)s %(message)s')
    
    def process_file(self, file):
        file_path = os.path.join(self.input_directory, file)
        try:
            df = pd.read_csv(file_path)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df.set_index('timestamp', inplace=True)
            logging.info(f"Loaded the file: {file}")
            
            #data cleaning and preprocessing
            df.drop(['Unnamed: 0'], axis=1, inplace=True, errors='ignore')
            mapping = {'BROKEN': -1, 'RECOVERING': -1, 'NORMAL': 1}
            df['machine_status'] = df['machine_status'].map(mapping)

            df.fillna(df.mean(), inplace=True)
            if 'sensor_15' in df.columns:
                df.drop('sensor_15', axis=1, inplace=True)
            
            #transform the data
            m, n = df.shape
            X = df.iloc[:, :n-1]  #ignore machine status column
            X_scaled = self.scaler.transform(X)
            logging.info("Transformed data")
            
            #predictions
            predictions = self.model.predict(X_scaled)
            df['anomaly'] = predictions
            logging.info("Generated predictions")
            
            #save predictions
            output_file_path = os.path.join(self.output_directory, file)
            df.to_csv(output_file_path, index=False)
            logging.info(f"Saved predictions to {output_file_path}")
            
            #save images
            for sensor in self.sensors_to_draw:
                if sensor in df.columns:
                    broken_rows = df[df['anomaly'] == -1]
                    recovery_rows = df[df['anomaly'] == -1]
                    normal_rows = df[df['anomaly'] == 1]
                    
                    plt.figure(figsize=(25, 3))
                    plt.plot(df.index.to_numpy(), df[sensor].values, color='grey')  # Use .values to convert to numpy array
                    plt.plot(broken_rows.index.to_numpy(), broken_rows[sensor].values, linestyle='none', marker='o', color='yellow', markersize=5, label='recovering', alpha=0.5)
                    plt.plot(recovery_rows.index.to_numpy(), recovery_rows[sensor].values, linestyle='none', marker='X', color='red', markersize=20, label='broken')
                    plt.plot(normal_rows.index.to_numpy(), normal_rows[sensor].values, linestyle='none', marker='X', color='blue', markersize=4, label='anomaly predicted', alpha=0.1)
                    
                    plt.title(f"{sensor} over time")
                    plt.legend()
                    
                    image_path = os.path.join(self.image_directory, f"{file}-{sensor}.png")
                    plt.savefig(image_path)
                    plt.close()
                    
                    logging.info(f"Saved custom image {image_path}")
            
            #remove the original file
            os.remove(file_path)
            logging.info(f"Removed original file {file}")
            logging.info("Resuming listening")
        except FileNotFoundError:
            logging.error(f"File not found: {file_path}")
        except pd.errors.EmptyDataError:
            logging.error(f"Empty data file: {file_path}")
        except Exception as e:
            logging.error(f"Error processing file {file}: {str(e)}")
    
    def logging_info(self):
        start_time = time.time()
        while True:
            elapsed_time = time.time() - start_time
            if elapsed_time > 120:
                logging.info("Stopping after 120 seconds")
                break
            try:
                files = [f for f in os.listdir(self.input_directory) if os.path.isfile(os.path.join(self.input_directory, f))]
                for file in files:
                    try:
                        logging.info(f"Found new data file: {file}")
                        self.process_file(file)
                    except Exception as e:
                        logging.error(f"Error processing file {file}: {str(e)}")
                
                time.sleep(self.check_interval)
            except Exception as e:
                logging.error(f"Error accessing input directory: {str(e)}")

#Example usage
if __name__ == "__main__":
    processor = FileProcessor('application.json')
    processor.logging_info()
