# Week 2.7: SE4ML (2)  
Author: Juana Karina Diaz Barba

### Step 1: getting and transforming the data

In [1]:
import pandas as pd
import yaml
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.ensemble import IsolationForest

from joblib import dump, load
import matplotlib.pyplot as plt
import os
from datetime import datetime
import json



In [2]:
def get_config():
    '''Setting the config file'''
    with open('../config_prog2.yaml', 'r') as stream:
        config = yaml.safe_load(stream)
        return config

config = get_config()
sensor_path = (config['sensor'])

# Creating a data frame with the data
data_df = pd.read_csv(sensor_path)
print(data_df.shape)
data_df.head(3)


(220320, 55)


Unnamed: 0.1,Unnamed: 0,timestamp,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,...,sensor_43,sensor_44,sensor_45,sensor_46,sensor_47,sensor_48,sensor_49,sensor_50,sensor_51,machine_status
0,0,2018-04-01 00:00:00,2.465394,47.09201,53.2118,46.31076,634.375,76.45975,13.41146,16.13136,...,41.92708,39.6412,65.68287,50.92593,38.19444,157.9861,67.70834,243.0556,201.3889,NORMAL
1,1,2018-04-01 00:01:00,2.465394,47.09201,53.2118,46.31076,634.375,76.45975,13.41146,16.13136,...,41.92708,39.6412,65.68287,50.92593,38.19444,157.9861,67.70834,243.0556,201.3889,NORMAL
2,2,2018-04-01 00:02:00,2.444734,47.35243,53.2118,46.39757,638.8889,73.54598,13.32465,16.03733,...,41.66666,39.351852,65.39352,51.21528,38.194443,155.9606,67.12963,241.3194,203.7037,NORMAL


In [3]:
# Dropping non needed and low quality columns
# 'sensor_15' and 'sensor_50' have a lot of missing data values compared with 
# the other sensors
data_df.drop(['Unnamed: 0','sensor_15', 'sensor_50'], axis=1, inplace=True)
# Convert timestamp to datetime and set it as index
data_df['timestamp'] = pd.to_datetime(data_df['timestamp'])
data_df.set_index('timestamp', inplace=True)

In [4]:
data_df.head()

Unnamed: 0_level_0,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,sensor_08,sensor_09,...,sensor_42,sensor_43,sensor_44,sensor_45,sensor_46,sensor_47,sensor_48,sensor_49,sensor_51,machine_status
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-04-01 00:00:00,2.465394,47.09201,53.2118,46.31076,634.375,76.45975,13.41146,16.13136,15.56713,15.05353,...,31.770832,41.92708,39.6412,65.68287,50.92593,38.19444,157.9861,67.70834,201.3889,NORMAL
2018-04-01 00:01:00,2.465394,47.09201,53.2118,46.31076,634.375,76.45975,13.41146,16.13136,15.56713,15.05353,...,31.770832,41.92708,39.6412,65.68287,50.92593,38.19444,157.9861,67.70834,201.3889,NORMAL
2018-04-01 00:02:00,2.444734,47.35243,53.2118,46.39757,638.8889,73.54598,13.32465,16.03733,15.61777,15.01013,...,31.77083,41.66666,39.351852,65.39352,51.21528,38.194443,155.9606,67.12963,203.7037,NORMAL
2018-04-01 00:03:00,2.460474,47.09201,53.1684,46.397568,628.125,76.98898,13.31742,16.24711,15.69734,15.08247,...,31.51042,40.88541,39.0625,64.81481,51.21528,38.19444,155.9606,66.84028,203.125,NORMAL
2018-04-01 00:04:00,2.445718,47.13541,53.2118,46.397568,636.4583,76.58897,13.35359,16.21094,15.69734,15.08247,...,31.51042,41.40625,38.77315,65.10416,51.79398,38.77315,158.2755,66.55093,201.3889,NORMAL


We're going to **train** the model on the months **April, May, and June** and then use the trained model to **predict** the anomalies of the months **July and August**. 

In [5]:
train_data = data_df.loc[(data_df.index >= '2018-04-01')
                     & (data_df.index < '2018-06-30')]

july_data = data_df.loc[(data_df.index >= '2018-07-01')
                     & (data_df.index < '2018-07-31')]

august_data = data_df.loc[(data_df.index >= '2018-08-01')
                     & (data_df.index < '2018-08-31')]

print(f'Train data size: {train_data.shape}')
print(f'July data size: {july_data.shape}')
print(f'August data size: {august_data.shape}')
# # Create files of the data split
# train_data.to_csv('Week2.7_sensor_train_data.csv')
# train_data.to_csv('Week2.7_sensor_july_data.csv')
# train_data.to_csv('Week2.7_sensor_august_data.csv')


Train data size: (129600, 51)
July data size: (43200, 51)
August data size: (43200, 51)


### Step 2: create the model and the drawer and Step4 listing for new data

In [6]:

class Data_transformation:
    '''Class that perform the data transformation including the data division 
    based on the sensor status, the calculation of outliers fraction, the data 
    imputation and the data scaling'''

    # Divide the data
    def sensor_status(self, data_df):
        '''Divide the sensor data based on its machine status'''
        broken_rows = data_df[data_df['machine_status']=='BROKEN']
        recovery_rows = data_df[data_df['machine_status']=='RECOVERING']
        normal_rows = data_df[data_df['machine_status']=='NORMAL']
        return  broken_rows, recovery_rows, normal_rows

    def calculate_outliers_fraction(self, normal_rows, data_df):
        '''To calculate the fraction of outliers on the dataset'''
        outliers_fraction = 1 - (len(normal_rows)/(len(data_df)))
        return outliers_fraction

    # Preprocessing
    def data_imputation(self, data_df):
        '''Function to impute the missing values in the data frame'''
        # Use mean of the column to handle missing values and remove label in feature matrix X
        m, n = data_df.shape
        # Ignore machine status columns (last column in the dataframe)
        X = data_df.iloc[:,:n-1] 
        X = X.fillna(X.mean())
        return X

    def data_scaling(self, data_df):
        '''Function to scale the data'''
        # Standardize features by removing the mean and scaling to unit variance.
        scaler = StandardScaler()
        # Fit to data, then transform it.
        X = scaler.fit_transform(data_df)
        return X


In [7]:
class MachineLearningModeling:
    '''Class to perform the machine learning modeling'''
    def svm_training(self, X, datatrans_df):
        self.clf = svm.OneClassSVM(nu=0.058)
        y_pred = self.clf.fit(X).predict(X)
        datatrans_df['svc'] = y_pred
        return datatrans_df

    def persist_model(self):
        # Persist the model on the local file system
        file_name = 'filename.joblib'
        dump(self.clf, file_name) 

In [17]:
class Plotter:
    '''Class to plot a sensor and an algorithm over time. Normal (grey), 
    recovery(yellow) and borken(red), predicted anomaly (blue)'''
    def plot_sensor_anomalies(self, sensor,recovery_rows, broken_rows, data):
        anomaly_rows = data[data['svc'] == -1]
        f = plt.figure(figsize=(30,3))
        axarr = f.add_subplot(1,1,1)
        
        plt.plot(data[sensor], color='grey', label='Normal')
        plt.plot(recovery_rows[sensor], linestyle='none', marker='o', 
                        color='yellow', markersize=5, label='Recovering')
        plt.plot(broken_rows[sensor], linestyle='none', marker='X', 
                        color='red', markersize=20, label='Broken')
        plt.plot(anomaly_rows[sensor], linestyle='none', marker='X', 
                        color='blue', markersize=4, label='Predicted anomaly',
                        alpha = 0.1)
        plt.title(sensor)
        plt.legend()
        return f


In [26]:


class FileManager:
    '''Class that looks at a specific directory'''
    def __init__(self, input_path, output_path, img_path, sensor_name, transformer):
        self.input_path = input_path
        self.output_path = output_path
        self.img_path = img_path
        self.sensor_name = sensor_name
        self.transformer = transformer # Data_transformation object

    # Found new data file
    def load_file(self, file_name):
        file_path = os.path.join(self.input_path, file_name)
        # Read the file on a pandas dataframe
        self.data_df = pd.read_csv(file_path)
        print('File loaded')

    def preprocess_data(self):
        self.datatrans_df = self.data_df
        # Convert timestamp to datetime and set it as index
        self.datatrans_df['timestamp'] = pd.to_datetime(self.datatrans_df['timestamp'])
        self.datatrans_df.set_index('timestamp', inplace=True)
        # Impute data
        self.X = self.transformer.data_imputation(self.datatrans_df)
        # Scale data 
        self.X = self.transformer.data_scaling(self.X)
        print('Data preprocessed')
        print(len(self.datatrans_df))
        return self.X
    
    def sensor_rows(self):
        # Function to define the rows that are broken, normal or transform
        broken_rows, recovery_rows, normal_rows = self.transformer.sensor_status(self.datatrans_df)
        return broken_rows, recovery_rows, normal_rows, self.datatrans_df
    
    def data_prediction(self, model):
        # Model and create the predictions dataframe
        self.predictions_df = model.svm_training(self.X, self.datatrans_df)
        print('Received predictions')

    def save_data(self):
        # Save predictions to the output directory
        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        output_path = os.path.join(self.output_path)
        (self.predictions_df).to_csv(output_path, index=False)
        print('Saving predictions in directory')

    def create_plot_images(self, f):
        # Save the plot into the folder
        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        image_path = os.path.join(self.img_path, timestamp)
        f.savefig(image_path)
        plt.close(f)
        print('Image saved')
        
    def remove_file(self, remove_file_path):
        # removing the file 
        os.remove(remove_file_path)
        print('File removed')

    def log_file(self, error_message):
        # create a file with an error message
        now = datetime.now()
        log_entry = '{}  {}\n'.format(now.strftime('%Y-%m-%d %H:%M:%S'), error_message)
        with open('log_file.txt', 'a') as log_file:
            log_file.write(log_entry)
        print('log_file created')




#### Main

In [28]:
# def main():

# Get the parameters from the json file
application_json_path = "/homes/jkdiazbarba/Documents/Programming/DSLS_Prog2/Programming2/Week2.7/application.json"
with open(application_json_path, 'r') as json_file:
    parameters = json.load(json_file)

# Get the paths of every parameter from the json file
input_path = parameters['input_directory']
output_path = parameters['output_directory']
img_path = parameters['img_directory']
sensor_name = parameters['sensor_names']
interval = parameters['interval']
file_name = parameters['file_name']


##### Loading the file #####
# Create an object of the class Data_transformation
transformer = Data_transformation()
# Create an instance of the FileManager class to pass the paths 
file_object = FileManager(input_path, output_path, img_path, sensor_name, transformer)
# Loading the file
file_object.load_file(file_name)

##### Preprocessing the data #####
# Preprocessing the data and getting the data transformed
X = file_object.preprocess_data()

##### Modeling #####
# Model object
model = MachineLearningModeling()
# Making the predictions
file_object.data_prediction(model)

##### Plots #####
# Getting the rows of every sensor
broken_rows, recovery_rows, normal_rows, data = file_object.sensor_rows()
#plotter object
plotter = Plotter()
# creating the image
image = plotter.plot_sensor_anomalies(sensor_name,recovery_rows, broken_rows, data)
# Saving the image on the folder
file_object.create_plot_images(image)

#### Errors ####
# Creating an error message
file_object.log_file('error_message')

# if __name__=='__main__':
#     main()


File loaded
Data preprocessed
129600
Received predictions
Image saved
log_file created
