Author: Juana Karina Diaz Barba

### Step 1: getting and transforming the data

In [60]:
import pandas as pd
import yaml
from sklearn.preprocessing import StandardScaler
from sklearn import svm


In [2]:
def get_config():
    '''Setting the config file'''
    with open('config_prog2.yaml', 'r') as stream:
        config = yaml.safe_load(stream)
        return config

config = get_config()
sensor_path = (config['sensor'])

# Creating a data frame with the data
data_df = pd.read_csv(sensor_path)
print(data_df.shape)
data_df.head(3)


(220320, 55)


Unnamed: 0.1,Unnamed: 0,timestamp,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,...,sensor_43,sensor_44,sensor_45,sensor_46,sensor_47,sensor_48,sensor_49,sensor_50,sensor_51,machine_status
0,0,2018-04-01 00:00:00,2.465394,47.09201,53.2118,46.31076,634.375,76.45975,13.41146,16.13136,...,41.92708,39.6412,65.68287,50.92593,38.19444,157.9861,67.70834,243.0556,201.3889,NORMAL
1,1,2018-04-01 00:01:00,2.465394,47.09201,53.2118,46.31076,634.375,76.45975,13.41146,16.13136,...,41.92708,39.6412,65.68287,50.92593,38.19444,157.9861,67.70834,243.0556,201.3889,NORMAL
2,2,2018-04-01 00:02:00,2.444734,47.35243,53.2118,46.39757,638.8889,73.54598,13.32465,16.03733,...,41.66666,39.351852,65.39352,51.21528,38.194443,155.9606,67.12963,241.3194,203.7037,NORMAL


In [3]:
# Dropping non needed and low quality columns
# 'sensor_15' and 'sensor_50' have a lot of missing data values compared with 
# the other sensors
data_df.drop(['Unnamed: 0','sensor_15', 'sensor_50'], axis=1, inplace=True)

We're going to **train** the model on the months **April, May, and June** and then use the trained model to **predict** the anomalies of the months **July and August**. 

In [47]:
train_data = data_df.loc[(data_df['timestamp'] >= '2018-04-01')
                     & (data_df['timestamp'] < '2018-06-30')]

july_data = data_df.loc[(data_df['timestamp'] >= '2018-07-01')
                     & (data_df['timestamp'] < '2018-07-31')]

august_data = data_df.loc[(data_df['timestamp'] >= '2018-08-01')
                     & (data_df['timestamp'] < '2018-08-31')]

print(f'Train data size: {train_data.shape}')
print(f'July data size: {july_data.shape}')
print(f'August data size: {august_data.shape}')

# Create files of the data split
# train_data.to_csv('Week2.7_sensor_train_data.csv')
# train_data.to_csv('Week2.7_sensor_july_data.csv')
# train_data.to_csv('Week2.7_sensor_august_data.csv')

Train data size: (129600, 52)
July data size: (43200, 52)
August data size: (43200, 52)


### Step 2: create the model and the drawer

In [55]:
# Divide the data
def sensor_status(data_df):
    '''Divide the sensor data based on its machine status'''
    broken_rows = data_df[data_df['machine_status']=='BROKEN']
    recovery_rows = data_df[data_df['machine_status']=='RECOVERING']
    normal_rows = data_df[data_df['machine_status']=='NORMAL']
    return  broken_rows, recovery_rows, normal_rows

def calculate_outliers_fraction(normal_rows, data_df):
    '''To calculate the fraction of outliers on the dataset'''
    outliers_fraction = 1 - (len(normal_rows)/(len(data_df)))
    return outliers_fraction

In [49]:
# Preprocessing
def data_imputation(data_df):
    '''Function to imputate the missing values in the data frame'''
    # Set datetime as index
    data_df.set_index('timestamp', inplace=True)
    # Use mean of the column to handle missing values and remove label in feature matrix X
    m, n = data_df.shape
    # Ignore machine status columns (last column in the dataframe)
    X = data_df.iloc[:,:n-1] 
    X = X.fillna(X.mean())
    # Matrix with the data
    print(X.shape)
    return X

def data_scaling(data_df):
    '''Function to scale the data'''
    # Standardize features by removing the mean and scaling to unit variance.
    scaler = StandardScaler()
    # Fit to data, then transform it.
    X = scaler.fit_transform(data_df)
    return X

In [56]:

train_broken_rows, train_recovery_rows, train_normal_rows = sensor_status(train_data)
outliers_fraction = calculate_outliers_fraction(train_normal_rows, train_data)

In [54]:
X_train = data_imputation(train_data)
X_train = data_scaling(X_train)


(129600, 50)


In [61]:
# 3m  53 sesin
# Training
def svm_train(outliers_fraction, X):
    clf = svm.OneClassSVM(nu=outliers_fraction)
    y_pred = clf.fit(X).predict(X)
    print('OneClassSVM')
    print('-'*100)
    # print(f'Number of anomalies detected')
    # print(data_df[f'{name}'].value_counts())
    return y_pred

y_train = svm_train(outliers_fraction, X_train)
y_train 

OneClassSVM
----------------------------------------------------------------------------------------------------


array([ 1,  1,  1, ..., -1, -1, -1])