In [1]:
# copy encoders to local file system
from sklearn.externals import joblib
from keras.models import load_model
min_max_scaler = joblib.load('encoders/Minmax_scaler')
autoencoder = load_model('encoders/encoder_1layer_75dims')
label_encoder = joblib.load('encoders/label_encoder')
onehot_encoder = joblib.load('encoders/onehot_encoder')

Using TensorFlow backend.


In [19]:
import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt

# read parquet
pm = pd.read_parquet('data/Europe_Network_data.parquet')
alarm = pd.read_parquet('data/v2_Europe_network_labels.parquet')

In [13]:
# drop useless columns, adjust time to the day before
alarm = alarm.loc[~alarm['description'].str.contains('Demo for Josh')]
alarm['TIME'] = pd.to_datetime(alarm['time'], infer_datetime_format=True).dt.floor('1D')
alarm = alarm.drop(['description','time','timestamp','extraAttributes'], axis=1).rename({'category':'ALARM'}, axis=1)
alarm['TIME'] = alarm['TIME'] - pd.DateOffset(1)
alarm['ALARM'] = 1

In [15]:
# label:['IS','n/a'], drop devices, fillna, min_max transform
pm = pm.loc[pm['LABEL'].isin(['IS','n/a'])]
dev_list = ['CHMON', 'STM64', 'OC192', 'STTP', 'STM4', 'STM16', 'NMCMON', 'OC48', 'OC12', 'OC3', 'FLEX', 'RAMAN']
pm = pm.loc[~pm['GROUPBYKEY'].isin(dev_list)]
pm = pm.fillna(0)
pm = pm.drop('LABEL',axis=1)
pm = pm.set_index(['ID','TIME']).sort_index().reset_index()
pm.iloc[:, 3:] = min_max_scaler.transform(pm.iloc[:, 3:])

ValueError: could not convert string to float: 'Pre-FEC Signal Fail'

In [None]:
# other transform
tmp_pm = autoencoder.predict(pm.iloc[:,3:].values)
GBK = label_encoder.transform(np.reshape(pm['GROUPBYKEY'].values, [-1, 1]))
GBK = np.reshape(GBK, [-1, 1])
GBK = onehot_encoder.transform(GBK)
GBK = pd.DataFrame(GBK.toarray())
pm = pd.concat([pm[['ID','TIME']],pd.DataFrame(np.concatenate([tmp_pm, GBK], axis=1))], axis=1)
del tmp_pm, GBK

In [None]:
# make groups when it has consecutive 3 days
day = pd.Timedelta('1d')
breaks = pm['TIME'].diff() != day
groups = breaks.cumsum()

tmp = groups.value_counts()
tmp = tmp[tmp<4]

In [None]:
# drop discrete rows
mask = groups.loc[groups.isin(tmp.index.tolist())].index
pm = pm.drop(mask)
del breaks, groups

In [None]:
# use this group to exec groupby
pm['GROUP'] = (pm['TIME'].diff()!=day).cumsum()

In [None]:
# construct time window
pm = pm.set_index(['ID','TIME'])
tmp = pm.groupby(['GROUP'])
## drop first 2 rows with nan
mask = tmp.head(2).index
pm = pd.concat([pm, tmp.shift(1), tmp.shift(2)], axis=1)
pm = pm.drop(mask)

In [None]:
# join pm and alarm
tmp = pd.merge(pm, alarm, on=['ID','TIME'], how='left')
del pm, alarm, mask

In [None]:
# ATTENTION: to reduce RAM usage, I reduced the precision to 32bit float
tmp = tmp.drop(['ID','TIME', 'GROUP'], axis=1).astype('float32')
tmp['ALARM'] = tmp['ALARM'].fillna(0).astype('int8')


In [None]:
# construct ndarrary and save to file.
pm = tmp.iloc[:, :-1].values.reshape(-1,86,3,1)
pm = np.swapaxes(pm, 1, 2)
alarm = tmp['ALARM'].values
del tmp

In [None]:
np.save('86_3_pm.npy', pm)
np.save('alarm.npy', alarm)