In [6]:
# copy encoders to local file system
from sklearn.externals import joblib
from keras.models import load_model
min_max_scaler = joblib.load('encoders/Minmax_scaler')
autoencoder = load_model('encoders/encoder_1layer_75dims')
label_encoder = joblib.load('encoders/label_encoder')
onehot_encoder = joblib.load('encoders/onehot_encoder')

Using TensorFlow backend.


In [2]:
import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt

# read parquet
pm = pd.read_parquet('data/Europe_network_data.parquet')
alarm = pd.read_parquet('data/Europe_network_labels.parquet')

In [3]:
# drop useless columns, adjust time to the day before
alarm = alarm.loc[~alarm['description'].str.contains('Demo for Josh')]
alarm['TIME'] = pd.to_datetime(alarm['time'], infer_datetime_format=True).dt.floor('1D')
alarm = alarm.drop(['description','time','timestamp','extraAttributes'], axis=1).rename({'category':'ALARM'}, axis=1)
alarm['TIME'] = alarm['TIME'] - pd.DateOffset(1)
alarm['ALARM'] = 1

In [4]:
# label:['IS','n/a'], drop devices, fillna, min_max transform
pm = pm.loc[pm['LABEL'].isin(['IS','n/a'])]
dev_list = ['CHMON', 'STM64', 'OC192', 'STTP', 'STM4', 'STM16', 'NMCMON', 'OC48', 'OC12', 'OC3', 'FLEX', 'RAMAN']
pm = pm.loc[~pm['GROUPBYKEY'].isin(dev_list)]
pm = pm.fillna(0)
pm = pm.drop('LABEL',axis=1)
pm = pm.set_index(['ID','TIME']).sort_index().reset_index()
pm.iloc[:, 3:] = min_max_scaler.transform(pm.iloc[:, 3:])

In [5]:
pm.head()

Unnamed: 0,ID,TIME,GROUPBYKEY,BBE-RS,CV-OTU,CV-S,DROPGAINAVG-OTS,DROPGAINMAX-OTS_DROPGAINMIN-OTS_-,E-CV,E-ES,E-INFRAMESERR_E-INFRAMES_/,E-OUTFRAMESERR_E-OUTFRAMES_/,E-UAS,ES-OTU,ES-RS,ES-S,OCH-OPRAVG,OCH-OPRMAX_OCH-OPRMIN_-,OCH-SPANLOSSAVG,OCH-SPANLOSSMAX_OCH-SPANLOSSMIN_-,OPINAVG-OTS,OPINMAX-OTS_OPINMIN-OTS_-,OPOUTAVG-OTS,OPOUTAVG-OTS_OPINAVG-OTS_-,OPOUTMAX-OTS_OPOUTMIN-OTS_-,OPRAVG-OCH,OPRAVG-OTS,OPRMAX-OCH_OPRMIN-OCH_-,OPRMAX-OTS_OPRMIN-OTS_-,OPTAVG-OCH,OPTAVG-OTS,OPTMAX-OCH_OPTMIN-OCH_-,OPTMAX-OTS_OPTMIN-OTS_-,ORLAVG-OTS,ORLMIN-OTS,OTU-CV,OTU-ES,OTU-QAVG,OTU-QSTDEV,PCS-CV,PCS-ES,PCS-UAS,QAVG-OTU,QSTDEV-OTU,RS-BBE,RS-ES,S-CV,S-ES
0,Device100005,2019-01-27,ETH10G,0.0,0.0,0.0,0.841868,0.0,0.0,0.0,0.0,0.0,0.999942,0.0,0.0,0.0,0.946844,0.0,0.0,0.0,0.839793,0.0,0.681106,0.420274,0.0,0.887208,0.684211,0.0,0.004319,0.8,0.829646,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Device100005,2019-01-28,ETH10G,0.0,0.0,0.0,0.841868,0.0,0.0,0.0,0.0,0.0,0.739839,0.0,0.0,0.0,0.946844,0.0,0.0,0.0,0.839793,0.0,0.681106,0.420274,0.0,0.887208,0.684211,0.0,0.004319,0.8,0.829646,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Device100005,2019-01-30,ETH10G,0.0,0.0,0.0,0.841868,0.0,0.0,0.0,0.0,0.0,0.000393,0.0,0.0,0.0,0.946844,0.0,0.0,0.0,0.839793,0.0,0.681106,0.420274,0.0,0.887208,0.684211,0.0,0.004319,0.8,0.829646,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Device100005,2019-01-31,ETH10G,0.0,0.0,0.0,0.841868,0.0,0.0,0.0,0.0,0.0,0.000162,0.0,0.0,0.0,0.946844,0.0,0.0,0.0,0.839793,0.0,0.681106,0.420274,0.0,0.887208,0.684211,0.0,0.004319,0.8,0.829646,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Device100005,2019-02-09,ETH10G,0.0,0.0,0.0,0.841868,0.0,0.0,4.6e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.946844,0.0,0.0,0.0,0.839793,0.0,0.681106,0.420274,0.0,0.887208,0.684211,0.0,0.004319,0.8,0.829646,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# other transform
tmp_pm = autoencoder.predict(pm.iloc[:,3:].values)
GBK = label_encoder.transform(np.reshape(pm['GROUPBYKEY'].values, [-1, 1]))
GBK = np.reshape(GBK, [-1, 1])
GBK = onehot_encoder.transform(GBK)
GBK = pd.DataFrame(GBK.toarray())
pm = pd.concat([pm[['ID','TIME']],pd.DataFrame(np.concatenate([tmp_pm, GBK], axis=1))], axis=1)
del tmp_pm, GBK

  y = column_or_1d(y, warn=True)


In [8]:
# make groups when it has consecutive 3 days
day = pd.Timedelta('1d')
breaks = pm['TIME'].diff() != day
groups = breaks.cumsum()

tmp = groups.value_counts()
tmp = tmp[tmp<4]

In [9]:
# drop discrete rows
mask = groups.loc[groups.isin(tmp.index.tolist())].index
pm = pm.drop(mask)
del breaks, groups

In [10]:
# use this group to exec groupby
pm['GROUP'] = (pm['TIME'].diff()!=day).cumsum()

In [11]:
# construct time window
pm = pm.set_index(['ID','TIME'])
tmp = pm.groupby(['GROUP'])
## drop first 2 rows with nan
mask = tmp.head(2).index
pm = pd.concat([pm, tmp.shift(1), tmp.shift(2)], axis=1)
pm = pm.drop(mask)

In [12]:
# join pm and alarm
tmp = pd.merge(pm, alarm, on=['ID','TIME'], how='left')
del pm, alarm, mask

In [13]:
# ATTENTION: to reduce RAM usage, I reduced the precision to 32bit float
tmp = tmp.drop(['ID','TIME', 'GROUP'], axis=1).astype('float32')
tmp['ALARM'] = tmp['ALARM'].fillna(0).astype('int8')


In [14]:
# construct ndarrary and save to file.
pm = tmp.iloc[:, :-1].values.reshape(-1,86,3,1)
pm = np.swapaxes(pm, 1, 2)
alarm = tmp['ALARM'].values
del tmp

In [17]:
# np.save('86_3_pm.npy', pm)
# np.save('alarm.npy', alarm)

In [20]:
import pandas as pd

data = pd.read_parquet('data/Tokyo_Network_Data_1Day.parquet')
data_eu = pd.read_parquet('data/Europe_network_data.parquet')
'''replace labels'''
# label dictionary
rep_list = {
    'CV-E':'E-CV',
    'CV-PCS': 'PCS-CV',
    'INFRAMESERR-E_INFRAMES-E_/': 'E-INFRAMESERR_E-INFRAMES_/',
    'ES-PCS': 'PCS-ES',
    'UAS-PCS': 'PCS-UAS',
    'SPANLOSSMAX-OCH_SPANLOSSMIN-OCH_-': 'OCH-SPANLOSSMAX_OCH-SPANLOSSMIN_-',
    'UAS-E': 'E-UAS',
    'ES-E': 'S-ES',
    'OUTFRAMESERR-E_OUTFRAMES-E_/': 'E-OUTFRAMESERR_E-OUTFRAMES_/',
    'SPANLOSSAVG-OCH': 'OCH-SPANLOSSAVG'
}
# rename the columns
data = data.rename(columns = rep_list)

'''Unify the columns'''
listA = data.columns.tolist()
listB = data_eu.columns.tolist()
common = list(set(listA).intersection(set(listB)))
diff = list(set(listB).difference(set(listA)))
emp_columns = pd.DataFrame(columns=diff)
data = pd.concat([data,emp_columns],axis = 1)
data = data[data_eu.columns]

In [22]:
del data_eu
pm = data

# label:['IS','n/a'], drop devices, fillna, min_max transform
pm = pm.loc[pm['LABEL'].isin(['IS','n/a'])]
dev_list = ['CHMON', 'STM64', 'OC192', 'STTP', 'STM4', 'STM16', 'NMCMON', 'OC48', 'OC12', 'OC3', 'FLEX', 'RAMAN']
pm = pm.loc[~pm['GROUPBYKEY'].isin(dev_list)]
pm = pm.fillna(0)
pm = pm.drop('LABEL',axis=1)
pm = pm.set_index(['ID','TIME']).sort_index().reset_index()
pm.iloc[:, 3:] = min_max_scaler.transform(pm.iloc[:, 3:])

# other transform
tmp_pm = autoencoder.predict(pm.iloc[:,3:].values)
GBK = label_encoder.transform(np.reshape(pm['GROUPBYKEY'].values, [-1, 1]))
GBK = np.reshape(GBK, [-1, 1])
GBK = onehot_encoder.transform(GBK)
GBK = pd.DataFrame(GBK.toarray())
pm = pd.concat([pm[['ID','TIME']],pd.DataFrame(np.concatenate([tmp_pm, GBK], axis=1))], axis=1)
del tmp_pm, GBK

# make groups when it has consecutive 3 days

day = pd.Timedelta('1d')
breaks = pm['TIME'].diff() != day
groups = breaks.cumsum()

tmp = groups.value_counts()
tmp = tmp[tmp<4]

# drop discrete rows

mask = groups.loc[groups.isin(tmp.index.tolist())].index
pm = pm.drop(mask)
del breaks, groups

# use this group to exec groupby
pm['GROUP'] = (pm['TIME'].diff()!=day).cumsum()
# construct time window
pm = pm.set_index(['ID','TIME'])
tmp = pm.groupby(['GROUP'])
## drop first 2 rows with nan
mask = tmp.head(2).index
pm = pd.concat([pm, tmp.shift(1), tmp.shift(2)], axis=1)
pm = pm.drop(mask)

  y = column_or_1d(y, warn=True)


In [28]:
# pm = pm.drop('GROUP', axis=1)
# pm = pm.astype('float32')
# pm['ALARM'] = -1
pm['ALARM'] =  pm['ALARM'].astype('int8')
alarm = pm['ALARM'].values
pm = pm.iloc[:, :-1].values.reshape(-1,86,3,1)
pm = np.swapaxes(pm, 1, 2)

np.save('tokyo_pm_86_3.npy', pm)
np.save('tokyo_alarm.npy', alarm)