In [37]:
import numpy as np
import pandas as pd
from copy import copy
import sys
sys.path.append('/home/ndsviriden/MinMax94/src/utils') 
from constants import data_directory, MmxColumns
from interpolation import interpolate_mmx, create_patterns
from converters import convert_raw_to_mmx
from preprocessing import get_clean_data
from loaders import load_mm94_stations, select_mm94_features
from geographical import find_nearest_wmo_station, add_solar_angles, add_coordinates, add_road_id
from sklearn.ensemble import IsolationForest
from score import get_labels
from score import calc_precision, calc_recall, calc_f1_score
import gc
from tqdm import tqdm_notebook
from functools import reduce

## Loading data

In [38]:
%%time
generate_station_id = [615, 618, 620, 624, 626, 627, 635, 702, 
                       704, 862, 874, 888, 1808, 1813, 1826]

raw = load_mm94_stations(generate_station_id)
raw = select_mm94_features(raw, ['t_air', 't_road', 't_underroad', 'pressure', 'dampness'])

mmx_rwis = convert_raw_to_mmx(raw)
mmx_rwis_interpolated = interpolate_mmx(mmx_rwis)
data = create_patterns(mmx_rwis_interpolated)
data['data_solar_azimuth'], data['data_solar_altitude'] = add_solar_angles(data)
data['data_latitude'], data['data_longitude'] = add_coordinates(data)
data['data_road'] = add_road_id(data)
data = data.dropna(subset=('data_t_road', ))
data_clean = get_clean_data(data)
data_clean = data_clean.reset_index(drop=True)

gc.collect()

  solar_azimuth = np.arccos(cos_az) * np.sign(h_rad)


CPU times: user 31.8 s, sys: 3.29 s, total: 35.1 s
Wall time: 35.1 s


## Generated anomalies

### Single outlier

In [39]:
def generate_single_anomaly(df, index, col=MmxColumns.ROAD_TEMPERATURE):
    sign = np.random.choice([1, -1])
    series_adding = sign * np.random.uniform(2, 5)
    
    perturbated_series = df.loc[index, col] + series_adding
    return perturbated_series

np.random.seed(42)
test = copy(data_clean)
test['label_true'] = False

number_of_anomalies = 30 * len(generate_station_id)
col = MmxColumns.ROAD_TEMPERATURE
index_list = np.random.choice(test.index, number_of_anomalies, replace=False)

for index in index_list:
    perturbated_series = generate_single_anomaly(test, index)
    test.loc[index, col] += perturbated_series
    test.loc[index, 'label_true'] = True

### Short-term anomaly

In [40]:
def generate_short_term_anomaly(df, index, col=MmxColumns.ROAD_TEMPERATURE):
    series_duration = np.random.randint(3, 12)
    sign = np.random.choice([1, -1])
    
    series_adding = sign * np.random.exponential(2, series_duration)
    perturbation = np.cumsum(series_adding)
    
    perturbated_series = df.loc[index: (index+series_duration-1), col] + perturbation
    return perturbated_series


number_of_anomalies = 20 * len(generate_station_id)
col = MmxColumns.ROAD_TEMPERATURE
index_list = np.random.choice(test.index, number_of_anomalies, replace=False)

for index in index_list:
    perturbated_series = generate_short_term_anomaly(test, index)
    series_duration = len(perturbated_series)
    test.loc[index: (index + series_duration - 1), col] += perturbated_series
    test.loc[index: (index + series_duration - 1), 'label_true'] = True

### Long term anomaly

In [41]:
def generate_long_term_anomaly(df, index, col=MmxColumns.ROAD_TEMPERATURE):
    series_duration = np.random.randint(300)
    multiplier = np.random.uniform(1.5, 2)
    perturbation = np.random.normal(0, 5, series_duration)
    #print(series_duration, multiplier, index)
    
    perturbated_series = df.loc[index: (index + series_duration - 1), 'data_t_road'] * multiplier + perturbation
    return perturbated_series

number_of_anomalies = 3 * len(generate_station_id)
col = MmxColumns.ROAD_TEMPERATURE
index_list = np.random.choice(test.index, number_of_anomalies, replace=False)

for index in index_list:
    perturbated_series = generate_long_term_anomaly(test, index)
    series_duration = len(perturbated_series)
    test.loc[index: (index + series_duration - 1), col] += perturbated_series
    test.loc[index: (index + series_duration - 1), 'label_true'] = True

## Feature selection

In [42]:
%%time

from preprocessing import create_feature_df


time = 0
lag_list=(1, 2, 3)
diff_list=((1, 2), (1, 3))
post_process = False
regression_mode = False
variables = ['data_t_road']

df_test = create_feature_df(test, coordinates=False,
                            winter_period=False, time=time, lag_list=lag_list, road_id=False,
                            diff_list=diff_list, post_process=post_process, variables=variables)

features = [col for col in df_test if col.startswith('data_')]
target = [col for col in df_test if col.startswith('target_')]

X_test = np.array(df_test[features + target])

CPU times: user 1.05 s, sys: 97.3 ms, total: 1.15 s
Wall time: 1.15 s


## Fit_Predict

In [31]:
from sklearn.covariance import EllipticEnvelope

thresh_list = []
contamination = len(df_test[df_test['label_true']]) / len(test)

for cont in np.linspace(contamination / 4, contamination * 2, 20):
    
    clf = EllipticEnvelope(contamination=cont)
    clf.fit(X_test)
    df_test['label_predict'] = (1 - clf.predict(X_test)) / 2 
    df_test['decision_function'] =  clf.decision_function(X_test)
    
    window = pd.Timedelta('4h')
    st_id = generate_station_id

    recall = calc_recall(df_test, st_id, window)
    precision = calc_precision(df_test, st_id, window)
    f1_score = calc_f1_score(precision, recall)
    
    thresh_list.append({'Threshold': cont, 'Recall': recall, 'Precision': precision, 'F1': f1_score})
    
    print("Contamination: {0:.4f}".format(cont))
    print("Recall: {0:.3f} \nPrecision: {1:.3f} \nF1:{2:.3f}".format(recall, precision, f1_score))
    print("------------------------------------------------")



Contamination: 0.0022
Recall: 0.844 
Precision: 0.987 
F1:0.910
------------------------------------------------
Contamination: 0.0030
Recall: 0.918 
Precision: 0.979 
F1:0.947
------------------------------------------------
Contamination: 0.0038
Recall: 0.936 
Precision: 0.973 
F1:0.955
------------------------------------------------
Contamination: 0.0046
Recall: 0.959 
Precision: 0.965 
F1:0.962
------------------------------------------------
Contamination: 0.0054
Recall: 0.968 
Precision: 0.949 
F1:0.958
------------------------------------------------
Contamination: 0.0062
Recall: 0.975 
Precision: 0.927 
F1:0.950
------------------------------------------------
Contamination: 0.0070
Recall: 0.984 
Precision: 0.902 
F1:0.941
------------------------------------------------




Contamination: 0.0078
Recall: 0.987 
Precision: 0.871 
F1:0.925
------------------------------------------------
Contamination: 0.0085
Recall: 0.988 
Precision: 0.836 
F1:0.905
------------------------------------------------
Contamination: 0.0093
Recall: 0.990 
Precision: 0.806 
F1:0.889
------------------------------------------------
Contamination: 0.0101
Recall: 0.992 
Precision: 0.773 
F1:0.869
------------------------------------------------
Contamination: 0.0109
Recall: 0.994 
Precision: 0.742 
F1:0.849
------------------------------------------------
Contamination: 0.0117
Recall: 0.995 
Precision: 0.717 
F1:0.833
------------------------------------------------




Contamination: 0.0125
Recall: 0.995 
Precision: 0.692 
F1:0.816
------------------------------------------------




Contamination: 0.0133
Recall: 0.996 
Precision: 0.673 
F1:0.803
------------------------------------------------
Contamination: 0.0141
Recall: 0.997 
Precision: 0.651 
F1:0.787
------------------------------------------------
Contamination: 0.0149
Recall: 0.997 
Precision: 0.634 
F1:0.775
------------------------------------------------




Contamination: 0.0157
Recall: 0.997 
Precision: 0.616 
F1:0.761
------------------------------------------------
Contamination: 0.0165
Recall: 0.997 
Precision: 0.599 
F1:0.749
------------------------------------------------




Contamination: 0.0173
Recall: 0.997 
Precision: 0.580 
F1:0.734
------------------------------------------------


In [32]:
thresh_list

[{'F1': 0.910119249872985,
  'Precision': 0.9872821479039096,
  'Recall': 0.8441436138905238,
  'Threshold': 0.0021659204715816956},
 {'F1': 0.9472042611604105,
  'Precision': 0.9786501377410468,
  'Recall': 0.9177163037080636,
  'Threshold': 0.0029638911716381097},
 {'F1': 0.9545650418488019,
  'Precision': 0.9734129137276181,
  'Recall': 0.9364331959976456,
  'Threshold': 0.003761861871694524},
 {'F1': 0.9619838061963599,
  'Precision': 0.9653088630259624,
  'Recall': 0.9586815773984697,
  'Threshold': 0.004559832571750938},
 {'F1': 0.9582385703097678,
  'Precision': 0.9491428571428572,
  'Recall': 0.9675103001765745,
  'Threshold': 0.005357803271807352},
 {'F1': 0.9504445684931457,
  'Precision': 0.9270557029177718,
  'Recall': 0.9750441436138906,
  'Threshold': 0.006155773971863766},
 {'F1': 0.9408118056153845,
  'Precision': 0.9016585938646705,
  'Recall': 0.9835197174808711,
  'Threshold': 0.0069537446719201805},
 {'F1': 0.9251828277782999,
  'Precision': 0.8707044107965767,
  'R

## Result

In [21]:
window = pd.Timedelta('4h')
st_id = generate_station_id

recall = calc_recall(df_test, st_id, window)
precision = calc_precision(df_test, st_id, window)
f1_score = calc_f1_score(precision, recall)

print("Recall: {0:.3f} \nPrecision: {1:.3f} \nF1:{2:.3f}".format(recall, precision, f1_score))

Recall: 0.989 
Precision: 0.831 
F1:0.903
----------------------------------------


import pickle

pickle.dump(clf, open('/mnt/HARD/MinMax94/models/pickle/elliptic_envelope_artificial.pickle', "wb"))

## Real data

In [45]:
%%time
test_station_id = [114, 117, 119, 302, 303, 307, 393, 442, 503, 504, 
                   511, 516, 1838, 1896]



raw = load_mm94_stations(test_station_id)
raw = select_mm94_features(raw, ['t_air', 't_road', 't_underroad', 'pressure', 'dampness'])

mmx_rwis = convert_raw_to_mmx(raw)
mmx_rwis_interpolated = interpolate_mmx(mmx_rwis)
data = create_patterns(mmx_rwis_interpolated)

data['data_solar_azimuth'], data['data_solar_altitude'] = add_solar_angles(data)
data['data_latitude'], data['data_longitude'] = add_coordinates(data)
data['data_road'] = add_road_id(data)
data = data.dropna(subset=('data_t_road', ))

del data[MmxColumns.ID_AIR_TEMPERATURE], data[MmxColumns.ID_UNDERGROUND_TEMPERATURE], \
    data[MmxColumns.ID_PRESSURE], data[MmxColumns.ID_HUMIDITY]

test_real = data[data['station_id'].isin(test_station_id)]
test_real = test_real.reset_index(drop=True)
test_real['label_true'] = get_labels(test_real, labels_type='true')

del data, raw, mmx_rwis, mmx_rwis_interpolated
gc.collect()

  solar_azimuth = np.arccos(cos_az) * np.sign(h_rad)


CPU times: user 24.8 s, sys: 1.91 s, total: 26.7 s
Wall time: 26.7 s


In [46]:
%%time

from preprocessing import create_feature_df


time = 0
lag_list=(1, 2, 3)
diff_list=((1, 2), (1, 3))
post_process = False
regression_mode = False
variables = ['data_t_road']

df_test_real = create_feature_df(test_real, coordinates=False,
                            winter_period=False, time=time, lag_list=lag_list, road_id=False,
                            diff_list=diff_list, post_process=post_process, variables=variables)

features = [col for col in df_test_real if col.startswith('data_')]
target = [col for col in df_test_real if col.startswith('target_')]

X_test_r = np.array(df_test_real[features + target])

CPU times: user 851 ms, sys: 31.9 ms, total: 882 ms
Wall time: 883 ms


In [47]:
from sklearn.covariance import EllipticEnvelope

thresh_list = []
#contamination = len(df_test[df_test['label_true']]) / len(test)

for cont in np.linspace(0.0021659204715816956, 0.017327363772653565, 20):
    
    clf = EllipticEnvelope(contamination=cont)
    clf.fit(X_test)
    df_test_real['label_predict'] = (1 - clf.predict(X_test_r)) / 2 
    df_test_real['decision_function'] =  clf.decision_function(X_test_r)
    
    window = pd.Timedelta('4h')
    st_id = generate_station_id

    recall = calc_recall(df_test, st_id, window)
    precision = calc_precision(df_test, st_id, window)
    f1_score = calc_f1_score(precision, recall)
    
    thresh_list.append({'Threshold': cont, 'Recall': recall, 'Precision': precision, 'F1': f1_score})
    
    print("Contamination: {0:.4f}".format(cont))
    print("Recall: {0:.3f} \nPrecision: {1:.3f} \nF1:{2:.3f}".format(recall, precision, f1_score))
    print("------------------------------------------------")



KeyError: 'label_predict'