<img src="Swisens_logo.png" width="240" height="240" align="left"/>
<div style="text-align: right">
    SwisensDataAnalyzer Introduction
    <br>Machine Learning Model Validation
    <br>Author: <a href="mailto:yanick.zeder@swisens.ch">Yanick Zeder</a>
    <br> Copyright 2021, Swisens AG
    <br> <a href="mailto:yanick.zeder@swisens.ch"> Support </a>
    <br><br>
    <b>Adapted and modified by MeteoSwiss.</b>
</div>

## Time Series Comparison

#### Imports

In [1]:
# run this if you made changes to the poleno-ml code 
# NB: Those changes must have been made to the /tf/tmp/poleno-ml repository to have an effect on this notebook's code.
# NB: However, changes made to the tmp repository are temporary and will be rolled back when Docker VM will be shutdown.
#     If you want to make them permanent, dupplicate them to /tf/home/dependencies/poleno-ml.
!pip install /tf/tmp/poleno-ml

Processing /tf/tmp/poleno-ml
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: poleno-ml
  Building wheel for poleno-ml (setup.py) ... [?25ldone
[?25h  Created wheel for poleno-ml: filename=poleno_ml-0.1.0-py3-none-any.whl size=16657 sha256=91d3188ca906b75af7efc1be07bf59341d70e8fca274355b774a7ffd327bb7c6
  Stored in directory: /root/.cache/pip/wheels/36/27/94/c36c0ca182dfe6d14b2ad2190409db7ec462f251c1019d9266
Successfully built poleno-ml
Installing collected packages: poleno-ml
  Attempting uninstall: poleno-ml
    Found existing installation: poleno-ml 0.1.0
    Uninstalling poleno-ml-0.1.0:
      Successfully uninstalled poleno-ml-0.1.0
Successfully installed poleno-ml-0.1.0
[0m

In [2]:
%load_ext autoreload
%autoreload 2
import copy
import datetime
import json
import matplotlib.pyplot as plt
from multiprocessing import Pool
from multiprocessing.pool import ThreadPool
import numpy as np
import operator as op
import os
import pandas as pd
from poleno_db_interface.database.filter import AndClause, OrClause, ConditionClause, DataColumn
from poleno_db_interface.database.query_utils import DataColumn, finalize_query
from poleno_ml.database.query_interface_ml import QueryInterfaceML, DatasetPipeline
import poleno_db_interface.database.model.poleno_data_model as pdm
import tensorflow as tf
import tensorflow.keras as keras
import time
from tqdm.notebook import tqdm
from uuid import UUID
import uuid

### Parameters

In [3]:
model_name = 'real1_bis'
start_date  = datetime.date(2020,2,19) # February 19th 2020
end_date    = datetime.date(2021,11,1) # November 1st 2021
hirst_file_path = os.path.join('validation_input', 'hirst_pay_19022020-01112021.csv')
agg_freq = '1D' # 24H
device_name = "poleno-5"

In [17]:
db_chunksize = 64
pred_batch_size = 64
assert pred_batch_size >= db_chunksize, 'Predictions are way slower if pred_batch_size is smaller than db_chunksize.'

In [5]:
model_path = os.path.join('models', model_name, 'model')
model_info_file_path = os.path.join(model_path, 'model_info.json')
eval_cache_path = os.path.join('validation_input', f'cache_{device_name}_{start_date}-{end_date}')

#### Prepare the tf pipeline
Similar to the training and testing, we set up a tf dataset pipeline. Here we use the more flexible function `query_interface_ml.prepare_tf_dataset_from_event_filter` which allows us to define any filter. Here, we use this to get all the event from one SwisensPoleno in a defined time range.

Additionally, we filter out all events that do not comply with the minimal size and solidity conditions.

In [6]:
# load trained model's info
with open(model_info_file_path, 'r') as f:
    model_info = json.loads(f.read())
# load trained model
model = keras.models.load_model(model_path, compile=False)

In [7]:
eval_dir = os.path.join('models', model_info['model_name'], 'eval')
os.makedirs(eval_dir, exist_ok=True)
poleno_file_path = os.path.join(eval_dir, f'{device_name}_{start_date.strftime("%d%m%Y")}-{end_date.strftime("%d%m%Y")}.csv')

### Get new model's predictions

#### Pull raw data

In [8]:
import myloginpath
db_config = myloginpath.parse('client', path='/tf/.mylogin.cnf')

# Conect to the database and create an interface instance
query_interface_ml = QueryInterfaceML(**db_config)

In [9]:
# load poleno's raw data measured during timerange
filter_ = AndClause(
    ConditionClause(pdm.Event.timestamp, op.gt, time.mktime(start_date.timetuple())),
    ConditionClause(pdm.Event.timestamp, op.lt, time.mktime(end_date.timetuple())),
    ConditionClause(pdm.Event.device_id_str, op.eq, device_name),
    ConditionClause(pdm.ImageAnalysis.particleArea, op.ge, 625, "img0"),
    ConditionClause(pdm.ImageAnalysis.particleArea, op.ge, 625, "img1"),
    ConditionClause(pdm.ImageAnalysis.particleSolidity, op.ge, 0.9, "img0"),
    ConditionClause(pdm.ImageAnalysis.particleSolidity, op.ge, 0.9, "img1"),
    ConditionClause(pdm.ImageAnalysis.ImageData_id, op.eq, 0, "img0"),
    ConditionClause(pdm.ImageAnalysis.ImageData_id, op.eq, 1, "img1"),
)

timeseries_dataset = query_interface_ml.prepare_tf_dataset_from_event_filter(
    filter_=filter_,
    batch_size=model_info['batch_size'],
    model_features=copy.deepcopy(model_info['model_features']),
    include_timestamps=True,
    db_chunksize=db_chunksize
)
timeseries_dataset.dataset_length

 received 2963000unique evet list finished. Calling prepare_tf_dataset function 1.8073307898239188


2963832

In [None]:
timeseries_dataset.enable_cache(eval_cache_path, prepare=True)

In [18]:
# batch and prefetch
timeseries_dataset.tf_dataset = timeseries_dataset.tf_dataset.unbatch().batch(pred_batch_size).prefetch(tf.data.AUTOTUNE)

#### Predict

In [None]:
# get the model's predictions for each event
labels = np.array(model_info['classes'])
list_batch_preds = []

for id_batch, feature_batch in tqdm(timeseries_dataset.get_data_pipeline(with_id=True), 
                                    total=timeseries_dataset.dataset_length//pred_batch_size, leave=False):
    #feature_batch['input_1'] = feature_batch.pop('rec0') # only uncomment this if you need backward compatibility with the old model's architecture
    #feature_batch['input_2'] = feature_batch.pop('rec1') # only uncomment this if you need backward compatibility with the old model's architecture
    # compute predictions
    preds = model.predict(feature_batch, verbose=False)
    # append predicted labels and certainty
    y_pred = np.argmax(preds, axis=-1)
    certainties = np.max(preds, axis=-1)
    pred_classes = labels[y_pred]
    list_batch_preds.append(pd.DataFrame({
        'event_id': [id_.decode() for id_ in id_batch["id"].numpy()],
        'pred_class': pred_classes,
        'pred_certainty': certainties,
        'event_timestamp': [ts_ for ts_ in id_batch["timestamp"].numpy()]
    }))
df_poleno = pd.concat(list_batch_preds).reset_index(drop=True) # convert list of pd.DataFrame to one pd.DataFrame
del list_batch_preds
# convert timestamp from double to datetime
df_poleno['event_timestamp'] = df_poleno['event_timestamp'].apply(float)
df_poleno['event_timestamp'] = pd.to_datetime(df_poleno['event_timestamp'], unit="s")
df_poleno.index = df_poleno.event_timestamp
df_poleno = df_poleno.drop(['event_timestamp'], axis=1)
df_poleno.to_csv(poleno_file_path) # save to csv

  0%|          | 0/1447 [00:00<?, ?it/s]

In [None]:
query_interface_ml.session.rollback()

### Get Hirst data

In [None]:
df_hirst = pd.read_csv(hirst_file_path)
timestamp_cols = ['Year', 'Month', 'Day', 'Hour', 'Minute']
df_hirst['event_timestamp'] = pd.to_datetime(df_hirst[timestamp_cols])
df_hirst = df_hirst.drop(timestamp_cols, axis=1)
df_hirst = df_hirst.replace(32767, np.nan)
df_hirst.index = df_hirst.event_timestamp
df_hirst = df_hirst[[c for c in df_hirst.columns if c in list(model_info['classes'])]]
raise Exception('deal with NaN')
df_hirst.head(3)

### Plots

In [None]:
# temporary cell
new__ = df_poleno.copy()
for col in df_poleno['pred_class'].unique():
    new__[col] = (df_poleno['pred_class'] == col).astype(int)
new__.resample(agg_freq).sum().max(), df_hirst.resample(agg_freq).sum().max()

In [None]:
label_to_plot = 'Fraxinus'
threshold = .9

In [None]:
# poleno
df_poleno_to_plot = df_poleno[(df_poleno.pred_class == label_to_plot) & (df_poleno.pred_certainty > threshold)].rename(columns={'event_id': 'poleno'})
df_poleno_to_plot = df_poleno_to_plot.resample(agg_freq).count() # frequency conversion and resampling of time series
if df_poleno_to_plot.shape[0] <= 0: # if no event has been found, create "empty" time series
    ix_ = pd.period_range(df_poleno.index.min(), df_poleno.index.max(), freq='1D')
    df_poleno_to_plot = pd.DataFrame({'poleno': np.zeros(len(ix_))}, index=ix_)
df_poleno_to_plot.head(3)

In [None]:
# hirst
df_hirst_to_plot = pd.DataFrame({'hirst': df_hirst[label_to_plot].resample(agg_freq).sum()})
df_hirst_to_plot.head(3)

In [None]:
df_final_plot = df_poleno_to_plot.merge(df_hirst_to_plot, on='event_timestamp', how='outer')[['poleno', 'hirst']]
df_final_plot = df_final_plot['2020-02-19':'2020-07-01']
print(df_final_plot.corr())
fig = df_final_plot.plot(figsize=(5, 3), title=label_to_plot, ylabel='concentration')
fig.plot()
plt.show()

In [None]:
df_final_plot.corr().poleno.hirst

In [None]:
## 1.2 read hirst df
print('read hirst data...')
hirstfile = "hirst_pay_19022020-24052020.csv"
df_hirst = pd.DataFrame(pd.read_csv(hirstfile))
# create timestamp index
df_hirst['timestamp'] = df_hirst.apply(lambda x: datetime.datetime(x['Year'], x['Month'], x['Day'], x['Hour'], x['Minute']), axis=1)
# filter the data
df_hirst = df_hirst.set_index('timestamp')[[lbl for lbl in labels if lbl in df_hirst.columns]]
#df_hirst.columns = labels
df_hirst = df_hirst.replace(32767,np.nan)
df_hirst.head()

In [None]:
## 3.3 resample hirst data
print('resample hirst...')
agg_time = pd.date_range(start=start_date, end=end_date, freq=agg_freq)
df_hirst_agg = pd.DataFrame(index=agg_time, columns=df_hirst.columns)

# need explanations:
for t in agg_time:
    tstr = t.strftime('%Y-%m-%d %H:%M:00')
    print(tstr)
    if pd.Timedelta(agg_freq) >= pd.Timedelta('1d'):
        tmp_bool = (df_hirst.index >= t) & (df_hirst.index < (t+pd.Timedelta(agg_freq)))
    else:
        tmp_bool = (df_hirst.index > (t-pd.Timedelta(agg_freq))) & (df_hirst.index <= t)
    
    for label in labels:
        df_hirst_agg.loc[tstr][label]=np.nanmean(df_hirst[tmp_bool][label])

if abc_switch:
    df_hirst_agg['abc'] = df_hirst_agg[['alnus','betula','corylus']].sum(axis=1)
# ---

labels = df_hirst_agg.columns
df_hirst_agg.to_csv(output_path+"hirst.csv")

### ==========================================
df_corr_all = pd.DataFrame(columns=labels)
df_corr_all['seuil']=np.nan
for seuil in seuils:
    ### 4. correlation ==============================
    print('computing correlations...')
    df_corr = pd.DataFrame(columns=labels, index=['hirst-p2','hirst-p4','hirst-p5','p2-p4','p2-p5','p4-p5','avg_hirst','avg_p2','avg_p4','avg_p5'])
    for label in labels:
        df_corr.loc['avg_hirst'][label] = df_hirst_agg[label].mean()
        if pswitch[0] == 1:
            df_corr.loc['hirst-p2'][label] = pd.concat([df_hirst_agg[label],df_p2_agg[label]],axis=1).astype(float).corr().to_numpy()[0][1]
            df_corr.loc['avg_p2'][label] = df_p2_agg[label].mean()
        if pswitch[1] == 1:    
            df_corr.loc['hirst-p4'][label] = pd.concat([df_hirst_agg[label],df_p4_agg[label]],axis=1).astype(float).corr().to_numpy()[0][1]
            df_corr.loc['avg_p4'][label] = df_p4_agg[label].mean()
        if pswitch[2] == 1:    
            df_corr.loc['hirst-p5'][label] = pd.concat([df_hirst_agg[label],df_p5_agg[label]],axis=1).astype(float).corr().to_numpy()[0][1]
            df_corr.loc['avg_p5'][label] = df_p5_agg[label].mean()
        if pswitch[0] + pswitch[1] == 2:
            df_corr.loc['p2-p4'][label] = pd.concat([df_p2_agg[label],df_p4_agg[label]],axis=1).astype(float).corr().to_numpy()[0][1]
        if pswitch[0] + pswitch[2] == 2:
            df_corr.loc['p2-p5'][label] = pd.concat([df_p2_agg[label],df_p5_agg[label]],axis=1).astype(float).corr().to_numpy()[0][1]
        if pswitch[1] + pswitch[2] == 2:
            df_corr.loc['p4-p5'][label] = pd.concat([df_p4_agg[label],df_p5_agg[label]],axis=1).astype(float).corr().to_numpy()[0][1]
        
    #df_corr.to_csv(output_path+seuillabel+"_correlations.csv")
    df_corr['seuil']=seuil
    df_corr_all = pd.concat([df_corr_all,df_corr])

#### Old Prepare poleno data