In [1]:
import io
import numpy as np
import os
import pandas as pd
from pathlib import Path
import requests
import zipfile

from datetime import datetime
from sklearn import datasets, ensemble

from evidently.pipeline.column_mapping import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset
from evidently.metric_preset import TargetDriftPreset
from evidently.metric_preset import DataQualityPreset
from evidently.metric_preset.regression_performance import RegressionPreset

from evidently.metrics import (
    RegressionQualityMetric,
    RegressionPredictedVsActualScatter,
    RegressionPredictedVsActualPlot,
    RegressionErrorPlot,
    RegressionAbsPercentageErrorPlot,
    RegressionErrorDistribution,
    RegressionErrorNormality,
    RegressionTopErrorMetric,
    RegressionErrorBiasTable,

    DatasetSummaryMetric,
    ColumnSummaryMetric,
    DatasetMissingValuesMetric,
    DatasetCorrelationsMetric
)

In [2]:
raw_data = pd.read_csv('data/data4.csv',sep=',',parse_dates=['time'],index_col='time')

In [3]:
raw_data.head()

Unnamed: 0_level_0,sensor1_air_hum,sensor1_air_temp,sensor2_air_hum,sensor2_air_temp,agromet_main_air_temp1,agromet_main_air_temp2
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-10-01 00:00:00,77.947,15.193,80.198,15.041,12.1,14.4
2022-10-01 00:10:00,79.257,14.939,80.963,14.843,11.933333,14.316667
2022-10-01 00:20:00,80.14,14.787,81.747,14.646,11.766667,14.233333
2022-10-01 00:30:00,80.908,14.56,82.914,14.36,11.6,14.15
2022-10-01 00:40:00,80.258,14.814,82.415,14.491,11.433333,14.066667


In [4]:
# Get weeks number
days = len(raw_data.index.unique())
weeks = days / 7

print(f'days = {days}; weeks = {weeks}')

days = 3359; weeks = 479.85714285714283


In [5]:
raw_data.index[0]

Timestamp('2022-10-01 00:00:00')

In [6]:
raw_data.index[-1]

Timestamp('2022-10-31 10:00:00')

In [7]:
raw_data.columns

Index(['sensor1_air_hum', 'sensor1_air_temp', 'sensor2_air_hum',
       'sensor2_air_temp', 'agromet_main_air_temp1', 'agromet_main_air_temp2'],
      dtype='object')

In [8]:
REF_TIME_START = '2022-10-01 00:00:00'
REF_TIME_END = '2022-10-15 00:00:00'

CURRENT_TIME_START = '2022-10-16 00:00:00'
CURRENT_TIME_END = '2022-10-31 00:00:00'

target = "sensor1_air_temp"
prediction = "prediction"
numerical_features = ['sensor1_air_hum', 'sensor1_air_temp', 'sensor2_air_hum',
       'sensor2_air_temp', 'agromet_main_air_temp1', 'agromet_main_air_temp2']

reports_dir = Path('reports') / f"{CURRENT_TIME_START}_{CURRENT_TIME_END}"
reports_dir.mkdir(exist_ok=True)

In [9]:
reference = raw_data.loc[REF_TIME_START:REF_TIME_END]
current = raw_data.loc[CURRENT_TIME_START:CURRENT_TIME_END]

In [10]:
reference.head()

Unnamed: 0_level_0,sensor1_air_hum,sensor1_air_temp,sensor2_air_hum,sensor2_air_temp,agromet_main_air_temp1,agromet_main_air_temp2
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-10-01 00:00:00,77.947,15.193,80.198,15.041,12.1,14.4
2022-10-01 00:10:00,79.257,14.939,80.963,14.843,11.933333,14.316667
2022-10-01 00:20:00,80.14,14.787,81.747,14.646,11.766667,14.233333
2022-10-01 00:30:00,80.908,14.56,82.914,14.36,11.6,14.15
2022-10-01 00:40:00,80.258,14.814,82.415,14.491,11.433333,14.066667


In [11]:
regressor = ensemble.RandomForestRegressor(random_state = 0, n_estimators = 50)

In [12]:
regressor.fit(reference[numerical_features], reference[target])

In [13]:
ref_prediction = regressor.predict(reference[numerical_features])
current_prediction = regressor.predict(current[numerical_features])

In [14]:
reference['prediction'] = ref_prediction
current['prediction'] = current_prediction

## DATA DRIFT

In [15]:
column_mapping = ColumnMapping()
column_mapping.numerical_features = numerical_features

In [16]:
data_drift_report = Report(metrics=[DataDriftPreset()])
data_drift_report.run(
    reference_data=reference,
    current_data=current.loc[CURRENT_TIME_START:CURRENT_TIME_END],
    column_mapping=column_mapping
)

In [17]:
data_drift_report_path = reports_dir / 'data_drift.html'
data_drift_report.save_html(data_drift_report_path)

## TARGET DRIFT

In [18]:
target_drift_report = Report(metrics=[TargetDriftPreset()])
target_drift_report.run(
    reference_data=reference,
    current_data=current.loc[CURRENT_TIME_START:CURRENT_TIME_END],
    column_mapping=column_mapping
)

In [19]:
target_drift_report_path = reports_dir / 'target_drift.html'
target_drift_report.save_html(target_drift_report_path)

## MODEL PERFORMANCE

In [20]:
column_mapping = ColumnMapping()

column_mapping.target = target
column_mapping.prediction = prediction
column_mapping.numerical_features = numerical_features

In [21]:
regression_performance_report = Report(metrics=[RegressionPreset()])

regression_performance_report.run(
    reference_data=reference,
    current_data=current.loc[CURRENT_TIME_START:CURRENT_TIME_END],
    column_mapping=column_mapping
)

In [22]:
model_performance_report_path = reports_dir / 'model_performance.html'
regression_performance_report.save_html(model_performance_report_path)

## DATA QUALITY

In [23]:
column_mapping = ColumnMapping()
column_mapping.numerical_features = numerical_features

In [24]:
data_quality_report = Report(metrics=[DataQualityPreset()])
data_quality_report.run(
    reference_data=reference,
    current_data=current.loc[CURRENT_TIME_START:CURRENT_TIME_END],
    column_mapping=column_mapping
)


elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



In [25]:
data_quality_report_path = reports_dir / 'data_quality.html'
data_quality_report.save_html(data_quality_report_path)