## Import libs

In [382]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from math import sqrt
from fedot.api.main import Fedot
from fedot.core.data.data import InputData
from fedot.core.repository.dataset_types import DataTypesEnum
from fedot.core.repository.tasks import Task, TaskTypesEnum, TsForecastingParams
from fedot.core.pipelines.pipeline import Pipeline
from fedot.core.pipelines.node import PrimaryNode, SecondaryNode
import logging
import warnings
logging.raiseExceptions = False
warnings.filterwarnings('ignore')

## Load data

In [403]:
data = pd.read_parquet('data/AllYearsES1.parquet')
data = data[data['Estacao'] == 'Vila Velha - Ibes']
data['datetime'] = pd.to_datetime(data['Data'] + ' ' + data['Hora'], format='mixed', errors='coerce')
data.set_index('datetime', inplace=True)
data = data.sort_index()

In [404]:
data[['Poluente', 'Valor']].groupby(['Poluente']).count()

Unnamed: 0_level_0,Valor
Poluente,Unnamed: 1_level_1
CO,68393
MP10,64656
MP2.5,47085
NO,9552
NO2,42648
O3,55188
PTS,26182
SO2,42065


In [405]:
mp_10 = data[data['Poluente'] == 'MP10']
mp_10.head()

Unnamed: 0_level_0,Data,Hora,Estacao,Codigo,Poluente,Valor,Unidade,Tipo
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-01-01 00:30:00,2015-01-01,00:30,Vila Velha - Ibes,ES07,MP10,14.0,ug/m3,automatica
2015-01-01 01:30:00,2015-01-01,01:30,Vila Velha - Ibes,ES07,MP10,37.0,ug/m3,automatica
2015-01-01 02:30:00,2015-01-01,02:30,Vila Velha - Ibes,ES07,MP10,23.0,ug/m3,automatica
2015-01-01 03:30:00,2015-01-01,03:30,Vila Velha - Ibes,ES07,MP10,10.0,ug/m3,automatica
2015-01-01 04:30:00,2015-01-01,04:30,Vila Velha - Ibes,ES07,MP10,5.0,ug/m3,automatica


In [6]:
fig = px.line(mp_10, x=mp_10.index, y=['Valor'],
              labels={'value': 'Концентрация (ug/m3)', 'datetime': 'Время'},
              title='MP10')
fig.show()

In [406]:
def get_longest_continuous_segment(df, max_delta_minutes=60):
    df = df.copy()
    time_index = df.index.to_series()

    delta = time_index.diff().fillna(pd.Timedelta(seconds=0))
    gap = delta > pd.Timedelta(minutes=max_delta_minutes)

    group_id = gap.cumsum()

    longest_group = group_id.value_counts().idxmax()

    return df[group_id == longest_group]


mp_10_longest = get_longest_continuous_segment(mp_10)

In [407]:
print(mp_10_longest.shape)
print(mp_10_longest.isna().sum())
mp_10_longest.head()

(1366, 8)
Data        0
Hora        0
Estacao     0
Codigo      0
Poluente    0
Valor       0
Unidade     0
Tipo        0
dtype: int64


Unnamed: 0_level_0,Data,Hora,Estacao,Codigo,Poluente,Valor,Unidade,Tipo
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-01-19 10:30:00,2017-01-19,10:30,Vila Velha - Ibes,ES07,MP10,47.0,ug/m3,automatica
2017-01-19 10:30:00,2017-01-19,10:30,Vila Velha - Ibes,ES07,MP10,47.0,ug/m3,automatica
2017-01-19 11:30:00,2017-01-19,11:30,Vila Velha - Ibes,ES07,MP10,26.0,ug/m3,automatica
2017-01-19 11:30:00,2017-01-19,11:30,Vila Velha - Ibes,ES07,MP10,26.0,ug/m3,automatica
2017-01-19 12:30:00,2017-01-19,12:30,Vila Velha - Ibes,ES07,MP10,23.0,ug/m3,automatica


## Single-option

### Baseline

In [408]:
def create_windows(data, window_size):
    
    num_rows = data.shape[0]
    
    X = []
    y = []
    
    for i in range(num_rows - window_size):
        window = data[i:i + window_size]
        target = data[i + window_size]
        X.append(window)
        y.append(target)
    
    return np.array(X), np.array(y)


In [661]:
window_size = 49
X, y = create_windows(mp_10_longest.Valor.values, window_size)

In [662]:
def train_test_split_ordered(X, y, test_ratio=0.2):
    split_idx = int(len(X) * (1 - test_ratio))
    
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split_ordered(X, y)

In [489]:
def moving_average_forecast(X):
    predictions = []
    for x in X:
        yhat = np.mean(x)
        predictions.append(yhat)
    return predictions

baseline_full_predictions = moving_average_forecast(X)
baseline_test_predictions = moving_average_forecast(X_test)

In [490]:
mp_10_ma = mp_10['Valor'].rolling(window=window_size).mean()

fig = px.line(title='Time-Series of MP10 with moving average forecast')
fig.add_scatter(y=y_test, mode='lines', name='Original')
fig.add_scatter(y=baseline_test_predictions, mode='lines', name=f'MA ({window_size}h)')
fig.update_layout(xaxis_title='Data', yaxis_title='MP10 values, ug/m3')
fig.show()

In [491]:
mae_baseline = mean_absolute_error(y_test, baseline_test_predictions)
rmse_baseline = np.sqrt(mean_squared_error(y_test, baseline_test_predictions))
print("Baseline (moving_average): MAE =", mae_baseline, "RMSE =", rmse_baseline)

Baseline (moving_average): MAE = 7.216450216450216 RMSE = 10.734408026602926


## Linear Regression

In [676]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [677]:
fig = px.line(title='Time-Series of MP10 with Linear Regression forecast')
fig.add_scatter(y=y_test, mode='lines', name='Original')
fig.add_scatter(y=y_pred, mode='lines', name=f'Linear Regression ({window_size}h)')
fig.update_layout(xaxis_title='Data', yaxis_title='MP10 values, ug/m3')
fig.show()

In [494]:
mae_linreg = mean_absolute_error(y_test, y_pred)
rmse_linreg = np.sqrt(mean_squared_error(y_test, y_pred))
print("Linear Regression: MAE =", mae_linreg, "RMSE =", rmse_linreg)

Linear Regression: MAE = 3.990367066793272 RMSE = 7.320427883534277


### FEDOT

In [757]:
def make_forecast(train_data, len_forecast: int, window_size: int):
    task = Task(TaskTypesEnum.ts_forecasting,
                TsForecastingParams(forecast_length=len_forecast))

    train_input = InputData(idx=np.arange(0, len(train_data)),
                            features=train_data,
                            target=train_data,
                            task=task,
                            data_type=DataTypesEnum.ts)

    start_forecast = len(train_data)
    end_forecast = start_forecast + len_forecast
    forecast_idx = np.arange(start_forecast, end_forecast)
    predict_input = InputData(idx=forecast_idx,
                              features=train_data,
                              target=train_data,
                              task=task,
                              data_type=DataTypesEnum.ts)
    
    # First level
    node_lagged_1 = PrimaryNode('lagged')
    node_lagged_1.parameters = {'window_size': window_size}

    # Second level
    node_final = SecondaryNode('ridge', nodes_from=[node_lagged_1])

    pipeline = Pipeline(node_final)
    pipeline.fit(train_input)

    predicted_output = pipeline.predict(predict_input)
    forecast = np.ravel(np.array(predicted_output.predict))

    return forecast

In [758]:
len_forecast = len(y_test)
predicts = []
for i in range(len_forecast):
    train_data = mp_10_longest.Valor.values[:(-len_forecast+i)]
    
    predicts.append(make_forecast(train_data, 1, window_size)[0])

predicts = np.array(predicts)

In [759]:
fig = px.line(title='Time-Series of MP10 with FEDOT forecast')
fig.add_scatter(y=y_test, mode='lines', name='Original')
fig.add_scatter(y=predicts, mode='lines', name=f'Fedot ({window_size}h)')
fig.update_layout(xaxis_title='Data', yaxis_title='MP10 values, ug/m3')
fig.show()

In [760]:
mae_fedot = mean_absolute_error(y_test, predicts)
rmse_fedot = np.sqrt(mean_squared_error(y_test, predicts))
print(f"MAE: {mae_fedot:.4f}, RMSE: {rmse_fedot:.4f}")

MAE: 3.7459, RMSE: 7.1042


### Comparison

In [761]:
fig_compare = go.Figure()
fig_compare.add_trace(go.Scatter(y=predicts, mode='lines', name='FEDOT'))
fig_compare.add_trace(go.Scatter(y=y_test, mode='lines', name='Test Data'))
fig_compare.add_trace(go.Scatter(y=baseline_test_predictions, mode='lines', name='Baseline (MA)'))
fig_compare.add_trace(go.Scatter(y=y_pred, mode='lines', name='Linear Regression'))
fig_compare.update_layout(title='Forecasting', xaxis_title='Data', yaxis_title='MP10, ug/m3')
fig_compare.show()

# Вывод метрик для обоих методов
metrics_df = pd.DataFrame({
    'Metrics': ['MAE', 'RMSE'],
    'Baseline (MA)': [mae_baseline, rmse_baseline],
    'Linear Regression': [mae_linreg, rmse_linreg],
    'FEDOT': [mae_fedot, rmse_fedot]
})
print(metrics_df)

  Metrics  Baseline (MA)  Linear Regression     FEDOT
0     MAE      14.693187           3.990367  3.745932
1    RMSE      21.212531           7.320428  7.104185


## Multivariate

In [762]:
all_polutant_data = data.pivot_table(index='datetime', columns='Poluente', values='Valor')
all_polutant_data = all_polutant_data[all_polutant_data.index < '2016-01-01']
all_polutant_data.head()

Poluente,CO,MP10,MP2.5,NO,NO2,O3,PTS,SO2
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-01-01 00:30:00,305.53,14.0,8.0,1.69,5.3,30.59,18.0,
2015-01-01 01:30:00,428.31,37.0,10.0,2.28,9.04,28.06,39.0,1.07
2015-01-01 02:30:00,330.15,23.0,22.0,2.35,6.61,30.01,26.0,2.17
2015-01-01 03:30:00,226.61,10.0,8.0,0.64,5.01,31.93,13.0,1.24
2015-01-01 04:30:00,156.58,5.0,5.0,0.51,2.97,35.81,7.0,1.88


In [763]:
all_polutant_data = all_polutant_data.sort_index()
print(all_polutant_data.isna().sum())

Poluente
CO         24
MP10      324
MP2.5     178
NO       7373
NO2      7373
O3         21
PTS       643
SO2       134
dtype: int64


In [764]:
all_polutant_filtered = all_polutant_data[['CO', 'MP2.5', 'MP10', 'SO2']]
all_polutant_filtered.dropna(inplace=True)
all_polutant_longest = get_longest_continuous_segment(all_polutant_filtered)

In [765]:
print(all_polutant_longest.shape)
print(all_polutant_longest.isna().sum())

(208, 4)
Poluente
CO       0
MP2.5    0
MP10     0
SO2      0
dtype: int64


In [766]:
for column in all_polutant_longest.columns:
    fig = px.line(all_polutant_longest, y=column, title=f'{column} over time')
    fig.show()

In [767]:
def extract_features(df: pd.DataFrame, window_size: int, target_column: str):
    stats_functions = {
        'mean': np.mean,
        'std': np.std,
        'min': np.min,
        'max': np.max,
        'median': np.median,
        'skew': lambda x: pd.Series(x).skew(),
        'kurt': lambda x: pd.Series(x).kurt()
    }

    feature_rows = []

    for i in range(len(df) - window_size):
        window = df.iloc[i:i+window_size]
        target_idx = i + window_size

        if target_idx >= len(df):
            break
        
        features = {}
        for col in df.columns:
            for stat_name, func in stats_functions.items():
                features[f"{col}_{stat_name}"] = func(window[col].values)
        
        features['target'] = df.iloc[target_idx][target_column]
        feature_rows.append(features)

    return pd.DataFrame(feature_rows)

In [800]:
all_polutant_stats_features = extract_features(all_polutant_longest, 49, 'MP10')

X_train, X_test, y_train, y_test = train_test_split_ordered(all_polutant_stats_features.drop('target', axis=1),
                                                            all_polutant_stats_features['target'])

### Baseline

In [801]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [810]:
fig = px.line(title='Multivariative Time-Series of MP10 with Linear Regression forecast')
fig.add_scatter(y=y_test, mode='lines', name='Original')
fig.add_scatter(y=y_pred, mode='lines', name=f'Linear regression ({window_size}h)')
fig.update_layout(xaxis_title='Data', yaxis_title='MP10 values, ug/m3')
fig.show()

In [803]:
mae_baseline = mean_absolute_error(y_test, y_pred)
rmse_baseline = np.sqrt(mean_squared_error(y_test, y_pred))
print("Baseline (moving_average): MAE =", mae_baseline, "RMSE =", rmse_baseline)

Baseline (moving_average): MAE = 14.693187424426792 RMSE = 21.212531100779078


### FEDOT

In [804]:
def make_forecast(train_data, target, len_forecast: int, window_size: int):
    task = Task(TaskTypesEnum.ts_forecasting,
                TsForecastingParams(forecast_length=len_forecast))

    train_input = InputData(idx=np.arange(0, len(train_data)),
                            features=train_data,
                            target=target,
                            task=task,
                            data_type=DataTypesEnum.ts)

    start_forecast = len(train_data)
    end_forecast = start_forecast + len_forecast
    forecast_idx = np.arange(start_forecast, end_forecast)
    predict_input = InputData(idx=forecast_idx,
                              features=train_data,
                              target=target,
                              task=task,
                              data_type=DataTypesEnum.ts)
    
    # First level
    node_lagged_1 = PrimaryNode('lagged')
    node_lagged_1.parameters = {'window_size': window_size}

    # Second level
    node_final = SecondaryNode('ridge', nodes_from=[node_lagged_1])

    pipeline = Pipeline(node_final)
    pipeline.fit(train_input)

    predicted_output = pipeline.predict(predict_input)
    forecast = np.ravel(np.array(predicted_output.predict))

    return forecast

In [805]:
len_forecast = len(y_test)
predicts = []
for i in range(len_forecast):
    train_data = all_polutant_stats_features.drop('target', axis=1).values[:(-len_forecast+i)]
    target = all_polutant_stats_features.target
    
    predicts.append(make_forecast(train_data, target, 1, window_size)[0])

predicts = np.array(predicts)

In [809]:
fig = px.line(title='Multivariative Time-Series of MP10 with Linear Regression forecast')
fig.add_scatter(y=y_test, mode='lines', name='Original')
fig.add_scatter(y=predicts, mode='lines', name=f'Fedot ({window_size}h)')
fig.update_layout(xaxis_title='Data', yaxis_title='MP10 values, ug/m3')
fig.show()

In [807]:
mae_fedot = mean_absolute_error(y_test, predicts)
rmse_fedot = np.sqrt(mean_squared_error(y_test, predicts))
print("Fedot: MAE =", mae_fedot, "RMSE =", rmse_fedot)

Fedot: MAE = 13.968173938745181 RMSE = 18.372518928221513


### Comparison

In [808]:
fig_compare = go.Figure()
fig_compare.add_trace(go.Scatter(y=predicts, mode='lines', name='FEDOT'))
fig_compare.add_trace(go.Scatter(y=y_test, mode='lines', name='Test Data'))
fig_compare.add_trace(go.Scatter(y=y_pred, mode='lines', name='Linear Regression'))
fig_compare.update_layout(title='Forecasting', xaxis_title='Data', yaxis_title='MP10, ug/m3')
fig_compare.show()

# Вывод метрик для обоих методов
metrics_df = pd.DataFrame({
    'Metrics': ['MAE', 'RMSE'],
    'Linear Regression': [mae_baseline, rmse_baseline],
    'FEDOT': [mae_fedot, rmse_fedot]
})
print(metrics_df)

  Metrics  Linear Regression      FEDOT
0     MAE          14.693187  13.968174
1    RMSE          21.212531  18.372519
