Import and load dataset

In [1]:
import pandas as pd

df = pd.read_csv("prepared_data/final_dataset.csv")

In [2]:
df.head()

Unnamed: 0,city_latitude,city_longitude,day_tempmax,day_tempmin,day_temp,day_precipcover,day_moonphase,hour_datetimeEpoch,hour_temp,hour_humidity,...,yar,year,yet,yevgeny,york,youth,zaporizhzhia,zelensky,znpp,zone
0,50.7469,25.3263,4.9,0.7,2.6,4.17,0.77,1645653600,2.4,89.18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,49.2336,28.4486,5.0,0.7,2.8,4.17,0.77,1645653600,2.1,91.76,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,48.5085,32.2656,6.2,-1.3,2.2,0.0,0.77,1645653600,0.0,82.64,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,49.4168,26.9743,4.7,0.2,2.3,8.33,0.77,1645653600,2.2,88.52,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,47.8289,35.1626,8.0,-2.0,3.3,0.0,0.77,1645653600,1.0,80.38,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, root_mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA

In [4]:
X_regression = df.drop(["alarms_start_epoch", "alarms_end_epoch", "is_alarm"], axis=1)
y_regression = df["alarms_start_epoch"]

X_classification = df.drop(["alarms_start_epoch", "alarms_end_epoch", "is_alarm"], axis=1)
y_classification = df["is_alarm"]

num_attribs = X_regression.select_dtypes(include='number').columns.tolist()
cat_attribs = ["hour_conditions", "region_x"]
list_attribs = ["hour_preciptype"]

In [5]:
class MultiLabelTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mlb = MultiLabelBinarizer()

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            X_list = X.iloc[:, 0].tolist()
        elif isinstance(X, pd.Series):
            X_list = X.tolist()
        else:
            X_list = X

        X_list = [x if isinstance(x, list) else [x] if x is not None else [] for x in X_list]
        self.mlb.fit(X_list)
        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X_list = X.iloc[:, 0].tolist()
        elif isinstance(X, pd.Series):
            X_list = X.tolist()
        else:
            X_list = X

        X_list = [x if isinstance(x, list) else [x] if x is not None else [] for x in X_list]
        return self.mlb.transform(X_list)


num_regression_pipeline = Pipeline([
    ("std_scaler", StandardScaler()),
])

full_regression_pipeline = ColumnTransformer([
    ("num", num_regression_pipeline, num_attribs),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_attribs),
    ("list", MultiLabelTransformer(), list_attribs)
])
regression_pipeline = Pipeline([
    ('preprocessor', full_regression_pipeline),
    ('model', LinearRegression())
])


In [11]:
num_classification_pipeline = Pipeline([
    ("std_scaler", StandardScaler()),
    ("pca", PCA())
])

full_classification_pipeline = ColumnTransformer([
    ("num", num_classification_pipeline, num_attribs),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_attribs),
    ("list", MultiLabelTransformer(), list_attribs)
])
classification_pipeline = Pipeline([
    ('preprocessor', full_classification_pipeline),
    ('model', LogisticRegression())
])

In [7]:
import numpy as np

tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []

for train_index, test_index in tscv.split(X_regression):
    X_train, X_test = X_regression.iloc[train_index], X_regression.iloc[test_index]
    y_train, y_test = y_regression.iloc[train_index], y_regression.iloc[test_index]

    regression_pipeline.fit(X_train, y_train)
    y_pred = regression_pipeline.predict(X_test)

    rmse_scores.append(root_mean_squared_error(y_test, y_pred) / 3600)

print(f"RMSE scores: {rmse_scores}")
print(f"Mean RMSE: {np.mean(rmse_scores):.4f}")



RMSE scores: [3.2326445734970277, 2.4131722314526916, 3.1908234543348324, 4.572515921690248, 4.677813742375525]
Mean RMSE: 3.6174


In [8]:
param_grid = {
    'model__fit_intercept': [True, False],
}

In [9]:
grid_search = GridSearchCV(
    regression_pipeline,
    param_grid,
    cv=tscv,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_regression, y_regression)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best RMSE: {-grid_search.best_score_:.4f}")

best_regression_model = grid_search.best_estimator_

Fitting 5 folds for each of 2 candidates, totalling 10 fits


8 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\misha\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\misha\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\misha\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 652, in fit
    Xt = self._fit(X, y, routed_params, raw_par

KeyboardInterrupt: 

It is too big to run for now, so we left as it is

In [49]:
for train_index, test_index in tscv.split(X_regression):
    X_train, X_test = X_regression.iloc[train_index], X_regression.iloc[test_index]
    y_train, y_test = y_regression.iloc[train_index], y_regression.iloc[test_index]

    best_regression_model.fit(X_train, y_train)
    y_pred = best_regression_model.predict(X_test)

    rmse_scores.append(root_mean_squared_error(y_test, y_pred) / 3600)

print(f"RMSE scores: {rmse_scores}")
print(f"Mean RMSE: {np.mean(rmse_scores):.4f}")



RMSE scores: [3.2340757803424034, 2.0521664316204986, 1.9762866799585281, 3.2221608300296745, 2.770249515575807, 3.234075180667554, 2.051468359280797, 1.9740742870406656, 3.2221171008311593, 2.7699321191399773]
Mean RMSE: 2.6507


The same results as we trained at the beggining

In [12]:
tscv = TimeSeriesSplit(n_splits=5)
accuracy_scores = []

for train_index, test_index in tscv.split(X_classification):
    X_train, X_test = X_classification.iloc[train_index], X_classification.iloc[test_index]
    y_train, y_test = y_classification.iloc[train_index], y_classification.iloc[test_index]

    classification_pipeline.fit(X_train, y_train)
    y_pred = classification_pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)

print(f"Accuracy scores: {accuracy_scores}")
print(f"Mean accuracy: {np.mean(accuracy_scores):.4f}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy scores: [0.8007020797456617, 0.6458849800351986, 0.6230058853585149, 0.6251821433302425, 0.6202145979599947]
Mean accuracy: 0.6630


In [85]:
# param_grid = {
#     # 'preprocessor__num__pca__n_components': [0.5, 0.2, 0.01, 0.95, 0.9, 0.8, 1, 5, 10, 25, 50, 70]
# }

In [86]:
grid_search = GridSearchCV(
    classification_pipeline,
    param_grid,
    cv=tscv,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_classification, y_classification)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best accuracy: {grid_search.best_score_:.4f}")

best_classification_model = grid_search.best_estimator_

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters: {'preprocessor__num__pca__n_components': 70}
Best accuracy: -0.7649
