In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
import pickle

In [2]:
df = pd.read_parquet('../data/all_data_preprocessed/all_merged.parquet')

In [3]:
X = df.drop(columns=[
    'event_all_region','alarms_in_regions',
    'event_2h_ago'])
y = df['event_all_region']

In [4]:
X.rename(columns={"event_1h_ago": "status"}, inplace=True)

In [5]:
tscv = TimeSeriesSplit(n_splits=5)
splits = list(tscv.split(X))
train_idx, test_idx = splits[-1]
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

In [6]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
xgb_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    eval_metric='logloss'
)


In [9]:
xgb_model.fit(X_train_scaled, y_train)

In [10]:
y_pred = xgb_model.predict(X_test_scaled)

In [11]:
accuracy_score(y_test, y_pred)

0.919181699044095

In [12]:
confusion_matrix(y_test, y_pred)

array([[81635,  4119],
       [ 4268, 13754]])

In [13]:
f1_score(y_test, y_pred)

0.7663462877838139

In [14]:
print(
classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.95      0.95      0.95     85754
         1.0       0.77      0.76      0.77     18022

    accuracy                           0.92    103776
   macro avg       0.86      0.86      0.86    103776
weighted avg       0.92      0.92      0.92    103776



In [15]:
importance = xgb_model.feature_importances_
feature_names = X_train.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance})

In [16]:
importance_df = importance_df.sort_values(by='Importance', ascending=False)

In [17]:
top_20_features = importance_df.head(20)
print(top_20_features)

                   Feature  Importance
1038                status    0.370034
1037  event_lastDay_region    0.009986
600                    563    0.004193
366                    329    0.003898
728                    691    0.003762
618                    581    0.003654
159                    122    0.003302
656                    619    0.003111
179                    142    0.002988
405                    368    0.002983
424                    387    0.002695
819                    782    0.002530
169                    132    0.002521
877                    840    0.002468
495                    458    0.002379
441                    404    0.002365
403                    366    0.002363
836                    799    0.002335
213                    176    0.002331
272                    235    0.002265


In [18]:
with open('../src/our_models/3_Xgboost_1hour.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

In [19]:
X_train

Unnamed: 0,region_id,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,day_solarradiation,...,994,995,996,997,998,999,event_lastDay_region,status,ru_holiday,ua_holiday
0,1.0,1.645740e+09,6.0,-1.0,2.6,-0.4,81.900002,0.0,0.0,126.199997,...,0.521,0.503,0.524,0.504,0.477,0.611,0.0,0.0,0.0,0.0
1,1.0,1.645740e+09,6.0,-1.0,2.6,-0.4,81.900002,0.0,0.0,126.199997,...,0.521,0.503,0.524,0.504,0.477,0.611,0.0,0.0,0.0,0.0
2,1.0,1.645740e+09,6.0,-1.0,2.6,-0.4,81.900002,0.0,0.0,126.199997,...,0.521,0.503,0.524,0.504,0.477,0.611,2.0,0.0,0.0,0.0
3,1.0,1.645740e+09,6.0,-1.0,2.6,-0.4,81.900002,0.0,0.0,126.199997,...,0.521,0.503,0.524,0.504,0.477,0.611,2.0,0.0,0.0,0.0
4,1.0,1.645740e+09,6.0,-1.0,2.6,-0.4,81.900002,0.0,0.0,126.199997,...,0.521,0.503,0.524,0.504,0.477,0.611,2.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
518875,21.0,1.740780e+09,4.9,-6.9,-0.9,-5.9,71.300003,0.0,0.0,140.100006,...,0.525,0.608,0.604,0.556,0.555,0.734,10.0,0.0,0.0,0.0
518876,21.0,1.740780e+09,4.9,-6.9,-0.9,-5.9,71.300003,0.0,0.0,140.100006,...,0.525,0.608,0.604,0.556,0.555,0.734,10.0,1.0,0.0,0.0
518877,21.0,1.740780e+09,4.9,-6.9,-0.9,-5.9,71.300003,0.0,0.0,140.100006,...,0.525,0.608,0.604,0.556,0.555,0.734,10.0,1.0,0.0,0.0
518878,21.0,1.740780e+09,4.9,-6.9,-0.9,-5.9,71.300003,0.0,0.0,140.100006,...,0.525,0.608,0.604,0.556,0.555,0.734,10.0,1.0,0.0,0.0


In [21]:
X

Unnamed: 0,region_id,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,day_solarradiation,...,994,995,996,997,998,999,event_lastDay_region,status,ru_holiday,ua_holiday
0,1,1645740000,6.0,-1.0,2.6,-0.4,81.9,0.0,0.00,126.2,...,0.521,0.503,0.524,0.504,0.477,0.611,0,0,0,0
1,1,1645740000,6.0,-1.0,2.6,-0.4,81.9,0.0,0.00,126.2,...,0.521,0.503,0.524,0.504,0.477,0.611,0,0,0,0
2,1,1645740000,6.0,-1.0,2.6,-0.4,81.9,0.0,0.00,126.2,...,0.521,0.503,0.524,0.504,0.477,0.611,2,0,0,0
3,1,1645740000,6.0,-1.0,2.6,-0.4,81.9,0.0,0.00,126.2,...,0.521,0.503,0.524,0.504,0.477,0.611,2,0,0,0
4,1,1645740000,6.0,-1.0,2.6,-0.4,81.9,0.0,0.00,126.2,...,0.521,0.503,0.524,0.504,0.477,0.611,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
622651,25,1740780000,0.6,-2.4,-0.7,-2.8,86.3,2.0,4.17,41.0,...,0.525,0.608,0.604,0.556,0.555,0.734,6,0,0,0
622652,25,1740780000,0.6,-2.4,-0.7,-2.8,86.3,2.0,4.17,41.0,...,0.525,0.608,0.604,0.556,0.555,0.734,6,0,0,0
622653,25,1740780000,0.6,-2.4,-0.7,-2.8,86.3,2.0,4.17,41.0,...,0.525,0.608,0.604,0.556,0.555,0.734,6,0,0,0
622654,25,1740780000,0.6,-2.4,-0.7,-2.8,86.3,2.0,4.17,41.0,...,0.525,0.608,0.604,0.556,0.555,0.734,6,0,0,0
