In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
import pickle
import joblib

In [2]:
df = pd.read_parquet('../data/all_data_preprocessed/all_merged.parquet')

In [3]:
X = df.drop(columns=[
    'event_all_region','alarms_in_regions',"event_lastDay_region",
    'event_1h_ago',"event_lastDay_region"])
y = df['event_all_region']

In [4]:
X.rename(columns={"event_2h_ago": "status"}, inplace=True)

In [5]:
tscv = TimeSeriesSplit(n_splits=5)
splits = list(tscv.split(X))
train_idx, test_idx = splits[-1]
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

In [6]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
xgb_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    eval_metric='aucpr',
)


In [9]:
xgb_model.fit(X_train_scaled, y_train)

In [10]:
y_pred = xgb_model.predict(X_test_scaled)

In [11]:
accuracy_score(y_test, y_pred)

0.863513721862473

In [12]:
confusion_matrix(y_test, y_pred)

array([[81525,  4229],
       [ 9935,  8087]])

In [13]:
f1_score(y_test, y_pred)

0.5331267717054519

In [14]:
print(
classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.89      0.95      0.92     85754
         1.0       0.66      0.45      0.53     18022

    accuracy                           0.86    103776
   macro avg       0.77      0.70      0.73    103776
weighted avg       0.85      0.86      0.85    103776



In [15]:
importance = xgb_model.feature_importances_
feature_names = X_train.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance})

In [16]:
importance_df = importance_df.sort_values(by='Importance', ascending=False)

In [17]:
top_20_features = importance_df.head(20)
print(top_20_features)

                 Feature  Importance
1037              status    0.054991
740                  703    0.005753
0              region_id    0.004578
321                  284    0.003682
28       hour_visibility    0.003659
12           day_sunrise    0.003457
567                  530    0.003276
148                  111    0.003215
380                  343    0.003176
885                  848    0.003103
808                  771    0.003032
16    hour_datetimeEpoch    0.002887
721                  684    0.002884
985                  948    0.002823
346                  309    0.002732
701                  664    0.002699
369                  332    0.002682
399                  362    0.002581
497                  460    0.002569
705                  668    0.002517


In [18]:
with open('../src/our_models/3_Xgboost_2hour.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

In [19]:
X_train

Unnamed: 0,region_id,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,day_solarradiation,...,993,994,995,996,997,998,999,status,ru_holiday,ua_holiday
0,1.0,1.645740e+09,6.0,-1.0,2.6,-0.4,81.900002,0.0,0.0,126.199997,...,0.534,0.534,0.502,0.529,0.501,0.466,0.621,0.0,0.0,0.0
1,1.0,1.645740e+09,6.0,-1.0,2.6,-0.4,81.900002,0.0,0.0,126.199997,...,0.534,0.534,0.502,0.529,0.501,0.466,0.621,0.0,0.0,0.0
2,1.0,1.645740e+09,6.0,-1.0,2.6,-0.4,81.900002,0.0,0.0,126.199997,...,0.534,0.534,0.502,0.529,0.501,0.466,0.621,0.0,0.0,0.0
3,1.0,1.645740e+09,6.0,-1.0,2.6,-0.4,81.900002,0.0,0.0,126.199997,...,0.534,0.534,0.502,0.529,0.501,0.466,0.621,0.0,0.0,0.0
4,1.0,1.645740e+09,6.0,-1.0,2.6,-0.4,81.900002,0.0,0.0,126.199997,...,0.534,0.534,0.502,0.529,0.501,0.466,0.621,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
518875,21.0,1.740780e+09,4.9,-6.9,-0.9,-5.9,71.300003,0.0,0.0,140.100006,...,0.643,0.552,0.598,0.612,0.565,0.558,0.738,0.0,0.0,0.0
518876,21.0,1.740780e+09,4.9,-6.9,-0.9,-5.9,71.300003,0.0,0.0,140.100006,...,0.643,0.552,0.598,0.612,0.565,0.558,0.738,0.0,0.0,0.0
518877,21.0,1.740780e+09,4.9,-6.9,-0.9,-5.9,71.300003,0.0,0.0,140.100006,...,0.643,0.552,0.598,0.612,0.565,0.558,0.738,1.0,0.0,0.0
518878,21.0,1.740780e+09,4.9,-6.9,-0.9,-5.9,71.300003,0.0,0.0,140.100006,...,0.643,0.552,0.598,0.612,0.565,0.558,0.738,1.0,0.0,0.0


In [20]:
X

Unnamed: 0,region_id,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,day_solarradiation,...,993,994,995,996,997,998,999,status,ru_holiday,ua_holiday
0,1,1645740000,6.0,-1.0,2.6,-0.4,81.9,0.0,0.00,126.2,...,0.534,0.534,0.502,0.529,0.501,0.466,0.621,0,0,0
1,1,1645740000,6.0,-1.0,2.6,-0.4,81.9,0.0,0.00,126.2,...,0.534,0.534,0.502,0.529,0.501,0.466,0.621,0,0,0
2,1,1645740000,6.0,-1.0,2.6,-0.4,81.9,0.0,0.00,126.2,...,0.534,0.534,0.502,0.529,0.501,0.466,0.621,0,0,0
3,1,1645740000,6.0,-1.0,2.6,-0.4,81.9,0.0,0.00,126.2,...,0.534,0.534,0.502,0.529,0.501,0.466,0.621,0,0,0
4,1,1645740000,6.0,-1.0,2.6,-0.4,81.9,0.0,0.00,126.2,...,0.534,0.534,0.502,0.529,0.501,0.466,0.621,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
622651,25,1740780000,0.6,-2.4,-0.7,-2.8,86.3,2.0,4.17,41.0,...,0.643,0.552,0.598,0.612,0.565,0.558,0.738,0,0,0
622652,25,1740780000,0.6,-2.4,-0.7,-2.8,86.3,2.0,4.17,41.0,...,0.643,0.552,0.598,0.612,0.565,0.558,0.738,0,0,0
622653,25,1740780000,0.6,-2.4,-0.7,-2.8,86.3,2.0,4.17,41.0,...,0.643,0.552,0.598,0.612,0.565,0.558,0.738,0,0,0
622654,25,1740780000,0.6,-2.4,-0.7,-2.8,86.3,2.0,4.17,41.0,...,0.643,0.552,0.598,0.612,0.565,0.558,0.738,0,0,0


In [21]:
joblib.dump(scaler, '../src/our_models/scaler_2hour.pkl')

['../src/our_models/scaler_2hour.pkl']