In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, make_scorer
from xgboost import XGBClassifier
import pickle
import joblib
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_parquet('../../data/all_data_preprocessed/all_merged_tfidf.parquet')

In [3]:
X = df.drop(columns=[
    'event_all_region','alarms_in_regions', 'event_1h_ago',
    'event_2h_ago'])
y = df['event_all_region']

In [4]:
#X.rename(columns={"event_1h_ago": "status"}, inplace=True)

In [5]:
tscv = TimeSeriesSplit(n_splits=5)
splits = list(tscv.split(X))
train_idx, test_idx = splits[-1]
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

In [6]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
neg_pos_ratio = sum(y_train == 0) / sum(y_train == 1)
neg_pos_ratio

2.78725174626114

In [9]:
param_grid = {
    'n_estimators': [100, 150],
    'max_depth': [6, 8],
    'learning_rate': [0.2, 0.4],
    'subsample': [0.6, 0.8],
    'colsample_bytree': [0.6, 0.8],
    'min_child_weight': [1, 3, 5],
    'gamma': [0.5],
}

In [10]:
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='aucpr',
    random_state=42,
    tree_method='hist',
    subsample=0.8,
    n_estimators=150,
    min_child_weight=3,
    max_depth=6,
    learning_rate=0.4,
    gamma=0.5,
    colsample_bytree=0.8,
    n_jobs=-1
)

In [11]:
# f1_scorer = make_scorer(f1_score)

In [12]:
# random_search = RandomizedSearchCV(
#     estimator=xgb_model,
#     param_distributions=param_grid,
#     n_iter=6,
#     scoring=f1_scorer,
#     cv=tscv,
#     verbose=2,
#     random_state=42,
#     n_jobs=1
# )

In [13]:
# random_search.fit(X_train_scaled, y_train)

In [14]:
# print(random_search.best_params_)
# print(random_search.best_score_)
# {'subsample': 0.8, 'n_estimators': 150, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.4, 'gamma': 0.5, 'colsample_bytree': 0.8}
# 0.7994637983532699


In [15]:
# xgb_model = XGBClassifier(
#     **random_search.best_params_,
#     objective='binary:logistic',
#     random_state=42,
#     eval_metric='aucpr',
#     device='cpu',
#     n_jobs=-1
# )

In [16]:
xgb_model.fit(X_train_scaled, y_train)

In [17]:
y_pred = xgb_model.predict(X_test_scaled)

In [18]:
accuracy_score(y_test, y_pred)

0.9034169750231268

In [19]:
confusion_matrix(y_test, y_pred)

array([[83076,  2678],
       [ 7345, 10677]])

In [20]:
f1_score(y_test, y_pred)

0.6805621952385505

In [21]:
print(
classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.92      0.97      0.94     85754
         1.0       0.80      0.59      0.68     18022

    accuracy                           0.90    103776
   macro avg       0.86      0.78      0.81    103776
weighted avg       0.90      0.90      0.90    103776



In [22]:
importance = xgb_model.feature_importances_
feature_names = X_train.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance})

In [23]:
importance_df = importance_df.sort_values(by='Importance', ascending=False)

In [24]:
top_20_features = importance_df.head(20)
print(top_20_features)

                   Feature  Importance
1037  event_lastDay_region    0.029217
223                    186    0.007774
318                    281    0.004956
762                    725    0.004906
671                    634    0.004813
391                    354    0.004737
834                    797    0.004607
325                    288    0.004477
464                    427    0.004364
873                    836    0.004229
780                    743    0.004115
359                    322    0.003960
580                    543    0.003690
640                    603    0.003590
734                    697    0.003589
618                    581    0.003487
761                    724    0.003296
216                    179    0.003279
579                    542    0.003275
1        day_datetimeEpoch    0.003183


In [26]:
with open('../../src/our_models/3__Xgboost__v2.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

In [26]:
X_train

Unnamed: 0,region_id,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,day_solarradiation,...,993,994,995,996,997,998,999,event_lastDay_region,ru_holiday,ua_holiday
0,1.0,1.645740e+09,6.0,-1.0,2.6,-0.4,81.900002,0.0,0.0,126.199997,...,0.527,0.533,0.503,0.519,0.501,0.457,0.617,0.0,0.0,0.0
1,1.0,1.645740e+09,6.0,-1.0,2.6,-0.4,81.900002,0.0,0.0,126.199997,...,0.527,0.533,0.503,0.519,0.501,0.457,0.617,0.0,0.0,0.0
2,1.0,1.645740e+09,6.0,-1.0,2.6,-0.4,81.900002,0.0,0.0,126.199997,...,0.527,0.533,0.503,0.519,0.501,0.457,0.617,2.0,0.0,0.0
3,1.0,1.645740e+09,6.0,-1.0,2.6,-0.4,81.900002,0.0,0.0,126.199997,...,0.527,0.533,0.503,0.519,0.501,0.457,0.617,2.0,0.0,0.0
4,1.0,1.645740e+09,6.0,-1.0,2.6,-0.4,81.900002,0.0,0.0,126.199997,...,0.527,0.533,0.503,0.519,0.501,0.457,0.617,2.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
518875,21.0,1.740780e+09,4.9,-6.9,-0.9,-5.9,71.300003,0.0,0.0,140.100006,...,0.659,0.560,0.639,0.610,0.601,0.572,0.756,10.0,0.0,0.0
518876,21.0,1.740780e+09,4.9,-6.9,-0.9,-5.9,71.300003,0.0,0.0,140.100006,...,0.659,0.560,0.639,0.610,0.601,0.572,0.756,10.0,0.0,0.0
518877,21.0,1.740780e+09,4.9,-6.9,-0.9,-5.9,71.300003,0.0,0.0,140.100006,...,0.659,0.560,0.639,0.610,0.601,0.572,0.756,10.0,0.0,0.0
518878,21.0,1.740780e+09,4.9,-6.9,-0.9,-5.9,71.300003,0.0,0.0,140.100006,...,0.659,0.560,0.639,0.610,0.601,0.572,0.756,10.0,0.0,0.0


In [27]:
X

Unnamed: 0,region_id,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,day_solarradiation,...,993,994,995,996,997,998,999,event_lastDay_region,ru_holiday,ua_holiday
0,1,1645740000,6.0,-1.0,2.6,-0.4,81.9,0.0,0.00,126.2,...,0.527,0.533,0.503,0.519,0.501,0.457,0.617,0,0,0
1,1,1645740000,6.0,-1.0,2.6,-0.4,81.9,0.0,0.00,126.2,...,0.527,0.533,0.503,0.519,0.501,0.457,0.617,0,0,0
2,1,1645740000,6.0,-1.0,2.6,-0.4,81.9,0.0,0.00,126.2,...,0.527,0.533,0.503,0.519,0.501,0.457,0.617,2,0,0
3,1,1645740000,6.0,-1.0,2.6,-0.4,81.9,0.0,0.00,126.2,...,0.527,0.533,0.503,0.519,0.501,0.457,0.617,2,0,0
4,1,1645740000,6.0,-1.0,2.6,-0.4,81.9,0.0,0.00,126.2,...,0.527,0.533,0.503,0.519,0.501,0.457,0.617,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
622651,25,1740780000,0.6,-2.4,-0.7,-2.8,86.3,2.0,4.17,41.0,...,0.659,0.560,0.639,0.610,0.601,0.572,0.756,6,0,0
622652,25,1740780000,0.6,-2.4,-0.7,-2.8,86.3,2.0,4.17,41.0,...,0.659,0.560,0.639,0.610,0.601,0.572,0.756,6,0,0
622653,25,1740780000,0.6,-2.4,-0.7,-2.8,86.3,2.0,4.17,41.0,...,0.659,0.560,0.639,0.610,0.601,0.572,0.756,6,0,0
622654,25,1740780000,0.6,-2.4,-0.7,-2.8,86.3,2.0,4.17,41.0,...,0.659,0.560,0.639,0.610,0.601,0.572,0.756,6,0,0


In [27]:
joblib.dump(scaler, '../../src/our_models/scaler_v2.pkl')

['../../src/our_models/scaler_v2.pkl']