In [13]:
import pickle

import joblib
import pandas as pd

from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_curve, auc, precision_recall_curve, confusion_matrix
from sklearn.metrics import classification_report

In [14]:
df = pd.read_parquet('../../data/all_data_preprocessed/all_merged.parquet')

In [15]:
X = df.drop(columns=[
    'event_all_region', 'alarms_in_regions', 'event_1h_ago',
    'event_2h_ago'])
y = df['event_all_region']
X.rename(columns={"event_1h_ago": "status"}, inplace=True)

In [16]:
tscv = TimeSeriesSplit(n_splits=5)
splits = list(tscv.split(X))

In [17]:
train_idx, test_idx = splits[-1]
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

In [18]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [19]:
scaler = joblib.load('../our_models/scaler_v1.pkl')

In [20]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [21]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

In [22]:
# param = {
#     'n_estimators': [100, 200],
#     'max_depth': [10, 20, None],
#     'min_samples_split': [2, 5],
#     'min_samples_leaf': [1, 2],
#     'max_features': ['sqrt', 'log2', None]
# }

In [23]:
# random_search = RandomizedSearchCV(
#     rf_model,
#     param_distributions=param,
#     n_iter=10,
#     cv=tscv,
#     scoring='f1_weighted',
#     random_state=42,
#     n_jobs=-1,
#     verbose=2
# )

In [24]:
# random_search.fit(X_train_scaled, y_train)

In [25]:
# best_rf = random_search.best_estimator_
best_rf = rf_model

In [27]:
best_rf.fit(X_train_scaled, y_train)

In [28]:
y_pred = best_rf.predict(X_test_scaled)

In [29]:
ac = accuracy_score(y_test, y_pred)

In [30]:
ac

0.898059281529448

In [31]:
f1 = f1_score(y_test, y_pred, average='weighted')
f1

0.8873141204738819

In [32]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[84084,  1670],
       [ 8909,  9113]])

In [33]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.90      0.98      0.94     85754
         1.0       0.85      0.51      0.63     18022

    accuracy                           0.90    103776
   macro avg       0.87      0.74      0.79    103776
weighted avg       0.89      0.90      0.89    103776



In [34]:
with open('../our_models/3__RF__v1.pkl', 'wb') as f:
    pickle.dump(best_rf, f)