In [1]:
import pickle

import joblib
import pandas as pd

from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_curve, auc, precision_recall_curve, confusion_matrix
from sklearn.metrics import classification_report

In [2]:
df = pd.read_parquet('../../data/all_data_preprocessed/all_merged_tfidf.parquet')

In [3]:
X = df.drop(columns=[
    'event_all_region', 'alarms_in_regions', 'event_1h_ago',
    'event_2h_ago'])
y = df['event_all_region']
X.rename(columns={"event_1h_ago": "status"}, inplace=True)

In [4]:
tscv = TimeSeriesSplit(n_splits=5)
splits = list(tscv.split(X))

In [5]:
train_idx, test_idx = splits[-1]
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

In [6]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [7]:
scaler = joblib.load('../our_models/scaler_v2.pkl')

In [8]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

In [10]:
# param = {
#     'n_estimators': [100, 200],
#     'max_depth': [10, 20, None],
#     'min_samples_split': [2, 5],
#     'min_samples_leaf': [1, 2],
#     'max_features': ['sqrt', 'log2', None]
# }

In [11]:
# random_search = RandomizedSearchCV(
#     rf_model,
#     param_distributions=param,
#     n_iter=10,
#     cv=tscv,
#     scoring='f1_weighted',
#     random_state=42,
#     n_jobs=-1,
#     verbose=2
# )

In [12]:
# random_search.fit(X_train_scaled, y_train)

In [13]:
# best_rf = random_search.best_estimator_
best_rf = rf_model

In [14]:
best_rf.fit(X_train_scaled, y_train)

In [15]:
y_pred = best_rf.predict(X_test_scaled)

In [16]:
ac = accuracy_score(y_test, y_pred)

In [17]:
ac

0.8957562442183163

In [18]:
f1 = f1_score(y_test, y_pred, average='weighted')
f1

0.884008658328307

In [19]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[84153,  1601],
       [ 9217,  8805]])

In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.90      0.98      0.94     85754
         1.0       0.85      0.49      0.62     18022

    accuracy                           0.90    103776
   macro avg       0.87      0.73      0.78    103776
weighted avg       0.89      0.90      0.88    103776



In [22]:
with open('../our_models/3__RF__v2.pkl', 'wb') as f:
    pickle.dump(best_rf, f)

FileNotFoundError: [Errno 2] No such file or directory: '../../our_models/3__RF__v2.pkl'