In [1]:
from undersampling import  DatabaseTables, FetchDBData
import pandas as pd


from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score


from sklearn.ensemble import RandomForestClassifier 
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.compose import ColumnTransformer


from imblearn.under_sampling import NearMiss
from imblearn.pipeline import Pipeline

from utils.undersampling import *

# DATA Combination and Undersampling

In [2]:
under_sampler = NearMiss(version=3, sampling_strategy=0.3)
weather_data, has_incident  = undersample_daily_weather_data(under_sampler, DatabaseTables.daily_weather_with_incidents)
weather_data.loc[:, "has_incident"] = has_incident

Ratio Weather:
has_incident
False    1139
True      516
Name: count, dtype: int64


In [3]:
under_sampler = NearMiss(version=3, sampling_strategy=1)
grid_data, has_incident = undersample_grid_data(under_sampler)
grid_data.loc[:, "has_incident"] = has_incident

Ratio of Grids:
has_incident
0    1243
1    1243
Name: count, dtype: int64


In [4]:
incidents_grid_date = FetchDBData(DatabaseTables.count_tree_incidents_per_grid_date).get_database_data()

In [5]:
weather_data["dt_iso"] = weather_data["dt_iso"].astype(str).str.split(' ').apply(lambda x: x[0])

In [6]:
incidents_grid_date["date"] = incidents_grid_date["date"].astype(str)

In [7]:
weather_data_incidents = weather_data.query("`has_incident` == 1").merge(incidents_grid_date, left_on="dt_iso", right_on="date")

In [8]:
weather_grid_has_incident = (
    weather_data_incidents
    .drop(["count_incidents_y", "count_incidents_x", "max_incident_priority", "has_incident"], axis=1)
    .merge(grid_data, on="grid_id")
)

In [9]:
weather_grid_no_incident = grid_data.query("`has_incident` == 0").merge(weather_data_incidents, how="cross").drop(["count_incidents_y", "count_incidents_x", "max_incident_priority", "has_incident_y", "grid_id_y"], axis=1).rename(columns={"grid_id_x": "grid_id", "has_incident_x": "has_incident"})

In [10]:
weather_data_no_incidents = weather_data.query("`has_incident` == 0").merge(incidents_grid_date, left_on="dt_iso", right_on="date")
weather_grid_weather_no_incidents = grid_data.query("`has_incident` == 0").merge(weather_data_incidents, how="cross").drop(["count_incidents_y", "count_incidents_x", "max_incident_priority", "has_incident_y", "grid_id_y"], axis=1).rename(columns={"grid_id_x": "grid_id", "has_incident_x": "has_incident"})

In [11]:
weather_grid_weather_no_incidents = weather_grid_weather_no_incidents[weather_grid_has_incident.columns]

In [12]:
weather_grid_no_incident = weather_grid_no_incident[weather_grid_has_incident.columns]

In [13]:
combined_data = pd.concat([
    weather_grid_has_incident, 
    weather_grid_weather_no_incidents, 
    weather_grid_no_incident])
combined_data = combined_data.drop(["hour","date","total_incident_duration"], axis=1)

In [14]:
combined_data = combined_data.drop(["grid_id", "weather_main", "dt_iso"], axis=1)

# Modeling

In [15]:
exclude_cols = [
    "dt_iso",
    "grid_id",
]

target_col = 'has_incident'

predictor_cols =[col for col in combined_data.columns if col not in [target_col]]
one_hot_cols = ["weather_main"]
numeric_cols = [col for col in predictor_cols if col not in exclude_cols + one_hot_cols]

X = combined_data[predictor_cols]
y = combined_data[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [16]:
one_hot_enc = OneHotEncoder()
power_trans_enc = PowerTransformer()
under_sampler = NearMiss(sampling_strategy=0.01, version=3)

column_transformations = ColumnTransformer(
    [
        ("one_hot_enc", one_hot_enc, one_hot_cols),
        ("power", power_trans_enc, numeric_cols),
    ],
    
    remainder="passthrough"
)

variance_dropout = VarianceThreshold()


model = RandomForestClassifier()


pipeline = Pipeline(
    [
        ("column_transformations", column_transformations),
        ("variance_dropout", variance_dropout),
        ("model", model),
    ]
)

In [ ]:
model = RandomizedSearchCV(
    model,
    param_distributions={
        "class_weight": [{False: 0.01, True: 0.99}, {False: 0.2, True: 0.8}],
        "n_estimators": [100, 200, 500]
    },
    n_jobs=5,
    cv=3,
    random_state=123,
    scoring="f1"
).fit(X_train, y_train)

In [ ]:
print(model.best_score_)

In [ ]:
y_pred = model.best_estimator_.predict()
data = pd.DataFrame(
    {
        "actual": y_test.reset_index().drop("index", axis=1)["has_incident"],
        "predicted": y_pred,
    }, 
)

print(pd.crosstab(data.actual, data.predicted))
print(f1_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))