In [91]:
from undersampling import  DatabaseTables, FetchDBData
import pandas as pd


from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score


from lightgbm import LGBMClassifier 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# DATA Combination and Undersampling

In [92]:
weather_data = FetchDBData(DatabaseTables.daily_weather_with_incidents).get_database_data()

# Modeling

In [93]:
exclude_cols = [
    "dt_iso",
    "grid_id",
    "count_incidents",
]

target_col = 'has_incident'

predictor_cols =[col for col in weather_data.columns if col not in [target_col] + exclude_cols]
one_hot_cols = ["weather_main"]
numeric_cols = [col for col in predictor_cols if col not in exclude_cols + one_hot_cols]

X = weather_data[predictor_cols]
y = weather_data[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)

In [94]:
one_hot_enc = OneHotEncoder()


col_trans = ColumnTransformer(
    [("one_hot_encoder", one_hot_enc, one_hot_cols)],
    remainder="passthrough"
)

  
model = LGBMClassifier(
    metric='binary_error', 
    boosting_type='dart', 
    class_weight={False: 0.05, True: 0.95},
    n_estimators=10_000, 
    verbose=-1,
    reg_alpha=10,
    reg_lambda=10
)

model_pipeline = Pipeline(
    [
        ("onehot_encoder", col_trans),
        ("model", model)
    ]
)
  
model_pipeline.fit(X_train, y_train) 
  
y_pred = model_pipeline.predict(X_test)

data = pd.DataFrame(
    {
        "actual": y_test.reset_index().drop("index", axis=1)["has_incident"],
        "predicted": y_pred,
    }, 
)

print(pd.crosstab(data.actual, data.predicted))
print(f1_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

predicted  False  True 
actual                 
False        487    556
True          16     87
0.23324396782841822
0.13530326594090203
0.8446601941747572


In [95]:
one_hot_enc = OneHotEncoder()


col_trans = ColumnTransformer(
    [("one_hot_encoder", one_hot_enc, one_hot_cols)],
    remainder="passthrough"
)

  
model = LGBMClassifier(
    metric='binary_error', 
    boosting_type='dart', 
    class_weight={False: 0.05, True: 0.95},
    n_estimators=100_000, 
    verbose=-1,
    reg_alpha=1,
    reg_lambda=3
)

model_pipeline = Pipeline(
    [
        ("onehot_encoder", col_trans),
        ("model", model)
    ]
)
  
model_pipeline.fit(X_train, y_train) 
  
y_pred = model_pipeline.predict(X_test)

data = pd.DataFrame(
    {
        "actual": y_test.reset_index().drop("index", axis=1)["has_incident"],
        "predicted": y_pred,
    }, 
)

print(pd.crosstab(data.actual, data.predicted))
print(f1_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

predicted  False  True 
actual                 
False        810    233
True          38     65
0.32418952618453867
0.2181208053691275
0.6310679611650486
