In [2]:
from undersampling import  DatabaseTables, FetchDBData
import pandas as pd


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

import matplotlib.pyplot as plt

import lightgbm as lgb

import numpy as np

import joblib

# DATA Combination and Undersampling

In [3]:
weather_data = FetchDBData(DatabaseTables.full_sampled_weather_with_incidents).get_database_data()

# Modeling

In [7]:
exclude_cols = [
    "dt_iso",
    "grid_id",
    "count_incidents",
    "weather_main"
]

target_col = 'has_incident'

predictor_cols =[col for col in weather_data.columns if col not in [target_col] + exclude_cols]
one_hot_cols = ["weather_main"]
numeric_cols = [col for col in predictor_cols if col not in exclude_cols + one_hot_cols]

X = weather_data[predictor_cols]
y = weather_data[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [27]:
boundary = 0.5

params = {
    'boosting_type': 'dart',
    'num_boost_round': 15_000,
    'metric': 'binary_error',
    'reg_alpha': 5,
    'reg_lambda': 100,
    'verbose': 0
}

def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.where(y_hat < boundary, 0, 1)
    return 'f1', f1_score(y_true, y_hat), True

model = lgb.train(
    params, 
    lgb.Dataset(X_train, label=y_train), 
    feval=lgb_f1_score
)

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

data = pd.DataFrame(
    {
        "actual": y_test.reset_index().drop("index", axis=1)["has_incident"],
        "predicted": np.where(y_pred <  boundary, 0, 1),
    }, 
)

print(pd.crosstab(data.actual, data.predicted))

f1 = f1_score(y_test, np.where(y_pred < boundary, 0, 1))
recall = recall_score(y_test, np.where(y_pred < boundary, 0, 1))
precision = precision_score(y_test, np.where(y_pred < boundary, 0, 1))
accuracy = accuracy_score(y_test, np.where(y_pred < boundary, 0, 1))
print(f"="*100)
print(f"Accuracy in testing: {accuracy}")
print(f"F1 Score in testing: {f1}")
print(f"Recall Score in testing: {recall}")
print(f"Precision in testing: {precision}")
print(f"="*100) 

predicted    0   1
actual            
False      935  13
True        70  33
Accuracy in testing: 0.9210275927687916
F1 Score in testing: 0.4429530201342282
Recall Score in testing: 0.32038834951456313
Precision in testing: 0.717391304347826


In [31]:
y_pred = model.predict(X_test)
print(pd.crosstab(data.actual, data.predicted))

f1 = f1_score(y_test, np.where(y_pred < 0.2, 0, 1))
recall = recall_score(y_test, np.where(y_pred < 0.2, 0, 1))
precision = precision_score(y_test, np.where(y_pred < 0.2, 0, 1))
accuracy = accuracy_score(y_test, np.where(y_pred < 0.2, 0, 1))
print(f"="*100)
print(f"Accuracy in testing: {accuracy}")
print(f"F1 Score in testing: {f1}")
print(f"Recall Score in testing: {recall}")
print(f"Precision in testing: {precision}")
print(f"="*100) 

predicted    0   1
actual            
False      935  13
True        70  33
Accuracy in testing: 0.8877259752616555
F1 Score in testing: 0.49572649572649574
Recall Score in testing: 0.5631067961165048
Precision in testing: 0.44274809160305345


In [22]:
`# joblib.dump(model, f"lightgbm_model_{f1:.2f}.joblib")