In [32]:
from undersampling import  DatabaseTables, FetchDBData
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score


from lightgbm import LGBMClassifier 
from sklearn.preprocessing import OneHotEncoder, PowerTransformer, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbalancedPipeline

# DATA Combination and Undersampling

In [2]:
weather_data = FetchDBData(DatabaseTables.full_sampled_weather_with_incidents).get_database_data()

# Modeling

In [3]:
exclude_cols = [
    "dt_iso",
    "grid_id",
    "count_incidents",
]

target_col = 'has_incident'

predictor_cols =[col for col in weather_data.columns if col not in [target_col] + exclude_cols]
one_hot_cols = ["weather_main"]
numeric_cols = [col for col in predictor_cols if col not in exclude_cols + one_hot_cols]

X = weather_data[predictor_cols]
y = weather_data[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)

In [4]:
one_hot_enc = OneHotEncoder()


col_trans = ColumnTransformer(
    [("one_hot_encoder", one_hot_enc, one_hot_cols)],
    remainder="passthrough"
)

  
model = LGBMClassifier(
    metric='binary_error', 
    boosting_type='dart', 
    class_weight={False: 0.05, True: 0.95},
    n_estimators=10_000, 
    verbose=-1,
    reg_alpha=10,
    reg_lambda=10
)

model_pipeline = Pipeline(
    [
        ("onehot_encoder", col_trans),
        ("model", model)
    ]
)
  
model_pipeline.fit(X_train, y_train) 
  
y_pred = model_pipeline.predict(X_test)

data = pd.DataFrame(
    {
        "actual": y_test.reset_index().drop("index", axis=1)["has_incident"],
        "predicted": y_pred,
    }, 
)

print(pd.crosstab(data.actual, data.predicted))
print(f1_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

predicted  False  True 
actual                 
False        596    352
True          20     83
0.30855018587360594
0.19080459770114944
0.8058252427184466


In [5]:
one_hot_enc = OneHotEncoder()


col_trans = ColumnTransformer(
    [("one_hot_encoder", one_hot_enc, one_hot_cols)],
    remainder="passthrough"
)

  
model = LGBMClassifier(
    metric='binary_error', 
    boosting_type='dart', 
    class_weight={False: 0.05, True: 0.95},
    n_estimators=10_000, 
    verbose=-1,
)

model_pipeline = Pipeline(
    [
        ("onehot_encoder", col_trans),
        ("model", model)
    ]
)
  
model_pipeline.fit(X_train, y_train) 
  
y_pred = model_pipeline.predict(X_test)

data = pd.DataFrame(
    {
        "actual": y_test.reset_index().drop("index", axis=1)["has_incident"],
        "predicted": y_pred,
    }, 
)

print(pd.crosstab(data.actual, data.predicted))
print(f1_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

predicted  False  True 
actual                 
False        916     32
True          63     40
0.45714285714285713
0.5555555555555556
0.3883495145631068


In [13]:
one_hot_enc = OneHotEncoder()


col_trans = ColumnTransformer(
    [("one_hot_encoder", one_hot_enc, one_hot_cols)],
    remainder="passthrough"
)

  
model = LGBMClassifier(
    metric='binary_error', 
    boosting_type='gbdt', 
    class_weight={False: 0.0001, True: 0.9999},
    n_estimators=50_000, 
    verbose=-1,
)

model_pipeline = Pipeline(
    [
        ("onehot_encoder", col_trans),
        ("model", model)
    ]
)
  
model_pipeline.fit(X_train, y_train) 
  
y_pred = model_pipeline.predict(X_test)

data = pd.DataFrame(
    {
        "actual": y_test.reset_index().drop("index", axis=1)["has_incident"],
        "predicted": y_pred,
    }, 
)

print(pd.crosstab(data.actual, data.predicted))
print(f1_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

predicted  False  True 
actual                 
False        871     77
True          52     51
0.44155844155844154
0.3984375
0.49514563106796117


In [16]:
one_hot_enc = OneHotEncoder()
min_max_scaler = MinMaxScaler()

col_trans = ColumnTransformer(
    [("one_hot_encoder", one_hot_enc, one_hot_cols),
    ("min_max_scaler", min_max_scaler, numeric_cols)],
    remainder="passthrough"
)

  
model = LGBMClassifier(
    metric='binary_error', 
    boosting_type='gbdt', 
    class_weight={False: 0.0001, True: 0.9999},
    n_estimators=50_000,
    verbose=-1,
)

model_pipeline = Pipeline(
    [
        ("onehot_encoder", col_trans),
        ("model", model)
    ]
)
  
model_pipeline.fit(X_train, y_train) 
  
y_pred = model_pipeline.predict(X_test)

data = pd.DataFrame(
    {
        "actual": y_test.reset_index().drop("index", axis=1)["has_incident"],
        "predicted": y_pred,
    }, 
)

print(pd.crosstab(data.actual, data.predicted))
print(f1_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

predicted  False  True 
actual                 
False        834    114
True          42     61
0.43884892086330934
0.3485714285714286
0.5922330097087378


In [23]:
one_hot_enc = OneHotEncoder()
min_max_scaler = MinMaxScaler()
undersampling = NearMiss(sampling_strategy=0.3)


col_trans = ColumnTransformer(
    [("one_hot_encoder", one_hot_enc, one_hot_cols),
    ("min_max_scaler", min_max_scaler, numeric_cols)],
    remainder="passthrough"
)

  
model = LGBMClassifier(
    metric='binary_error', 
    boosting_type='gbdt', 
    class_weight={False: 0.0001, True: 0.9999},
    n_estimators=50_000,
    verbose=-1,
)

model_pipeline = ImbalancedPipeline(
    [
        ("encoding", col_trans),
        ("undersampling", undersampling),
        ("model", model)
    ]
)
  
model_pipeline.fit(X_train, y_train) 
  
y_pred = model_pipeline.predict(X_test)

data = pd.DataFrame(
    {
        "actual": y_test.reset_index().drop("index", axis=1)["has_incident"],
        "predicted": y_pred,
    }, 
)

print(pd.crosstab(data.actual, data.predicted))
print(f1_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

predicted  False  True 
actual                 
False        526    422
True          30     73
0.24414715719063546
0.14747474747474748
0.7087378640776699


In [26]:
one_hot_enc = OneHotEncoder()
power_trans = PowerTransformer()
oversampling = SMOTE(sampling_strategy=0.3)


col_trans = ColumnTransformer(
    [("one_hot_encoder", one_hot_enc, one_hot_cols),
    ("power_trans", power_trans, numeric_cols)],
    remainder="passthrough"
)

  
model_2 = LGBMClassifier(
    metric='binary_error', 
    boosting_type='gbdt', 
    class_weight={False: 0.0001, True: 0.9999},
    n_estimators=50_000,
    verbose=-1,
)

model_pipeline_2 = ImbalancedPipeline(
    [
        ("encoding", col_trans),
        ("oversampling", oversampling),
        ("model", model_2)
    ]
)
  
model_pipeline_2.fit(X_train, y_train) 
  
y_pred = model_pipeline_2.predict(X_test)

data = pd.DataFrame(
    {
        "actual": y_test.reset_index().drop("index", axis=1)["has_incident"],
        "predicted": y_pred,
    }, 
)

print(pd.crosstab(data.actual, data.predicted))
print(f1_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

predicted  False  True 
actual                 
False        847    101
True          44     59
0.44866920152091255
0.36875
0.5728155339805825


In [31]:
one_hot_enc = OneHotEncoder()
power_trans = PowerTransformer()
oversampling = SMOTE(sampling_strategy=0.3)
undersampling = NearMiss(sampling_strategy=0.5)


col_trans = ColumnTransformer(
    [("one_hot_encoder", one_hot_enc, one_hot_cols),
    ("power_trans", power_trans, numeric_cols)],
    remainder="passthrough"
)

  
model_2 = LGBMClassifier(
    metric='binary_error', 
    boosting_type='gbdt', 
    class_weight={False: 0.2, True: 0.8},
    n_estimators=50_000,
    verbose=-1,
)

model_pipeline_3 = ImbalancedPipeline(
    [
        ("encoding", col_trans),
        ("oversampling", oversampling),
        ("undersampling", undersampling),
        ("model", model_2)
    ]
)
  
model_pipeline_3.fit(X_train, y_train) 
  
y_pred = model_pipeline_3.predict(X_test)

data = pd.DataFrame(
    {
        "actual": y_test.reset_index().drop("index", axis=1)["has_incident"],
        "predicted": y_pred,
    }, 
)

print(pd.crosstab(data.actual, data.predicted))
print(f1_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

predicted  False  True 
actual                 
False        876     72
True          49     54
0.47161572052401746
0.42857142857142855
0.5242718446601942
