# Instructions

1. Run all cells

In [1]:
from collections import defaultdict

import pandas as pd
import numpy as np

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import GradientBoostingClassifier as Classifier
from sklearn.ensemble import GradientBoostingRegressor as Regressor

from sklearn.metrics import ndcg_score

from dataset import WorksitesDataset
from run import build_datasets

class DatasetConfig:
    dataset_path = "./datasets/worksites.csv"
    fill_missing_regression = 1460
    
LABEL_COLUMNS = ["FUTURE_TOTAL_COUNT", "DAYS_UNTIL_NEXT_ACCIDENT"]

train_ds, val_ds, test_ds = build_datasets(DatasetConfig, LABEL_COLUMNS)

train_features = pd.concat([train_ds._features_df, val_ds._features_df]).to_numpy()
train_labels = pd.concat([train_ds._labels_df, val_ds._labels_df])
train_classifier_labels = (train_labels.iloc[:,0] > 0).astype(int).to_numpy()
train_regressor_labels = train_labels.iloc[:,1].fillna(DatasetConfig.fill_missing_regression).to_numpy()

test_features = test_ds._features_df.to_numpy()
test_relevances = test_ds._df[["YEAR", "MONTH", "REGION_ID", "ACCIDENTS_TWELVE_MONTHS"]].copy()
test_relevances["REGION"] = test_relevances["REGION_ID"].map(WorksitesDataset.REGIONAL_OFFICES)

print("Size of train/test:", len(train_ds) + len(val_ds), len(test_ds), end="\n\n")

Size of train/test: 153213 38303



In [2]:
def compute_ranking_metrics(relevances, criticality):
    relevances = relevances.copy()
    relevances["CRITICALITY"] = criticality
    
    ndcgs = defaultdict(list)

    for region_year_month, group_df in relevances.groupby(["REGION", "YEAR", "MONTH"]):
        if group_df["ACCIDENTS_TWELVE_MONTHS"].sum() == 0:
            continue
        if len(group_df) > 1:
            score = ndcg_score(
                y_true=[group_df["ACCIDENTS_TWELVE_MONTHS"]],
                y_score=[group_df["CRITICALITY"]],
            )
        else:
            score = 1
        region = region_year_month[0]
        ndcgs[region].append(score)

    mean_ndcgs = {}
    for key, value in ndcgs.items():
        mean_key = f"{key}_NDCG"
        mean_value = np.mean(value)
        mean_ndcgs[mean_key] = mean_value

    mean_ndcg = np.mean(list(mean_ndcgs.values()))
    return mean_ndcg, mean_ndcgs

# `GradientBoostingClassifier`

In [3]:
X, y = train_features, train_classifier_labels
clf = Classifier(random_state=42).fit(X, y)
clf.score(X, y)

0.9666868999366894

In [4]:
test_predicted = clf.predict_proba(test_features)[:,1]
compute_ranking_metrics(test_relevances, test_predicted)

(0.8077466928989031,
 {'ANTOFAGASTA_NDCG': 0.6972892526742207,
  'ARICA_NDCG': 1.0,
  'ATACAMA_NDCG': 0.9445915959086405,
  'CENTRO_NDCG': 0.7844571514621748,
  'COQUIMBO_NDCG': 0.962655701131645,
  'MAGALLANES_NDCG': 0.8067428466747268,
  'MAULE_NDCG': 0.5739304356045668,
  "O'HIGGINS_NDCG": 0.9985106858769003,
  'SUR_NDCG': 0.6014953415798711,
  'TARAPACA_NDCG': 0.7077939180762846})

# `GradientBoostingRegressor`

In [5]:
X, y = train_features, train_regressor_labels
est = Regressor(random_state=42).fit(X, y)
est.score(X, y)

0.6684511491963749

In [6]:
test_predicted = -est.predict(test_features)
compute_ranking_metrics(test_relevances, test_predicted)

(0.8183341671733133,
 {'ANTOFAGASTA_NDCG': 0.7798033589853474,
  'ARICA_NDCG': 1.0,
  'ATACAMA_NDCG': 0.9186036076432735,
  'CENTRO_NDCG': 0.8049299243867941,
  'COQUIMBO_NDCG': 0.9585323278444451,
  'MAGALLANES_NDCG': 0.8763145341375751,
  'MAULE_NDCG': 0.49859291893397173,
  "O'HIGGINS_NDCG": 0.9985733649860123,
  'SUR_NDCG': 0.6052211551435673,
  'TARAPACA_NDCG': 0.7427704796721469})