# Instructions

1. Run all cells

In [1]:
from collections import defaultdict

import pandas as pd
import numpy as np

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.tree import DecisionTreeClassifier as Classifier
from sklearn.tree import DecisionTreeRegressor as Regressor

from sklearn.metrics import ndcg_score

from dataset import WorksitesDataset
from run import build_datasets

class DatasetConfig:
    dataset_path = "./datasets/worksites.csv"
    fill_missing_regression = 1460
    
LABEL_COLUMNS = ["FUTURE_TOTAL_COUNT", "DAYS_UNTIL_NEXT_ACCIDENT"]

train_ds, val_ds, test_ds = build_datasets(DatasetConfig, LABEL_COLUMNS)

train_features = pd.concat([train_ds._features_df, val_ds._features_df]).to_numpy()
train_labels = pd.concat([train_ds._labels_df, val_ds._labels_df])
train_classifier_labels = (train_labels.iloc[:,0] > 0).astype(int).to_numpy()
train_regressor_labels = train_labels.iloc[:,1].fillna(DatasetConfig.fill_missing_regression).to_numpy()

test_features = test_ds._features_df.to_numpy()
test_relevances = test_ds._df[["YEAR", "MONTH", "REGION_ID", "ACCIDENTS_TWELVE_MONTHS"]].copy()
test_relevances["REGION"] = test_relevances["REGION_ID"].map(WorksitesDataset.REGIONAL_OFFICES)

print("Size of train/test:", len(train_ds) + len(val_ds), len(test_ds), end="\n\n")

Size of train/test: 153213 38303



In [2]:
def compute_ranking_metrics(relevances, criticality):
    relevances = relevances.copy()
    relevances["CRITICALITY"] = criticality
    
    ndcgs = defaultdict(list)

    for region_year_month, group_df in relevances.groupby(["REGION", "YEAR", "MONTH"]):
        if group_df["ACCIDENTS_TWELVE_MONTHS"].sum() == 0:
            continue
        if len(group_df) > 1:
            score = ndcg_score(
                y_true=[group_df["ACCIDENTS_TWELVE_MONTHS"]],
                y_score=[group_df["CRITICALITY"]],
            )
        else:
            score = 1
        region = region_year_month[0]
        ndcgs[region].append(score)

    mean_ndcgs = {}
    for key, value in ndcgs.items():
        mean_key = f"{key}_NDCG"
        mean_value = np.mean(value)
        mean_ndcgs[mean_key] = mean_value

    mean_ndcg = np.mean(list(mean_ndcgs.values()))
    return mean_ndcg, mean_ndcgs

# Classifier

In [3]:
X, y = train_features, train_classifier_labels
clf = Classifier(random_state=42).fit(X, y)
clf.score(X, y)

0.9962470547538395

In [4]:
test_predicted = clf.predict_proba(test_features)[:,1]
compute_ranking_metrics(test_relevances, test_predicted)

(0.517410303185587,
 {'ANTOFAGASTA_NDCG': 0.4840698526088696,
  'ARICA_NDCG': 0.6553433864871141,
  'ATACAMA_NDCG': 0.48739565031331794,
  'CENTRO_NDCG': 0.3938738685667898,
  'COQUIMBO_NDCG': 0.4068740162935044,
  'MAGALLANES_NDCG': 0.6956858216550444,
  'MAULE_NDCG': 0.6308970010421683,
  "O'HIGGINS_NDCG": 0.4981979873436856,
  'SUR_NDCG': 0.3918106991249038,
  'TARAPACA_NDCG': 0.5299547484204726})

# Regressor

In [5]:
X, y = train_features, train_regressor_labels
est = Regressor(random_state=42).fit(X, y)
est.score(X, y)

0.973046320745934

In [6]:
test_predicted = -est.predict(test_features)
compute_ranking_metrics(test_relevances, test_predicted)

(0.6977760794857507,
 {'ANTOFAGASTA_NDCG': 0.6448355660527673,
  'ARICA_NDCG': 0.9597210561410436,
  'ATACAMA_NDCG': 0.6642246238151508,
  'CENTRO_NDCG': 0.6817950590605372,
  'COQUIMBO_NDCG': 0.8294907265625361,
  'MAGALLANES_NDCG': 0.8546966607658227,
  'MAULE_NDCG': 0.5008720055319669,
  "O'HIGGINS_NDCG": 0.8570632060978465,
  'SUR_NDCG': 0.40737284849072053,
  'TARAPACA_NDCG': 0.5776890423391149})