In [1]:
from typing import Tuple
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
import pandas as pd

import numpy as np

import torch

from torch import nn
from torch.utils.data import Dataset, DataLoader

from xgboost import XGBClassifier

import wandb

In [2]:
# df = pd.concat([pd.read_csv("more-elements/more-elements.csv"), pd.read_csv("new-more-data/new-more-data.csv")])
# df
train_df = pd.read_pickle("half_data.pkl")
test_df = pd.read_pickle("test_proc.pkl")

In [3]:
print(len(df))
df = df[(df[[str(i) for i in range(1001)]] > 0).all(axis=1)]
print(len(df))

183553
183553


In [4]:
len(df[(df["element_1"] == 0) | (df["element_2"] == 0) | (df["element_3"] == 0)]) / len(df)

0.11451733286843582

In [9]:
class CustomCatboostSpectraDataset(Dataset):
    def __init__(self, data: pd.DataFrame, device="cuda:0") -> None:
        self.data = data
        self.elements = self.data["element_1"].unique()
        self.air_ratios = data.air_ratio.to_numpy(dtype=np.float64)

        self.spectras = self.data[[str(i) for i in range(1001)]].to_numpy(
            dtype=np.float64
        )

        self.ratios = self.data[
            ["element_1_ratio", "element_2_ratio", "element_3_ratio"]
        ].to_numpy(dtype=np.float64)
        self.element_indices = self.data[
            ["element_1", "element_2", "element_3"]
        ].to_numpy(dtype=np.int32)

        self.spectras = self.spectras[~np.isnan(self.spectras).any(axis=1)]

    def __len__(self) -> int:
        return len(self.spectras)

    def __getitem__(self, idx: int) -> Tuple[np.ndarray, np.ndarray]:
        spectra = np.log(self.spectras[idx])

        elements_distribution = np.zeros(
            [len(self.elements)], dtype=np.float64
        )
        indices = self.element_indices[idx, :]
        indices = indices[indices != -1]
        elements_distribution[indices] = np.where(
            self.ratios[idx][range(indices.shape[0])] > 0, 1.0, 0.0
        )

        return spectra, elements_distribution

In [4]:
# train_df, val_df = train_test_split(df, test_size=0.15, random_state=42)

In [10]:
train_dataset = CustomCatboostSpectraDataset(train_df, device="cpu")
val_dataset = CustomCatboostSpectraDataset(test_df, device="cpu")

In [11]:
X_train, y_train = np.array([train_dataset[i][0] for i in range(len(train_dataset))]), np.array(
    [train_dataset[i][1] for i in range(len(train_dataset))]
)



In [12]:
gb = MultiOutputClassifier(
    XGBClassifier(
        n_estimators=1000,
        device="cuda",
        learning_rate=0.5,
        objective="binary:logistic",
    )
)


gb.fit(X=X_train, Y=y_train)

In [13]:
X_val, y_val = np.array([val_dataset[i][0] for i in range(len(val_dataset))]), np.array(
    [val_dataset[i][1] for i in range(len(val_dataset))]
)

  spectra = np.log(self.spectras[idx])


In [34]:
i = 321
pred = gb.predict(X_val[i])
cross_entropy(pred[None, :], y_val[i][None, :])

1.473936293076028

In [14]:
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
)

In [15]:
pred = gb.predict(X_val)
print(f'F1: {f1_score(y_val, pred, average="macro")}')
print(f'precision: {precision_score(y_val, pred, average="macro")}')
print(f'recall: {recall_score(y_val, pred, average="macro")}')
print(f"accuracy: {accuracy_score(y_val, pred)}")

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




F1: 0.9681856093537644
precision: 0.9829342528927463
recall: 0.9556629237804803
accuracy: 0.8658137459089557


In [16]:
print(f'F1: {f1_score(y_val, pred, average=None)}')
print(f'precision: {precision_score(y_val, pred, average=None)}')
print(f'recall: {recall_score(y_val, pred, average=None)}')

F1: [0.99960344 0.77550416 0.77459213 0.99839647 0.99917023 0.99906735
 0.97212377 0.99086545 0.75066293 0.99097964 0.99539877 0.99533458
 0.998835   0.99848907 0.99896318 0.99867048 0.99791484 0.99933691
 0.99953725 0.99869909 0.99504638 0.99718744 0.99184856 0.99674007
 0.99167303]
precision: [1.         0.86574686 0.86656734 1.         1.         0.99993332
 0.99790107 0.99946197 0.84488476 0.99979925 0.99993301 0.99973223
 1.         1.         1.         0.99993344 1.         1.
 1.         0.9999332  0.99986637 1.         0.99986418 1.
 0.99979932]
recall: [0.99920719 0.70229877 0.70026756 0.99679808 0.99834185 0.99820288
 0.94764467 0.98241555 0.67534791 0.98231427 0.99090547 0.99097545
 0.99767272 0.9969827  0.9979285  0.9974107  0.99583837 0.99867471
 0.99907493 0.99746802 0.99027263 0.99439065 0.98396044 0.99350133
 0.98367777]


In [10]:
# vanilla
pred = gb.predict(X_val)
f1_score(y_val, pred, average="macro")

0.916144592122844

In [16]:
# n_estimators=1000,
# device="cuda",
# learning_rate=0.3,
# objective="binary:logistic",
pred = gb.predict(X_val)
f1_score(y_val, pred, average="macro")

0.9221136677387097

In [18]:
# n_estimators=1000,
# device="cuda",
# learning_rate=0.1,
# objective="binary:logistic",
pred = gb.predict(X_val)
f1_score(y_val, pred, average="macro")

0.9214788513615314

In [None]:
gb = MultiOutputClassifier(
    XGBClassifier(
        n_estimators=1000,
        device="cuda",
        learning_rate=0.1,
        objective="binary:logistic",
        **{"lambda": 10}
    )
)


gb.fit(X=X_train, Y=y_train)
pred = gb.predict(X_val)
f1_score(y_val, pred, average="macro")

0.9213703296883752

In [11]:
# n_estimators=1000,                                BEST!!!
# device="cuda",
# learning_rate=0.5,
# objective="binary:logistic",
pred = gb.predict(X_val)
print(f'F1: {f1_score(y_val, pred, average="macro")}')
print(f'precision: {precision_score(y_val, pred, average="macro")}')
print(f'recall: {recall_score(y_val, pred, average="macro")}')
print(f"accuracy: {accuracy_score(y_val, pred)}")

F1: 0.924023939808274
precision: 0.9540807299030724
recall: 0.9031540752644299
accuracy: 0.7106744633712273


In [12]:
print(f'F1: {f1_score(y_val, pred, average=None)}')
print(f'precision: {precision_score(y_val, pred, average=None)}')
print(f'recall: {recall_score(y_val, pred, average=None)}')

F1: [0.99774485 0.48822355 0.49029418 0.99686028 0.99649347 0.99920773
 0.89705103 0.9673476  0.39592253 0.97349794 0.98919877 0.99457448
 0.99600447 0.9958159  0.99593298 0.99745142 0.99415759 0.99760956
 0.99903754 0.99737208 0.99049104 0.99286959 0.98039216 0.99445588
 0.98259188]
precision: [1.         0.66757642 0.64986737 1.         1.         0.99968294
 0.98525989 0.99933533 0.55517439 0.9993241  0.99934853 1.
 1.         1.         1.         1.         1.         1.
 1.         1.         0.99874253 1.         0.99901736 1.
 0.99868938]
recall: [0.99549984 0.38483323 0.39363753 0.99374022 0.99301144 0.99873297
 0.82333874 0.93734414 0.30766793 0.94897304 0.97925311 0.98920752
 0.99204075 0.99166667 0.9918989  0.99491579 0.98838305 0.99523052
 0.99807692 0.99475794 0.98237477 0.98584015 0.96244872 0.9889729
 0.96700508]


In [23]:
# n_estimators=1000,
# device="cuda",
# learning_rate=0.7,
# objective="binary:logistic",
pred = gb.predict(X_val)
f1_score(y_val, pred, average="macro")

0.9230440875965293

In [25]:
# n_estimators=1000,
# device="cuda",
# learning_rate=0.6,
# objective="binary:logistic",
pred = gb.predict(X_val)
f1_score(y_val, pred, average="macro")

0.9236453189490572

In [29]:
# n_estimators=1000,
# device="cuda",
# learning_rate=0.5,
# subsample=0.75,
# objective="binary:logistic",
pred = gb.predict(X_val)
f1_score(y_val, pred, average="macro")

0.9233216239966314

In [30]:
# n_estimators=1000,
# device="cuda",
# learning_rate=0.5,
# subsample=0.9,
# objective="binary:logistic",
pred = gb.predict(X_val)
f1_score(y_val, pred, average="macro")

0.9233216239966314

In [35]:
pred, y_val[i]

(array([1.23883874e-06, 4.38560582e-03, 5.62113566e-03, 4.34981757e-04,
        1.37915759e-05, 1.60435102e-05, 6.15459019e-03, 5.16426915e-03,
        6.25508849e-03, 3.83966453e-03, 3.68702319e-04, 1.52557320e-04,
        1.44995356e-05, 1.01582786e-04, 1.56701221e-01, 3.78149110e-06,
        2.47053659e-01, 3.95344475e-04, 2.64859709e-04, 2.57998431e-05,
        2.71519559e-03, 2.26385942e-04, 3.70902956e-03, 1.02934236e-04,
        2.68017492e-02, 5.55836556e-01]),
 array([0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.19726331,
        0.        , 0.39952827, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.10320842,
        0.3       ]))

In [36]:
np.sum(np.abs(pred[y_val[i] != 0] - y_val[i][y_val[i] != 0])) / y_val[i][y_val[i] != 0].shape[0]

0.1313199816354742

In [14]:
pred_total = gb.predict_proba(X_val)

In [15]:
cross_entropy(pred_total, y_val)

1.237707421082145