In [1]:
from typing import Tuple
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import pandas as pd

import numpy as np

import torch

from torch import nn
from torch.utils.data import Dataset, DataLoader

from catboost import CatBoostClassifier, MultiTargetCustomMetric

import wandb

In [2]:
# df = pd.concat([pd.read_csv("more-elements/more-elements.csv"), pd.read_csv("new-more-data/new-more-data.csv")])
# df
df = pd.read_pickle("half_data.pkl")
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1000,element_1,element_2,element_3,element_1_ratio,element_2_ratio,element_3_ratio,temp,pressure,air_ratio
0,1.402026e-21,1.650584e-21,1.910661e-21,2.264304e-21,2.829955e-21,3.673546e-21,4.745899e-21,5.773241e-21,6.446840e-21,6.741727e-21,...,3.466256e-20,17,23,-1,0.440157,0.559843,0.000000,283.0,0.775,0.0
1,4.102767e-22,4.371429e-22,4.698476e-22,5.105580e-22,5.626400e-22,6.315813e-22,7.266133e-22,8.610945e-22,1.048941e-21,1.323529e-21,...,2.481710e-21,0,19,-1,0.428047,0.571953,0.000000,303.0,0.800,0.6
2,3.610634e-23,2.739156e-23,1.628379e-23,1.195799e-23,1.069667e-23,1.058083e-23,1.121691e-23,1.274356e-23,1.515703e-23,1.882304e-23,...,2.756785e-21,2,11,13,0.342547,0.376666,0.280788,283.0,0.500,0.6
3,1.016487e-22,1.356304e-22,1.894228e-22,3.406116e-22,8.410447e-22,2.039525e-21,1.635235e-21,6.587985e-22,3.087047e-22,2.124933e-22,...,1.240249e-22,6,10,13,0.053299,0.321178,0.625524,293.0,0.900,0.6
4,1.637227e-21,2.113385e-21,2.633061e-21,3.098701e-21,3.509187e-21,3.569827e-21,3.557713e-21,3.356628e-21,3.389955e-21,3.744591e-21,...,3.614986e-23,9,18,24,0.470739,0.251465,0.277796,273.0,0.500,0.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183548,1.720379e-22,2.933964e-22,4.190668e-22,5.128756e-22,6.537258e-22,6.598291e-22,7.231058e-22,6.818061e-22,8.002978e-22,1.130003e-21,...,8.585612e-23,9,17,23,0.401527,0.454117,0.144356,313.0,0.500,0.6
183549,2.392508e-21,1.998411e-21,1.320957e-21,9.131813e-22,6.948151e-22,5.720024e-22,4.987314e-22,4.533583e-22,4.270046e-22,4.143369e-22,...,2.460996e-21,4,8,16,0.348675,0.389984,0.261341,323.0,1.000,0.6
183550,1.947286e-22,2.092204e-22,2.261748e-22,2.462494e-22,2.703056e-22,2.995569e-22,3.357735e-22,3.814871e-22,4.405433e-22,5.189812e-22,...,1.838447e-23,1,7,14,0.116401,0.430989,0.452610,313.0,0.100,0.6
183551,4.523992e-22,6.372981e-22,1.661855e-21,1.874233e-22,1.226359e-22,1.804612e-22,3.107570e-22,1.559673e-21,3.250324e-21,2.427107e-21,...,1.508687e-21,15,18,23,0.422131,0.441890,0.135979,283.0,0.200,0.3


In [3]:
print(len(df))
df = df[(df[[str(i) for i in range(1001)]] > 0).all(axis=1)]
print(len(df))

183553
183553


In [4]:
len(df[(df["element_1"] == 0) | (df["element_2"] == 0) | (df["element_3"] == 0)]) / len(df)

0.11451733286843582

In [3]:
class CustomCatboostSpectraDataset(Dataset):
    def __init__(self, data: pd.DataFrame, device="cuda:0") -> None:
        self.data = data
        self.elements = self.data["element_1"].unique()
        self.air_ratios = data.air_ratio.to_numpy(dtype=np.float64)

        self.spectras = self.data[[str(i) for i in range(1001)]].to_numpy(
            dtype=np.float64
        )

        self.ratios = self.data[
            ["element_1_ratio", "element_2_ratio", "element_3_ratio"]
        ].to_numpy(dtype=np.float64)
        self.element_indices = self.data[
            ["element_1", "element_2", "element_3"]
        ].to_numpy(dtype=np.int32)

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, idx: int) -> Tuple[np.ndarray, np.ndarray]:
        spectra = np.log(self.spectras[idx])

        elements_distribution = np.zeros(
            [len(self.elements)], dtype=np.float64
        )
        indices = self.element_indices[idx, :]
        indices = indices[indices != -1]
        elements_distribution[indices] = np.where(
            self.ratios[idx][range(indices.shape[0])] > 0, 1.0, 0.0
        )

        return spectra, elements_distribution

In [4]:
train_df, val_df = train_test_split(df, test_size=0.15, random_state=42)

In [5]:
train_dataset = CustomCatboostSpectraDataset(train_df, device="cpu")
val_dataset = CustomCatboostSpectraDataset(val_df, device="cpu")

In [6]:
X_train, y_train = np.array([train_dataset[i][0] for i in range(len(train_dataset))]), np.array(
    [train_dataset[i][1] for i in range(len(train_dataset))]
)

X_val, y_val = np.array([val_dataset[i][0] for i in range(len(val_dataset))]), np.array(
    [val_dataset[i][1] for i in range(len(val_dataset))]
)

In [7]:
def cross_entropy(predictions, targets, epsilon=1e-12):
    """
    Computes cross entropy between targets (encoded as one-hot vectors)
    and predictions.
    Input: predictions (N, k) ndarray
           targets (N, k) ndarray
    Returns: scalar
    """
    predictions = np.clip(predictions, epsilon, 1.0 - epsilon)
    N = predictions.shape[0]
    ce = -np.sum(targets * np.log(predictions + 1e-9)) / N
    return ce

In [8]:
class CEMetric(MultiTargetCustomMetric):

    @staticmethod
    def get_cross_entropy(y_true, y_pred):
        return cross_entropy(y_pred, y_true)

    def is_max_optimal(self):
        return False  # smaller is better

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])
        y_true = np.array(target).astype(int)
        approx = approxes[0]
        score = self.get_cross_entropy(y_true, approx)
        return score, 1

    def get_final_error(self, error, weight):
        return error

In [7]:
gb = CatBoostClassifier(
    task_type="GPU",
    learning_rate=0.5,
    bootstrap_type="No",
    l2_leaf_reg=100,
    loss_function="MultiCrossEntropy",
)
gb.fit(X=X_train, y=y_train, eval_set=(X_val, y_val), plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.3449290	test: 0.3453621	best: 0.3453621 (0)	total: 288ms	remaining: 4m 47s
1:	learn: 0.2427042	test: 0.2431588	best: 0.2431588 (1)	total: 530ms	remaining: 4m 24s
2:	learn: 0.1925064	test: 0.1928331	best: 0.1928331 (2)	total: 793ms	remaining: 4m 23s
3:	learn: 0.1690469	test: 0.1693741	best: 0.1693741 (3)	total: 1.04s	remaining: 4m 19s
4:	learn: 0.1517103	test: 0.1520756	best: 0.1520756 (4)	total: 1.32s	remaining: 4m 21s
5:	learn: 0.1419227	test: 0.1423951	best: 0.1423951 (5)	total: 1.56s	remaining: 4m 18s
6:	learn: 0.1311543	test: 0.1317432	best: 0.1317432 (6)	total: 1.81s	remaining: 4m 17s
7:	learn: 0.1229314	test: 0.1233947	best: 0.1233947 (7)	total: 2.06s	remaining: 4m 14s
8:	learn: 0.1174806	test: 0.1180781	best: 0.1180781 (8)	total: 2.3s	remaining: 4m 13s
9:	learn: 0.1130589	test: 0.1136630	best: 0.1136630 (9)	total: 2.48s	remaining: 4m 5s
10:	learn: 0.1076565	test: 0.1082633	best: 0.1082633 (10)	total: 2.72s	remaining: 4m 4s
11:	learn: 0.1045602	test: 0.1052173	best: 0

<catboost.core.CatBoostClassifier at 0x1af2a181cf0>

In [34]:
i = 321
pred = gb.predict(X_val[i])
cross_entropy(pred[None, :], y_val[i][None, :])

1.473936293076028

In [9]:
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
)

In [19]:
pred = gb.predict(X_val)
f1_score(y_val, pred, average="macro")

0.8944643510188943

In [21]:
# lr=0.001
pred = gb.predict(X_val)
f1_score(y_val, pred, average="macro")

0.5869803074741877

In [24]:
# lr=0.005
pred = gb.predict(X_val)
f1_score(y_val, pred, average="macro")

0.7881139848690839

In [26]:
# lr=0.1
pred = gb.predict(X_val)
f1_score(y_val, pred, average="macro")

0.9045378519069314

In [28]:
# lr=0.5
pred = gb.predict(X_val)
f1_score(y_val, pred, average="macro")

0.9128630654610811

In [30]:
# lr=0.5, l2_leaf_reg=10
pred = gb.predict(X_val)
f1_score(y_val, pred, average="macro")

0.9137050602780659

In [32]:
# lr=0.5, l2_leaf_reg=100
pred = gb.predict(X_val)
f1_score(y_val, pred, average="macro")

0.913891723176625

In [34]:
# depth=4, lr=0.5, l2_leaf_reg=100
pred = gb.predict(X_val)
f1_score(y_val, pred, average="macro")

0.9110333349811157

In [36]:
# min_data_in_leaf=2, lr=0.5, l2_leaf_reg=100
pred = gb.predict(X_val)
f1_score(y_val, pred, average="macro")

0.913891723176625

In [38]:
# bootstrap_type="Bayesian", lr=0.5, l2_leaf_reg=100
pred = gb.predict(X_val)
f1_score(y_val, pred, average="macro")

0.913891723176625

In [40]:
# bootstrap_type="Bernoulli", lr=0.5, l2_leaf_reg=100
pred = gb.predict(X_val)
f1_score(y_val, pred, average="macro")

0.9151949535677928

In [10]:
# bootstrap_type="No", lr=0.5, l2_leaf_reg=100          BEST!!!
pred = gb.predict(X_val)
print(f'F1: {f1_score(y_val, pred, average="macro")}')
print(f'precision: {precision_score(y_val, pred, average="macro")}')
print(f'recall: {recall_score(y_val, pred, average="macro")}')
print(f'accuracy: {accuracy_score(y_val, pred)}')

F1: 0.9151949535677928
precision: 0.9609059716537451
recall: 0.8864482821325259
accuracy: 0.6662187193549559


In [17]:
f1_score(y_val, pred, average=None)

array([0.99661345, 0.45485961, 0.48228767, 0.99227251, 0.9924812 ,
       0.99634631, 0.85777207, 0.9419885 , 0.38819474, 0.95733646,
       0.98312784, 0.98862397, 0.9935918 , 0.99094437, 0.99527918,
       0.99440805, 0.99065124, 0.99488655, 0.99694583, 0.99426268,
       0.9831946 , 0.98709163, 0.96382008, 0.99029744, 0.97259608])

In [35]:
pred, y_val[i]

(array([1.23883874e-06, 4.38560582e-03, 5.62113566e-03, 4.34981757e-04,
        1.37915759e-05, 1.60435102e-05, 6.15459019e-03, 5.16426915e-03,
        6.25508849e-03, 3.83966453e-03, 3.68702319e-04, 1.52557320e-04,
        1.44995356e-05, 1.01582786e-04, 1.56701221e-01, 3.78149110e-06,
        2.47053659e-01, 3.95344475e-04, 2.64859709e-04, 2.57998431e-05,
        2.71519559e-03, 2.26385942e-04, 3.70902956e-03, 1.02934236e-04,
        2.68017492e-02, 5.55836556e-01]),
 array([0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.19726331,
        0.        , 0.39952827, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.10320842,
        0.3       ]))

In [36]:
np.sum(np.abs(pred[y_val[i] != 0] - y_val[i][y_val[i] != 0])) / y_val[i][y_val[i] != 0].shape[0]

0.1313199816354742

In [14]:
pred_total = gb.predict_proba(X_val)

In [15]:
cross_entropy(pred_total, y_val)

1.237707421082145