In [1]:
from typing import Tuple
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
import pandas as pd

import numpy as np

import torch

from torch import nn
from torch.utils.data import Dataset, DataLoader

from xgboost import XGBRegressor
import wandb

In [2]:
# df = pd.concat([pd.read_csv("more-elements/more-elements.csv"), pd.read_csv("new-more-data/new-more-data.csv")])
# df
# df = pd.read_pickle("half_data.pkl")
# df
train_df = pd.read_pickle("half_data.pkl")
test_df = pd.read_pickle("test_proc.pkl")

In [3]:
print(len(df))
df = df[(df[[str(i) for i in range(1001)]] > 0).all(axis=1)]
print(len(df))

183553
183553


In [4]:
len(df[(df["element_1"] == 0) | (df["element_2"] == 0) | (df["element_3"] == 0)]) / len(df)

0.11451733286843582

In [3]:
class CustomCatboostSpectraDataset(Dataset):
    def __init__(self, data: pd.DataFrame, device="cuda:0") -> None:
        self.data = data
        self.elements = self.data["element_1"].unique()
        self.air_ratios = data.air_ratio.to_numpy(dtype=np.float64)

        self.spectras = self.data[[str(i) for i in range(1001)]].to_numpy(
            dtype=np.float64
        )

        self.ratios = self.data[
            ["element_1_ratio", "element_2_ratio", "element_3_ratio"]
        ].to_numpy(dtype=np.float64)
        self.element_indices = self.data[
            ["element_1", "element_2", "element_3"]
        ].to_numpy(dtype=np.int32)

        self.spectras = self.spectras[~np.isnan(self.spectras).any(axis=1)]

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, idx: int) -> Tuple[np.ndarray, np.ndarray]:
        spectra = np.log(self.spectras[idx])

        elements_distribution = np.zeros(
            [len(self.elements) + 1], dtype=np.float64
        )
        indices = self.element_indices[idx, :]
        indices = indices[indices != -1]
        elements_distribution[indices] = self.ratios[idx][range(indices.shape[0])] * (1 - self.air_ratios[idx])
        elements_distribution[-1] = self.air_ratios[idx]

        return spectra, elements_distribution

In [30]:
# train_df, val_df = train_test_split(df, test_size=0.15, random_state=42)

In [5]:
train_dataset = CustomCatboostSpectraDataset(train_df)
val_dataset = CustomCatboostSpectraDataset(test_df)

In [6]:
X_train, y_train = np.array(
    [train_dataset[i][0] for i in range(len(train_dataset))]
), np.array([train_dataset[i][1] for i in range(len(train_dataset))])



In [7]:
gb = MultiOutputRegressor(
    XGBRegressor(
        n_estimators=1000,
        device="cuda",
        learning_rate=0.5,
        objective="reg:logistic",
    )
)

gb.fit(X=X_train, y=y_train)

In [8]:
X_val, y_val = np.array([val_dataset[i][0] for i in range(len(val_dataset))]), np.array(
    [val_dataset[i][1] for i in range(len(val_dataset))]
)

  spectra = np.log(self.spectras[idx])


In [9]:
from sklearn.metrics import (
    mean_absolute_error
)

In [10]:
def cross_entropy(predictions, targets, epsilon=1e-12):
    """
    Computes cross entropy between targets (encoded as one-hot vectors)
    and predictions.
    Input: predictions (N, k) ndarray
           targets (N, k) ndarray
    Returns: scalar
    """
    predictions = np.clip(predictions, epsilon, 1.0 - epsilon)
    N = predictions.shape[0]
    ce = -np.sum(targets * np.log(predictions + 1e-9)) / N
    return ce

In [11]:
# n_estimators=1000,                    # BEST!!!
# device="cuda",
# learning_rate=0.5,
# objective="reg:logistic"",
pred = gb.predict(X_val)
print(f"MAE: {mean_absolute_error(y_val, pred)}")
print(f"cross_entropy: {cross_entropy(y_val, pred)}")

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




MAE: 0.005210967055080564
cross_entropy: 1.326221330203524


In [13]:
# n_estimators=1000,                    # BEST!!!
# device="cuda",
# learning_rate=0.5,
# objective="reg:logistic"",
pred = gb.predict(X_val)
print(f"MAE: {mean_absolute_error(y_val, pred, multioutput='raw_values')}")
print(f"cross_entropy: {cross_entropy(y_val, pred)}")

MAE: [0.00145122 0.01157944 0.01144963 0.00139811 0.00131279 0.0018197
 0.00486937 0.00327795 0.01278296 0.00261025 0.00202304 0.0018851
 0.0021728  0.00205734 0.0009154  0.00238492 0.00118048 0.00163402
 0.00136707 0.0022042  0.00190258 0.00229062 0.00207384 0.00166191
 0.00187639 0.05530398]
cross_entropy: 1.326221330203524


In [119]:
dummy_pred = np.ones(y_val[0].shape[0])# np.random.rand(y_val[0].shape[0])
dummy_pred = dummy_pred / np.sum(dummy_pred)

mean_absolute_error(dummy_pred, y_val[1234])

0.06508875739644972

In [21]:
gb = MultiOutputRegressor(
    XGBRegressor(
        n_estimators=1000,
        device="cuda",
        learning_rate=0.1,
        objective="reg:logistic",
    )
)

gb.fit(X=X_train, y=y_train)
pred = gb.predict(X_val)
print(f"MAE: {mean_absolute_error(y_val, pred)}")
print(f"cross_entropy: {cross_entropy(y_val, pred)}")

MAE: 0.009336816871054577
cross_entropy: 2.000867025936605


In [22]:
gb = MultiOutputRegressor(
    XGBRegressor(
        n_estimators=1000,
        device="cuda",
        learning_rate=0.01,
        objective="reg:logistic",
    )
)

gb.fit(X=X_train, y=y_train)
pred = gb.predict(X_val)
print(f"MAE: {mean_absolute_error(y_val, pred)}")
print(f"cross_entropy: {cross_entropy(y_val, pred)}")

MAE: 0.013161710913389404
cross_entropy: 2.787678108017829


In [23]:
gb = MultiOutputRegressor(
    XGBRegressor(
        n_estimators=1000,
        device="cuda",
        learning_rate=0.7,
        objective="reg:logistic",
    )
)

gb.fit(X=X_train, y=y_train)
pred = gb.predict(X_val)
print(f"MAE: {mean_absolute_error(y_val, pred)}")
print(f"cross_entropy: {cross_entropy(y_val, pred)}")

MAE: 0.010297508163170335
cross_entropy: 1.5176706056273925


In [24]:
gb = MultiOutputRegressor(
    XGBRegressor(
        n_estimators=1000,
        device="cuda",
        learning_rate=0.7,
        objective="reg:logistic",
        **{"lambda": 10},
    )
)

gb.fit(X=X_train, y=y_train)
pred = gb.predict(X_val)
print(f"MAE: {mean_absolute_error(y_val, pred)}")
print(f"cross_entropy: {cross_entropy(y_val, pred)}")

MAE: 0.009804583388589295
cross_entropy: 1.5694708058239422


In [25]:
gb = MultiOutputRegressor(
    XGBRegressor(
        n_estimators=500,
        device="cuda",
        learning_rate=0.1,
        objective="reg:logistic",
    )
)

gb.fit(X=X_train, y=y_train)
pred = gb.predict(X_val)
print(f"MAE: {mean_absolute_error(y_val, pred)}")
print(f"cross_entropy: {cross_entropy(y_val, pred)}")

MAE: 0.010147301780857343
cross_entropy: 2.209400935655668


In [26]:
gb = MultiOutputRegressor(
    XGBRegressor(
        n_estimators=500,
        device="cuda",
        learning_rate=0.5,
        objective="reg:logistic",
    )
)

gb.fit(X=X_train, y=y_train)
pred = gb.predict(X_val)
print(f"MAE: {mean_absolute_error(y_val, pred)}")
print(f"cross_entropy: {cross_entropy(y_val, pred)}")

MAE: 0.009786270569905808
cross_entropy: 1.7560236509987506


In [28]:
gb = MultiOutputRegressor(
    XGBRegressor(
        n_estimators=500,
        device="cuda",
        learning_rate=0.5,
        objective="reg:logistic",
    )
)

gb.fit(X=X_train, y=y_train)
pred = gb.predict(X_val)
print(f"MAE: {mean_absolute_error(y_val, pred)}")
print(f"cross_entropy: {cross_entropy(y_val, pred)}")

MAE: 0.009786270569905808
cross_entropy: 1.7560236509987506


In [27]:
gb = MultiOutputRegressor(
    XGBRegressor(
        n_estimators=1000,
        device="cuda",
        learning_rate=0.5,
        objective="reg:logistic",
        **{"alpha": 10},
    )
)

gb.fit(X=X_train, y=y_train)
pred = gb.predict(X_val)
print(f"MAE: {mean_absolute_error(y_val, pred)}")
print(f"cross_entropy: {cross_entropy(y_val, pred)}")

MAE: 0.011086385435772832
cross_entropy: 2.2264376980979104


In [36]:
np.sum(np.abs(pred[y_val[i] != 0] - y_val[i][y_val[i] != 0])) / y_val[i][y_val[i] != 0].shape[0]

0.1313199816354742

In [14]:
pred_total = gb.predict_proba(X_val)

In [15]:
cross_entropy(pred_total, y_val)

1.237707421082145