In [1]:
import numpy as np
import pandas as pd

DATA_DIR = "/home/gangda/workspace/ds-richter/data"

In [2]:
catboost_version = 1
lightgbm_version = 1
num_fold = 5

y = pd.read_csv(DATA_DIR + '/submission_format.csv', index_col='building_id')

y_catboost_test = np.load(DATA_DIR + '/intermediate/catboost_{}fold_v{}_test.npy'.format(num_fold, catboost_version))
y_lightgbm_test = np.load(DATA_DIR + '/intermediate/lightgbm_{}fold_v{}_test.npy'.format(num_fold, lightgbm_version))

y_catboost_df = pd.DataFrame(data = y_catboost_test, columns = ["1","2","3"], index=y.index)
y_lightgbm_df = pd.DataFrame(data = y_lightgbm_test, columns = ["1","2","3"], index=y.index)

y_lightgbm_df

Unnamed: 0_level_0,1,2,3
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
300051,0.000994,0.278134,0.720872
99355,0.001265,0.916416,0.082320
890251,0.012274,0.790961,0.196765
745817,0.678771,0.319503,0.001727
421793,0.000344,0.186396,0.813260
...,...,...,...
310028,0.066407,0.874372,0.059221
663567,0.001021,0.319639,0.679340
1049160,0.033148,0.833649,0.133203
442785,0.007747,0.843482,0.148771


In [179]:
y_catboost_1_2_test = np.load(DATA_DIR + '/intermediate/catboost_{}fold_v2_1_2_test.npy'.format(num_fold))
y_lightgbm_1_2_test = np.load(DATA_DIR + '/intermediate/lightgbm_{}fold_1_2_test.npy'.format(num_fold))

# y_catboost_2_3_test = np.load(DATA_DIR + '/intermediate/catboost_{}fold_v2_2_3_test.npy'.format(num_fold))
y_catboost_2_3_test = np.load(DATA_DIR + '/intermediate/catboost_gscv_test.npy'.format(num_fold))
y_lightgbm_2_3_test = np.load(DATA_DIR + '/intermediate/lightgbm_{}fold_2_3_test.npy'.format(num_fold))

y_catboost_1_2_df = pd.DataFrame(data=y_catboost_1_2_test, columns=["1","2"], index=y.index)
y_lightgbm_1_2_df = pd.DataFrame(data=y_lightgbm_1_2_test, columns=["2"], index=y.index)
y_lightgbm_1_2_df["1"] = 1 - y_lightgbm_1_2_df["2"]
y_lightgbm_1_2_df = y_lightgbm_1_2_df[["1", "2"]]

y_catboost_2_3_df = pd.DataFrame(data=y_catboost_2_3_test, columns=["2","3"], index=y.index)
y_lightgbm_2_3_df = pd.DataFrame(data=y_lightgbm_2_3_test, columns=["3"], index=y.index)
y_lightgbm_2_3_df["2"] = 1 - y_lightgbm_2_3_df["3"]
y_lightgbm_2_3_df = y_lightgbm_2_3_df[["2","3"]]

In [18]:
import optuna
from functools import partial
from sklearn.metrics import f1_score


class GeneralizedMeanBlender:
    """Combines multiple predictions using generalized mean"""
    def __init__(self, p_range=(-2,2)):
        """"""
        self.p_range = p_range
        self.p = None
        self.weights = None

    def _objective(self, trial, X, y):

        # create hyperparameters
        p = trial.suggest_float(f"p", *self.p_range)
        weights = [
            trial.suggest_float(f"w{i}", 0, 1)
            for i in range(X.shape[0] * 1)
        ]
        weights = np.reshape(weights, (X.shape[0], 1, 1))

        # blend predictions
        if p <= 0:
            blend_preds = np.log1p(X) * weights
        else:
            blend_preds = X**p * weights
        blend_preds = np.sum(blend_preds, axis=0)

        y_pred = blend_preds.argmax(axis=1) + 1
        return f1_score(y, y_pred, average='micro')

    def fit(self, X, y, n_trials=10):
        # optimize objective
        obj = partial(self._objective, X=X, y=y)
        study = optuna.create_study(direction="maximize")
        study.optimize(obj, n_trials=n_trials)
        # extract best weights
        if self.p is None:
            self.p = [v for k,v in study.best_params.items() if "p" in k][0]
        self.weights = np.array([v for k,v in study.best_params.items() if "w" in k])
        self.weights /= self.weights.sum()

    def transform(self, X):
        assert self.weights is not None and self.p is not None, \
            "Must call fit method before transform"
        weights = np.reshape(self.weights, (X.shape[0], 1, 1))
        if self.p <= 0:
            prob = np.log1p(X) * weights
        else:
            prob = X**self.p * weights
        prob = np.sum(prob, axis=0)

        return prob

    def fit_transform(self, X, y, **kwargs):
        self.fit(X, y, **kwargs)
        return self.transform(X)

In [176]:
y_train = pd.read_csv(DATA_DIR + '/train_labels.csv', index_col='building_id')
y_train_1_2 = pd.read_csv(DATA_DIR + '/train_1_2_labels.csv', index_col='building_id')
y_train_2_3 = pd.read_csv(DATA_DIR + '/train_2_3_labels.csv', index_col='building_id')

y_catboost_1_2_train = np.load(DATA_DIR + '/intermediate/catboost_{}fold_v2_1_2_train.npy'.format(num_fold))
y_lightgbm_1_2_train = np.load(DATA_DIR + '/intermediate/lightgbm_{}fold_1_2_train.npy'.format(num_fold))
y_lightgbm_1_2_train = np.stack([1 - y_lightgbm_1_2_train, y_lightgbm_1_2_train], axis=1)

y_catboost_2_3_train = np.load(DATA_DIR + '/intermediate/catboost_{}fold_v2_2_3_train.npy'.format(num_fold))
y_lightgbm_2_3_train = np.load(DATA_DIR + '/intermediate/lightgbm_{}fold_2_3_train.npy'.format(num_fold))
y_lightgbm_2_3_train = np.stack([1 - y_lightgbm_2_3_train, y_lightgbm_2_3_train], axis=1)

In [177]:
y_train_stack = np.stack([y_catboost_1_2_train, y_lightgbm_1_2_train])
y_train_label = y_train_1_2

gmb = GeneralizedMeanBlender()
gmb.fit(y_train_stack, y_train_label, n_trials=500)

[32m[I 2023-04-21 01:55:06,862][0m A new study created in memory with name: no-name-a6bce937-31e8-491f-b287-9ea55325f4b1[0m
[32m[I 2023-04-21 01:55:06,909][0m Trial 0 finished with value: 0.9037506560620129 and parameters: {'p': -1.5666370029240864, 'w0': 0.589285206722376, 'w1': 0.13252796501716535}. Best is trial 0 with value: 0.9037506560620129.[0m
[32m[I 2023-04-21 01:55:06,949][0m Trial 1 finished with value: 0.903577628717925 and parameters: {'p': -0.8205385191218664, 'w0': 0.057009969827388374, 'w1': 0.6761952866698765}. Best is trial 0 with value: 0.9037506560620129.[0m
[32m[I 2023-04-21 01:55:06,988][0m Trial 2 finished with value: 0.9036180017648789 and parameters: {'p': -0.4526388914422239, 'w0': 0.26382989812603275, 'w1': 0.030798541898675147}. Best is trial 0 with value: 0.9037506560620129.[0m
[32m[I 2023-04-21 01:55:07,032][0m Trial 3 finished with value: 0.9042351326254593 and parameters: {'p': 1.3730972128739425, 'w0': 0.25863684622795824, 'w1': 0.21030922

In [148]:
y_train_stack2 = np.stack([y_catboost_2_3_train, y_lightgbm_2_3_train])
y_train_label2 = y_train_2_3 - 1

gmb2 = GeneralizedMeanBlender()
gmb2.fit(y_train_stack2, y_train_label2, n_trials=500)

[32m[I 2023-04-21 01:26:28,477][0m A new study created in memory with name: no-name-cbe5f4b8-cf54-4424-b38c-f1739eaca111[0m
[32m[I 2023-04-21 01:26:28,540][0m Trial 0 finished with value: 0.7978231419629094 and parameters: {'p': -0.4133019325528151, 'w0': 0.8952897771625595, 'w1': 0.42748363380144416}. Best is trial 0 with value: 0.7978231419629094.[0m
[32m[I 2023-04-21 01:26:28,600][0m Trial 1 finished with value: 0.7977467013763553 and parameters: {'p': -0.7915821836156418, 'w0': 0.7342219100902562, 'w1': 0.5010847694616363}. Best is trial 0 with value: 0.7978231419629094.[0m
[32m[I 2023-04-21 01:26:28,663][0m Trial 2 finished with value: 0.7978146485644033 and parameters: {'p': 0.4620760424104793, 'w0': 0.7199019915350588, 'w1': 0.6050901510378445}. Best is trial 0 with value: 0.7978231419629094.[0m
[32m[I 2023-04-21 01:26:28,723][0m Trial 3 finished with value: 0.7978231419629094 and parameters: {'p': -1.3424344109675435, 'w0': 0.2781801878371797, 'w1': 0.229520914524

In [170]:
y_prob = np.log1p(y_train_stack)
y_prob = np.sum(y_prob, axis=0)
y_pred = y_prob.argmax(axis=1) + 1
f1_score(y_train_label, y_pred, average='micro')

0.9040851755939163

In [178]:
# read from previous best result
y_mixture_test = pd.read_csv(DATA_DIR + '/submission/mixture_v1.csv', index_col='building_id')
y_mixture_test = y_mixture_test.to_numpy().reshape(-1) - 1
y_mixture_test = np.eye(3)[y_mixture_test]

y_mixture_test = pd.DataFrame(data = y_mixture_test, columns = ["1","2","3"], index=y.index)

y_weighted_test = y_mixture_test  # Replace by 0.7534
# y_weighted_test = y_catboost_df.multiply(0.5).add(y_lightgbm_df.multiply(0.5))  # 0.7533

y_weighted_test

Unnamed: 0_level_0,1,2,3
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
300051,0.0,0.0,1.0
99355,0.0,1.0,0.0
890251,0.0,1.0,0.0
745817,1.0,0.0,0.0
421793,0.0,0.0,1.0
...,...,...,...
310028,0.0,1.0,0.0
663567,0.0,0.0,1.0
1049160,0.0,1.0,0.0
442785,0.0,1.0,0.0


In [180]:
# y_weighted_1_2_test = y_catboost_1_2_df.add(y_lightgbm_1_2_df)
y_weighted_1_2_test = gmb.transform(np.stack([y_catboost_1_2_df.to_numpy(), y_lightgbm_1_2_df.to_numpy()]))
y_weighted_1_2_test = pd.DataFrame(data = y_weighted_1_2_test, columns = ["1","2"], index=y.index)
# y_weighted_1_2_test = y_catboost_1_2_df.add(y_lightgbm_1_2_df).add(y_weighted_test.drop(columns="3",axis=1))


y_weighted_2_3_test = y_catboost_2_3_df.add(y_lightgbm_2_3_df)
# y_weighted_2_3_test = gmb2.transform(np.stack([y_catboost_2_3_df.to_numpy(), y_lightgbm_2_3_df.to_numpy()]))
# y_weighted_2_3_test = pd.DataFrame(data = y_weighted_2_3_test, columns = ["2","3"], index=y.index)
# y_weighted_2_3_test = y_catboost_2_3_df.add(y_lightgbm_2_3_df).add(y_weighted_test.drop(columns="1",axis=1))

y_weighted_2_3_test

Unnamed: 0_level_0,2,3
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1
300051,0.576232,1.423768
99355,1.855960,0.144040
890251,1.584992,0.415008
745817,1.995695,0.004305
421793,0.369443,1.630557
...,...,...
310028,1.935975,0.064025
663567,0.811214,1.188786
1049160,1.712600,0.287400
442785,1.672679,0.327321


In [181]:
from tqdm import tqdm

y_submission = y

conflicts = 0
changes = {"1":{},"2":{},"3":{}}

for index, row in tqdm(y_weighted_test.iterrows(), total=y_weighted_test.shape[0]):
    pred_3 = row.idxmax()
    pred_1_2 = y_weighted_1_2_test.loc[index].idxmax()
    pred_2_3 = y_weighted_2_3_test.loc[index].idxmax()
    if pred_3 == "1":
        pred = pred_1_2
    elif pred_3 == "3":
        pred = pred_2_3
    else:
        if pred_1_2 == "1":
            if pred_2_3 == "3":
                pred = "2"
                conflicts += 1
            else:
                pred = "1"
        else:
            pred = pred_2_3
    if pred != pred_3:
        if pred in changes[pred_3]:
            changes[pred_3][pred] += 1
        else:
            changes[pred_3][pred] = 1
    y_weighted_test.loc[index,"pred_1_2"]=pred_1_2
    y_weighted_test.loc[index,"pred_2_3"]=pred_2_3
    y_weighted_test.loc[index,"pred"]=pred

print(conflicts)
print(changes)

y_weighted_test

100%|██████████| 86868/86868 [00:32<00:00, 2653.05it/s]

1
{'1': {'2': 354}, '2': {'3': 665, '1': 260}, '3': {'2': 567}}





Unnamed: 0_level_0,1,2,3,pred_1_2,pred_2_3,pred
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
300051,0.0,0.0,1.0,2,3,3
99355,0.0,1.0,0.0,2,2,2
890251,0.0,1.0,0.0,2,2,2
745817,1.0,0.0,0.0,1,2,1
421793,0.0,0.0,1.0,2,3,3
...,...,...,...,...,...,...
310028,0.0,1.0,0.0,2,2,2
663567,0.0,0.0,1.0,2,3,3
1049160,0.0,1.0,0.0,2,2,2
442785,0.0,1.0,0.0,2,2,2


mix inf
0
{'1': {'2': 356}, '2': {'1': 253}, '3': {}}

lgbm 1.0: 7528
1
{'1': {'2': 241}, '2': {'3': 574, '1': 167}, '3': {'2': 395}}

lgbm 0.5: 7533
1
{'1': {'2': 241}, '2': {'3': 373, '1': 167}, '3': {'2': 273}}

cat 1.0 7522
1
{'1': {'2': 241}, '2': {'3': 592, '1': 167}, '3': {'2': 635}}

cat 0.5
1
{'1': {'2': 241}, '2': {'3': 397, '1': 167}, '3': {'2': 429}}

mix 1.0 7534
1
{'1': {'2': 241}, '2': {'3': 437, '1': 167}, '3': {'2': 381}}

mix 0.5 7533
1
{'1': {'2': 241}, '2': {'1': 167, '3': 314}, '3': {'2': 291}}

mix inf
2
{'1': {'2': 241}, '2': {'3': 660, '1': 166}, '3': {'2': 564}}

In [182]:
y_submission["damage_grade"] = y_weighted_test["pred"]
y_submission.to_csv(DATA_DIR + '/submission/final_{}.csv'.format('gmb_1_2_tuned_1_2'))
y_submission

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,3
99355,2
890251,2
745817,1
421793,3
...,...
310028,2
663567,3
1049160,2
442785,2
