In [33]:
import numpy as np
import pandas as pd
import polars as pl

from sklearn.metrics import roc_curve, auc, roc_auc_score

from pathlib import Path

In [34]:
for a, b in set(
    [('model1', 'exp2'),
    ('model1', 'exp2'),
    ('model1', 'exp2'),
    ('model2', 'exp3')]
):
    print(a, b)

model2 exp3
model1 exp2


In [35]:
def score(solution: np.ndarray, submission: np.ndarray, min_tpr: float=0.80) -> float:
    v_gt = abs(solution-1)
    v_pred = np.array([1.0 - x for x in submission])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
    # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)

    return(partial_auc)

In [36]:
base_path = Path('./')
def read_img_oof():
    oof_df_tsuma_eva_nes = pl.read_parquet(base_path / 'preds_eva_nes.parquet', columns=['isic_id', 'pred']).rename({'pred': 'pred_tsuma_eva_nes'})
    oof_df_tsuma_conv_nes = pl.read_parquet(base_path / 'preds_conv_nes.parquet', columns=['isic_id', 'pred']).rename({'pred': 'pred_tsuma_conv_nes'})
    oof_df_sub_71 = pl.concat([pl.read_csv(base_path / f'sub71/test_results_fold_{fold}.csv') for fold in range(5)]).select(['isic_id', 'pred']).rename({'pred': 'pred_sub_71'})
    oof_df_sub_73 = pl.concat([pl.read_csv(base_path / f'sub73/test_results_fold_{fold}.csv') for fold in range(5)]).select(['isic_id', 'pred']).rename({'pred': 'pred_sub_73'})
    oof_df_sub_75 = pl.concat([pl.read_csv(base_path / f'sub75/test_results_fold_{fold}.csv') for fold in range(5)]).select(['isic_id', 'pred']).rename({'pred': 'pred_sub_75'})
    oof_df_sub_77 = pl.concat([pl.read_csv(base_path / f'sub77/test_results_fold_{fold}.csv') for fold in range(5)]).select(['isic_id', 'pred']).rename({'pred': 'pred_sub_77'})

    # Merge the data
    # train_df = train_df.join(oof_df_tsuma_eva_nes, on='isic_id', how='left')
    train_df = oof_df_tsuma_eva_nes
    train_df = train_df.join(oof_df_tsuma_conv_nes, on='isic_id', how='left')
    train_df = train_df.join(oof_df_sub_71, on='isic_id', how='left')
    train_df = train_df.join(oof_df_sub_73, on='isic_id', how='left')
    train_df = train_df.join(oof_df_sub_75, on='isic_id', how='left')
    train_df = train_df.join(oof_df_sub_77, on='isic_id', how='left')
    train_df = train_df.to_pandas()

    return train_df
    

In [37]:
oof_df1 = pd.read_csv('ensemble_oof_df_20240904.csv')
oof_df2 = pd.read_csv('ensemble_oof_df_20240905.csv')
oof_kanna_attr = pd.read_csv('ensemble_oof_df_with_att.csv').rename(columns={'pred': 'pred_kanna_attr'})
oof_kanna_no_attr = pd.read_csv('ensemble_oof_df_without_att.csv').rename(columns={'pred': 'pred_kanna_no_attr'})
oof_tsuma_attr = pd.read_parquet('preds_tsuma_plain.parquet').rename(columns={'pred': 'pred_tsuma_attr'})
oof_tsuma_no_attr = pd.read_parquet('preds_tsuma_no_att.parquet').rename(columns={'pred': 'pred_tsuma_no_attr'})
img_oof_df = read_img_oof()

oof_df = pd.merge(oof_df1, oof_df2, on=['isic_id', 'target'], how='left')
oof_df = pd.merge(oof_df, img_oof_df, on=['isic_id'], how='left')
oof_df = pd.merge(oof_df, oof_kanna_attr, on=['isic_id', 'target'], how='left')
oof_df = pd.merge(oof_df, oof_kanna_no_attr, on=['isic_id', 'target'], how='left')
oof_df = pd.merge(oof_df, oof_tsuma_attr, on=['isic_id'], how='left')
oof_df = pd.merge(oof_df, oof_tsuma_no_attr, on=['isic_id'], how='left')


In [38]:
oof_df.head()

Unnamed: 0,isic_id,target,pred_xgb_exp68,pred_lgb_exp68,pred_xgb_exp69,pred_lgb_exp69,pred_xgb_exp70,pred_lgb_exp70,pred_xgb_exp71,pred_lgb_exp71,...,pred_tsuma_eva_nes,pred_tsuma_conv_nes,pred_sub_71,pred_sub_73,pred_sub_75,pred_sub_77,pred_kanna_attr,pred_kanna_no_attr,pred_tsuma_attr,pred_tsuma_no_attr
0,ISIC_0015670,0,1.5e-05,4.8e-05,1.2e-05,4.9e-05,1e-05,5e-05,1.2e-05,4.7e-05,...,0.017248,0.023749,3e-06,1.127719e-06,1e-05,1.729415e-07,1.7e-05,1.5e-05,0.000575,0.000756
1,ISIC_0015845,0,0.041703,0.139432,0.026189,0.139875,0.032432,0.100835,0.032224,0.101122,...,0.576921,0.142342,0.001004,0.0001837846,0.003136,0.0002251682,0.045575,0.040704,0.584633,0.660702
2,ISIC_0015864,0,5e-06,1.7e-05,5e-06,1.2e-05,4e-06,1.3e-05,3e-06,1.1e-05,...,0.004238,0.015459,2e-06,7.042271e-08,6e-06,6.3488e-09,9e-06,6e-06,0.000195,0.000253
3,ISIC_0015902,0,2.6e-05,9.2e-05,2.3e-05,9.8e-05,2.3e-05,8.4e-05,3.6e-05,8.6e-05,...,0.004798,0.016119,1e-06,3.237762e-06,1.4e-05,1.834289e-05,7.7e-05,8.3e-05,0.000148,0.000121
4,ISIC_0024200,0,3.7e-05,0.000171,7.9e-05,0.00018,4.3e-05,0.000136,6e-05,0.000135,...,0.073753,0.23749,6e-06,0.0001609308,2.2e-05,2.54135e-06,3.3e-05,3.1e-05,3.7e-05,6.2e-05


In [44]:
feature_cols1 = [
    'pred_xgb_exp69',
    'pred_xgb_exp70',
    'pred_xgb_exp78',
    'pred_lgb_exp78',
    'pred_xgb_exp79',
    'pred_xgb_exp85',
    'pred_xgb_exp86',
    'pred_xgb_exp92',
    'pred_xgb_exp94',
    'pred_lgb_exp94',
    
    'pred_xgb_exp100',
    'pred_xgb_exp104',
    'pred_lgb_exp104',
    'pred_xgb_exp109',
    'pred_xgb_exp116',
    'pred_lgb_exp116',
    'pred_xgb_exp120',

    'pred_tsuma_eva_nes',
    'pred_tsuma_conv_nes',
    'pred_sub_71',
    'pred_sub_73',
    'pred_sub_75',
    'pred_sub_77',

    # 'pred_kanna_attr',
    'pred_tsuma_attr',
]

feature_cols2 = [
    'pred_xgb_exp73',
    'pred_xgb_exp74',
    'pred_xgb_exp82',
    'pred_lgb_exp82',
    'pred_xgb_exp83',
    'pred_xgb_exp89',
    'pred_xgb_exp90',
    'pred_xgb_exp96',
    'pred_xgb_exp98',
    'pred_lgb_exp98',

    'pred_xgb_exp102',
    'pred_xgb_exp106',
    'pred_lgb_exp106',
    'pred_xgb_exp111',
    'pred_xgb_exp118',
    'pred_lgb_exp118',
    'pred_xgb_exp122',

    # 'pred_tsuma_eva_nes',
    # 'pred_tsuma_conv_nes',
    # 'pred_sub_71',
    # 'pred_sub_73',
    # 'pred_sub_75',
    # 'pred_sub_77',

    # 'pred_kanna_no_attr',
    'pred_tsuma_no_attr',
]

feature_cols = sorted(list(set(feature_cols1 + feature_cols2)))
# feature_cols = [col for col in feature_cols if 'xgb' in col]

In [45]:
def rank_averaging_normalized(df, column):
    """
    指定されたカラムに対してrank averagingを行い、最大値を1に正規化する。

    Parameters:
    df (pd.DataFrame): 対象のDataFrame
    column (str): ランク付けを行うカラム名

    Returns:
    pd.Series: 正規化されたランク
    """
    # 平均ランクを計算
    ranks = df[column].rank(method='average')

    # 最大値を1に正規化
    normalized_ranks = ranks / ranks.max()

    return normalized_ranks
    

In [46]:
for col in feature_cols:
    oof_df[col] = rank_averaging_normalized(oof_df, col)

In [47]:
import optuna
import numpy as np

# Optunaのログを無効化
optuna.logging.set_verbosity(optuna.logging.CRITICAL)

def ensemble_auc(weights):
    weighted_preds = np.dot(oof_df[feature_cols], weights)
    pauc = score(oof_df['target'], weighted_preds)
    return -pauc

def objective(trial):
    # 重みのパラメータを作成
    weights = [trial.suggest_float(f"weight_{i}", 0, 1) for i in range(len(feature_cols))]
    
    # 重みの合計で各重みを正規化して合計が1になるように調整
    total_weight = sum(weights)
    normalized_weights = [w / total_weight for w in weights]
    
    return ensemble_auc(normalized_weights)

study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=500)
study.optimize(objective, n_trials=1000)

# 最適化された重みを取得して表示
optimized_weights = [study.best_trial.params[f"weight_{i}"] for i in range(len(feature_cols))]
total_weight = sum(optimized_weights)
normalized_optimized_weights = [w / total_weight for w in optimized_weights]

# 最適化されたスコア（目的関数の値）も表示
optimized_score = -study.best_trial.value

print("Optimized Weights:", normalized_optimized_weights)
print("Optimized Score:", optimized_score)


Optimized Weights: [0.0036466984358633926, 0.004211807972837416, 0.060230724296397246, 2.419774956887695e-05, 0.003181441323493762, 0.01000125082192251, 0.061639476667546864, 0.027484618771729644, 0.004012539685166852, 0.005660457751542802, 0.005927928092242226, 0.004078289375662666, 0.01174279699244935, 0.004111691337695684, 0.0014235356603581727, 0.0075436585080596915, 0.033011910123181526, 0.0002111190450706945, 0.044431298577526264, 0.04527743762802538, 0.060760886196883225, 0.029506920744128425, 0.04996731709093545, 0.035598958043055684, 0.04585339424396361, 0.017125064141501656, 0.006645900214912578, 0.001331573298129166, 0.027228086055112694, 0.06337763396412263, 0.047770082811900316, 0.016463283923177644, 0.011315935368265088, 0.0027871663198685615, 0.01636954066832886, 0.004470421529237921, 0.010176173737847561, 0.03528908879166972, 0.06013207964031161, 0.056604148152965605, 0.060050477086734375, 0.0033229891606062624]
Optimized Score: 0.1838425586284332


In [None]:
sorted(feature_cols)

In [None]:
a = 

In [None]:
result_w = (np.array(a) + np.array(b) + np.array(c)) / 3
result_w

In [48]:
for w, col in zip(normalized_optimized_weights, feature_cols):
    print(w, col)


0.0036466984358633926 pred_lgb_exp104
0.004211807972837416 pred_lgb_exp106
0.060230724296397246 pred_lgb_exp116
2.419774956887695e-05 pred_lgb_exp118
0.003181441323493762 pred_lgb_exp78
0.01000125082192251 pred_lgb_exp82
0.061639476667546864 pred_lgb_exp94
0.027484618771729644 pred_lgb_exp98
0.004012539685166852 pred_sub_71
0.005660457751542802 pred_sub_73
0.005927928092242226 pred_sub_75
0.004078289375662666 pred_sub_77
0.01174279699244935 pred_tsuma_attr
0.004111691337695684 pred_tsuma_conv_nes
0.0014235356603581727 pred_tsuma_eva_nes
0.0075436585080596915 pred_tsuma_no_attr
0.033011910123181526 pred_xgb_exp100
0.0002111190450706945 pred_xgb_exp102
0.044431298577526264 pred_xgb_exp104
0.04527743762802538 pred_xgb_exp106
0.060760886196883225 pred_xgb_exp109
0.029506920744128425 pred_xgb_exp111
0.04996731709093545 pred_xgb_exp116
0.035598958043055684 pred_xgb_exp118
0.04585339424396361 pred_xgb_exp120
0.017125064141501656 pred_xgb_exp122
0.006645900214912578 pred_xgb_exp69
0.0013315732

In [336]:
a = [0.06352687811380262, 0.02283959176887256, 0.06257184624403947, 0.005754200590161534, 0.08086339127137905, 0.0025267147184888455, 0.07155105453900618, 0.09501158498443575, 0.05350670164043835, 0.03061920961908998, 0.06352214433793489, 0.08602909363135221, 0.014415845685898107, 0.07082832457872726, 0.082427698721928, 0.06710373444687946, 0.04734685285731679, 0.0048184624740187434, 0.008542280567235874, 0.0018374644091975103, 0.016486611445669267, 0.00804025102540549, 0.01904550973208852, 0.020784552596633782]
# Optimized Score: 0.18375263329050767
b = [0.024961511769743833, 0.04351962470044531, 0.05639721330046738, 0.01609041376785471, 0.08905821177894489, 0.06116494665421514, 0.005378478803026167, 0.09825598365246625, 0.09792718182871639, 0.06793737833944119, 0.0076496655402115395, 0.10687111101074899, 0.018193269703697175, 0.07825039807755532, 0.01397199890511219, 0.052000706182423556, 0.10300372042635919, 0.016815631948737805, 0.010984509794901388, 0.001331914858524622, 0.009974580864481788, 4.720819791891561e-05, 0.008165447095982912, 0.012048892798023073]
# Optimized Score: 0.18382430657535356
c = [0.06873657053566712, 0.048556449561692, 0.046040723605825444, 0.008220297104795276, 0.11101150356051459, 0.009037389658546191, 0.04265601920277655, 0.05981707277179553, 0.10816582426368399, 0.018322234487586922, 0.018511388905386318, 0.08699223276381375, 0.005773344426873809, 0.0819680172879521, 0.09961666837679822, 0.040950410694635196, 0.09894378052427596, 3.2064072542117004e-05, 0.008515037952776285, 0.009911458975446718, 0.006465642326186605, 0.004670331765533566, 0.008690538500531723, 0.00839499867436401]
# Optimized Score: 0.1839025630467764

In [337]:
result_w = (np.array(a) + np.array(b) + np.array(c)) / 3
result_w

array([0.05240832, 0.03830522, 0.05500326, 0.01002164, 0.09364437,
       0.02424302, 0.03986185, 0.08436155, 0.08653324, 0.03895961,
       0.0298944 , 0.09329748, 0.01279415, 0.07701558, 0.06533879,
       0.05335162, 0.08309812, 0.00722205, 0.00934728, 0.00436028,
       0.01097561, 0.0042526 , 0.01196717, 0.01374281])

In [338]:
final_pred = np.zeros(len(oof_df))
for w, col in zip(result_w, feature_cols):
    final_pred += w * oof_df[col]
score(oof_df['target'], final_pred)
    

0.18384709560363163

In [315]:
a = [0.03417042449606698, 0.01150408747431351, 0.03738374796137772, 0.06063048724820239, 0.047850039054448575, 0.01249970532100278, 0.0003445341148379346, 0.07664088693599194, 0.09635888177359535, 0.01991076155450156, 0.02614658373490151, 0.04011790841491652, 0.007696151490654544, 0.0814682962748052, 0.0395516634381202, 0.11464587998039259, 0.11203945332446999, 0.013545755743810919, 0.0033677438455063154, 0.0026253746084748137, 0.002844093027485632, 0.015882583659727056, 0.019588938217354277, 0.11450689878685112, 0.008679119518190765]
# Optimized Score: 0.18399161960221724
b = [0.06843261196781632, 0.03496252494142428, 0.059854611041720684, 0.008308614163077864, 0.027800529572409693, 0.06478545223336332, 0.03259317635941183, 0.04884154461933212, 0.06329680373145112, 0.014457039187228378, 0.027204730549850273, 0.07668657856186602, 0.05125537194944305, 0.0731125742069751, 0.07762442419485975, 0.0704441747058285, 0.07315971310379578, 0.007587899371908108, 0.007186010540098669, 0.0018697830386352533, 0.014540648885001551, 0.013939079735409513, 0.003865354498103163, 0.07327248601528301, 0.004918262825706235]
# Optimized Score: 0.18386093515619645
c = [0.0876891214210251, 0.0114344147674044, 0.08988158372382228, 0.012522294004636132, 0.027526554278416555, 0.01270066180774577, 0.09030380200229599, 0.076010103735, 0.06842794816903655, 0.019581089894351967, 0.001847218793180227, 0.09480250649722231, 0.02514360259644122, 0.06334413552877768, 0.07885910815308099, 0.04404141881455133, 0.03856658759555437, 0.007499850884506298, 0.0019487988521012377, 0.0017501801377167601, 0.010982524838634666, 0.015498297286996279, 0.01410166274203436, 0.09057152061937027, 0.014965012856097176]
# Optimized Score: 0.18396830854235835


In [319]:
result_w = (np.array(a) + np.array(b) + np.array(c)) / 3
result_w

array([0.06343072, 0.01930034, 0.06237331, 0.0271538 , 0.03439237,
       0.02999527, 0.0410805 , 0.06716418, 0.07602788, 0.01798296,
       0.01839951, 0.07053566, 0.02803171, 0.07264167, 0.06534507,
       0.07637716, 0.07458858, 0.0095445 , 0.00416752, 0.00208178,
       0.00945576, 0.01510665, 0.01251865, 0.09278364, 0.0095208 ])

In [318]:
for w, col in zip(result_w, feature_cols):
    print(w, col)


0.06343071929496946 pred_xgb_exp69
0.01930034239438073 pred_xgb_exp70
0.06237331424230689 pred_xgb_exp78
0.02715379847197213 pred_lgb_exp78
0.03439237430175827 pred_xgb_exp79
0.02999527312070396 pred_xgb_exp85
0.041080504158848584 pred_xgb_exp86
0.06716417843010802 pred_xgb_exp92
0.076027877891361 pred_xgb_exp94
0.017982963545360636 pred_lgb_exp94
0.018399511025977335 pred_xgb_exp100
0.07053566449133496 pred_xgb_exp104
0.028031708678846273 pred_lgb_exp104
0.072641668670186 pred_xgb_exp109
0.06534506526202032 pred_xgb_exp116
0.0763771578335908 pred_lgb_exp116
0.07458858467460672 pred_xgb_exp120
0.009544502000075108 pred_tsuma_eva_nes
0.004167517745902074 pred_tsuma_conv_nes
0.002081779261608942 pred_sub_71
0.009455755583707282 pred_sub_73
0.015106653560710948 pred_sub_75
0.012518651819163935 pred_sub_77
0.09278363514050147 pred_kanna_attr
0.009520798399998058 pred_tsuma_attr
