# LB:(20240608_miyaki_5.json) 

In [10]:
import os 
import json
import warnings
import numpy as np
import pandas as pd
import multiprocessing
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoost,Pool

import torch
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetClassifier

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import AUC
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, GroupShuffleSplit
from sklearn.metrics import recall_score, precision_score, roc_auc_score,confusion_matrix

CORE_NUM = multiprocessing.cpu_count()
device = "cuda" if torch.cuda.is_available() else "cpu"
warnings.simplefilter('ignore')
os.makedirs("../submit/", exist_ok=True)

print(f'multi:{CORE_NUM} | device:{device}')

multi:8 | device:cuda


In [7]:
df_master = pd.read_parquet('../test_data/cleaned_pid_to_info_all_v6.parquet')
df_master = df_master.rename(columns={'id':'paper_id'})
df_train_master = pd.read_parquet('../test_data/train_author.parquet')
df_test_master = pd.read_parquet('../test_data/ind_test_author_filter_public.parquet')
print(f'train:{df_train_master.shape} | test:{df_test_master.shape}')

train:(779, 4) | test:(515, 3)


In [8]:
def get_ids(df, proc):
    merge_keys = ['author_id','paper_id','label']
    if proc == 'train':
        df_normal = df.explode('normal_data')[['id','normal_data']]
        df_normal.columns = ['id','paper_id']
        df_normal['label'] = 1
        df_outliers = df.explode('outliers')[['id','outliers']]
        df_outliers.columns = ['id','paper_id']
        df_outliers['label'] = 0
        df = pd.concat([df_normal,df_outliers])
        df.columns = ['author_id','paper_id','label']
    elif proc == 'test':
        df = df.explode('papers')[['id','papers']]
        df['label'] = 0
        df.columns = ['author_id','paper_id','label']
    return df[merge_keys]
    
def get_feature(df, proc):
    merge_keys = ['author_id','paper_id','label']
    
    df_glm = pd.read_parquet(f'../test_feature/{proc}_glm.parquet')
    df = df.merge(df_glm, on=merge_keys, how='left')
    
    df_sci = pd.read_parquet(f'../test_feature/{proc}_scibert_nli.parquet')
    df = df.merge(df_sci, on=merge_keys, how='left')

    df_e5 = pd.read_parquet(f'../test_feature/{proc}_multilingual_e5_large.parquet')
    df = df.merge(df_e5, on=merge_keys, how='left')

    df_minilm = pd.read_parquet(f'../test_feature/{proc}_minilm.parquet')
    df = df.merge(df_minilm, on=merge_keys, how='left')

    df_deb = pd.read_parquet(f'../test_feature/{proc}_deberta.parquet')
    df = df.merge(df_deb, on=merge_keys, how='left')

    df_w2v = pd.read_parquet(f'../test_feature/{proc}_w2v.parquet')
    df = df.merge(df_w2v, on=merge_keys, how='left')

    df_oag = pd.read_parquet(f'../test_feature/{proc}_oag_bert.parquet')
    df = df.merge(df_oag, on=merge_keys, how='left')

    df_tfidf = pd.read_parquet(f'../test_feature/{proc}_tfidf.parquet')
    df = df.merge(df_tfidf, on=merge_keys, how='left')

    df_graph = pd.read_parquet(f'../test_feature/{proc}_graph.parquet')
    df = df.merge(df_graph, on=merge_keys, how='left')

    df_jaccard = pd.read_parquet(f'../test_feature/{proc}_jaccard.parquet')
    df = df.merge(df_jaccard, on=merge_keys, how='left')

    df_basic = pd.read_parquet(f'../test_feature/{proc}_basic.parquet')
    df = df.merge(df_basic, on=merge_keys, how='left')
    
    # label data
    label_col = [col for col in df.columns if '_label' in col]
    df[label_col] = df[label_col].replace(-1, 0)
    prefix_list = list(set([p.split('_')[0] for p in label_col]))
    for p in prefix_list:
        cols = [col for col in label_col if p in col]
        df[f'{p}_label_mean'] = df[cols].sum(axis=1)/len(cols)
        df[f'{p}_label_mean'] = df[f'{p}_label_mean'].apply(lambda x : 1 / (1 + np.exp(-x)))
    type_list = list(set([p.split('_')[1] for p in label_col]))
    for t in type_list:
        cols = [col for col in label_col if t in col]
        df[f'{t}_label_mean'] = df[cols].sum(axis=1)/len(cols)
        df[f'{t}_label_mean'] = df[f'{t}_label_mean'].apply(lambda x : 1 / (1 + np.exp(-x)))
    # pred data
    pred_col = [col for col in df.columns if '_pred' in col]
    suffix_col =  list(set([p.split('_')[-1] for p in pred_col]))     
    for p in prefix_list:
        for s in suffix_col:
            cols = [col for col in pred_col if p in col]
            cols = [col for col in cols if s in col]
            if len(cols)>1:
                df[f'{p}_pred_mean_{s}'] = df[cols].sum(axis=1)/len(cols)
                df[f'{p}_pred_mean_{s}'] = df[f'{p}_pred_mean_{s}'].apply(lambda x : 1 / (1 + np.exp(-x)))
    feature_col = [col for col in df.columns.tolist() if col not in merge_keys]    
    return df,feature_col
df_train = get_ids(df_train_master, 'train')
df_train, feature_col = get_feature(df_train, 'train')
df_test = get_ids(df_test_master, 'test')
df_test, _ = get_feature(df_test, 'test')
print(f'train:{df_train[feature_col].shape} | test:{df_test[feature_col].shape}')

train:(148309, 1158) | test:(116262, 1158)


In [4]:
# 無意味な特徴量の削除
def drop_noise(df, f_col):
    one_value_cols = [col for col in f_col if df[col].nunique() <= 1]
    many_null_cols = [col for col in f_col if df[col].isnull().sum() / df_train.shape[0] > 0.9]
    noise_cols = list(set(many_null_cols + one_value_cols))
    return noise_cols
train_noise_cols = drop_noise(df_train, feature_col)
test_noise_cols = drop_noise(df_test, feature_col)
noise_cols = list(set(train_noise_cols + test_noise_cols))
before_num = len(feature_col)
feature_col = [col for col in feature_col if col not in noise_cols]
after_num = len(feature_col)
print(f'削減前:{before_num} | 削減後:{after_num}')
print(f'削除：{noise_cols}')

削減前:1158 | 削減後:1144
削除：['deberta_title_cos_min', 'e5_venue_cos_min', 'mini_pred_mean_gm', 'deberta_venue_cos_min', 'deberta_all_cos_min', 'e5_all_cos_min', 'e5_abstract_cos_min', 'deberta_abstract_cos_min', 'e5_title_cos_min', 'deberta_keywords_cos_min', 'deberta_pred_mean_gm', 'sci_pred_mean_gm', 'e5_pred_mean_gm', 'e5_keywords_cos_min']


# Local Test

In [5]:
# 学習・検証・テストデータ作成
groups = df_train['author_id']
X = df_train.set_index(['author_id','paper_id'])
y = X['label']
X = X[feature_col]
# テストデータ作成
gss = GroupShuffleSplit(n_splits=1, test_size=0.3,random_state=85)
train_idxs,test_idxs = next(gss.split(X=X, y=y, groups=groups))
X, X_test, y, y_test = X.iloc[train_idxs],X.iloc[test_idxs], y.iloc[train_idxs],y.iloc[test_idxs]
# 学習・検証データ作成
X_train, X_valid, y_train, y_valid = train_test_split(
    X, 
    y, 
    test_size=0.1,
    random_state=1,
    shuffle=True,
    stratify=y
)
print(f'train:{X_train.shape} 負例:{len(y_train[y_train==0])} | valid:{X_valid.shape} 負例:{len(y_valid[y_valid==0])} | test:{X_test.shape} 負例:{len(y_test[y_test==0])}')

train:(97102, 1144) 負例:11320 | valid:(10790, 1144) 負例:1258 | test:(40417, 1144) 負例:4707


# 評価関数

In [6]:
# 評価関数
def get_evaluate(y_test, predict):
    auc = roc_auc_score(y_test, predict)
    predict = np.where(predict < 0.5, 0, 1)
    precision = precision_score(y_test, predict)
    recall = recall_score(y_test, predict)      
    return auc, precision, recall

def compute_score(target: np.ndarray,pred: np.ndarray,groups: np.ndarray) -> float:
    result_df = pd.DataFrame({"pred": pred, "target": target, "group": groups})
    total_errors = len(result_df[result_df["target"]==0])
    score = 0.0
    for _, gdf in result_df.groupby("group"):
        weight = len(gdf[gdf["target"]==0]) / total_errors
        auc = roc_auc_score(gdf["target"], gdf["pred"])
        score += auc * weight
    return score
    
def eval(y_pred, X_test, y_test):
    local_score = compute_score(y_test.values, np.array(y_pred), y_test.index.get_level_values('author_id').tolist())
    auc, precision, recall = get_evaluate(y_test, y_pred)
    print(f'LOCAL SCORE:{local_score}')
    print(f'AUC:{auc} | PRECISION:{precision} | RECALL:{recall}')
    
    y_pred = np.where(y_pred < 0.5, 0, 1)
    test = X_test.copy()
    test['pred'] = y_pred
    test['label'] = y_test.values
    test = test.reset_index()
    tmp = []
    for author_id, group in test.groupby('author_id'):
        group.loc[group['label'] == group['pred'], 'flag'] = 1
        group.loc[group['label'] != group['pred'], 'flag'] = 0
        error = (len(group[group['flag']==0])/len(group))*100
        group['error'] = error
        group = group[['author_id','paper_id','pred','label','flag','error']]
        tmp.append(group)
    test = pd.concat(tmp)
    test = test.merge(df_master,on='paper_id',how='left')
    test = test.sort_values('error',ascending=False)
    cm = confusion_matrix(y_test.values, y_pred)
    print(cm)
    return local_score, test

# モデル定義

In [7]:
def nn_train(n, X_train, X_valid, y_train, y_valid, X_test, y_test=None, log=False):    
    # 欠損値補完
    df_mean = X_train.mean(numeric_only=True)
    X_train_nn = X_train.fillna(df_mean)
    X_valid_nn = X_valid.fillna(df_mean)
    X_test_nn = X_test.fillna(df_mean)
    # 次元圧縮
    dim=512
    pca = PCA(n_components=dim)
    pca.fit(X_train_nn)
    X_train_nn = pca.transform(X_train_nn)
    X_valid_nn = pca.transform(X_valid_nn)
    X_test_nn = pca.transform(X_test_nn)
    # 標準化
    ss = StandardScaler()
    X_train_nn = ss.fit_transform(X_train_nn)
    X_valid_nn = ss.transform(X_valid_nn)
    X_test_nn = ss.transform(X_test_nn) 
    # モデルの構築
    model = Sequential([
        Dense(256, input_dim=X_train_nn.shape[1], activation='relu'),
        BatchNormalization(),
        Dropout(0.6),
        Dense(64, input_dim=X_train_nn.shape[1], activation='relu'),
        BatchNormalization(),
        Dropout(0.4),
        Dense(1, activation='sigmoid')
    ])
    if log:
        model.summary()
    early_stopping = EarlyStopping(monitor='val_auc', patience=10, mode='max', restore_best_weights=True)

    # モデルのコンパイル
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss=BinaryCrossentropy(),
                  metrics=[AUC(name='auc')])
    # モデルのトレーニング
    history = model.fit(X_train_nn, y_train,
                        epochs=1000,
                        batch_size=256,
                        validation_data=(X_valid_nn, y_valid),
                        verbose=10,
                        callbacks=[early_stopping]
                       )
    y_pred = model.predict(X_test_nn).flatten()
    del model
    torch.cuda.empty_cache()
    if y_test is not None:
        _, test = eval(y_pred, X_test, y_test)
        return y_pred
    else:
        return y_pred
nn_pred = nn_train(1, X_train, X_valid, y_train, y_valid, X_test, y_test, True)

2024-06-08 02:13:49.214681: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-06-08 02:13:49.224103: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-06-08 02:13:49.227081: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

Epoch 1/1000


I0000 00:00:1717812831.648759    7533 service.cc:145] XLA service 0x7f8668004b40 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1717812831.648792    7533 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2024-06-08 02:13:51.735622: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-06-08 02:13:52.072932: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8905
I0000 00:00:1717812835.081423    7533 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
LOCAL SCORE:0.7423929586535796
AUC:0.8814220400308246 | PRECISION:0.9169892812936188 | RECALL:0.9750490058807056
[[ 1555  3152]
 [  891 34819]]


In [8]:
def tab_train(n, X_train, X_valid, y_train, y_valid, X_test, y_test=None, log=False):
    # 事前学習
    # f_feature = [col for col in feature_col if col not in null_col]
    df_mean = X_train.mean(numeric_only=True)
    X_train_tab = X_train.fillna(df_mean)
    X_valid_tab = X_valid.fillna(df_mean)
    X_test_tab = X_test.fillna(df_mean)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    unsupervised_model = TabNetPretrainer(
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=2e-2),
        device_name = device,
        mask_type='entmax',
        verbose = 5,
    )
    unsupervised_model.fit(
        X_train_tab.values,
        eval_set=[X_valid_tab.values],
        batch_size = 128,
        pretraining_ratio=0.8,
        # num_workers=CORE_NUM-1,
    )
    # 本学習
    model = TabNetClassifier(
        # n_d=16,
        # n_a=16,
        # n_steps=4, 
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=2e-2),
        device_name = device,
        verbose = 10,
        seed = n
    )
    model.fit(
        X_train_tab.values, y_train.values,
        eval_set = [(X_valid_tab.values, y_valid.values)],
        eval_metric= ['auc'],
        eval_name = ['valid'],
        batch_size = 128,
        max_epochs = 100,
        patience = 5,
    num_workers=CORE_NUM-1,
        from_unsupervised=unsupervised_model
    )
    y_pred = model.predict_proba(X_test_tab.values)[:, 1]
    if y_test is not None:
        _, test = eval(y_pred, X_test, y_test)
        return y_pred
    else:
        return y_pred

tab_pred = tab_train(1, X_train, X_valid, y_train, y_valid, X_test, y_test, False)

epoch 0  | loss: 77206407075.53612| val_0_unsup_loss_numpy: 26116888.0|  0:00:20s
epoch 5  | loss: 3706268906.61633| val_0_unsup_loss_numpy: 18463060.0|  0:01:56s
epoch 10 | loss: 3176558105.04447| val_0_unsup_loss_numpy: 2006614.625|  0:03:32s
epoch 15 | loss: 1511897929.81026| val_0_unsup_loss_numpy: 77266784.0|  0:05:07s
epoch 20 | loss: 1598853398.83735| val_0_unsup_loss_numpy: 2119974528.0|  0:06:43s

Early stopping occurred at epoch 21 with best_epoch = 11 and best_val_0_unsup_loss_numpy = 613729.875
epoch 0  | loss: 0.33211 | valid_auc: 0.82717 |  0:00:16s
epoch 10 | loss: 0.14663 | valid_auc: 0.9492  |  0:02:59s

Early stopping occurred at epoch 18 with best_epoch = 13 and best_valid_auc = 0.95446
LOCAL SCORE:0.7579040097882414
AUC:0.8841916062857222 | PRECISION:0.9286144413423113 | RECALL:0.9500420050406049
[[ 2099  2608]
 [ 1784 33926]]


In [9]:
def rf_train(n, X_train, X_valid, y_train, y_valid, X_test, y_test=None, log=False):
    print(f'TRAIN SHAPE:{X_train.shape}')
    random_forest_model = RandomForestClassifier(n_estimators=150,random_state=42, max_depth=10)
    random_forest_model.fit(X_train[feature_col], y_train)
    y_pred = random_forest_model.predict_proba(X_test[feature_col])[:, 1]
    if y_test is not None:
        _, test = eval(y_pred, X_test, y_test)
        return y_pred
    else:
        return y_pred
rf_pred = rf_train(1, X_train, X_valid, y_train, y_valid, X_test, y_test, False)

TRAIN SHAPE:(97102, 1144)
LOCAL SCORE:0.7799972824759734
AUC:0.8911495667986638 | PRECISION:0.9105442880794702 | RECALL:0.9856622794735368
[[ 1249  3458]
 [  512 35198]]


In [10]:
def lgbm_train(n, X_train, X_valid, y_train, y_valid, X_test, y_test=None, log=False):
    print(f'TRAIN SHAPE:{X_train.shape}')
    lgb_train = lgb.Dataset(
        X_train, 
        y_train
    )
    lgb_eval = lgb.Dataset(
        X_valid, 
        y_valid, 
        reference=lgb_train,
    )
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'max_depth': 7,
        'num_leaves': 26,
        'learning_rate': 0.02,
        'verbosity': -1,
        'random_state': n,
        'n_jobs':CORE_NUM - 1
    }
    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_eval],
        num_boost_round=10000,
        callbacks=[
            lgb.early_stopping(
                stopping_rounds=10,
                first_metric_only=True, 
                verbose=True),
            lgb.log_evaluation(1000)
        ]
    )
    importance = pd.DataFrame(model.feature_importance(), index=X_train.columns.tolist(), columns=['importance'])
    importance = importance.sort_values('importance',ascending=False)
    if log:
        print(f'NUM_OF_PARAMS:{len(model.params)}')
        print(f'PARAMS:{model.params}')
        display(importance.head())
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    if y_test is not None:
        _, test = eval(y_pred, X_test, y_test)
        return y_pred, importance, test, model.params, model
    else:
        return y_pred, importance

lgb_pred, lgb_imp, lgb_error, params, model = lgbm_train(1, X_train, X_valid, y_train, y_valid, X_test, y_test, False)

TRAIN SHAPE:(97102, 1144)
Training until validation scores don't improve for 10 rounds
[1000]	valid_0's auc: 0.976749
Early stopping, best iteration is:
[1448]	valid_0's auc: 0.978881
Evaluated only: auc
LOCAL SCORE:0.8026274278914615
AUC:0.9161377618979032 | PRECISION:0.9303740062956837 | RECALL:0.9766451974236908
[[ 2097  2610]
 [  834 34876]]


In [15]:
def catboost_train(n,X_train, X_valid, y_train, y_valid, X_test, y_test=None,log=False):
    train_pool = Pool(X_train, label=y_train)
    valid_pool = Pool(X_valid, label=y_valid)
    params = {
        # タスク設定と損失関数
        'loss_function': 'Logloss',
        'eval_metric':'AUC',
        'depth':7,
        'num_boost_round': 10000,
        'early_stopping_rounds': 10,
        'random_state':n,
        'thread_count':CORE_NUM-1,
        'use_best_model':True,
    }
    # モデルを学習する
    model = CatBoost(params)
    model.fit(
        train_pool,
        eval_set=valid_pool,
        verbose=1000,
    )
    importance = pd.DataFrame(
         model.get_feature_importance(type='PredictionValuesChange'),
         index=X_train.columns.tolist(),
         columns=['importance'])
    importance = importance.sort_values('importance',ascending=False)
    if log:
        print(f'NUM_OF_PARAMS:{len(model.get_params())}')
        print(f'PARAMS:{model.get_params()}')
        display(importance.head())
    y_pred = model.predict(X_test.values.tolist(),prediction_type='Probability')[:,1]
    if y_test is not None:
        _, test = eval(y_pred, X_test, y_test)
        return y_pred, importance, test
    else:
        return y_pred, importance
cat_pred, cat_imp, cat_error = catboost_train(1, X_train, X_valid, y_train, y_valid, X_test, y_test)

Learning rate set to 0.036037
0:	test: 0.8319194	best: 0.8319194 (0)	total: 249ms	remaining: 41m 30s
1000:	test: 0.9745314	best: 0.9745314 (1000)	total: 4m 15s	remaining: 38m 19s
Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.9750076222
bestIteration = 1060

Shrink model to first 1061 iterations.
LOCAL SCORE:0.7977686733292223
AUC:0.9142010650795834 | PRECISION:0.925352560704061 | RECALL:0.9775413049565947
[[ 1891  2816]
 [  802 34908]]


In [17]:
def xgbt_train(n,X_train, X_valid, y_train, y_valid, X_test, y_test=None,log=False):
    xgb_train = xgb.DMatrix(X_train,label=y_train)
    xgb_eval = xgb.DMatrix(X_valid,label=y_valid)
    xgb_test = xgb.DMatrix(X_test)
    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'max_depth': 7,
        'max_leaves':26,
        'learning_rate': 0.02,
        'seed': n,
        # 'tree_method': 'gpu_hist',
    }
    model = xgb.train(
        xgb_params,
        xgb_train,
        num_boost_round=10000,
        evals=[(xgb_eval, 'eval')],
        early_stopping_rounds=10,
        verbose_eval=1000,
    )
    importance = pd.DataFrame(
        list(model.get_score(importance_type='gain').items()), 
        columns=['feature','importance']
    )
    importance = importance.sort_values('importance',ascending=False)
    if log:
        print(f'NUM_OF_PARAMS:{len(model.attributes())}')
        print(f'PARAMS:{model.attributes()}')
        display(importance.head())
    y_pred = model.predict(xgb_test,iteration_range=(0, model.best_iteration))
    if y_test is not None:
        _, test = eval(y_pred, X_test, y_test)
        return y_pred, importance, test
    else:
        return y_pred, importance
xgb_pred, xgb_imp, xgb_error = xgbt_train(1, X_train, X_valid, y_train, y_valid, X_test, y_test)

[0]	eval-auc:0.87724
[1000]	eval-auc:0.97205
[2000]	eval-auc:0.97746
[2071]	eval-auc:0.97769
LOCAL SCORE:0.7848442542246195
AUC:0.9118602679315357 | PRECISION:0.9272944614158587 | RECALL:0.9775413049565947
[[ 1970  2737]
 [  802 34908]]


In [18]:
# ミックス
# lgb + cat
mean_pred = np.mean([lgb_pred, cat_pred], axis=0)
_, _ = eval(mean_pred, X_test, y_test)
# lgb + xgb
mean_pred = np.mean([lgb_pred, xgb_pred], axis=0)
_, _ = eval(mean_pred, X_test, y_test)
# xgb + cat
mean_pred = np.mean([cat_pred, xgb_pred], axis=0)
_, _ = eval(mean_pred, X_test, y_test)
# lgb + xgb + cat
mean_pred = np.mean([lgb_pred,cat_pred,xgb_pred], axis=0)
_, _ = eval(mean_pred, X_test, y_test)
# lgb + xgb + cat + rf
mean_pred = np.mean([lgb_pred, cat_pred, xgb_pred, rf_pred], axis=0)
_, _ = eval(mean_pred, X_test, y_test)
# lgb + xgb + cat + rf + tab
mean_pred = np.mean([lgb_pred, cat_pred, xgb_pred, rf_pred, tab_pred], axis=0)
_, _ = eval(mean_pred, X_test, y_test)
# lgb + xgb + cat + rf + tab + nn
mean_pred = np.mean([lgb_pred, cat_pred, xgb_pred, rf_pred, tab_pred, nn_pred], axis=0)
_, _ = eval(mean_pred, X_test, y_test)
# xgb + cat + rf
mean_pred = np.mean([lgb_pred, cat_pred, xgb_pred, rf_pred], axis=0)
_, _ = eval(mean_pred, X_test, y_test)
# xgb + cat + rf + tab
mean_pred = np.mean([cat_pred, xgb_pred, rf_pred, tab_pred], axis=0)
_, _ = eval(mean_pred, X_test, y_test)
# xgb + cat + rf + tab + nn
mean_pred = np.mean([cat_pred, xgb_pred, rf_pred, tab_pred, nn_pred], axis=0)
_, _ = eval(mean_pred, X_test, y_test)

LOCAL SCORE:0.805540838181124
AUC:0.9176674432289427 | PRECISION:0.9278616218514188 | RECALL:0.977905348641837
[[ 1992  2715]
 [  789 34921]]
LOCAL SCORE:0.797009099509209
AUC:0.9161992687476013 | PRECISION:0.9286113253717052 | RECALL:0.977681321758611
[[ 2023  2684]
 [  797 34913]]
LOCAL SCORE:0.7949005400142309
AUC:0.9153444791110221 | PRECISION:0.9258640797285835 | RECALL:0.9781853822458695
[[ 1910  2797]
 [  779 34931]]
LOCAL SCORE:0.801151506780494
AUC:0.9173171483786041 | PRECISION:0.9269282814614344 | RECALL:0.9782973956874825
[[ 1953  2754]
 [  775 34935]]
LOCAL SCORE:0.8000017683267081
AUC:0.9132209534147709 | PRECISION:0.9246678464830027 | RECALL:0.9803136376365164
[[ 1855  2852]
 [  703 35007]]
LOCAL SCORE:0.796559869934678
AUC:0.9151171682135743 | PRECISION:0.9253904907894389 | RECALL:0.9805096611593391
[[ 1884  2823]
 [  696 35014]]
LOCAL SCORE:0.7922506659425376
AUC:0.9146048025019429 | PRECISION:0.9235306509374341 | RECALL:0.9821338560627275
[[ 1803  2904]
 [  638 35072]

# SUBMIT

In [19]:
i = 0
X = df_train[feature_col]
y = df_train['label']
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=92)
df_result = pd.DataFrame()
for train_index, valid_index in kf.split(X, y):
    
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_valid, y_valid = X.iloc[valid_index], y.iloc[valid_index]
    submit = df_test.copy()
    
    lgb_pred, _ = lgbm_train(i,X_train, X_valid, y_train, y_valid, submit[feature_col])
    submit[f'pred_l_{i}'] = lgb_pred
    
    cat_pred, _ = catboost_train(i,X_train, X_valid, y_train, y_valid, submit[feature_col])
    submit[f'pred_c_{i}'] = cat_pred
    
    xgb_pred, _ = xgbt_train(i,X_train, X_valid, y_train, y_valid, submit[feature_col])
    submit[f'pred_x_{i}'] = xgb_pred

    nn_pred = nn_train(i, X_train, X_valid, y_train, y_valid, submit[feature_col])
    submit[f'pred_n_{i}'] = nn_pred

    rf_pred = rf_train(i,X_train, X_valid, y_train, y_valid, submit[feature_col])
    submit[f'pred_r_{i}'] = rf_pred

    tab_pred = tab_train(1, X_train, X_valid, y_train, y_valid, submit[feature_col])
    submit[f'pred_t_{i}'] = tab_pred
    
    submit = submit[[
        'author_id',
        'paper_id', 
        f'pred_n_{i}', 
        f'pred_t_{i}',
        f'pred_c_{i}',
        f'pred_l_{i}',
        f'pred_x_{i}', 
        f'pred_r_{i}'
    ]]
    
    if i == 0:
        df_result = submit
    else:
        df_result = df_result.merge(submit, on=['author_id','paper_id'], how='left')
    i += 1

TRAIN SHAPE:(118647, 1144)
Training until validation scores don't improve for 10 rounds
[1000]	valid_0's auc: 0.971527
[2000]	valid_0's auc: 0.976126
Early stopping, best iteration is:
[2089]	valid_0's auc: 0.976359
Evaluated only: auc
Learning rate set to 0.037866
0:	test: 0.8244848	best: 0.8244848 (0)	total: 268ms	remaining: 44m 37s
1000:	test: 0.9703834	best: 0.9703834 (1000)	total: 4m 34s	remaining: 41m 7s
Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.9734196402
bestIteration = 1437

Shrink model to first 1438 iterations.
[0]	eval-auc:0.84732
[1000]	eval-auc:0.96642
[2000]	eval-auc:0.97345
[2580]	eval-auc:0.97520
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/100

In [23]:
df_result.head()

Unnamed: 0,author_id,paper_id,pred_n_0,pred_t_0,pred_c_0,pred_l_0,pred_x_0,pred_r_0,pred_n_1,pred_t_1,...,pred_c_3,pred_l_3,pred_x_3,pred_r_3,pred_n_4,pred_t_4,pred_c_4,pred_l_4,pred_x_4,pred_r_4
0,Fkb16wn7,0DchSY2n,0.999974,0.997906,0.998925,0.999752,0.999738,0.989472,0.999393,0.998577,...,0.998874,0.999777,0.999673,0.988388,0.999925,0.998548,0.998769,0.999799,0.999774,0.988611
1,Fkb16wn7,0Gw6iDes,0.999889,0.998098,0.998589,0.99863,0.998909,0.988811,0.999143,0.995871,...,0.99865,0.999652,0.999363,0.988001,0.999882,0.996312,0.998683,0.99929,0.999301,0.987937
2,Fkb16wn7,0PgoDSAP,0.999959,0.997839,0.996995,0.995894,0.99453,0.971817,0.999892,0.997278,...,0.994646,0.993036,0.996453,0.961597,0.999815,0.996407,0.993643,0.996104,0.995694,0.967773
3,Fkb16wn7,0S7g2B2l,0.936261,0.997916,0.995652,0.997748,0.997187,0.987839,0.996448,0.99369,...,0.996661,0.998762,0.998011,0.984952,0.998846,0.997943,0.990603,0.998547,0.997021,0.984736
4,Fkb16wn7,0YJjxtdf,0.998884,0.997115,0.994194,0.998507,0.996973,0.966986,0.998855,0.986124,...,0.995817,0.998226,0.997173,0.968581,0.998481,0.984952,0.994068,0.998569,0.99838,0.974821


In [22]:
use_col = [col for col in df_result.columns if '_x_' not in col]
use_col

['author_id',
 'paper_id',
 'pred_n_0',
 'pred_t_0',
 'pred_c_0',
 'pred_l_0',
 'pred_r_0',
 'pred_n_1',
 'pred_t_1',
 'pred_c_1',
 'pred_l_1',
 'pred_r_1',
 'pred_n_2',
 'pred_t_2',
 'pred_c_2',
 'pred_l_2',
 'pred_r_2',
 'pred_n_3',
 'pred_t_3',
 'pred_c_3',
 'pred_l_3',
 'pred_r_3',
 'pred_n_4',
 'pred_t_4',
 'pred_c_4',
 'pred_l_4',
 'pred_r_4']

In [24]:
p_dic = {}
for name, group in df_result[use_col].groupby('author_id'):
    p_dic[f'{name}'] = dict(zip(group['paper_id'].tolist(), group.mean(axis=1, numeric_only=True).tolist()))

In [25]:
tf = open("../submit/submit.json", "w")
json.dump(p_dic, tf)
tf.close()