In [1]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import warnings
from tqdm import tqdm
warnings.filterwarnings('ignore')

  from pandas import MultiIndex, Int64Index


In [2]:
train_data = pd.read_csv('dataTrain.csv')
test_data = pd.read_csv('dataA.csv')
submission = pd.read_csv('submit_example_A.csv')
data_nolabel = pd.read_csv('dataNoLabel.csv')
print(f'train_data.shape = {train_data.shape}')
print(f'test_data.shape  = {test_data.shape}')
print(f'data_nolabel.shape  = {data_nolabel.shape}')

train_data.shape = (59872, 48)
test_data.shape  = (49858, 47)
data_nolabel.shape  = (39884, 47)


In [3]:
def func3(x, y):
    if x<y:
        return int(x/3)
    return int(y/3)

In [4]:
for col in ['f%d' %idx for idx in range(4, 9)]:
    train_data[col] = train_data[col].apply(lambda x: func3(x, 102))
    test_data[col] = test_data[col].apply(lambda x: func3(x, 102))
for col in ['f%d' %idx for idx in range(9, 14)]:
    train_data[col] = train_data[col].apply(lambda x: func3(x, 30))
    test_data[col] = test_data[col].apply(lambda x: func3(x, 30))
for col in ['f%d' %idx for idx in range(14, 19)]:
    train_data[col] = train_data[col].apply(lambda x: func3(x, 3))
    test_data[col] = test_data[col].apply(lambda x: func3(x, 3))

train_data['f19'] = train_data['f19'].apply(lambda x: func3(x, 30))
test_data['f19'] = test_data['f19'].apply(lambda x: func3(x, 30))

for col in ['f%d' %idx for idx in range(20, 42)]:
    train_data[col] = train_data[col].apply(lambda x: func3(x, 300))
    test_data[col] = test_data[col].apply(lambda x: func3(x, 300))

In [5]:
train_data['label'].value_counts()

0    44950
1    14922
Name: label, dtype: int64

In [6]:
train_data.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f38,f39,f40,f41,f42,f43,f44,f45,f46,label
0,81167,0,1,mid,0,0,0,34,0,0,...,0,0,0,0,0,0,0,624,1539,0
1,50408,1,1,mid,0,0,7,0,0,0,...,0,0,0,0,0,0,0,186,366,0
2,9114,0,0,high,12,12,34,0,0,0,...,0,0,0,0,0,0,0,24,48,1
3,53228,1,1,low,0,0,0,0,0,0,...,0,0,0,0,0,0,3,3,9,0
4,56280,1,1,mid,3,17,34,0,0,0,...,0,0,0,0,0,0,0,42,141,0


In [7]:
train_data['f47'] = train_data['f1'] * 10 + train_data['f2']
test_data['f47'] = test_data['f1'] * 10 + test_data['f2']

train_data['f48'] = train_data.apply(lambda x: "%02d%s" %(x['f47'], x['f3']), axis=1)
test_data['f48'] = test_data.apply(lambda x: "%02d%s" %(x['f47'], x['f3']), axis=1)
# 暴力Feature 位置
loc_f = ['f1', 'f2', 'f4', 'f5', 'f6']
for df in [train_data, test_data]:
    for i in range(len(loc_f)):
        for j in range(i + 1, len(loc_f)):
            df[f'{loc_f[i]}+{loc_f[j]}'] = df[loc_f[i]] + df[loc_f[j]]
            df[f'{loc_f[i]}-{loc_f[j]}'] = df[loc_f[i]] - df[loc_f[j]]
            df[f'{loc_f[i]}*{loc_f[j]}'] = df[loc_f[i]] * df[loc_f[j]]
            df[f'{loc_f[i]}/{loc_f[j]}'] = df[loc_f[i]] / (df[loc_f[j]]+0.000001)

# 暴力Feature 通话
com_f = ['f43', 'f44', 'f45', 'f46']
for df in [train_data, test_data]:
    for i in range(len(com_f)):
        for j in range(i + 1, len(com_f)):
            df[f'{com_f[i]}+{com_f[j]}'] = df[com_f[i]] + df[com_f[j]]
            df[f'{com_f[i]}-{com_f[j]}'] = df[com_f[i]] - df[com_f[j]]
            df[f'{com_f[i]}*{com_f[j]}'] = df[com_f[i]] * df[com_f[j]]
            df[f'{com_f[i]}/{com_f[j]}'] = df[com_f[i]] / (df[com_f[j]]+0.000001)

In [8]:
cat_columns = ['f3', 'f48']
data = pd.concat([train_data, test_data])

for col in cat_columns:
    lb = LabelEncoder()
    lb.fit(data[col])
    train_data[col] = lb.transform(train_data[col])
    test_data[col] = lb.transform(test_data[col])

In [9]:
for col in [f'f{idx}' for idx in range(1, 49)]:
    tmp = train_data.groupby([col]).label.agg(['mean'])
    tmp.columns = [f"{col}_{cols}" for cols in tmp.columns]
    tmp.reset_index(inplace=True)
    train_data = train_data.merge(tmp, on=[col], how="left")
    test_data = test_data.merge(tmp, on=[col], how="left")

In [10]:
train_data.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f39_mean,f40_mean,f41_mean,f42_mean,f43_mean,f44_mean,f45_mean,f46_mean,f47_mean,f48_mean
0,81167,0,1,2,0,0,0,34,0,0,...,0.249309,0.249257,0.249215,0.249194,0.234923,0.235792,0.153846,0.25,0.253594,0.231371
1,50408,1,1,2,0,0,7,0,0,0,...,0.249309,0.249257,0.249215,0.249194,0.234923,0.235792,0.303279,0.273381,0.06026,0.053836
2,9114,0,0,0,12,12,34,0,0,0,...,0.249309,0.249257,0.249215,0.249194,0.234923,0.235792,0.242804,0.250871,0.43396,0.582656
3,53228,1,1,1,0,0,0,0,0,0,...,0.249309,0.249257,0.249215,0.249194,0.234923,0.237388,0.23087,0.250597,0.06026,0.054207
4,56280,1,1,2,3,17,34,0,0,0,...,0.249309,0.249257,0.249215,0.249194,0.234923,0.235792,0.262376,0.284314,0.06026,0.053836


In [11]:
num_columns = [ col for col in train_data.columns if col not in ['id', 'label', 'f3', 'f48']]
feature_columns = num_columns + cat_columns
target = 'label'

train = train_data[feature_columns]
label = train_data[target]
test = test_data[feature_columns]
train = train[:50000]
label = label[:50000]

In [12]:
print(train.shape, label.shape)

(50000, 160) (50000,)


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    train, label, stratify=label, random_state=2022)

In [14]:
gbm = LGBMClassifier(
    objective='binary',
    boosting_type='gbdt',
    num_leaves=2 ** 6, 
    max_depth=8,
    colsample_bytree=0.8,
    subsample_freq=1,
    max_bin=255,
    learning_rate=0.05, 
    n_estimators=100, 
    metrics='auc'
)

In [15]:
gbm.fit(X_train, y_train)
y_pred = gbm.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred)
print('auc = %.8f' % auc)

auc = 0.92066540


In [16]:
features = []
feature_importances = []
for col in feature_columns:
    x_test = X_test.copy()
    x_test[col] = 0
    auc1 = roc_auc_score(y_test, gbm.predict_proba(x_test)[:, 1])
    if auc1 < auc:
        features.append(col)
    feature_importances.append([col, auc1, auc1 - auc])

In [17]:
feature_importances.sort(key=lambda x: x[2])
for fi in feature_importances:
    print("| %10s | %.8f | %.8f |" % (fi[0], fi[1], fi[2]))

|   f46_mean | 0.89541724 | -0.02524817 |
|   f45_mean | 0.91767927 | -0.00298613 |
|      f1-f4 | 0.91788528 | -0.00278012 |
|   f44_mean | 0.91847787 | -0.00218753 |
|   f48_mean | 0.91925141 | -0.00141399 |
|      f2-f4 | 0.91928069 | -0.00138471 |
|   f20_mean | 0.91932153 | -0.00134388 |
|   f42_mean | 0.91962026 | -0.00104514 |
|      f4/f5 | 0.91969306 | -0.00097234 |
|   f21_mean | 0.91973302 | -0.00093238 |
|      f1/f6 | 0.91975189 | -0.00091351 |
|    f6_mean | 0.91979553 | -0.00086987 |
|      f2/f4 | 0.91996445 | -0.00070095 |
|   f39_mean | 0.91996972 | -0.00069569 |
|    f44/f46 | 0.92015059 | -0.00051482 |
|      f2/f5 | 0.92017986 | -0.00048554 |
|      f2/f6 | 0.92021005 | -0.00045535 |
|      f4-f5 | 0.92022347 | -0.00044194 |
|      f5-f6 | 0.92031431 | -0.00035109 |
|   f36_mean | 0.92032785 | -0.00033755 |
|   f37_mean | 0.92033395 | -0.00033145 |
|   f32_mean | 0.92034204 | -0.00032337 |
|    f8_mean | 0.92040687 | -0.00025854 |
|      f5+f6 | 0.92040832 | -0.000

In [18]:
gbm.fit(X_train[features], y_train)
y_pred = gbm.predict_proba(X_test[features])[:, 1]
auc = roc_auc_score(y_test, y_pred)
print('auc = %.8f' % auc)

auc = 0.92060014


In [19]:
train = train[features]
test = test[features]

In [20]:
def model_train(model, model_name, kfold=5):
    oof_preds = np.zeros((train.shape[0]))
    test_preds = np.zeros(test.shape[0])
    skf = StratifiedKFold(n_splits=kfold, shuffle=True)
    print(f"Model = {model_name}")
    for k, (train_index, test_index) in enumerate(skf.split(train, label)):
        x_train, x_test = train.iloc[train_index, :], train.iloc[test_index, :]
        y_train, y_test = label.iloc[train_index], label.iloc[test_index]

        model.fit(x_train,y_train)

        y_pred = model.predict_proba(x_test)[:,1]
        oof_preds[test_index] = y_pred.ravel()
        auc = roc_auc_score(y_test,y_pred)
        print("- KFold = %d, val_auc = %.4f" % (k, auc))
        test_fold_preds = model.predict_proba(test)[:, 1]
        test_preds += test_fold_preds.ravel()
        
#         val_fold_preds = model.predict_proba(val)[:, 1]
#         val_preds += val_fold_preds.ravel()
    print("Overall Model = %s, AUC = %.4f" % (model_name, roc_auc_score(label, oof_preds)))
    return test_preds / kfold #, val_preds / kfold

In [21]:
test_preds = model_train(gbm, 'LGBMClassifier', 10)

Model = LGBMClassifier
- KFold = 0, val_auc = 0.9086
- KFold = 1, val_auc = 0.9178
- KFold = 2, val_auc = 0.9084
- KFold = 3, val_auc = 0.9226
- KFold = 4, val_auc = 0.9228
- KFold = 5, val_auc = 0.9197
- KFold = 6, val_auc = 0.9155
- KFold = 7, val_auc = 0.9235
- KFold = 8, val_auc = 0.9178
- KFold = 9, val_auc = 0.9191
Overall Model = LGBMClassifier, AUC = 0.9175


In [22]:
submission['label'] = test_preds
submission.to_csv('submission.csv', index=False)

In [18]:
test[target] = test_preds
val[target] = val_preds

In [26]:
testp = test[ (test[target]<=0.01) | (test[target]>=0.99) ].copy()
testp.loc[ testp[target]>=0.5, target ] = 1
testp.loc[ testp[target]<0.5, target ] = 0

valp = val[ (val[target]<=0.01) | (val[target]>=0.99) ].copy()
valp.loc[ valp[target]>=0.5, target ] = 1
valp.loc[ valp[target]<0.5, target ] = 0

trainp = pd.concat([train, testp[ff], valp[ff]])
labelp = pd.concat([label, testp[target], valp[target]])
test = test[ff]

In [27]:
print(trainp.shape, labelp.shape, test.shape)

(69513, 71) (69513,) (49858, 71)


In [8]:
preds = model_train(gbm, 'gbm', 10)

Model = gbm
- KFold = 0, val_auc = 0.9066
- KFold = 1, val_auc = 0.9083
- KFold = 2, val_auc = 0.9156
- KFold = 3, val_auc = 0.9043
- KFold = 4, val_auc = 0.9065
- KFold = 5, val_auc = 0.9070
- KFold = 6, val_auc = 0.8993
- KFold = 7, val_auc = 0.9160
- KFold = 8, val_auc = 0.9155
- KFold = 9, val_auc = 0.9144
Overall Model = gbm, AUC = 0.9092


In [18]:
train.shape, test.shape, label.shape

((50000, 111), (49858, 111), (50000,))

In [21]:
import gc
import joblib
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics, preprocessing
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Flatten, Input, Embedding, BatchNormalization
import warnings
warnings.filterwarnings("ignore")

In [22]:
def create_model(input_shape): 
    inputs = layers.Input(shape=(input_shape,))
    
    x = layers.BatchNormalization()(inputs)
    
    x = layers.Dense(500, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(300, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    
    y = layers.Dense(2, activation="softmax")(x)

    model = Model(inputs=inputs, outputs=y)
    return model

In [23]:
oof_preds = np.zeros((trainp.shape[0]))
test_preds = np.zeros(test.shape[0])

input_shape = trainp.shape[1]

In [31]:
%%time
skf = StratifiedKFold(n_splits=10, shuffle=True)
k = 0
for k, (train_index, test_index) in enumerate(skf.split(trainp, labelp)):
    K.clear_session()
    x_train, x_test = trainp.iloc[train_index, :], trainp.iloc[test_index, :]
    y_train, y_test = labelp.iloc[train_index], labelp.iloc[test_index]
    
    model = create_model(input_shape)
    adam = Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False)
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=[tf.keras.metrics.AUC()])
    lr = callbacks.ReduceLROnPlateau(monitor='val_auc', factor=0.1,
                                     patience=3, min_lr=1e-6, mode='max', verbose=0)
    es = callbacks.EarlyStopping(monitor='val_auc', min_delta=0.001, patience=5,
                                 verbose=0, mode='max', baseline=None, restore_best_weights=True)
    model.fit(x_train,
              utils.to_categorical(y_train),
              validation_data=(x_test, utils.to_categorical(y_test)),
              verbose=0,
              batch_size=2048,
              callbacks=[lr, es],
              epochs=100
             )
    valid_fold_preds = model.predict(x_test)[:, 1]
    test_fold_preds = model.predict(test)[:, 1]
    oof_preds[test_index] = valid_fold_preds.ravel()
    test_preds += test_fold_preds.ravel()
    print("- KFold = %d, val_auc = %.4f" % (k, metrics.roc_auc_score(y_test, valid_fold_preds)))
    k += 1
    K.clear_session()

- KFold = 0, val_auc = 0.9390
- KFold = 1, val_auc = 0.9400
- KFold = 2, val_auc = 0.9421
- KFold = 3, val_auc = 0.9339
- KFold = 4, val_auc = 0.9340
- KFold = 5, val_auc = 0.9286
- KFold = 6, val_auc = 0.9346
- KFold = 7, val_auc = 0.9372
- KFold = 8, val_auc = 0.9379
- KFold = 9, val_auc = 0.9383
CPU times: total: 23min 8s
Wall time: 4min 46s


In [32]:
print("Overall AUC={}".format(metrics.roc_auc_score(labelp.values, oof_preds)))

Overall AUC=0.9362616597524223


In [35]:
submission['label'] = test_preds / 10
submission.to_csv('submission.csv', index=False)

In [36]:
submission.head()

Unnamed: 0,id,label
0,1,0.021979
1,2,0.039847
2,3,0.065599
3,4,0.051095
4,5,0.025593
