In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import warnings
from tqdm import tqdm
warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_csv('data/dataTrain.csv')
test_data = pd.read_csv('data/dataB.csv')
submission = pd.read_csv('data/submit_example_B.csv')
data_nolabel = pd.read_csv('data/dataNoLabel.csv')
# f1 f2 f4 f5 f6位置类特征 f3 f7-f43互联网类特征 f43-46 通话类特征
display(train_data.head(2))
display(test_data.head(2))
display(train_data.shape)
test_data.shape

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f38,f39,f40,f41,f42,f43,f44,f45,f46,label
0,81167,0,1,mid,0,0,0,153,0,0,...,0,0,0,0,0,0,0,624,1539,0
1,50408,1,1,mid,0,0,21,0,0,0,...,0,0,0,0,0,0,0,186,366,0


Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46
0,1,1,0,low,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,36,60
1,2,0,1,mid,0,48,78,0,0,0,...,138,0,0,0,0,33,3,69,330,939


(59872, 48)

(49926, 47)

In [3]:
train_data['f47'] = train_data['f1'] * 10 + train_data['f2']
test_data['f47'] = test_data['f1'] * 10 + test_data['f2']
train_data['f3'] = train_data['f3'].map({'low': 0, 'mid': 1, 'high': 2})
test_data['f3'] = test_data['f3'].map({'low': 0, 'mid': 1, 'high': 2})
display(train_data.head(2))
display(test_data.head(2))

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f39,f40,f41,f42,f43,f44,f45,f46,label,f47
0,81167,0,1,1,0,0,0,153,0,0,...,0,0,0,0,0,0,624,1539,0,1
1,50408,1,1,1,0,0,21,0,0,0,...,0,0,0,0,0,0,186,366,0,11


Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f38,f39,f40,f41,f42,f43,f44,f45,f46,f47
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,36,60,10
1,2,0,1,1,0,48,78,0,0,0,...,0,0,0,0,33,3,69,330,939,1


In [4]:
# 暴力Feature 位置
loc_f = ['f1', 'f2', 'f4', 'f5', 'f6']
for df in [train_data, test_data]:
    for i in range(len(loc_f)):
        for j in range(i + 1, len(loc_f)):
            df[f'{loc_f[i]}+{loc_f[j]}'] = df[loc_f[i]] + df[loc_f[j]]
            df[f'{loc_f[i]}-{loc_f[j]}'] = df[loc_f[i]] - df[loc_f[j]]
            df[f'{loc_f[i]}*{loc_f[j]}'] = df[loc_f[i]] * df[loc_f[j]]
            df[f'{loc_f[i]}/{loc_f[j]}'] = df[loc_f[i]] / (df[loc_f[j]]+1)

# 暴力Feature 通话
com_f = ['f43', 'f44', 'f45', 'f46']
for df in [train_data, test_data]:
    for i in range(len(com_f)):
        for j in range(i + 1, len(com_f)):
            df[f'{com_f[i]}+{com_f[j]}'] = df[com_f[i]] + df[com_f[j]]
            df[f'{com_f[i]}-{com_f[j]}'] = df[com_f[i]] - df[com_f[j]]
            df[f'{com_f[i]}*{com_f[j]}'] = df[com_f[i]] * df[com_f[j]]
            df[f'{com_f[i]}/{com_f[j]}'] = df[com_f[i]] / (df[com_f[j]]+1)
display(train_data.head(2))
display(test_data.head(2))

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f44*f45,f44/f45,f44+f46,f44-f46,f44*f46,f44/f46,f45+f46,f45-f46,f45*f46,f45/f46
0,81167,0,1,1,0,0,0,153,0,0,...,0,0.0,1539,-1539,0,0.0,2163,-915,960336,0.405195
1,50408,1,1,1,0,0,21,0,0,0,...,0,0.0,366,-366,0,0.0,552,-180,68076,0.506812


Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f44*f45,f44/f45,f44+f46,f44-f46,f44*f46,f44/f46,f45+f46,f45-f46,f45*f46,f45/f46
0,1,1,0,0,0,0,0,0,0,0,...,0,0.0,60,-60,0,0.0,96,-24,2160,0.590164
1,2,0,1,1,0,48,78,0,0,0,...,22770,0.208459,1008,-870,64791,0.073404,1269,-609,309870,0.351064


In [5]:
# 离散化            
all_f = [f'f{idx}' for idx in range(1, 47) if idx != 3]
for df in [train_data, test_data]:
    for col in all_f:
        df[f'{col}_log'] = df[col].apply(lambda x: int(np.log(x)) if x > 0 else 0)
# 特征交叉        
log_f = [f'f{idx}_log' for idx in range(1, 47) if idx != 3]
for df in [train_data, test_data]:
    for i in range(len(log_f)):
        for j in range(i + 1, len(log_f)):
            df[f'{log_f[i]}_{log_f[j]}'] = df[log_f[i]]*10000 + df[log_f[j]]
display(train_data.head(2))
display(test_data.head(2))

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f42_log_f43_log,f42_log_f44_log,f42_log_f45_log,f42_log_f46_log,f43_log_f44_log,f43_log_f45_log,f43_log_f46_log,f44_log_f45_log,f44_log_f46_log,f45_log_f46_log
0,81167,0,1,1,0,0,0,153,0,0,...,0,0,6,7,0,6,7,6,7,60007
1,50408,1,1,1,0,0,21,0,0,0,...,0,0,5,5,0,5,5,5,5,50005


Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f42_log_f43_log,f42_log_f44_log,f42_log_f45_log,f42_log_f46_log,f43_log_f44_log,f43_log_f45_log,f43_log_f46_log,f44_log_f45_log,f44_log_f46_log,f45_log_f46_log
0,1,1,0,0,0,0,0,0,0,0,...,0,0,3,4,0,3,4,3,4,30004
1,2,0,1,1,0,48,78,0,0,0,...,30001,30004,30005,30006,10004,10005,10006,40005,40006,50006


In [6]:
cat_columns = ['f3']
num_columns = [ col for col in train_data.columns if col not in ['id', 'label', 'f3']]
feature_columns = num_columns + cat_columns
target = 'label'

train = train_data[feature_columns]
label = train_data[target]
test = test_data[feature_columns]
display(train.head(2))
display(test.head(2))
display(label.head(2))


Unnamed: 0,f1,f2,f4,f5,f6,f7,f8,f9,f10,f11,...,f42_log_f44_log,f42_log_f45_log,f42_log_f46_log,f43_log_f44_log,f43_log_f45_log,f43_log_f46_log,f44_log_f45_log,f44_log_f46_log,f45_log_f46_log,f3
0,0,1,0,0,0,153,0,0,78,0,...,0,6,7,0,6,7,6,7,60007,1
1,1,1,0,0,21,0,0,0,0,0,...,0,5,5,0,5,5,5,5,50005,1


Unnamed: 0,f1,f2,f4,f5,f6,f7,f8,f9,f10,f11,...,f42_log_f44_log,f42_log_f45_log,f42_log_f46_log,f43_log_f44_log,f43_log_f45_log,f43_log_f46_log,f44_log_f45_log,f44_log_f46_log,f45_log_f46_log,f3
0,1,0,0,0,0,0,0,0,0,0,...,0,3,4,0,3,4,3,4,30004,0
1,0,1,0,48,78,0,0,0,0,0,...,30004,30005,30006,10004,10005,10006,40005,40006,50006,1


0    0
1    0
Name: label, dtype: int64

In [7]:
def model_train(model, model_name, kfold=5):
    oof_preds = np.zeros((train.shape[0]))
    test_preds = np.zeros(test.shape[0])
    skf = StratifiedKFold(n_splits=kfold)
    print(f"Model = {model_name}")
    for k, (train_index, test_index) in enumerate(skf.split(train, label)):
        x_train, x_test = train.iloc[train_index, :], train.iloc[test_index, :]
        y_train, y_test = label.iloc[train_index], label.iloc[test_index]

        model.fit(x_train,y_train)

        y_pred = model.predict_proba(x_test)[:,1]
        oof_preds[test_index] = y_pred.ravel()
        auc = roc_auc_score(y_test,y_pred)
        print("- KFold = %d, val_auc = %.4f" % (k, auc))
        test_fold_preds = model.predict_proba(test)[:, 1]
        test_preds += test_fold_preds.ravel()
    print("Overall Model = %s, AUC = %.4f" % (model_name, roc_auc_score(label, oof_preds)))
    return test_preds / kfold

In [8]:
#去除干扰数据
train = train[:50000]
label = label[:50000]

In [9]:
gbc = GradientBoostingClassifier(
    n_estimators=50, 
    learning_rate=0.1,
    max_depth=5
)
hgbc = HistGradientBoostingClassifier(
    max_iter=100,
    max_depth=5
)
xgbc = XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    n_estimators=100, 
    max_depth=6, 
    learning_rate=0.1
)
gbm = LGBMClassifier(
    objective='binary',
    boosting_type='gbdt',
    num_leaves=2 ** 6, 
    max_depth=8,
    colsample_bytree=0.8,
    subsample_freq=1,
    max_bin=255,
    learning_rate=0.05, 
    n_estimators=100, 
    metrics='auc'
)
cbc = CatBoostClassifier(
    iterations=210, 
    depth=6, 
    learning_rate=0.03, 
    l2_leaf_reg=1, 
    loss_function='Logloss', 
    verbose=0
)
LG = LogisticRegression(C = 0.1, penalty = 'l1',solver='liblinear')

In [10]:
estimators = [
    ('gbc', gbc),
    ('hgbc', hgbc),
    ('xgbc', xgbc),
    ('gbm', gbm),
    ('cbc', cbc),
    ('LG', LG)
]
clf = StackingClassifier(
    estimators=estimators, 
    final_estimator=LogisticRegression()
)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    train, label, stratify=label, random_state=2023)

In [12]:
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred)
print('auc = %.8f' % auc)

[LightGBM] [Info] Number of positive: 9352, number of negative: 28148
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072957 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31812
[LightGBM] [Info] Number of data points in the train set: 37500, number of used features: 1098
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.249387 -> initscore=-1.101886
[LightGBM] [Info] Start training from score -1.101886
[LightGBM] [Info] Number of positive: 7482, number of negative: 22518
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059525 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30992
[LightGBM] [Info] Number of data points in the train set: 30000, number of used features: 1095
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.249400 -> initscore=-1.101815
[LightGBM] [Info] Start training from score -1.101815
[LightGB

In [13]:
ff = []
for col in feature_columns:
    x_test = X_test.copy()
    x_test[col] = 0
    auc1 = roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1])
    if auc1 < auc:
        ff.append(col)
    print('%5s | %.8f | %.8f' % (col, auc1, auc1 - auc))

   f1 | 0.90918300 | -0.00000595
   f2 | 0.90922994 | 0.00004099
   f4 | 0.90917719 | -0.00001176
   f5 | 0.90919897 | 0.00001002
   f6 | 0.90927038 | 0.00008143
   f7 | 0.90920252 | 0.00001357
   f8 | 0.90915849 | -0.00003046
   f9 | 0.90915371 | -0.00003524
  f10 | 0.90919808 | 0.00000913
  f11 | 0.90919367 | 0.00000472
  f12 | 0.90921698 | 0.00002803
  f13 | 0.90918987 | 0.00000092
  f14 | 0.90918673 | -0.00000222
  f15 | 0.90918851 | -0.00000044
  f16 | 0.90918895 | 0.00000000
  f17 | 0.90918813 | -0.00000082
  f18 | 0.90914164 | -0.00004731
  f19 | 0.90910848 | -0.00008047
  f20 | 0.90918731 | -0.00000164
  f21 | 0.90919288 | 0.00000393
  f22 | 0.90919015 | 0.00000120
  f23 | 0.90874551 | -0.00044344
  f24 | 0.90913337 | -0.00005558
  f25 | 0.90912711 | -0.00006184
  f26 | 0.90904471 | -0.00014424
  f27 | 0.90918895 | 0.00000000
  f28 | 0.90918512 | -0.00000383
  f29 | 0.90918820 | -0.00000075
  f30 | 0.90920126 | 0.00001231
  f31 | 0.90922649 | 0.00003753
  f32 | 0.90919196 | 0.0

In [14]:
clf.fit(X_train[ff], y_train)
y_pred = clf.predict_proba(X_test[ff])[:, 1]
auc = roc_auc_score(y_test, y_pred)
print('auc = %.8f' % auc)

[LightGBM] [Info] Number of positive: 9352, number of negative: 28148
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037728 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20830
[LightGBM] [Info] Number of data points in the train set: 37500, number of used features: 579
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.249387 -> initscore=-1.101886
[LightGBM] [Info] Start training from score -1.101886
[LightGBM] [Info] Number of positive: 7482, number of negative: 22518
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032352 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20376
[LightGBM] [Info] Number of data points in the train set: 30000, number of used features: 579
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.249400 -> initscore=-1.101815
[LightGBM] [Info] Start training from score -1.101815
[LightGBM]

In [None]:
train = train[ff]
test = test[ff]

clf_test_preds = model_train(clf, "StackingClassifier", 10)

submission['label'] = clf_test_preds
submission.to_csv('submission.csv', index=False)

Model = StackingClassifier
[LightGBM] [Info] Number of positive: 11223, number of negative: 33777
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.047442 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21254
[LightGBM] [Info] Number of data points in the train set: 45000, number of used features: 580
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.249400 -> initscore=-1.101815
[LightGBM] [Info] Start training from score -1.101815
[LightGBM] [Info] Number of positive: 8979, number of negative: 27021
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036910 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20800
[LightGBM] [Info] Number of data points in the train set: 36000, number of used features: 579
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.249417 -> initscore=-1.101726
[LightGBM] [Info] Start training fro