In [1]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import warnings
from tqdm import tqdm
warnings.filterwarnings('ignore')

  from pandas import MultiIndex, Int64Index


In [2]:
train_data = pd.read_csv('data/dataTrain.csv')
test_data = pd.read_csv('data/dataA.csv')
submission = pd.read_csv('data/submit_example_A.csv')
data_nolabel = pd.read_csv('data/dataNoLabel.csv')

In [3]:
print(f'train_data.shape = {train_data.shape}\ntest_data.shape  = {test_data.shape}')

train_data.shape = (59872, 48)
test_data.shape  = (49858, 47)


In [4]:
train_data['f47'] = train_data['f1'] * 10 + train_data['f2']
test_data['f47'] = test_data['f1'] * 10 + test_data['f2']
# 暴力Feature 位置
loc_f = ['f1', 'f2', 'f4', 'f5', 'f6']
for df in [train_data, test_data]:
    for i in range(len(loc_f)):
        for j in range(i + 1, len(loc_f)):
            df[f'{loc_f[i]}+{loc_f[j]}'] = df[loc_f[i]] + df[loc_f[j]]
            df[f'{loc_f[i]}-{loc_f[j]}'] = df[loc_f[i]] - df[loc_f[j]]
            df[f'{loc_f[i]}*{loc_f[j]}'] = df[loc_f[i]] * df[loc_f[j]]
            df[f'{loc_f[i]}/{loc_f[j]}'] = df[loc_f[i]] / (df[loc_f[j]]+1)

# 暴力Feature 通话
com_f = ['f43', 'f44', 'f45', 'f46']
for df in [train_data, test_data]:
    for i in range(len(com_f)):
        for j in range(i + 1, len(com_f)):
            df[f'{com_f[i]}+{com_f[j]}'] = df[com_f[i]] + df[com_f[j]]
            df[f'{com_f[i]}-{com_f[j]}'] = df[com_f[i]] - df[com_f[j]]
            df[f'{com_f[i]}*{com_f[j]}'] = df[com_f[i]] * df[com_f[j]]
            df[f'{com_f[i]}/{com_f[j]}'] = df[com_f[i]] / (df[com_f[j]]+1)
# 离散化            
all_f = [f'f{idx}' for idx in range(1, 47) if idx != 3]
for df in [train_data, test_data]:
    for col in all_f:
        df[f'{col}_log'] = df[col].apply(lambda x: int(np.log(x)) if x > 0 else 0)
# 特征交叉        
log_f = [f'f{idx}_log' for idx in range(1, 47) if idx != 3]
for df in [train_data, test_data]:
    for i in range(len(log_f)):
        for j in range(i + 1, len(log_f)):
            df[f'{log_f[i]}_{log_f[j]}'] = df[log_f[i]]*10000 + df[log_f[j]]

In [5]:
cat_columns = ['f3']
data = pd.concat([train_data, test_data])

for col in cat_columns:
    lb = LabelEncoder()
    lb.fit(data[col])
    train_data[col] = lb.transform(train_data[col])
    test_data[col] = lb.transform(test_data[col])

In [6]:
feature_columns = [ col for col in train_data.columns if col not in ['id', 'label']]
target = 'label'

train = train_data[feature_columns][:50000]
label = train_data[target][:50000]
test = test_data[feature_columns]

In [7]:
def model_train(model, model_name, kfold=10):
    oof_preds = np.zeros((train.shape[0]))
    test_preds = np.zeros(test.shape[0])
    skf = StratifiedKFold(n_splits=kfold)

    for k, (train_index, test_index) in enumerate(skf.split(train, label)):
        x_train, x_test = train.iloc[train_index, :], train.iloc[test_index, :]
        y_train, y_test = label.iloc[train_index], label.iloc[test_index]

        model.fit(x_train,y_train)

        y_pred = model.predict_proba(x_test)[:,1]
        oof_preds[test_index] = y_pred.ravel()
        auc = roc_auc_score(y_test,y_pred)
        print("Model = %s, KFold = %d, val_auc = %.4f" % (model_name, k, auc))
        test_fold_preds = model.predict_proba(test)[:, 1]
        test_preds += test_fold_preds.ravel()
    print("Overall Model = %s, AUC = %.4f" % (model_name, roc_auc_score(label, oof_preds)))
    return test_preds / kfold

In [8]:
gbm = LGBMClassifier(
    objective='binary',
    boosting_type='gbdt',
    num_leaves=2 ** 6, 
    max_depth=8,
    colsample_bytree=0.8,
    subsample_freq=1,
    max_bin=255,
    learning_rate=0.05, 
    n_estimators=200, 
    metrics='auc'
)

In [9]:
preds = model_train(gbm, "LGBMClassifier", 10)

Model = LGBMClassifier, KFold = 0, val_auc = 0.9060
Model = LGBMClassifier, KFold = 1, val_auc = 0.9079
Model = LGBMClassifier, KFold = 2, val_auc = 0.9146
Model = LGBMClassifier, KFold = 3, val_auc = 0.9054
Model = LGBMClassifier, KFold = 4, val_auc = 0.9049
Model = LGBMClassifier, KFold = 5, val_auc = 0.9056
Model = LGBMClassifier, KFold = 6, val_auc = 0.8995
Model = LGBMClassifier, KFold = 7, val_auc = 0.9154
Model = LGBMClassifier, KFold = 8, val_auc = 0.9146
Model = LGBMClassifier, KFold = 9, val_auc = 0.9123
Overall Model = LGBMClassifier, AUC = 0.9085


In [10]:
submission['label'] = preds
submission.to_csv('submission.csv', index=False)