In [23]:
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from category_encoders import TargetEncoder 
from sklearn.model_selection import KFold
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
import warnings
warnings.filterwarnings('ignore')
# 基本信息
base_info = pd.read_csv("./train/base_info.csv")

# id + 标签
entprise_info = pd.read_csv("./train/entprise_info.csv")

# id + 标签(Null) 
result = pd.read_csv("./entprise_evaluate.csv")

In [24]:
temp = pd.concat([entprise_info,result], ignore_index=True)

In [25]:
# 将base_info和label进行连接，label为空的是测试集
data = pd.merge(temp, base_info, on="id", how="left")

In [26]:
data['opfrom'] = pd.to_datetime(data['opfrom'],format='%Y-%m-%d')
data['opto'] = pd.to_datetime(data['opto'],format='%Y-%m-%d')
# 构造时间特征
data["time"] = (data["opto"] - data["opfrom"]).dt.days

In [27]:
# 使用目标编码对industryphy列进行训练 
enc = TargetEncoder()  
data["industryphy"] = enc.fit_transform(data["industryphy"], data["label"]) 

In [28]:
# 使用目标编码对opform列进行训练 
enc = TargetEncoder()  
data["opform"] = enc.fit_transform(data["opform"], data["label"]) 

In [29]:
# 使用目标编码对opform列进行训练
enc = TargetEncoder()  
data["oploc"] = enc.fit_transform(data["oploc"], data["label"]) 

In [30]:
train = data[data.label.notnull()].reset_index(drop=True)

In [31]:
test = data[data.label.isnull()].reset_index(drop=True)

In [32]:
label = train.label

In [11]:
train = train.drop(["id", "dom", "opscope", "opfrom", "opto", "label", "score"], axis=1)
test = test.drop(["id", "dom", "opscope", "opfrom", "opto", "label", "score"], axis=1)

In [12]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 1108
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test_pred = np.zeros(test_x.shape[0])
    test = np.zeros(test_x.shape[0])
    importance = np.zeros(train_x.columns.shape[0])
    
    cv_scores = []
    feature_names = train_x.columns.tolist()
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'tree_method':'gpu_hist',
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.05,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
            }

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
            
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            test_matrix = clf.DMatrix(test_x)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'tree_method':'gpu_hist',
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.04,
                      'tree_method': 'exact',
                      'seed': 2020,
                      'nthread': 36,
                      "silent": True,
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
            
        # importance += model.feature_importance() / 5
        
        train[valid_index] = val_pred
        test += test_pred / kf.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
    
    # df = pd.DataFrame({ 'column': feature_names, 'importance': importance}).sort_values(by='importance')           
    # df.to_csv("./importance.csv")
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return test

def lgb_model(x_train, y_train, x_test):
    lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_test

def xgb_model(x_train, y_train, x_test):
    xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
    return xgb_test

def cat_model(x_train, y_train, x_test):
    cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
    return cat_test

In [13]:
xgb_test = xgb_model(train, label, test)

************************************ 1 ************************************
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.97961	eval-auc:0.97960
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[200]	train-auc:0.99553	eval-auc:0.99064
[400]	train-auc:0.99680	eval-auc:0.99042
Stopping. Best iteration:
[208]	train-auc:0.99559	eval-auc:0.99069

[0.9906932702188356]
************************************ 2 ************************************
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through t

In [14]:
lgb_test = lgb_model(train, label, test)

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.99681	valid_1's auc: 0.988942
Early stopping, best iteration is:
[100]	training's auc: 0.995189	valid_1's auc: 0.989863
[0.9898632230684478]
************************************ 2 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.996522	valid_1's auc: 0.990543
Early stopping, best iteration is:
[180]	training's auc: 0.996333	valid_1's auc: 0.99064
[0.9898632230684478, 0.9906401343396559]
************************************ 3 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.99603	valid_1's auc: 0.993519
[400]	training's auc: 0.997676	valid_1's auc: 0.993462
Early stopping, best iteration is:
[337]	training's auc: 0.997324	valid_1's auc: 0.993543
[0.9898632230684478, 0.99064013433965

In [15]:
# cat_test = cat_model(train, label, test)

In [16]:
# (cat_test < 0).sum()

In [17]:
print(xgb_test.sum())
print(lgb_test.sum())
# print(cat_test.sum())

925.4969567938506
928.8016930435757


In [18]:
rh_test = lgb_test * 0.4 + xgb_test * 0.6
rh_test

array([0.01458281, 0.6710442 , 0.00120874, ..., 0.00175307, 0.00118242,
       0.81647762])

In [36]:
result['score'] = rh_test

In [37]:
result

Unnamed: 0,id,score
0,82750f1b9d1223508ee329d47e27d35176c93eb9f35e9c1a,0.014583
1,f000950527a6feb670cc1c87c2025f3922aaa4a0206a0a33,0.671044
2,e9f7b28ec10e04700ef4db75a494f9a1e8e8b09555e6afa1,0.001209
3,beb4aaaa89e0a0ae9d77bd5d7665be6342f552f51840cf19,0.000353
4,e9f7b28ec10e0470ee4172cec0133b6826c34f27d3dff204,0.001669
...,...,...
9995,f000950527a6feb6b9e9c5a82689e87ee128abcf72ca7b96,0.155470
9996,d8071a739aa75a3bb98b032a18ae492bb8cf7ad9e0c23acd,0.036022
9997,f000950527a6feb63ae3783e4b82cbd8da7b3eaf43624866,0.001753
9998,d8071a739aa75a3bf8557cd0432d5c04e2241aee9f422220,0.001182


In [38]:
result[['id','score']].to_csv('result.csv', index=False)