In [1]:
# 导包
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from category_encoders import TargetEncoder 
from sklearn.model_selection import KFold
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 年报基本信息
annual_report_info = pd.read_csv("./train/annual_report_info.csv")

# 年报基本信息进行去重操作
annual_report_info = annual_report_info.drop_duplicates()

# 年报信息提取出特征
annual_report_feature = annual_report_info.id.drop_duplicates().reset_index(drop=True)

# 根据id进行分组计算FUNDAM资金数额的均值
temp = annual_report_info[["id", "FUNDAM"]].groupby("id").mean()

temp = temp.rename(columns={"FUNDAM":"id_FUNDAM_mean"})

annual_report_feature = pd.merge(annual_report_feature,temp,how="left",on="id")

# 删除 成员人数 字段
annual_report_info["MEMNUM"].isnull().sum()

# 删除 农民人数 字段
annual_report_info["FARNUM"].isnull().sum()

# 删除 本年度新增成员人数 字段
annual_report_info["ANNNEWMEMNUM"].isnull().sum()

# 删除 本年度退出成员人数 字段
annual_report_info["ANNREDMEMNUM"].isnull().sum()

# 根据id进行分组计算EMPNUM从业人数的均值
temp = annual_report_info[["id", "EMPNUM"]].groupby("id").mean()

temp = temp.rename(columns={"EMPNUM":"id_EMPNUM_mean"})

annual_report_feature = pd.merge(annual_report_feature,temp,how="left",on="id")

# 取值为1和2的进行分箱
def binning_2(x):
    if x>=1.5 and x<=2:
        return 2.0
    elif x<1.5 and x>=1:
        return 1.0
    
# 根据id进行分组计算 EMPNUMSIGN:从业人数是否公示 的平均取值，再进行数据分箱
temp = annual_report_info[["id", "EMPNUMSIGN"]].groupby("id").mean()

temp = temp.rename(columns={"EMPNUMSIGN":"id_EMPNUMSIGN_mean"})

annual_report_feature = pd.merge(annual_report_feature,temp,how="left",on="id")

annual_report_feature["id_EMPNUMSIGN_mean"] = annual_report_feature["id_EMPNUMSIGN_mean"].apply(binning_2)

# 先对经营状态(BUSSTNAME)进行编码，然后根据id分组计算经营状态的均值，然后再进行分箱
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(list(annual_report_info["BUSSTNAME"].astype(str).values))
annual_report_info["BUSSTNAME"] = le.transform(annual_report_info["BUSSTNAME"].astype(str).values)

# 分箱函数
def Binning_4(x):
    if x < 0.5:
        return 
    elif x < 1.5:
        return 1.0
    elif x<2.5:
        return 2.0
    elif x<3.5:
        return 3.0
    else :
        return 4.0

temp = annual_report_info[["id", "BUSSTNAME"]].groupby("id").mean()

temp = temp.rename(columns={"BUSSTNAME":"id_BUSSTNAME_mean"})

annual_report_feature = pd.merge(annual_report_feature,temp,how="left",on="id")

annual_report_feature["id_BUSSTNAME_mean"] = annual_report_feature["id_BUSSTNAME_mean"].apply(Binning_4)

# 根据id进行分组计算 COLGRANUM:其中高校毕业生人数经营者 的均值
temp = annual_report_info[["id", "COLGRANUM"]].groupby("id").mean()

temp = temp.rename(columns={"COLGRANUM":"id_COLGRANUM_mean"})

annual_report_feature = pd.merge(annual_report_feature,temp,how="left",on="id")

# 根据id进行分组计算 RETSOLNUM:其中退役士兵人数经营者 的均值
temp = annual_report_info[["id", "RETSOLNUM"]].groupby("id").mean()

temp = temp.rename(columns={"RETSOLNUM":"id_RETSOLNUM_mean"})

annual_report_feature = pd.merge(annual_report_feature,temp,how="left",on="id")

# 根据id进行分组计算 DISPERNUM:其中残疾人人数经营者 的均值
temp = annual_report_info[["id", "DISPERNUM"]].groupby("id").mean()

temp = temp.rename(columns={"DISPERNUM":"id_DISPERNUM_mean"})

annual_report_feature = pd.merge(annual_report_feature,temp,how="left",on="id")

# 根据id进行分组计算 UNENUM:其中下岗失业人数经营者 的均值
temp = annual_report_info[["id", "UNENUM"]].groupby("id").mean()

temp = temp.rename(columns={"UNENUM":"id_UNENUM_mean"})

annual_report_feature = pd.merge(annual_report_feature,temp,how="left",on="id")

# 根据id进行分组计算 COLEMPLNUM:其中高校毕业生人数雇员 的均值
temp = annual_report_info[["id", "COLEMPLNUM"]].groupby("id").mean()

temp = temp.rename(columns={"COLEMPLNUM":"id_COLEMPLNUM_mean"})

annual_report_feature = pd.merge(annual_report_feature,temp,how="left",on="id")

# 根据id进行分组计算 RETEMPLNUM:其中退役士兵人数雇员 的均值
temp = annual_report_info[["id", "RETEMPLNUM"]].groupby("id").mean()

temp = temp.rename(columns={"RETEMPLNUM":"id_RETEMPLNUM_mean"})

annual_report_feature = pd.merge(annual_report_feature,temp,how="left",on="id")

# 根据id进行分组计算 DISEMPLNUM:其中残疾人人数雇员 的均值
temp = annual_report_info[["id", "DISEMPLNUM"]].groupby("id").mean()

temp = temp.rename(columns={"DISEMPLNUM":"id_DISEMPLNUM_mean"})

annual_report_feature = pd.merge(annual_report_feature,temp,how="left",on="id")

# 根据id进行分组计算 UNEEMPLNUM:其中下岗失业人数雇员 的均值
temp = annual_report_info[["id", "UNEEMPLNUM"]].groupby("id").mean()

temp = temp.rename(columns={"UNEEMPLNUM":"id_UNEEMPLNUM_mean"})

annual_report_feature = pd.merge(annual_report_feature,temp,how="left",on="id")

# 根据id进行分组计算 WEBSITSIGN:是否有网站标志 的均值  再分箱
temp = annual_report_info[["id", "WEBSITSIGN"]].groupby("id").mean()

temp = temp.rename(columns={"WEBSITSIGN":"id_WEBSITSIGN_mean"})

annual_report_feature = pd.merge(annual_report_feature,temp,how="left",on="id")

annual_report_feature["id_WEBSITSIGN_mean"] = annual_report_feature["id_WEBSITSIGN_mean"].apply(binning_2)

# 根据id进行分组计算 FORINVESTSIGN:是否有对外投资企业标志 的均值    再分箱
temp = annual_report_info[["id", "FORINVESTSIGN"]].groupby("id").mean()

temp = temp.rename(columns={"FORINVESTSIGN":"id_FORINVESTSIGN_mean"})

annual_report_feature = pd.merge(annual_report_feature,temp,how="left",on="id")

annual_report_feature["id_FORINVESTSIGN_mean"] = annual_report_feature["id_FORINVESTSIGN_mean"].apply(binning_2)

# 根据id进行分组计算 STOCKTRANSIGN:有限责任公司本年度是否发生股东股权转让标志 的均值  再分箱
temp = annual_report_info[["id", "STOCKTRANSIGN"]].groupby("id").mean()

temp = temp.rename(columns={"STOCKTRANSIGN":"id_STOCKTRANSIGN_mean"})

annual_report_feature = pd.merge(annual_report_feature,temp,how="left",on="id")

annual_report_feature["id_STOCKTRANSIGN_mean"] = annual_report_feature["id_STOCKTRANSIGN_mean"].apply(binning_2)

def binning_3(x):
    if x>=1 and x<1.5:
        return 1.0
    elif x>=1.5 and x<2.5:
        return 2.0
    elif x>=2.5 and x<=3:
        return 3.0
    
# 根据id进行分组计算 PUBSTATE:公示状态 的均值  再分箱
temp = annual_report_info[["id", "PUBSTATE"]].groupby("id").mean()

temp = temp.rename(columns={"PUBSTATE":"id_PUBSTATE_mean"})

annual_report_feature = pd.merge(annual_report_feature,temp,how="left",on="id")

annual_report_feature["id_STOCKTRANSIGN_mean"] = annual_report_feature["id_STOCKTRANSIGN_mean"].apply(binning_3)

print("年报特征提取完毕！")

年报特征提取完毕！


In [3]:
annual_report_feature

Unnamed: 0,id,id_FUNDAM_mean,id_EMPNUM_mean,id_EMPNUMSIGN_mean,id_BUSSTNAME_mean,id_COLGRANUM_mean,id_RETSOLNUM_mean,id_DISPERNUM_mean,id_UNENUM_mean,id_COLEMPLNUM_mean,id_RETEMPLNUM_mean,id_DISEMPLNUM_mean,id_UNEEMPLNUM_mean,id_WEBSITSIGN_mean,id_FORINVESTSIGN_mean,id_STOCKTRANSIGN_mean,id_PUBSTATE_mean
0,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,3.50,6.000000,,,0.00,0.0,0.0,0.00,0.00,0.00,0.0,0.00,2.0,,,3.00
1,f000950527a6feb63ee1ce82bb22ddd1ab8b8fdffa3b91fb,,3.500000,2.0,2.0,2.00,0.0,0.0,0.00,1.50,0.00,0.0,0.00,2.0,2.0,2.0,3.00
2,9c7fa510616a68309e4badf2a7a3123c0462fb85bf28ef17,13.50,16.000000,,,0.00,0.0,0.0,0.00,0.00,0.00,0.0,0.00,2.0,,,3.00
3,755db3b5c5f74eb48564a8be9d4a9d7038ed96bc2eea645c,2.00,1.000000,,,0.00,0.0,0.0,0.00,0.00,0.00,0.0,0.00,2.0,,,3.00
4,e9f7b28ec10e0470287f274dd5a327519e74d2eb9506faad,,5.666667,2.0,2.0,,,,,,,,,2.0,2.0,2.0,3.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8932,59b38c56de38368333bc0aea6c88cd7dae33b1c5cf9e5cc5,,1.000000,2.0,3.0,0.00,0.0,0.0,0.00,0.00,0.00,0.0,0.00,2.0,2.0,,3.00
8933,f000950527a6feb6de489447885cd6d18f593ec2674174ac,,4.750000,2.0,2.0,2.00,0.0,0.0,1.00,2.50,0.00,0.0,1.50,2.0,2.0,2.0,2.75
8934,f1c1045b13d18329a2bd99d2a7e2227688c0d69bf1d1e325,4.25,2.250000,,,0.00,0.0,0.0,0.00,0.00,0.00,0.0,0.00,2.0,,,3.00
8935,f000950527a6feb6bde38216d7cbbf32e66d3a3a96d4dbda,,2.250000,2.0,2.0,0.75,0.0,0.0,0.25,0.75,0.25,0.0,0.25,2.0,2.0,,2.25


In [4]:
# 读取纳税信息
tax_info = pd.read_csv("./train/tax_info.csv")

# 去除（29195 - 24614）条重复数据
tax_info = tax_info.drop_duplicates()

# 提取tax表格中的id
tax_feature = tax_info.id.drop_duplicates().reset_index(drop=True)

# 将提取出来的id转换成dataframe
tax_feature = pd.DataFrame(tax_feature)

# 将有id的部分赋值为1
tax_feature["is_tax_exist"] = 1

# # 连接上其余特征
# data = pd.merge(data, tax_feature, on="id", how="left")
# # 将没有tax信息的id填充为0
# data["is_tax_exist"].fillna(0,inplace=True)

print("纳税特征提取完成！")

纳税特征提取完成！


In [5]:
tax_feature

Unnamed: 0,id,is_tax_exist
0,f000950527a6feb6c2f40c9d8477e73a439dfa0897830397,1
1,d8071a739aa75a3b9f23966f8dae78fd226c272515b9c255,1
2,d8071a739aa75a3b8beaa7f2ea3a364a1bf8faefec72f871,1
3,f000950527a6feb6207093f8cac7a11cc2abd1763a264757,1
4,f000950527a6feb6f97af739bb95531db891a11df80bdb8b,1
...,...,...
803,47645761dc56bb8cf147c0f51d60cfe28fd995aaca7693d9,1
804,f000950527a6feb6bd25a1d6ac6f6463fa2d6e21e0d2861b,1
805,d8071a739aa75a3b6860158ec0cc8ba7972fb14ba37b9e0a,1
806,f000950527a6feb6cb8976eb56233ede461cb23103f85f32,1


In [6]:
# 读取变更信息
change_info = pd.read_csv("./train/change_info.csv")

# 45940 - 45216 去重
change_info = change_info.drop_duplicates()

# 添加一列计数
change_info["cnt"] = 1

# 提取变更信息的次数
temp = change_info[["id", "cnt"]].groupby("id").count()

temp = temp.rename(columns={"cnt":"count_change"})

def change_fea(x):
    if x>5:
        return 2
    else:
        return 1

# 对变更信息计数特征进行分箱
temp["count_change"] = temp["count_change"].apply(change_fea)

change_fea = change_info.id.drop_duplicates().reset_index(drop=True)

change_fea = pd.merge(change_fea,temp,how="left",on="id")

print("变更信息提取完成！")

变更信息提取完成！


In [7]:
change_fea

Unnamed: 0,id,count_change
0,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,1
1,e9f7b28ec10e047000d16ab79e1b5e6da434a1697cce7818,1
2,f000950527a6feb63ee1ce82bb22ddd1ab8b8fdffa3b91fb,1
3,216bd2aaf4d07924b4a106be25791281e2a6d9e54eaee13b,1
4,743e550a617316d5772a00182284976e17d42b6f0ca6d374,1
...,...,...
8721,d8071a739aa75a3b2cf30bec1c008a658963648897cb375b,1
8722,f000950527a6feb6de489447885cd6d18f593ec2674174ac,1
8723,f1c1045b13d18329a2bd99d2a7e2227688c0d69bf1d1e325,2
8724,da8691b210adb3f65b43370d3a362f4aa1d3b16b5ba0c9d7,1


In [8]:
# 读取新闻信息
news_info = pd.read_csv("./train/news_info.csv")

# 对新闻进行特征进行映射编码
def encode(x):
    if x=="积极":
        return 1
    if x=="中立":
        return 0
    else:
        return -1
news_info["positive_negtive"] = news_info["positive_negtive"].apply(encode)

# 统计不同信息的
temp = news_info[["id", "positive_negtive"]].groupby("id").sum()

temp = temp.rename(columns={"positive_negtive":"news_count"})

# 进行分箱
def news_fea(x):
    if x>0:
        return 1
    elif x<0:
        return -1
    else:
        return 0

# 对计数特征进行分箱
temp["news_count"] = temp["news_count"].apply(news_fea)

# 提取新闻数据id
news_fea = news_info.id.drop_duplicates().reset_index(drop=True)

news_fea = pd.merge(news_fea,temp,how="left",on="id")

print("新闻特征提取完成！")

新闻特征提取完成！


In [9]:
news_fea

Unnamed: 0,id,news_count
0,f000950527a6feb62669d6a175fe6fdccd1eb4f7ca8e5016,1
1,f000950527a6feb6e8bd9919e2ca363359bcfa997a0f9de7,-1
2,d8071a739aa75a3bcf6fb0041ee883243251d30025ab9d45,0
3,f000950527a6feb6d71de3382afa0bc5ff87bb65477f698a,1
4,f000950527a6feb65929509d9be855bf75b7337d4465843e,1
...,...,...
922,d8071a739aa75a3be6f3e200fd5532cb96764b8f4623c75a,0
923,f000950527a6feb69ea351e48351a711fb09bf1b83f04dfc,0
924,e9f7b28ec10e047005eec1a07b716d63fac8742cbdeacd46,0
925,d8071a739aa75a3b6860158ec0cc8ba7972fb14ba37b9e0a,1


In [10]:
# 其他消息的读取
other_info = pd.read_csv("./train/other_info.csv")

# 删除4条错误数据
other_info = other_info.drop(index=(other_info.loc[(other_info["id"] == "f000950527a6feb63702b1f6c1dabe5ea196d320bbbff425")].index))

other_info = other_info.drop(index=(other_info.loc[(other_info["id"] == "e9f7b28ec10e04707ba878b89e6c2d362b107a817342f9c6")].index))

other_fea = other_info

print("其他信息提取完成！")

其他信息提取完成！


In [11]:
other_fea

Unnamed: 0,id,legal_judgment_num,brand_num,patent_num
0,f000950527a6feb6d340f91da09e61347d8200cd2f0d1602,4.0,,
1,f000950527a6feb608dd9322b74a99f60851207f36a3c94c,1.0,,
2,d8071a739aa75a3b9f23966f8dae78fd226c272515b9c255,2.0,,
3,216bd2aaf4d079242209b1496f81a36c7abed9dd0bb65ed3,,1.0,
4,e9f7b28ec10e0470de9631c789f49acdd4e7cf9ed6db094b,,2.0,
...,...,...,...,...
1885,47645761dc56bb8cf147c0f51d60cfe28fd995aaca7693d9,6.0,,
1886,f000950527a6feb69ea351e48351a711fb09bf1b83f04dfc,1.0,,
1887,d8071a739aa75a3b39130af3718b2f261b57833a6a58ba55,2.0,1.0,
1888,d8071a739aa75a3b6860158ec0cc8ba7972fb14ba37b9e0a,1.0,,


In [12]:
# 基本信息
base_info = pd.read_csv("./train/base_info.csv")

# id + 标签
entprise_info = pd.read_csv("./train/entprise_info.csv")

# id + 标签(Null) 
result = pd.read_csv("./entprise_evaluate.csv")

In [13]:
temp = pd.concat([entprise_info,result], ignore_index=True)

In [14]:
# 将base_info和label进行连接，label为空的是测试集
data = pd.merge(temp, base_info, on="id", how="left")

In [15]:
# 连接变更信息特征
data = pd.merge(data, change_fea, on="id", how="left")

In [16]:
# 连接税务特征
data = pd.merge(data, tax_feature, on="id", how="left")
# 空值用0填充
data["is_tax_exist"].fillna(0,inplace=True)

In [17]:
# 连接新闻特征
data = pd.merge(data, news_fea, on="id", how="left")

In [18]:
# 连接其他特征
data = pd.merge(data, other_fea, on="id", how="left")

In [19]:
# 连接年报特征
data = pd.merge(data, annual_report_feature, on="id", how="left")

In [20]:
data['opfrom'] = pd.to_datetime(data['opfrom'],format='%Y-%m-%d')
data['opto'] = pd.to_datetime(data['opto'],format='%Y-%m-%d')
# 构造时间特征
data["time"] = (data["opto"] - data["opfrom"]).dt.days

In [21]:
# 使用目标编码对industryphy列进行训练 
enc = TargetEncoder()  
data["industryphy"] = enc.fit_transform(data["industryphy"], data["label"]) 

In [22]:
# 使用目标编码对opform列进行训练 
enc = TargetEncoder()  
data["opform"] = enc.fit_transform(data["opform"], data["label"]) 

In [23]:
# 使用目标编码对opform列进行训练
enc = TargetEncoder()  
data["oploc"] = enc.fit_transform(data["oploc"], data["label"]) 

In [24]:
train = data[data.label.notnull()].reset_index(drop=True)
test = data[data.label.isnull()].reset_index(drop=True)

In [25]:
label = train.label

In [26]:
train = train.drop(["id", "dom", "opscope", "opfrom", "opto", "label", "score"], axis=1)
test = test.drop(["id", "dom", "opscope", "opfrom", "opto", "label", "score"], axis=1)

In [27]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 1024
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test_pred = np.zeros(test_x.shape[0])
    test = np.zeros(test_x.shape[0])
    importance = np.zeros(train_x.columns.shape[0])
    
    cv_scores = []
    feature_names = train_x.columns.tolist()
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'tree_method':'gpu_hist',
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.05,
                'seed': 1024,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
            }

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
            
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            test_matrix = clf.DMatrix(test_x)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'tree_method':'gpu_hist',
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.04,
                      'tree_method': 'exact',
                      'seed': 1024,
                      'nthread': 36,
                      "silent": True,
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
            
        # importance += model.feature_importance() / 5
        
        train[valid_index] = val_pred
        test += test_pred / kf.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
    
    # df = pd.DataFrame({ 'column': feature_names, 'importance': importance}).sort_values(by='importance')           
    # df.to_csv("./importance.csv")
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return test

def lgb_model(x_train, y_train, x_test):
    lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_test

def xgb_model(x_train, y_train, x_test):
    xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
    return xgb_test

def cat_model(x_train, y_train, x_test):
    cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
    return cat_test

In [28]:
xgb_test = xgb_model(train, label, test)

************************************ 1 ************************************
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.98397	eval-auc:0.97425
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[200]	train-auc:0.99567	eval-auc:0.99137
[400]	train-auc:0.99733	eval-auc:0.99169
Stopping. Best iteration:
[347]	train-auc:0.99701	eval-auc:0.99179

[0.9917938387500198]
************************************ 2 ************************************
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through t

In [29]:
lgb_test = lgb_model(train, label, test)

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.997156	valid_1's auc: 0.991331
[400]	training's auc: 0.998769	valid_1's auc: 0.991546
[600]	training's auc: 0.999288	valid_1's auc: 0.991548
[800]	training's auc: 0.999543	valid_1's auc: 0.991606
Early stopping, best iteration is:
[671]	training's auc: 0.999388	valid_1's auc: 0.99177
[0.9917700929253274]
************************************ 2 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.997528	valid_1's auc: 0.987523
Early stopping, best iteration is:
[108]	training's auc: 0.996014	valid_1's auc: 0.988037
[0.9917700929253274, 0.9880373319212512]
************************************ 3 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.996733	valid_1's auc: 0.994
Early stopping, be

In [30]:
print(xgb_test.sum())
print(lgb_test.sum())

923.5839730163598
929.6832208554812


In [31]:
rh_test = lgb_test * 0.5 + xgb_test * 0.5

In [32]:
result['score'] = rh_test

In [33]:
result

Unnamed: 0,id,score
0,82750f1b9d1223508ee329d47e27d35176c93eb9f35e9c1a,0.019012
1,f000950527a6feb670cc1c87c2025f3922aaa4a0206a0a33,0.702373
2,e9f7b28ec10e04700ef4db75a494f9a1e8e8b09555e6afa1,0.001019
3,beb4aaaa89e0a0ae9d77bd5d7665be6342f552f51840cf19,0.000536
4,e9f7b28ec10e0470ee4172cec0133b6826c34f27d3dff204,0.001479
...,...,...
9995,f000950527a6feb6b9e9c5a82689e87ee128abcf72ca7b96,0.108808
9996,d8071a739aa75a3bb98b032a18ae492bb8cf7ad9e0c23acd,0.064366
9997,f000950527a6feb63ae3783e4b82cbd8da7b3eaf43624866,0.001795
9998,d8071a739aa75a3bf8557cd0432d5c04e2241aee9f422220,0.001072


In [34]:
result[['id','score']].to_csv('result.csv', index=False)