In [2]:
import os, re, json, gc, itertools
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

import joblib
import xgboost as xgb

  from pandas import MultiIndex, Int64Index


In [79]:
# load data
train = pd.read_csv('data/first_round_training_data.csv')
test = pd.read_csv('data/first_round_testing_data.csv')

In [80]:
# train 分离 target
target = train.pop('Quality_label')

In [81]:
# target 编码
target = (target.astype("category")
          .cat.set_categories(["Fail", "Pass", "Good", "Excellent"])
          .cat.codes)
target = target.rename('target')

In [82]:
# train 分离 parameter 和 attribute
col_attr = [f'Attribute{x}' for x in range(1, 11)]
attr = train[col_attr]
train = train.drop(columns=col_attr)

In [83]:
# test 分离 group
group = test.pop('Group')
group = group.rename('group')

In [84]:
# test 调整参数顺序，与 train 保持一致
col_test = [f"Parameter{x}" for x in range(1, 11)]
test = test[col_test]

In [85]:
# 简化列名
colnames = [f'p{i}' for i in range(1, 11)]
train.columns = colnames
test.columns = colnames
colnames = [f'a{i}' for i in range(1, 11)]
attr.columns = colnames

In [86]:
# 统一小数
round_fmt = {1: 11, 2: 11, 3: 11, 4: 11,
             5: 8, 6: 8, 7: 8, 8: 8, 9: 8, 10: 8}
for i, r in round_fmt.items():
    train[f'p{i}'] = train[f'p{i}'].round(r)
    test[f'p{i}'] = test[f'p{i}'].round(r)
# transform log2
# 对数转换和归一化后存在负数，影响加减乘除的结果
# 需要测试对数转换前的效果
train = train.apply(np.log2)
attr = attr.apply(np.log2)
test = test.apply(np.log2)

### A4

In [87]:
attribute='a4'
df = train.join(attr[attribute])

In [88]:
#使用统一参数
best_hp={'max_depth': 5,
 'learning_rate': 0.1,
 'n_estimators': 571,
 'verbosity': 1,
 'objective': 'reg:squarederror',
 'booster': 'dart',
 'n_jobs': -1,
 'gamma': 0,
 'min_child_weight': 1,
 'max_delta_step': 0,
 'subsample': 1,
 'colsample_bytree': 1,
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'scale_pos_weight': 1,
 'base_score': 0.5,
 'importance_type': 'gain',
 'num_round': 223}

In [89]:
x_trn,x_val,y_trn,y_val = train_test_split(train,attr.a4,test_size=1000)
m = xgb.XGBRegressor().set_params(**best_hp)
m.fit(x_trn,y_trn)

Parameters: { "num_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=571, n_jobs=-1,
             num_parallel_tree=1, num_round=223, predictor='auto',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=1)

In [90]:
# 保存模型
joblib.dump(m,f'model/tp3_{attribute}.m')

['model/tp3_a4.m']

### A5

In [91]:
attribute = 'a5'
df = train.join(attr[attribute])

In [92]:
x_trn,x_val,y_trn,y_val = train_test_split(train,attr.a5,test_size=1000)
m = xgb.XGBRegressor().set_params(**best_hp)
m.fit(x_trn,y_trn)

Parameters: { "num_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=571, n_jobs=-1,
             num_parallel_tree=1, num_round=223, predictor='auto',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=1)

In [93]:
# 保存模型
joblib.dump(m,f'model/tp3_{attribute}.m')

['model/tp3_a5.m']

### A6

In [94]:
attribute = 'a6'
df = train.join(attr[attribute])
x_trn, x_val, y_trn, y_val = train_test_split(train, attr.a6, test_size=1000)
m = xgb.XGBRegressor().set_params(**best_hp)
m.fit(x_trn, y_trn)

Parameters: { "num_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=571, n_jobs=-1,
             num_parallel_tree=1, num_round=223, predictor='auto',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=1)

In [95]:
# 保存模型
joblib.dump(m, f'model/tp3_{attribute}.m')

['model/tp3_a6.m']

### A7

In [96]:
attribute = 'a7'
df = train.join(attr[attribute])
x_trn, x_val, y_trn, y_val = train_test_split(train, attr.a7, test_size=1000)
m = xgb.XGBRegressor().set_params(**best_hp)
m.fit(x_trn, y_trn)

Parameters: { "num_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=571, n_jobs=-1,
             num_parallel_tree=1, num_round=223, predictor='auto',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=1)

In [97]:
# 保存模型
joblib.dump(m, f'model/tp3_{attribute}.m')

['model/tp3_a7.m']

### A8

In [98]:
attribute = 'a8'
df = train.join(attr[attribute])
x_trn, x_val, y_trn, y_val = train_test_split(train, attr.a8, test_size=1000)
m = xgb.XGBRegressor().set_params(**best_hp)
m.fit(x_trn, y_trn)

Parameters: { "num_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=571, n_jobs=-1,
             num_parallel_tree=1, num_round=223, predictor='auto',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=1)

In [99]:
# 保存模型
joblib.dump(m, f'model/tp3_{attribute}.m')

['model/tp3_a8.m']

### A9

In [100]:
attribute = 'a9'
df = train.join(attr[attribute])
x_trn, x_val, y_trn, y_val = train_test_split(train, attr.a9, test_size=1000)
m = xgb.XGBRegressor().set_params(**best_hp)
m.fit(x_trn, y_trn)

Parameters: { "num_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=571, n_jobs=-1,
             num_parallel_tree=1, num_round=223, predictor='auto',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=1)

In [101]:
# 保存模型
joblib.dump(m, f'model/tp3_{attribute}.m')

['model/tp3_a9.m']

### A10

In [102]:
attribute = 'a10'
df = train.join(attr[attribute])
x_trn, x_val, y_trn, y_val = train_test_split(train, attr.a10, test_size=1000)
m = xgb.XGBRegressor().set_params(**best_hp)
m.fit(x_trn, y_trn)

Parameters: { "num_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=571, n_jobs=-1,
             num_parallel_tree=1, num_round=223, predictor='auto',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=1)

In [103]:
# 保存模型
joblib.dump(m, f'model/tp3_{attribute}.m')

['model/tp3_a10.m']

In [104]:
# stacking 建立 attr 特征
cv = KFold(6,shuffle=True,random_state=1)
attrs = [f'a{i}' for i in range(4, 11)]
trn_res = None
tst_res = None
for attribute in attrs:
    print(attribute)
    m = joblib.load(f'model/tp3_{attribute}.m')
    trn_attr = None
    tst_attr = None
    for index, (trn_idx, val_idx) in enumerate(cv.split(train, attr[attribute])):
        x_trn, x_val = train.iloc[trn_idx], train.iloc[val_idx]
        y_trn, y_val = attr[attribute].iloc[trn_idx], attr[attribute].iloc[val_idx]
        m.fit(x_trn, y_trn)
        if trn_attr is None:
            trn_attr = np.c_[val_idx, m.predict(x_val)]
            tst_attr = m.predict(test)/cv.n_splits
        else:
            trn_attr = np.r_[trn_attr, np.c_[val_idx, m.predict(x_val)]]
            tst_attr += m.predict(test)/cv.n_splits
    trn_attr = pd.DataFrame(trn_attr).set_index(0)
    trn_attr.index = trn_attr.index.astype('int')
    trn_attr.columns = [attribute]
    if trn_res is None:
        trn_res = trn_attr
        tst_res = tst_attr
    else:
        trn_res = trn_res.join(trn_attr)
        tst_res = np.c_[tst_res,tst_attr]

a4
Parameters: { "num_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_round" } might not be used.

 

In [105]:
train = train.join(trn_res)
tst_res = pd.DataFrame(tst_res)
tst_res.columns = [f'a{i}' for i in range(4,11)]
test = test.join(tst_res)

In [106]:
# save data
save_path = 'data/tp6.h5'
train.to_hdf(save_path,'train')
target.to_hdf(save_path,'target')
test.to_hdf(save_path,'test')
attr.to_hdf(save_path,'attr')
group.to_hdf(save_path,'group')

In [3]:
# load data
load_path = 'data/tp6.h5'
train = pd.read_hdf(load_path,'train')
target = pd.read_hdf(load_path,'target')
test = pd.read_hdf(load_path,'test')
attr = pd.read_hdf(load_path,'attr')
group = pd.read_hdf(load_path,'group')

In [117]:
# def create_uid(df,cols,uid_name):
#     df[uid_name]=df[cols[0]].astype(str)
#     if len(cols)>1:
#         for c in cols[1:]:
#             df[uid_name]=df[uid_name]+df[c].astype(str)
#     return df
#
# def uid_aggregation(train,test,main_cols,uid,agg_method):
#     for main_col in main_cols:
#         new_col_name=uid+'_'+main_col+'_'+agg_method
#         train[new_col_name]=train.groupby([uid])[main_col].transform(agg_method)
#         test[new_col_name]=test.groupby([uid])[main_col].transform(agg_method)
#
# def frequency_encoding(train,test,cols):
#     for c in cols:
#         new_col_name=c+'_freq'
#         train[new_col_name] = train[c].map(train[c].value_counts())
#         test[new_col_name] = test[c].map(test[c].value_counts())
#
# def nunique_encoding(train,test,main_cols,col):
#     for main_col in main_cols:
#         new_col_name=main_col+'_'+col+'_nunique'
#         train[new_col_name]= train[col].map(train.groupby(col)[main_col].nunique())
#         test[new_col_name]= test[col].map(test.groupby(col)[main_col].nunique())

In [4]:
for c in range(4,11):
    col='a'+str(c)
    train[col]=train[col].round(3)
    test[col]=test[col].round(3)

# a4+a5+a6
train['a4_plus_a5_plus_a6']=train['a4']+train['a5']+train['a6']
test['a4_plus_a5_plus_a6']=test['a4']+test['a5']+test['a6']

# a4+a5
train['a4_plus_a5']=train['a4']+train['a5']
test['a4_plus_a5']=test['a4']+test['a5']

# a5+a6
train['a5_plus_a6']=train['a5']+train['a6']
test['a5_plus_a6']=test['a5']+test['a6']

# drop_cols
drop_cols=['p1','p2','p3','p4','p5','p6','p7','p8','p10',]
train =train.drop(columns=drop_cols)
test =test.drop(columns=drop_cols)
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6000 entries, 0 to 5999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   p9                  6000 non-null   float64
 1   a4                  6000 non-null   float64
 2   a5                  6000 non-null   float64
 3   a6                  6000 non-null   float64
 4   a7                  6000 non-null   float64
 5   a8                  6000 non-null   float64
 6   a9                  6000 non-null   float64
 7   a10                 6000 non-null   float64
 8   a4_plus_a5_plus_a6  6000 non-null   float64
 9   a4_plus_a5          6000 non-null   float64
 10  a5_plus_a6          6000 non-null   float64
dtypes: float64(11)
memory usage: 562.5 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6000 entries, 0 to 5999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   p9 

In [5]:
best_hp={'max_depth': 3,
 'learning_rate': 0.1,
 'n_estimators': 203,
 'verbosity': 1,
 'objective': 'multi:softmax',
 'booster': 'dart',
 'n_jobs': -1,
 'gamma': 0,
 'min_child_weight': 1,
 'max_delta_step': 0,
 'subsample': 1,
 'colsample_bytree': 1,
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 # 'reg_alpha': 6.736871781492226,
 # 'reg_lambda': 0.073512560432491,
 'scale_pos_weight': 1,
 'base_score': 0.5,
 'num_class': 4,
 'num_round': 295}

In [6]:
from sklearn.metrics import log_loss, accuracy_score

oof = np.zeros((train.shape[0], 4))
prediction = np.zeros((test.shape[0], 4))
seeds = [1, 2020, 66666]
num_model_seed = 3
for model_seed in range(num_model_seed):
    print("开始训练第%d个模型：" % (model_seed + 1))
    oof_cat = np.zeros((train.shape[0], 4))
    prediction_cat = np.zeros((test.shape[0], 4))
    skf = StratifiedKFold(n_splits=5, random_state=seeds[model_seed], shuffle=True)#分割
    for index, (train_index, test_index) in enumerate(skf.split(train, target)):
        print("正在训练第%d折" % (index + 1))
        train_x, test_x, train_y, test_y = train.iloc[train_index], train.iloc[test_index], target.iloc[train_index], \
                                           target.iloc[test_index]
        gc.collect()
        m = xgb.XGBClassifier().set_params(**best_hp).set_params(n_jobs=-1,learning_rate=0.05)
        m.fit(train_x, train_y) # xgb需要接受的是字典
        oof_cat[test_index] += m.predict_proba(test_x)
        prediction_cat += m.predict_proba(test) / skf.n_splits
        gc.collect()
    oof += oof_cat / num_model_seed
    prediction += prediction_cat / num_model_seed
    print('logloss', log_loss(pd.get_dummies(target).values, oof_cat))
    print('ac', accuracy_score(target, np.argmax(oof_cat, axis=1)))
print("*" * 30)
print('mean_logloss', log_loss(pd.get_dummies(target).values, oof))
print('mean_ac', accuracy_score(target, np.argmax(oof, axis=1)))

group_size=50
mname='CCF_model_xgb'
res_prob = pd.DataFrame(np.c_[group,prediction])
res_prob.columns=['Group', 'Fail ratio', 'Pass ratio', 'Good ratio', 'Excellent ratio']
res_prob = res_prob[['Group', 'Excellent ratio','Good ratio', 'Pass ratio','Fail ratio']]
res_prob =(res_prob.groupby('Group').sum()/group_size).reset_index()
res_prob.to_csv(f'submission/{mname}_prob.csv',index=False,encoding='utf-8')

开始训练第1个模型：
正在训练第1折
Parameters: { "num_round", "scale_pos_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


正在训练第2折
Parameters: { "num_round", "scale_pos_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


正在训练第3折
Parameters: { "num_round", "scale_pos_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


正在训练第4折
Parameters: { "num_round", "scale_pos_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issu

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


正在训练第2折
Parameters: { "num_round", "scale_pos_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


正在训练第3折
Parameters: { "num_round", "scale_pos_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


正在训练第4折
Parameters: { "num_round", "scale_pos_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issu

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


正在训练第2折
Parameters: { "num_round", "scale_pos_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


正在训练第3折
Parameters: { "num_round", "scale_pos_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


正在训练第4折
Parameters: { "num_round", "scale_pos_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issu

In [155]:
len(train.columns)

11

In [156]:
train.columns

Index(['p9', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'a10', 'a4_plus_a5_plus_a6',
       'a4_plus_a5', 'a5_plus_a6'],
      dtype='object')