## 1. 数据读取和预处理

给定数据是csv的表格，包含54列数据以及标签和ID。使用pandas进行数据读取，并放在`data`中，为了避免对标签操作，先删除了标签和ID列，后续再加回来。

In [None]:
# 读取数据
train = pd.read_csv('/work/data/train_base_data.csv')
test = pd.read_csv('/work/data/test_data.csv')
data = pd.concat([train, test]).reset_index(drop=True)

y_label = ['ID', 'CHANNEL_A', 'CHANNEL_B', 'CHANNEL_C']
y_data = data[y_label]
data = data.drop(columns=y_label)

首先，通过观察确定了数据集中的字符型特征列，具体包括COL3、COL4、COL5和COL19。对于这些字符型特征，实施了`Label Encoding`，这是一种将类别标签转换为连续数值的技术，以便于后续的数值分析。

对于数据集中的缺失值，在尝试了均值填充和众数填充之后，决定采用更为保守的策略，即直接将缺失值填充为`-2`。

最后，将列中的最大值大于1000的加入`num_f`作为代表性的数值型特征。

为了有效减少无用的数据，对方差较大和较小的进行了删除。

In [None]:
def data_preprocessing(data, mode='train'):
    ff = [i for i in data.columns if i not in ['ID', 'CHANNEL_A', 'CHANNEL_B', 'CHANNEL_C']]
    # 需要特别处理的类别特征的列名
    cat_f = ['COL3', 'COL4', 'COL5', 'COL19']
    num_f = []  # 存储数值型特征的列名
    for f in tqdm(ff):
        data[f] = data[f].fillna(-2)
        data[f] = data[f].astype('str')
        data[f] = data[f].apply(lambda x: x.replace(' ', '-1'))
        if f not in cat_f:
            data[f] = data[f].astype('float')
        else:
            data[f] = data[f].astype('str')
            # 对类别特征进行Label Encoding
            if data[f].dtype == 'object':
                lb = LabelEncoder()
                data[f] = lb.fit_transform(data[f])
            else:
                grade_dict = {chr(i): i-96 for i in range(97, 123)}
                data[f] = data[f].map(grade_dict)
        if data[f].max()>1000 and mode != 'test':
            num_f.append(f)

    # 去掉低方差、高方差特征
    if mode != 'test':
        data, num_f, ff, cat_f = remove_features(data, num_f, ff, cat_f)
    
    return data, num_f, ff, cat_f

## 2. 特征工程

数据预处理后，下一步进行特征工程。分别尝试了：
1. 偏离值特征
2. 数值和类别特征交叉
3. 数值特征加减乘除交叉
4. 数值特征做 max, min, mean, std

实验发现，偏离值特征的作用不大，而数值特征的两个处理作用相似，故最终保留了计算量较小的“数值特征做 max, min, mean, std”，最终的特征工程如下：

In [None]:
def feature_engineering(data, num_f, ff, cat_f, mode='train'):
    print('the num of ff num_f cat_f ', len(ff), len(num_f), len(cat_f))  # 33 0 1
    # for group in tqdm(cat_f):
    #     for i, feature in enumerate(ff, start=1):  # 添加序号计数，从1开始
    #         if feature not in cat_f:
    #             tmp = data.groupby(group)[feature].agg(['mean', 'std', 'max', 'min', 'sum']).reset_index()
    #             tmp = pd.merge(data, tmp, on=group, how='left')
    #             # 创建新的特征，表示相对于组内统计的偏差
    #             data['{}-mean_gb_{}'.format(feature, group)] = data[feature] - tmp['mean']
    #             data['{}-min_gb_{}'.format(feature, group)] = data[feature] - tmp['min']
    #             data['{}-max_gb_{}'.format(feature, group)] = data[feature] - tmp['max']
    #             data['{}/sum_gb_{}'.format(feature, group)] = data[feature] / tmp['sum']   
    
    # 数值型特征和类别特征之间的交叉
    for i in tqdm(range(len(num_f))):
        for j in range(i + 1, len(num_f)):
            for cat in cat_f[1:]:
                f1 = ff[i]
                f2 = ff[j]
                data[f'{f1}_{f2}_log_{cat}'] = (np.log1p(data[f1]) - np.log1p(data[f2])) * data[cat]
                data[f'{f1}+{f2}_log_{cat}'] = (np.log1p(data[f1]) + np.log1p(data[f2])) * data[cat]
                data[f'{f1}*{f2}_log_{cat}'] = (np.log1p(data[f1]) * np.log1p(data[f2])) * data[cat]
                data[f'{f1}/{f2}_log_{cat}'] = (np.log1p(data[f1]) / np.log1p(data[f2])) * data[cat]
                data[f'{f2}/{f1}_log_{cat}'] = (np.log1p(data[f2]) / np.log1p(data[f1])) * data[cat]

                data[f'{f1}_{f2}_log_{cat}_'] = (np.log1p(data[f1]) - np.log1p(data[f2])) / data[cat]
                data[f'{f1}+{f2}_log_{cat}_'] = (np.log1p(data[f1]) + np.log1p(data[f2])) / data[cat]
                data[f'{f1}*{f2}_log_{cat}_'] = (np.log1p(data[f1]) * np.log1p(data[f2])) / data[cat]
                data[f'{f1}/{f2}_log_{cat}_'] = (np.log1p(data[f1]) / np.log1p(data[f2])) / data[cat]
                data[f'{f2}/{f1}_log_{cat}_'] = (np.log1p(data[f2]) / np.log1p(data[f1])) / data[cat]

    # # 数值型特征之间的加减乘除交叉
    # for i in tqdm(range(len(num_f))):
    #     for j in range(i + 1, len(num_f)):
    #         f1 = ff[i]
    #         f2 = ff[j]
    #         data[f'{f1}_{f2}'] = data[f1] - data[f2]
    #         data[f'{f1}+{f2}'] = data[f1] + data[f2]
    #         data[f'{f1}*{f2}'] = data[f1] * data[f2]
    #         data[f'{f1}/{f2}'] = data[f1] / data[f2]
    #         data[f'{f2}/{f1}'] = data[f2] / data[f1]

    # 数值特征做 max, min, mean, std
    for i in tqdm(range(len(num_f))):
        f = ff[i]
        data[f'{f}_max'] = data[f].max()
        data[f'{f}_min'] = data[f].min()
        mean_series = data[f].mean()
        std_series = data[f].std()
        ptp_series = data[f].max() - data[f].min()  # 计算峰峰值
        data[f'{f}_mean'] = mean_series
        data[f'{f}_std'] = std_series
        data[f'{f}_ptp'] = ptp_series

    return data

## 3. 模型全特征训练

数据和标签准备好之后，开始进行模型的训练。我们的任务是3个二分类问题，故构建三个模型，选择`lightgbm`作为模型，使用5折交叉验证。boosting类型选择`gbdt`，目标函数选择`binary`，评价指标选择`auc`。为了方便后续的特征选择，我们需要保存每个特征的重要性，以便后续的特征选择。

In [None]:
# 训练模型
def train_model(X_train, X_test, features, y, threshold_, params, seed=2024, save_model=False, model_path='model.txt', kf=10):
    feat_imp_df = pd.DataFrame({'feat': features, 'imp': 0})    # 存储特征名称和它们的重要性
    KF = StratifiedKFold(n_splits=kf, random_state=seed, shuffle=True)   # 5折交叉验证
    # 初始化保存每个折的分数列表
    score_lists = []
    
    oof_lgb = np.zeros(len(X_train))    # 初始化1个任务的oof预测结果
    predictions_lgb = np.zeros(len(X_test))  # 测试集的预测结果，1个任务

    for fold_, (trn_idx, val_idx) in enumerate(KF.split(X_train.values, y.values)):
        print("[fold n°{}]".format(fold_ + 1))
        trn_data = lgb.Dataset(X_train.iloc[trn_idx][features], label=y.iloc[trn_idx])
        val_data = lgb.Dataset(X_train.iloc[val_idx][features], label=y.iloc[val_idx])

        clf = lgb.train(params, 
                        trn_data, 
                        valid_sets=[trn_data, val_data], 
                        verbose_eval=100)
        
        # model_lgb = lgb.LGBMClassifier(objective='binary', max_depth=3, num_leaves=50,
        #                     n_estimators=5000,
        #                     min_child_samples=18, min_child_weight=0.001,
        #                     feature_fraction=0.6, bagging_fraction=0.5,
        #                     metric='auc', )
        # params_test={
        #         'learning_rate=': [0.5, 0.1, 0.05, 0.01],
        #     }
        # gsearch = GridSearchCV(estimator=model_lgb, param_grid=params_test, scoring='f1_micro', cv=5, verbose=1, n_jobs=4)
        # gsearch.fit(X_train.iloc[trn_idx][features], y.iloc[trn_idx])
        # print(gsearch.best_params_, gsearch.best_score_)

        oof_lgb[val_idx] = clf.predict(X_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
        predictions_lgb += clf.predict(X_test[features], num_iteration=clf.best_iteration) / KF.n_splits
        feat_imp_df['imp'] += clf.feature_importance() / KF.n_splits
        score_lists.append(f1_score(y.iloc[val_idx], [1 if i >= threshold_ else 0 for i in oof_lgb[val_idx]]))


    # 打印每个任务的评估指标
    print("AUC score: {}".format(roc_auc_score(y, oof_lgb)))
    print("F1 score: {}".format(f1_score(y, [1 if i >= threshold_ else 0 for i in oof_lgb])))
    print("Precision score: {}".format(precision_score(y, [1 if i >= threshold_ else 0 for i in oof_lgb])))
    print("Recall score: {}".format(recall_score(y, [1 if i >= threshold_ else 0 for i in oof_lgb])))
    print("F1 mean: {}".format(np.mean(score_lists)))

    # # 假设oof_lgb是你的模型输出的概率，y是真实标签
    # thresholds = np.linspace(0, threshold_+0.1, 100)  # 生成一系列可能的阈值
    # best_threshold = 0
    # best_f1 = 0

    # for threshold in thresholds:
    #     y_pred = [1 if i >= threshold else 0 for i in oof_lgb]
    #     current_f1 = f1_score(y, y_pred)
    #     if current_f1 > best_f1:
    #         best_f1 = current_f1
    #         best_threshold = threshold

    # print("Best F1 score: {}".format(best_f1))
    # print("Best threshold: {}".format(best_threshold))

    # # 使用最佳阈值计算其他指标
    # y_pred_best = [1 if i >= best_threshold else 0 for i in oof_lgb]
    # print("AUC score: {}".format(roc_auc_score(y, oof_lgb)))
    # print("F1 score with best threshold: {}".format(f1_score(y, y_pred_best)))
    # print("Precision score with best threshold: {}".format(precision_score(y, y_pred_best)))
    # print("Recall score with best threshold: {}".format(recall_score(y, y_pred_best)))

    if save_model:
        booster = lgb.train(params, trn_data, valid_sets=[trn_data, val_data], verbose_eval=100)
        booster.save_model(model_path)  # 保存模型到文件
    
    # 返回特征重要性、每个任务的oof预测结果和测试集的预测结果
    return feat_imp_df, oof_lgb, predictions_lgb

由于最终评价指标为F1，计算精确率和召回率时，**阈值的选择**尤为重要。我先预设阈值都为0.5，对模型进行训练推理，计算出推理到的测试集中预测值的均值作为阈值，即`[0.05,0.25,0.05]`，但是这样计算仍有改进的空间。在后续训练时，我在(0, threshold_+0.1)中均匀采样100次，分别计算F1指标，选择F1最高的作为最终阈值(见代码注释的地方)，结合线上预测结果，最终选定阈值为`[0.06, 0.2575757575757576, 0.03333333333333334]`。

## 4. 重要特征训练

模型全特征训练完，保存下了特征重要性，然后我们可以根据特征重要性筛选特征。我筛选了重要性大于`0.05`的特征，然后再次训练模型，这次进行10折交叉验证，充分保证模型的质量，训练结束后把3个模型分别保存下来。

In [None]:
length = [0 for _ in range(3)]
# 筛选大于0.05的特征
for i in range(3):
    feat_imp_df_i = feat_imp_df[i]
    features_ = feat_imp_df_i[feat_imp_df_i['imp'] > 0.05]['feat'].to_list()
    length[i] = len(features_)
pred = [[] for _ in range(3)]
oof = [[] for _ in range(3)]
for i in range(3):
    print('[Channel {}]'.format(i))
    feat_imp_df_i = feat_imp_df[i]
    pred[i], oof[i] = mean_fusion(train, test, feat_imp_df_i, y.iloc[i], threshold[i], length[i], params[i], model_path='model_{}.txt'.format(i))


## 5. 测试

训练完毕后，编写测试代码，按照以上进行数据预处理和特征工程，读取保存好的重要性特征，选择训练时用到的特征进行预测。

In [None]:
# 加载特征重要性
with open('feat_imp_a.pkl', 'rb') as f:
    feat_imp_df = pickle.load(f)

pred = [[] for _ in range(3)]
for i in range(3):
    # 加载保存的模型
    booster = lgb.Booster(model_file='/work/model_{}.txt'.format(i))
    feat_imp_df_i = feat_imp_df[i]
    features = feat_imp_df_i.sort_values(['imp'])[-length[i]:]['feat'].to_list()
    pred[i] = booster.predict(test[features], num_iteration=booster.best_iteration)

最后将结果写进`'/work/output.csv'`文件中。

In [None]:
pred_np = np.array(pred)
# # 分别计算一下三个通道的预测值的均值
# A_mean = np.mean(pred_np[0])
# B_mean = np.mean(pred_np[1])
# C_mean = np.mean(pred_np[2])
# print(A_mean, B_mean, C_mean)

test['CHANNEL_A'] = np.where(pred_np[0] >= threshold[0], 1, 0)
test['CHANNEL_B'] = np.where(pred_np[1] >= threshold[1], 1, 0)
test['CHANNEL_C'] = np.where(pred_np[2] >= threshold[2], 1, 0)

import csv
with open('/work/output.csv', newline='', mode='w') as outputFile:
    fieldnames = ['ID', 'CHANNEL_A', 'CHANNEL_B', 'CHANNEL_C']
    writer = csv.DictWriter(outputFile, fieldnames=fieldnames)
    writer.writerow({'ID': 'ID', 'CHANNEL_A': 'CHANNEL_A', 'CHANNEL_B': 'CHANNEL_B', 'CHANNEL_C': 'CHANNEL_C'})
    for index, row in test[['ID', 'CHANNEL_A', 'CHANNEL_B', 'CHANNEL_C']].iterrows():
        writer.writerow({'ID': row['ID'], 'CHANNEL_A': row['CHANNEL_A'], 'CHANNEL_B': row['CHANNEL_B'], 'CHANNEL_C': row['CHANNEL_C']})
print(test[['ID', 'CHANNEL_A']].head())

## 6. LGB模型进行调参

> 上述需要训练3个模型，三个模型的参数可能是不一样的，而且参数的选择对结果也有影响，故尝试了网格搜索进行调参，但是效果并不理想，这里先把调参过程列下。

先把学习率先定一个较高的值，这里取 `learning_rate = 0.1`，其次确定估计器`boosting_type`的类型，默认选`gbd`。

为了确定估计器的数目，也就是boosting迭代的次数，参数名为`num_boost_round`，先将该参数设成一个较大的数5000，然后在设置了早停`early_stopping_round=200`，避免过拟合。

接下来进行其他参数调优，引入`sklearn`中的`GridSearchCV()`函数进行网格搜索。

1. 首先调整`num_leaves`与`max_depth`

In [None]:
from sklearn.model_selection import GridSearchCV
model_lgb = lgb.LGBMClassifier(objective='binary',num_leaves=2**6,
                      learning_rate=0.1, n_estimators=5000,
                      metric='auc')
params_test={
    'max_depth': range(3,8,2),
    'num_leaves':range(50, 170, 30)
}
gsearch = GridSearchCV(estimator=model_lgb, param_grid=params_test, scoring='f1_micro', cv=5, verbose=1, n_jobs=4)
gsearch.fit(X_train.iloc[trn_idx][features], y.iloc[trn_idx])
print(gsearch.best_params_, gsearch.best_score_)

打印了以下运行结果，这里运行了12个参数组合，得到的最优解是在`max_depth`为3/5，`num_leaves`为50的情况下，三行对应三个模型

In [None]:
{'max_depth': 5, 'num_leaves': 50} 0.96
{'max_depth': 5, 'num_leaves': 50} 0.672
{'max_depth': 3, 'num_leaves': 50} 0.984

2. 调整`min_data_in_leaf`和`min_sum_hessian_in_leaf`

In [None]:
model_lgb = lgb.LGBMClassifier(objective='binary', max_depth=3, num_leaves=50,
                      learning_rate=0.1, n_estimators=5000,
                      metric='auc', )
params_test={
    'min_child_samples': [18, 19, 20, 21, 22],
    'min_child_weight':[0.001, 0.002]
}
gsearch = GridSearchCV(estimator=model_lgb, param_grid=params_test, scoring='f1_micro', cv=5, verbose=1, n_jobs=4)
gsearch.fit(X_train.iloc[trn_idx][features], y.iloc[trn_idx])
print(gsearch.best_params_, gsearch.best_score_)

In [None]:
Fitting 5 folds for each of 10 candidates, totalling 50 fits

可以看到，`min_data_in_leaf`的最优值为18，而`min_sum_hessian_in_leaf`为0.01。也就是这两个参数`min_data_in_leaf`和 `min_sum_hessian_in_leaf`。

In [None]:
{'min_child_samples': 18, 'min_child_weight': 0.001} 0.952
{'min_child_samples': 19, 'min_child_weight': 0.001} 0.676
{'min_child_samples': 18, 'min_child_weight': 0.001} 0.984

3. `feature_fraction`参数来进行特征的子抽样，`bagging_fraction`和`bagging_freq`相当于subsample样本采样

In [None]:
model_lgb = lgb.LGBMClassifier(objective='binary', max_depth=3, num_leaves=50,
                      learning_rate=0.1, n_estimators=5000,
                      min_child_samples=18, min_child_weight=0.001,
                      metric='auc', )
params_test={
        'feature_fraction': [0.5, 0.6, 0.7, 0.8, 0.9],
        'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0]
    }
gsearch = GridSearchCV(estimator=model_lgb, param_grid=params_test, scoring='f1_micro', cv=5, verbose=1, n_jobs=4)
gsearch.fit(X_train.iloc[trn_idx][features], y.iloc[trn_idx])
print(gsearch.best_params_, gsearch.best_score_)

In [None]:
Fitting 5 folds for each of 25 candidates, totalling 125 fits

从这里可以看出来，`bagging_feaction``和feature_fraction`的理想值分别是0.6和0.5/0.8。

In [None]:
{'bagging_fraction': 0.6, 'feature_fraction': 0.5} 0.952
{'bagging_fraction': 0.6, 'feature_fraction': 0.8} 0.6679999999999999
{'bagging_fraction': 0.6, 'feature_fraction': 0.5} 0.984

4. 降低`learning_rate`

In [None]:
model_lgb = lgb.LGBMClassifier(objective='binary', max_depth=3, num_leaves=50,
                     n_estimators=5000,
                      min_child_samples=18, min_child_weight=0.001,
                      feature_fraction=0.6, bagging_fraction=0.5,
                      metric='auc', )
params_test={
        'learning_rate=': [0.5, 0.1, 0.05, 0.01],
    }
gsearch = GridSearchCV(estimator=model_lgb, param_grid=params_test, scoring='f1_micro', cv=5, verbose=1, n_jobs=4)
gsearch.fit(X_train.iloc[trn_idx][features], y.iloc[trn_idx])
print(gsearch.best_params_, gsearch.best_score_)

这个学习率竟然是0.5最好。

## 7. 模型再训练

在测试时，我尝试把保存的重要性特征输出进行查看（如下代码展示），发现这些特征的列名都是原本数据的列，特征工程似乎不起作用。所以我想尝试只选择这些重要性列，再次从头进行训练，数据换为只包含这些列的数据。

In [4]:
import pickle

with open('/work/num_f.txt', 'r') as f:
    lines = f.readlines()  # 读取所有行到一个列表中
    num_f = eval(lines[0])
    ff = eval(lines[1])
    cat_f = eval(lines[2])
    length = eval(lines[3])
        
with open('/work/feat_imp_a.pkl', 'rb') as f:
    feat_imp_df = pickle.load(f)
for i in range(3):
    feat_imp_df_i = feat_imp_df[i]
    features = feat_imp_df_i.sort_values(['imp'])[-length[i]:]['feat'].to_list()
    # 打印这些特征的列,发现全是原本的列名,说明特征工程无效,故设计了第二次模型训练,即run_B.py
    print(features)

['COL32', 'COL54', 'COL12', 'COL27', 'COL46', 'COL28', 'COL37', 'COL14', 'COL13', 'COL39', 'COL43', 'COL3', 'COL31', 'COL21', 'COL25', 'COL47', 'COL45', 'COL6', 'COL29', 'COL36', 'COL38', 'COL35', 'COL52', 'COL42', 'COL24', 'COL18', 'COL15', 'COL51', 'COL17', 'COL2', 'COL16', 'COL9', 'COL10', 'COL1']
['COL28', 'COL12', 'COL54', 'COL27', 'COL32', 'COL13', 'COL3', 'COL39', 'COL29', 'COL6', 'COL14', 'COL46', 'COL21', 'COL47', 'COL43', 'COL38', 'COL42', 'COL35', 'COL16', 'COL17', 'COL45', 'COL25', 'COL52', 'COL10', 'COL31', 'COL37', 'COL2', 'COL9', 'COL24', 'COL36', 'COL51', 'COL15', 'COL18', 'COL1']
['COL28', 'COL54', 'COL12', 'COL27', 'COL32', 'COL39', 'COL3', 'COL46', 'COL6', 'COL47', 'COL16', 'COL13', 'COL25', 'COL14', 'COL21', 'COL10', 'COL17', 'COL38', 'COL45', 'COL31', 'COL24', 'COL52', 'COL29', 'COL43', 'COL42', 'COL2', 'COL9', 'COL37', 'COL1', 'COL15', 'COL35', 'COL18', 'COL51', 'COL36']


重新训练过后，选出来的重要性特征列名如下。经实验尝试，发现好像再训练也没什么作用。训练代码和测试代码见`run_B.py`和`test_B.py`。

In [6]:
import pickle

with open('/work/num_f_b.txt', 'r') as f:
    lines = f.readlines()  # 读取所有行到一个列表中
    num_f = eval(lines[0])
    ff = eval(lines[1])
    cat_f = eval(lines[2])
    length = eval(lines[3])
        
with open('/work/feat_imp_b.pkl', 'rb') as f:
    feat_imp_df = pickle.load(f)
for i in range(3):
    feat_imp_df_i = feat_imp_df[i]
    features = feat_imp_df_i.sort_values(['imp'])[-length[i]:]['feat'].to_list()
    # 打印这些特征的列,发现全是原本的列名,说明特征工程无效,故设计了第二次模型训练,即run_B.py
    print(features)

['COL17', 'COL51-max_gb_COL3', 'COL15-max_gb_COL3', 'COL18-mean_gb_COL3', 'COL39-min_gb_COL3', 'COL2/sum_gb_COL3', 'COL43-min_gb_COL3', 'COL36-max_gb_COL3', 'COL31-max_gb_COL3', 'COL42', 'COL37-max_gb_COL3', 'COL17-min_gb_COL3', 'COL24/sum_gb_COL3', 'COL16-min_gb_COL3', 'COL18-max_gb_COL3', 'COL10/sum_gb_COL3', 'COL52', 'COL51-mean_gb_COL3', 'COL51-min_gb_COL3', 'COL38', 'COL51/sum_gb_COL3', 'COL2-max_gb_COL3', 'COL9-min_gb_COL3', 'COL1-min_gb_COL3', 'COL1-mean_gb_COL3', 'COL1/sum_gb_COL3', 'COL10-mean_gb_COL3', 'COL16-max_gb_COL3', 'COL24', 'COL10-min_gb_COL3', 'COL1-max_gb_COL3', 'COL16', 'COL9', 'COL16-mean_gb_COL3', 'COL2-mean_gb_COL3', 'COL9-max_gb_COL3', 'COL1', 'COL10', 'COL9-mean_gb_COL3', 'COL17-mean_gb_COL3', 'COL2-min_gb_COL3', 'COL2']
['COL13', 'COL25-mean_gb_COL3', 'COL29/sum_gb_COL3', 'COL31-max_gb_COL3', 'COL42-max_gb_COL3', 'COL38-mean_gb_COL3', 'COL10-mean_gb_COL3', 'COL38/sum_gb_COL3', 'COL47', 'COL43', 'COL16-mean_gb_COL3', 'COL24/sum_gb_COL3', 'COL38', 'COL1-min_gb_