In [1]:
import warnings
warnings.simplefilter('ignore')

import re
import gc
import pickle
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', 100)

from tqdm.notebook import tqdm

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score
from lightgbm import LGBMClassifier
from sklearn.decomposition import PCA

import jieba
import jieba.posseg as pseg 
import lightgbm as lgb
# 导入编码转换模块
import codecs
# 从textrank4zh模块中导入提取关键词和生成摘要的类
from textrank4zh import TextRank4Keyword, TextRank4Sentence, util

from hyperopt import fmin, tpe, hp
from hyperopt import Trials

### 1、数据预处理

In [2]:
def getMaxSame(a, b):
    # 获得a,b字符串最大相同子序列
    if len(a)<=len(b):
        s1, s2 = a, b
    else:
        s1, s2 = b, a
    maxlen = 0
    maxstr = ""
    for i in range(len(a)):
        for j in range(len(a), i-1, -1):
            if s1[i:j] in s2 and len(s1[i:j]) > maxlen:
                maxlen = len(s1[i:j])
                maxstr = s1[i:j]
    return maxstr

def get_len(x):
    x_split = re.split("[,，;；.。]", x)
    x_split = [i for i in x_split if len(i)>2]
    return len(x_split)

def getIndex(a, b):
    # 输出a在b中的索引，若a不在b中，则返回-1
    if a in b:
        return b.index(a)
    else:
        return -1

def get_word_weight(text, nums=100, window=2, speech_tags=['a', 'd', 't', 'v', 'l']):
    # 创建分词类的实例
    tr4w = TextRank4Keyword(allow_speech_tags=speech_tags)
    # 对文本进行分析，设定窗口大小为2，并将英文单词小写
    tr4w.analyze(text=text, lower=True, window=window)

    # 从关键词列表中获取前20个关键词
    word_lst = []
    weight_lst = []
    for item in tr4w.get_keywords(num=nums, word_min_len=1):
        # 打印每个关键词的内容及关键词的权重
        word_lst.append(item.word)
        weight_lst.append(item.weight)
    word_weight = pd.DataFrame({'word': word_lst, 'weight': weight_lst})
    return word_weight
    
def get_level_content_features(df):
    """level和content的相关类特征"""
    # level_4和content只保留中文文本
    df["content"] = df["content"].astype(str)
    regex = re.compile(u"[\u4e00-\u9fa5]+")
    df["level4_nosign"] = df["level_4"].apply(lambda x: ''.join(regex.findall(x)))
    df["content_nosign"] = df["content"].apply(lambda x: ''.join(regex.findall(x)))
    
    #------------------------- 最长公共子序列 -------------------------#
    # content是否包含于level_4
    df["content_in_level4"] = df.apply(lambda x: int(x.content_nosign in x.level4_nosign), axis=1)
    # content与level4最长相同子字符串
    df["content_level4_substr"] = df.apply(lambda x: getMaxSame(x.content_nosign, x.level4_nosign), axis=1)
    # content与level4最长相同子字符串长度
    df["content_level4_sublen"] = df["content_level4_substr"].str.len()
    df["level4_strlen"] = df.level_4.apply(len)
    df["content_strlen"] = df.content.apply(len)
#     df['level4_content_sub'] = df['level4_strlen'] - df['content_strlen']
#     df['level4_content_div'] = df['level4_strlen'] / df['content_strlen']
#     df["substr_level4_div"] = df["content_level4_sublen"] / df["level4_strlen"]
#     df["substr_content_div"] = df["content_level4_sublen"] / df["content_strlen"]
    
    #------------------------- 子句 -------------------------#
    df['level4_sent_num'] = df.level_4.apply(lambda x: get_len(x))
    df['content_sent_num'] = df.content.apply(lambda x: get_len(x))
    df['level4_content_sent_sub'] = df['level4_sent_num'] - df['content_sent_num']
    
    #------------------------- 分词 -------------------------#
    # 基于jieba分词，计算三个特征：相同词的个数，不同词的个数，Jaccard相似度
    df['level4_jieba'] = df['level_4'].apply(lambda x: set(jieba.cut(x)))
    df['content_jieba'] = df['content'].apply(lambda x: set(jieba.cut(x)))
    df['same_word'] = df.apply(lambda x: len(x.level4_jieba&x.content_jieba), axis=1)
    df['different_word'] = df.apply(lambda x: len(x.content_jieba-x.level4_jieba), axis=1)
    df['same_word_level4'] = df.apply(lambda x: x.same_word/len(x.level4_jieba), axis=1)
    df['same_word_content'] = df.apply(lambda x: x.same_word/len(x.content_jieba) 
                                       if len(x.content_jieba)>0 else 0, axis=1)
    df['jaccard'] = df.apply(lambda x: x.same_word/len(x.level4_jieba|x.content_jieba), axis=1)
    
    return df

def get_content_features(df):
    """"""
    #------------------------- 关键词特征 -------------------------#
    # 对content分词，统计分词和标签
    word_lst, label_lst = [], []
    punctuation = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~“”？，！【】（）、。：；’‘……￥·1234567890 """
    df['jieba'] = df.apply(lambda x: [i for i,j in list(pseg.cut(x.content)) if i not in punctuation and 
                                       i not in x.content_level4_substr and j in ["v", "d", "a"]], axis=1)
    for i, j in df[df.label.notna()][['jieba', 'label']].values:
        word_lst.extend(i)
        label_lst.extend(len(i)*[j])
    df_word = pd.DataFrame({"word": word_lst, "label": label_lst})

    word_label1_30 = list(df_word.loc[df_word.label==1, "word"].value_counts().sort_values(ascending=False)[:30].index)
    word_label0_100 = set(df_word[df_word.label.notna()].word.value_counts().sort_values(ascending=False)[:130].index)-set(word_label1_30)
    word_label1_30 = pickle.load(open("./data/ci_label1_30", "rb"))
    word_label0_100 = pickle.load(open("./data/ci_label0_100", "rb"))
    
    for i, word in enumerate(word_label1_30):
        df['wd_'+str(i)] = df.content.apply(lambda x: getIndex(word, x))  # 前30多的词，出现在当前content的索引
    df["word_label0_100"] = df.jieba.apply(lambda x: len(set(x)&word_label0_100))  # 前100正样本特有的词，出现在当前content几次
    
    #------------------------- textrank特征 -------------------------#
    text0 = ' '.join(df[df.label==0].content.values)
    text1 = ' '.join(df[df.label==1].content.values)
    df_content_0 = get_word_weight(text0, nums=100, window=6)
    df_content_1 = get_word_weight(text1, nums=100, window=6)
    df_content_0.columns = ['word', 'weight_0']
    df_content_1.columns = ['word', 'weight_1']
    df_content_0_wd = pd.merge(df_content_0, df_content_1, on='word', how='left')
    content_0_wd = df_content_0_wd[df_content_0_wd.weight_1.isna()]['word'].values
    df_content_1_wd = pd.merge(df_content_1, df_content_0, on='word', how='left')
    content_1_wd = df_content_1_wd[df_content_1_wd.weight_0.isna()]['word'].values

    def get_main_wd_num(content, words):
        word_sum = 0
        for word in words:
            if word in content:
                word_sum += 1
        return word_sum

    df['content_0_cnt'] = df.content.apply(lambda x: get_main_wd_num(x, content_0_wd))
    df['content_1_cnt'] = df.content.apply(lambda x: get_main_wd_num(x, content_1_wd))
    
    df_wd_1 = pd.DataFrame()
    for i, word in enumerate(content_1_wd):
        df_wd_1['prwd1_' + str(i)] = df.content.apply(lambda x: int(word in x))

    df_wd_0 = pd.DataFrame()
    for i, word in enumerate(content_0_wd):
        df_wd_0['prwd0_' + str(i)] = df.content.apply(lambda x: int(word in x))

    ## 特征筛选
    # 方法一：编码后降维
    pca = PCA(n_components=0.8)
    pca.fit(df_wd_1)
    wd1_pca = pca.transform(df_wd_1)

    for i in range(wd1_pca.shape[1]):
        df[f'content_pr1_{i}'] = wd1_pca[:, i]

    pca = PCA(n_components=0.8)
    pca.fit(df_wd_0)
    wd0_pca = pca.transform(df_wd_0)

    for i in range(wd0_pca.shape[1]):
        df[f'content_pr0_{i}'] = wd0_pca[:, i]

    # 方法二：选择IV大于0.02的分词，降维后进入模型
    # df_wd_1['label'] = df.label
    # df_wd_0['label'] = df.label
    # df_wd1_iv = toad.quality(df_wd_1, target='label')
    # df_wd1_iv_sel = df_wd1_iv[df_wd1_iv.iv > 0.02].index.values
    # df_wd0_iv = toad.quality(df_wd_0, target='label')
    # df_wd0_iv_sel = df_wd0_iv[df_wd0_iv.iv > 0.02].index.values

    # pca = PCA(n_components=0.95)
    # pca.fit(df_wd_1[df_wd1_iv_sel])
    # wd1_pca = pca.transform(df_wd_1[df_wd1_iv_sel])

    # for i in range(wd1_pca.shape[1]):
    #     df[f'content_pr1_{i}'] = wd1_pca[:, i]

    # pca = PCA(n_components=0.8)
    # pca.fit(df_wd_0[df_wd0_iv_sel])
    # wd0_pca = pca.transform(df_wd_0[df_wd0_iv_sel])

    # for i in range(wd0_pca.shape[1]):
    #     df[f'content_pr0_{i}'] = wd0_pca[:, i]
    
    #------------------------- TF-IDF特征 -------------------------#
    df['content_seg'] = df['content'].apply(lambda x: " ".join(jieba.cut(x)))
    df['content_word_cnt'] = df['content_seg'].apply(lambda x: len(x.split(" ")))

    n_components = 16
    X = list(df['content_seg'].values)
    tfv = TfidfVectorizer(ngram_range=(1,1), 
                          token_pattern=r"(?u)\b[^ ]+\b",
                          max_features=10000)
    tfv.fit(X)
    X_tfidf = tfv.transform(X)
    svd = TruncatedSVD(n_components=n_components)
    svd.fit(X_tfidf)
    X_svd = svd.transform(X_tfidf)

    for i in range(n_components):
        df[f'content_tfidf_{i}'] = X_svd[:, i]
        
    return df

def get_level_features(df):
    #------------------------- badrate编码 -------------------------#
    for col in ['level_1', 'level_2', 'level_3']:
        risk_ratio = dict(df[df.label.notna()].groupby(col)['label'].mean())
        df[f'{col}_risk_score'] = df[col].map(risk_ratio)
    
    #------------------------- 类别编码 -------------------------#
    for col in ['level_1', 'level_2', 'level_3', 'level_4']:
        df[f'{col}_strlen'] = df[col].astype(str).apply(len)
        lbl = LabelEncoder()
        lbl.fit(df[col])
        df[col] = lbl.transform(df[col])
    
    return df

def get_bert_features(train, test):
    # bert
    train_bert_pred = pd.read_csv('./data/roberta_pred_oof2.csv')
    test_bert_pred = pd.read_csv('./data/roberta_pred_test2.csv')

    train = pd.merge(train, train_bert_pred, on='id')
    test = pd.merge(test, test_bert_pred, on='id')
    
    # sentence pair bert
    train_sbert_pred = pd.read_csv('./data/roberta2_pred_oof.csv')
    test_sbert_pred = pd.read_csv('./data/roberta2_pred_test.csv')

    train = pd.merge(train, train_sbert_pred, on='id')
    test = pd.merge(test, test_sbert_pred, on='id')
    
    # 直接用预训练模型encode，计算文本相似度
    df_tr_sim = pd.read_csv('./data/train_sim.csv')
    df_ts_sim = pd.read_csv('./data/test_sim.csv')

    train = pd.concat([train, df_tr_sim], axis=1)
    test = pd.concat([test, df_ts_sim], axis=1)
    
    return train, test

def mainProcess(train_path, test_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    train['content'].fillna('', inplace=True)
    test['content'].fillna('', inplace=True)
    train['type'] = 'train'
    test['type'] = 'test'
    df = pd.concat([train, test], ignore_index=True)
    df = get_level_content_features(df)
    df = get_content_features(df)
    df = get_level_features(df)
    drop_cols = ['content', 'content_nosign', 'level4_nosign', 'content_level4_substr', 'level4_jieba','content_jieba', 
                 'jieba', 'content_seg', 'type']
    df.drop(drop_cols, axis=1, inplace=True)
    train = df[df['label'].notna()]
    test = df[df['label'].isna()]
    return train, test

### 2、自动调参

In [11]:
def hp_param(train_df):
    """基于贝叶斯的自动调参"""
    data = train_df
    X = data.drop(['id', 'label'], axis=1)
    y = data['label']

    #split
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=1)

    train_data = lgb.Dataset(data=X_train,label=y_train)
    test_data = lgb.Dataset(data=X_test,label=y_test)
    
    # 搜索空间
    space = {"max_depth": hp.randint("max_depth", 15),
             "num_trees": hp.randint("num_trees", 300),
             "learning_rate": hp.uniform("learning_rate", 1e-3, 5e-1),
             "bagging_fraction": hp.randint("bagging_fraction", 5),
             "num_leaves": hp.randint("num_leaves", 6),   
            }
    
    # 参数变换
    def argsDict_tranform(argsDict, isPrint=False):
        argsDict["max_depth"] = argsDict["max_depth"] + 5
        argsDict['num_trees'] = argsDict['num_trees'] + 150
        argsDict["learning_rate"] = argsDict["learning_rate"] * 0.02 + 0.05
        argsDict["bagging_fraction"] = argsDict["bagging_fraction"] * 0.1 + 0.5
        argsDict["num_leaves"] = argsDict["num_leaves"] * 3 + 10
        if isPrint: 
            print(argsDict)
        else:
            pass 
        return argsDict
    
    def lgb_f1_score(y_hat, data):
        y_true = data.get_label()
        y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
        return 'f1', f1_score(y_true, y_hat), True
    
    # 模型生成器
    def lightgbm_factory(argsDict):
        argsDict = argsDict_tranform(argsDict)
        
        params={'nthread': -1,  # 进程数
                'max_depth': argsDict['max_depth'],  # 最大深度
                'num_trees': argsDict['num_trees'],  # 树的数量
                'eta': argsDict['learning_rate'],    # 学习率
                'bagging_fraction': argsDict['bagging_fraction'],  # 样本采样
                'num_leaves': argsDict['num_leaves'],  # 终点节点最小样本占比的和
                'objective': 'binary',
                'feature_fraction': 0.8,  # 特征采样
                'lambda_11': 2,  # L1正则化
                'lambda_12': 3,  # L2正则化
                'baggingseed': 100,  # 随机种子，默认为100
               }
        params['metric'] = ['auc']


        model_lgb = lgb.train(params, train_data, valid_sets=[test_data], feval=lgb_f1_score, early_stopping_rounds=10)
        return get_tranformer_score(model_lgb)
    
    # 获取损失函数
    def get_tranformer_score(tranformer):
        model = tranformer
        prediction = model.predict(X_test, num_iteration=model.best_iteration)
        return -sklearn.metrics.roc_auc_score(y_test, prediction)
    
    # 开始调参
    best = fmin(lightgbm_factory, space, algo=tpe.suggest, max_evals=6)
    print('best:')
    print(best)
    
    # 得到最佳参数
    print('best param')
    params = argsDict_tranform(best, isPrint=True)
    return params
    
def model_lgb_hp(train_df, params):
    data = train_df
    X = data.drop(['id', 'label'], axis=1)
    y = data['label']

    #split
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=1)

    train_data = lgb.Dataset(data=X_train,label=y_train)
    test_data = lgb.Dataset(data=X_test,label=y_test)

    #train
    gbm_model = LGBMClassifier(boosting_type='gbdt', **params)
    gbm_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric='AUC', early_stopping_rounds=10)
    
    pickle.dump(gbm_model, open("model", "wb"))
    print("lgb已保存成文件model")

In [12]:
params = hp_param(train)

[1]	valid_0's auc: 0.98658	valid_0's f1: 0                                                                             
Training until validation scores don't improve for 10 rounds                                                           
[2]	valid_0's auc: 0.987391	valid_0's f1: 0                                                                            
[3]	valid_0's auc: 0.987538	valid_0's f1: 0                                                                            
[4]	valid_0's auc: 0.987576	valid_0's f1: 0                                                                            
[5]	valid_0's auc: 0.98756	valid_0's f1: 0                                                                             
[6]	valid_0's auc: 0.987569	valid_0's f1: 0                                                                            
[7]	valid_0's auc: 0.987544	valid_0's f1: 0                                                                            
[8]	valid_0's auc: 0.987586	valid_0's f1

[7]	valid_0's auc: 0.986798	valid_0's f1: 0                                                                            
[8]	valid_0's auc: 0.986875	valid_0's f1: 0                                                                            
[9]	valid_0's auc: 0.986856	valid_0's f1: 0                                                                            
[10]	valid_0's auc: 0.986791	valid_0's f1: 0.751807                                                                    
[11]	valid_0's auc: 0.986754	valid_0's f1: 0.860262                                                                    
[12]	valid_0's auc: 0.986855	valid_0's f1: 0.879828                                                                    
[13]	valid_0's auc: 0.9869	valid_0's f1: 0.886994                                                                      
[14]	valid_0's auc: 0.986915	valid_0's f1: 0.894068                                                                    
Early stopping, best iteration is:      

In [13]:
model_lgb_hp(train, params)

[1]	training's auc: 0.983624	training's binary_logloss: 0.297589	valid_1's auc: 0.982898	valid_1's binary_logloss: 0.296837
Training until validation scores don't improve for 10 rounds
[2]	training's auc: 0.983811	training's binary_logloss: 0.268301	valid_1's auc: 0.98312	valid_1's binary_logloss: 0.268148
[3]	training's auc: 0.984603	training's binary_logloss: 0.245756	valid_1's auc: 0.983442	valid_1's binary_logloss: 0.246082
[4]	training's auc: 0.986076	training's binary_logloss: 0.227237	valid_1's auc: 0.985464	valid_1's binary_logloss: 0.228082
[5]	training's auc: 0.986119	training's binary_logloss: 0.211596	valid_1's auc: 0.98552	valid_1's binary_logloss: 0.212737
[6]	training's auc: 0.98616	training's binary_logloss: 0.198005	valid_1's auc: 0.985513	valid_1's binary_logloss: 0.1996
[7]	training's auc: 0.98618	training's binary_logloss: 0.186025	valid_1's auc: 0.985569	valid_1's binary_logloss: 0.187731
[8]	training's auc: 0.986204	training's binary_logloss: 0.175366	valid_1's au

### 3、训练模型

In [16]:
def train_model(train, test):
    ycol = 'label'
    feature_names = list(filter(lambda x: x not in [ycol, 'id'], train.columns))

    # 贝叶斯调参 A榜f1: 0.95887806
    model = lgb.LGBMClassifier(objective='binary',
                               boosting_type='gbdt',
                               learning_rate=0.05,
                               n_estimators=1000,
                               max_depth=15,
                               num_leaves=13,
                               subsample=0.6,
                               feature_fraction=0.8,
                               reg_alpha=2,     
                               reg_lambda=3,    
                               random_state=2021,
                               is_unbalance=True,
                               metric='auc')

    oof = []
    prediction = test[['id']]
    prediction[ycol] = 0
    df_importance_list = []

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2021)
    for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train[feature_names], train[ycol])):
        X_train = train.iloc[trn_idx][feature_names]
        Y_train = train.iloc[trn_idx][ycol]

        X_val = train.iloc[val_idx][feature_names]
        Y_val = train.iloc[val_idx][ycol]

        print('\nFold_{} Training ================================\n'.format(fold_id+1))

        lgb_model = model.fit(X_train,
                              Y_train,
                              eval_names=['train', 'valid'],
                              eval_set=[(X_train, Y_train), (X_val, Y_val)],
                              verbose=100,
                              eval_metric='auc',
                              early_stopping_rounds=50)

        pred_val = lgb_model.predict_proba(X_val, num_iteration=lgb_model.best_iteration_)
        df_oof = train.iloc[val_idx][['id', ycol]].copy()
        df_oof['pred'] = pred_val[:,1]
        oof.append(df_oof)

        pred_test = lgb_model.predict_proba(test[feature_names], num_iteration=lgb_model.best_iteration_)
        prediction[ycol] += pred_test[:,1] / kfold.n_splits

        df_importance = pd.DataFrame({
            'column': feature_names,
            'importance': lgb_model.feature_importances_,
        })
        df_importance_list.append(df_importance)

        del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
        gc.collect()
    
    df_oof = pd.concat(oof)
    df_importance = pd.concat(df_importance_list)
    df_importance = df_importance.groupby(['column'])['importance'].agg(
        'mean').sort_values(ascending=False).reset_index()
    print(df_importance)
    
    return df_oof, prediction, df_importance

### 4、寻找最优切分点

In [17]:
def search_best_split(df_oof, prediction):
    i_bst = 0
    bst = 0
    for i in np.arange(0.1, 1, 0.05):
        df_oof['pred_label'] = df_oof['pred'].apply(lambda x: 1 if x >= i else 0)
        score = f1_score(df_oof['label'], df_oof['pred_label'])
        print(i, 'f1_score:', score)
        if score> bst:
            i_bst = i
            bst = score
    print('best split point: {}, best f1-score: {}'.format(i_bst, bst))
    
    prediction['label'] = prediction['label'].apply(lambda x: 1 if x >= i_bst else 0)
    print(prediction['label'].value_counts())
    return prediction[['id', 'label']]

In [18]:
if __name__ == "__main__":
    train_path = "./data/train.csv"
    test_path = "./data/test.csv"
    
    print("数据预处理...")
    train, test = mainProcess(train_path, test_path)
    train, test = get_bert_features(train, test)
    print("训练模型...")
    df_oof, prediction, df_importance = train_model(train, test)
    result = search_best_split(df_oof, prediction)
    result.to_csv(f'./data/submission.csv', index=False)

数据预处理...
训练模型...


Training until validation scores don't improve for 50 rounds
[100]	train's auc: 0.998856	valid's auc: 0.992418
Early stopping, best iteration is:
[135]	train's auc: 0.999204	valid's auc: 0.992803


Training until validation scores don't improve for 50 rounds
[100]	train's auc: 0.99881	valid's auc: 0.994931
Early stopping, best iteration is:
[97]	train's auc: 0.998795	valid's auc: 0.995108


Training until validation scores don't improve for 50 rounds
[100]	train's auc: 0.998976	valid's auc: 0.985797
Early stopping, best iteration is:
[51]	train's auc: 0.998004	valid's auc: 0.986633


Training until validation scores don't improve for 50 rounds
[100]	train's auc: 0.999213	valid's auc: 0.988753
Early stopping, best iteration is:
[82]	train's auc: 0.999027	valid's auc: 0.989578


Training until validation scores don't improve for 50 rounds
[100]	train's auc: 0.998795	valid's auc: 0.994695
Early stopping, best iteration is:
[76]	train's auc: 0.998359	valid's auc: 0.99504

In [22]:
df_importance.head(20)

Unnamed: 0,column,importance
0,bert_pred_y,127.0
1,bert_pred_x,122.8
2,content_tfidf_11,38.2
3,content_tfidf_0,37.8
4,content_tfidf_10,34.6
5,level_3_risk_score,30.6
6,l1_sim,29.4
7,content_tfidf_3,26.0
8,content_tfidf_7,22.2
9,content_strlen,21.6


In [19]:
result.head()

Unnamed: 0,id,label
0,0,0
1,1,0
2,2,1
3,3,0
4,4,0
