In [1]:
import warnings
warnings.simplefilter('ignore')

import gc

import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', 100)

from tqdm.notebook import tqdm

%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score

import jieba

import lightgbm as lgb

In [2]:
train = pd.read_csv('raw_data/train.csv')

print(train.shape)
train.head(10)

(12000, 7)


Unnamed: 0,id,level_1,level_2,level_3,level_4,content,label
0,0,工业/危化品类（现场）—2016版,（二）电气安全,6、移动用电产品、电动工具及照明,1、移动使用的用电产品和I类电动工具的绝缘线，必须采用三芯(单相)或四芯(三相)多股铜芯橡套软线。,"使用移动手动电动工具,外接线绝缘皮破损,应停止使用.",0
1,1,工业/危化品类（现场）—2016版,（一）消防检查,1、防火巡查,3、消防设施、器材和消防安全标志是否在位、完整；,一般,1
2,2,工业/危化品类（现场）—2016版,（一）消防检查,2、防火检查,6、重点工种人员以及其他员工消防知识的掌握情况；,消防知识要加强,0
3,3,工业/危化品类（现场）—2016版,（一）消防检查,1、防火巡查,3、消防设施、器材和消防安全标志是否在位、完整；,消防通道有货物摆放 清理不及时,0
4,4,工业/危化品类（现场）—2016版,（一）消防检查,1、防火巡查,4、常闭式防火门是否处于关闭状态，防火卷帘下是否堆放物品影响使用；,防火门打开状态,0
5,5,工业/危化品类（现场）—2016版,（一）消防检查,2、防火检查,8、易燃易爆危险物品和场所防火防爆措施的落实情况以及其他重要物资的防火安全情况；,防爆柜里面稀释剂，机油费混装,0
6,6,工业/危化品类（现场）—2016版,（一）消防检查,1、防火巡查,2、安全出口、疏散通道是否畅通，安全疏散指示标志、应急照明是否完好；,已经整改,1
7,7,工业/危化品类（现场）—2016版,（一）消防检查,1、防火巡查,2、安全出口、疏散通道是否畅通，安全疏散指示标志、应急照明是否完好；,逃生通道有货物阻挡。,0
8,8,工业/危化品类（现场）—2016版,（一）消防检查,2、防火检查,2、安全疏散通道、疏散指示标志、应急照明和安全出口情况；,已整改,1
9,9,工业/危化品类（现场）—2016版,（四）作业环境,1、作业通道,"1、作业通道应保持畅通，禁止临时堆放货物；通道以黄色或者白色线标明。凡有地坑、壕、池的地方,...",通道黄色线标脱落，已及时重新标好线标。,0


In [3]:
test = pd.read_csv('raw_data/test.csv')

print(test.shape)
test.head(10)

(18000, 6)


Unnamed: 0,id,level_1,level_2,level_3,level_4,content
0,0,交通运输类（现场）—2016版,（一）消防安全,2、防火检查,2、安全疏散通道、疏散指示标志、应急照明和安全出口情况。,RB1洗地机占用堵塞安全通道
1,1,工业/危化品类（选项）—2016版,（二）仓库,1、一般要求,1、库房内储存物品应分类、分堆、限额存放。,未分类堆放
2,2,工业/危化品类（现场）—2016版,（一）消防检查,1、防火巡查,3、消防设施、器材和消防安全标志是否在位、完整；,消防设施、器材和消防安全标志是否在位、完整
3,3,商贸服务教文卫类（现场）—2016版,（二）电气安全,3、电气线路及电源插头插座,3、电源插座、电源插头应按规定正确接线。,插座随意放在电器旁边
4,4,商贸服务教文卫类（现场）—2016版,（一）消防检查,1、防火巡查,6、其他消防安全情况。,检查中发现一瓶灭火器过期
5,5,工业/危化品类（现场）—2016版,（一）消防检查,2、防火检查,4、灭火器材配置及有效情况；,灭火器过期更换
6,6,工业/危化品类（现场）—2016版,（一）消防检查,2、防火检查,11、消防安全标志的设置情况和完好、有效情况；,仓库的墙面上 未贴严禁烟火标志， 已进行整改
7,7,工业/危化品类（现场）—2016版,（一）消防检查,2、防火检查,11、消防安全标志的设置情况和完好、有效情况；,部分消防标志褪色，已更换！
8,8,工业/危化品类（现场）—2016版,（一）消防检查,1、防火巡查,3、消防设施、器材和消防安全标志是否在位、完整；,手推车放在灭火器前，阻挡灭火器
9,9,工业/危化品类（现场）—2016版,（一）消防检查,2、防火检查,12、其他需要检查的内容。 防火检查需填写检查记录。检查人员和被检查部门负责人在检查记录上签名。,消防栓未定时检查


In [4]:
# bert prob

train_bert_pred = pd.read_csv('roberta_pred_oof.csv')
test_bert_pred = pd.read_csv('roberta_pred_test.csv')

train = pd.merge(train, train_bert_pred, on='id')
test = pd.merge(test, test_bert_pred, on='id')

In [6]:
df = pd.concat([train, test])
print(df.shape)

df.head(10)

(30000, 8)


Unnamed: 0,id,level_1,level_2,level_3,level_4,content,label,bert_pred
0,0,工业/危化品类（现场）—2016版,（二）电气安全,6、移动用电产品、电动工具及照明,1、移动使用的用电产品和I类电动工具的绝缘线，必须采用三芯(单相)或四芯(三相)多股铜芯橡套软线。,"使用移动手动电动工具,外接线绝缘皮破损,应停止使用.",0.0,-3.096279
1,1,工业/危化品类（现场）—2016版,（一）消防检查,1、防火巡查,3、消防设施、器材和消防安全标志是否在位、完整；,一般,1.0,2.496831
2,2,工业/危化品类（现场）—2016版,（一）消防检查,2、防火检查,6、重点工种人员以及其他员工消防知识的掌握情况；,消防知识要加强,0.0,1.056331
3,3,工业/危化品类（现场）—2016版,（一）消防检查,1、防火巡查,3、消防设施、器材和消防安全标志是否在位、完整；,消防通道有货物摆放 清理不及时,0.0,-3.817928
4,4,工业/危化品类（现场）—2016版,（一）消防检查,1、防火巡查,4、常闭式防火门是否处于关闭状态，防火卷帘下是否堆放物品影响使用；,防火门打开状态,0.0,-2.853796
5,5,工业/危化品类（现场）—2016版,（一）消防检查,2、防火检查,8、易燃易爆危险物品和场所防火防爆措施的落实情况以及其他重要物资的防火安全情况；,防爆柜里面稀释剂，机油费混装,0.0,-3.080718
6,6,工业/危化品类（现场）—2016版,（一）消防检查,1、防火巡查,2、安全出口、疏散通道是否畅通，安全疏散指示标志、应急照明是否完好；,已经整改,1.0,3.261704
7,7,工业/危化品类（现场）—2016版,（一）消防检查,1、防火巡查,2、安全出口、疏散通道是否畅通，安全疏散指示标志、应急照明是否完好；,逃生通道有货物阻挡。,0.0,-3.527178
8,8,工业/危化品类（现场）—2016版,（一）消防检查,2、防火检查,2、安全疏散通道、疏散指示标志、应急照明和安全出口情况；,已整改,1.0,3.132907
9,9,工业/危化品类（现场）—2016版,（四）作业环境,1、作业通道,"1、作业通道应保持畅通，禁止临时堆放货物；通道以黄色或者白色线标明。凡有地坑、壕、池的地方,...",通道黄色线标脱落，已及时重新标好线标。,0.0,-3.748759


In [7]:
for col in [f for f in df.columns if f != 'id']:
    print(col, df[col].nunique())

level_1 19
level_2 78
level_3 185
level_4 379
content 23572
label 2
bert_pred 25229


In [8]:
for col in tqdm(['level_1', 'level_2', 'level_3', 'level_4']):
    df[f'{col}_strlen'] = df[col].astype(str).apply(len)
    lbl = LabelEncoder()
    lbl.fit(df[col])
    df[col] = lbl.transform(df[col])

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [9]:
df['content_strlen'] = df['content'].astype(str).apply(len)

In [10]:
df.head()

Unnamed: 0,id,level_1,level_2,level_3,level_4,content,label,bert_pred,level_1_strlen,level_2_strlen,level_3_strlen,level_4_strlen,content_strlen
0,0,12,32,170,136,"使用移动手动电动工具,外接线绝缘皮破损,应停止使用.",0.0,-3.096279,17,7,16,49,26
1,1,12,8,56,257,一般,1.0,2.496831,17,7,6,24,2
2,2,12,8,101,353,消防知识要加强,0.0,1.056331,17,7,6,24,7
3,3,12,8,56,257,消防通道有货物摆放 清理不及时,0.0,-3.817928,17,7,6,24,15
4,4,12,8,56,287,防火门打开状态,0.0,-2.853796,17,7,6,33,7


In [11]:
df['content'].fillna('', inplace=True)
df['content_seg'] = df['content'].apply(lambda x: " ".join(jieba.cut(x)))

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.105 seconds.
Prefix dict has been built successfully.


In [12]:
df['content_word_cnt'] = df['content_seg'].apply(lambda x: len(x.split(" ")))

In [13]:
n_components = 16

X = list(df['content_seg'].values)
tfv = TfidfVectorizer(ngram_range=(1,1), 
                      token_pattern=r"(?u)\b[^ ]+\b",
                      max_features=10000)
tfv.fit(X)
X_tfidf = tfv.transform(X)
svd = TruncatedSVD(n_components=n_components)
svd.fit(X_tfidf)
X_svd = svd.transform(X_tfidf)

for i in range(n_components):
    df[f'content_tfidf_{i}'] = X_svd[:, i]

In [14]:
df.head(10)

Unnamed: 0,id,level_1,level_2,level_3,level_4,content,label,bert_pred,level_1_strlen,level_2_strlen,level_3_strlen,level_4_strlen,content_strlen,content_seg,content_word_cnt,content_tfidf_0,content_tfidf_1,content_tfidf_2,content_tfidf_3,content_tfidf_4,content_tfidf_5,content_tfidf_6,content_tfidf_7,content_tfidf_8,content_tfidf_9,content_tfidf_10,content_tfidf_11,content_tfidf_12,content_tfidf_13,content_tfidf_14,content_tfidf_15
0,0,12,32,170,136,"使用移动手动电动工具,外接线绝缘皮破损,应停止使用.",0.0,-3.096279,17,7,16,49,26,"使用 移动 手动 电动工具 , 外 接线 绝缘 皮 破损 , 应 停止使用 .",14,0.006962,0.004836,0.009002,0.005316,0.001656,0.002731,-0.004912,0.011597,0.011524,-0.001966,0.007598,-0.011693,-0.00863,-0.008527,0.010131,0.003516
1,1,12,8,56,257,一般,1.0,2.496831,17,7,6,24,2,一般,1,0.000851,0.00112,-0.000152,0.000703,0.00136,0.001832,-0.000563,-0.002091,0.001959,-0.003571,0.00039,-0.015447,-0.002157,0.012254,-0.006067,0.004736
2,2,12,8,101,353,消防知识要加强,0.0,1.056331,17,7,6,24,7,消防 知识 要 加强,4,0.011752,0.004001,0.015744,0.005031,-0.002309,0.005014,-0.002317,0.014483,0.023243,-0.00166,0.010149,-0.011126,0.000899,-0.008132,0.006392,0.00907
3,3,12,8,56,257,消防通道有货物摆放 清理不及时,0.0,-3.817928,17,7,6,24,15,消防通道 有 货物 摆放 清理 不 及时,9,0.240184,-0.047535,-0.055964,0.169164,0.134602,0.17835,-0.103931,0.044703,0.084637,0.275877,0.350776,-0.365059,-0.030264,0.063569,-0.077093,-0.043314
4,4,12,8,56,287,防火门打开状态,0.0,-2.853796,17,7,6,33,7,防火门 打开 状态,3,0.010451,0.008915,0.010056,0.005501,0.001714,0.012143,0.003139,0.055971,0.036108,-0.037276,0.051451,-0.002573,-0.09711,-0.042557,0.053756,-0.011044
5,5,12,8,101,364,防爆柜里面稀释剂，机油费混装,0.0,-3.080718,17,7,6,40,14,防爆 柜 里面 稀释剂 ， 机油 费 混装,8,0.00281,0.001136,0.005998,-0.001037,0.00148,0.007708,-0.010797,0.006677,-0.000775,-0.005653,-0.00618,-0.006884,0.002432,-0.006244,-0.004764,-0.014633
6,6,12,8,56,178,已经整改,1.0,3.261704,17,7,6,34,4,已经 整改,2,0.186023,0.381582,-0.098095,-0.068713,-0.018996,-0.014279,0.001265,0.009713,-0.016689,-0.012468,0.004466,-0.024171,-0.014421,0.003721,-0.051968,-0.002361
7,7,12,8,56,178,逃生通道有货物阻挡。,0.0,-3.527178,17,7,6,34,10,逃生 通道 有 货物 阻挡 。,6,0.254065,-0.111572,-0.077983,0.104665,0.214997,-0.063296,-0.012981,0.074042,0.053388,0.05514,-0.098229,-0.248198,-0.097865,0.132346,-0.079019,0.035055
8,8,12,8,101,180,已整改,1.0,3.132907,17,7,6,28,3,已 整改,2,0.397351,0.85661,-0.240545,-0.164496,-0.048351,-0.055773,0.010368,-0.020039,-0.062835,-0.025351,0.00241,0.019105,0.007217,0.023712,-0.03149,0.009754
9,9,12,70,7,38,通道黄色线标脱落，已及时重新标好线标。,0.0,-3.748759,17,7,6,55,19,通道 黄色 线标 脱落 ， 已 及时 重新 标好 线标 。,11,0.101173,0.045701,-0.014822,-0.002153,0.009648,-0.048944,-0.029479,0.017528,0.021423,-0.026241,-0.024783,-0.011687,-0.009744,-0.025672,0.02459,-0.01482


In [15]:
df.drop(['content', 'content_seg'], axis=1, inplace=True)
train = df[df['label'].notna()]
test = df[df['label'].isna()]

print(train.shape, test.shape)

(12000, 29) (18000, 29)


In [16]:
ycol = 'label'
feature_names = list(
    filter(lambda x: x not in [ycol, 'id'], train.columns))

model = lgb.LGBMClassifier(objective='binary',
                           boosting_type='gbdt',
                           tree_learner='serial',
                           num_leaves=32,
                           max_depth=4,
                           learning_rate=0.1,
                           n_estimators=1000,
                           subsample=0.8,
                           feature_fraction=0.8,
                           reg_alpha=2,
                           reg_lambda=3,
                           random_state=2021,
                           is_unbalance=True,
                           metric='auc')


oof = []
prediction = test[['id']]
prediction[ycol] = 0
df_importance_list = []

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2021)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train[feature_names], train[ycol])):
    X_train = train.iloc[trn_idx][feature_names]
    Y_train = train.iloc[trn_idx][ycol]

    X_val = train.iloc[val_idx][feature_names]
    Y_val = train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=100,
                          eval_metric='auc',
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict_proba(
        X_val, num_iteration=lgb_model.best_iteration_)
    df_oof = train.iloc[val_idx][['id', ycol]].copy()
    df_oof['pred'] = pred_val[:,1]
    oof.append(df_oof)

    pred_test = lgb_model.predict_proba(
        test[feature_names], num_iteration=lgb_model.best_iteration_)
    prediction[ycol] += pred_test[:,1] / kfold.n_splits

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()
    
    
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance



Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[12]	train's auc: 0.994687	valid's auc: 0.99433


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[16]	train's auc: 0.994427	valid's auc: 0.996004


Training until validation scores don't improve for 50 rounds
[100]	train's auc: 0.998878	valid's auc: 0.990012
Early stopping, best iteration is:
[79]	train's auc: 0.998387	valid's auc: 0.990388


Training until validation scores don't improve for 50 rounds
[100]	train's auc: 0.998891	valid's auc: 0.987676
Early stopping, best iteration is:
[106]	train's auc: 0.998994	valid's auc: 0.987761


Training until validation scores don't improve for 50 rounds
[100]	train's auc: 0.998777	valid's auc: 0.991013
Early stopping, best iteration is:
[84]	train's auc: 0.998464	valid's auc: 0.992256


Unnamed: 0,column,importance
0,bert_pred,135.2
1,content_tfidf_13,36.2
2,content_tfidf_10,35.8
3,content_tfidf_0,33.6
4,content_strlen,33.0
5,content_tfidf_3,32.8
6,content_tfidf_11,29.4
7,content_tfidf_6,27.4
8,content_tfidf_7,27.0
9,content_tfidf_15,26.2


In [17]:
i_bst = 0
bst = 0
df_oof = pd.concat(oof)
for i in np.arange(0.1, 1.0, 0.05):
    df_oof['pred_label'] = df_oof['pred'].apply(lambda x: 1 if x >= i else 0)
    score = f1_score(df_oof['label'], df_oof['pred_label'])
    print(i, 'f1_score:', score)
    if score> bst:
        i_bst = i
        bst = score

0.1 f1_score: 0.7721557804354493
0.15000000000000002 f1_score: 0.81060116354234
0.20000000000000004 f1_score: 0.8255544521681561
0.25000000000000006 f1_score: 0.8336686787391012
0.30000000000000004 f1_score: 0.8449744463373083
0.3500000000000001 f1_score: 0.8494845360824743
0.40000000000000013 f1_score: 0.8579367836054185
0.45000000000000007 f1_score: 0.8634764250527798
0.5000000000000001 f1_score: 0.873481057898499
0.5500000000000002 f1_score: 0.8797675263349073
0.6000000000000002 f1_score: 0.8918617614269789
0.6500000000000001 f1_score: 0.89913109180204
0.7000000000000002 f1_score: 0.9004594180704442
0.7500000000000002 f1_score: 0.8566735112936346
0.8000000000000002 f1_score: 0.8020969855832242
0.8500000000000002 f1_score: 0.6722276741903829
0.9000000000000002 f1_score: 0.6676602086438153
0.9500000000000003 f1_score: 0.6596173212487412


In [18]:
prediction['label'] = prediction['label'].apply(lambda x: 1 if x >= i_bst else 0)
prediction['label'].value_counts()

0    16074
1     1926
Name: label, dtype: int64

In [19]:
prediction[['id', 'label']].to_csv(f'submission_{bst}.csv', index=False)