In [1]:
import pandas as pd
import gc
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score, classification_report
from tqdm import tqdm
import lightgbm as lgb
import os
import numpy as np

warnings.simplefilter('ignore')
pd.set_option('max_columns', None)
pd.set_option('max_rows', 500)
pd.options.display.max_colwidth = 10000

In [2]:
seed = 2021

In [3]:
df_train = pd.read_csv('raw_data/trainset/recruit_folder.csv')
df_test = pd.read_csv('raw_data/testset/recruit_folder.csv')
df_test['LABEL'] = np.nan

In [4]:
df_feature = df_train.append(df_test, sort=False)

In [5]:
df_feature['LABEL'].mean()

0.15927573602334874

# 求职者基本信息

In [6]:
df_person = pd.read_csv('raw_data/trainset/person.csv')

In [7]:
df_person = df_person.drop(columns=['LANGUAGE_REMARK', 'SPECILTY'], axis=1)

In [8]:
df_person.rename(columns={'MAJOR': 'PERSON_MAJOR'}, inplace=True)

In [9]:
df_person.head()

Unnamed: 0,PERSON_ID,GENDER,WORK_YEARS,HIGHEST_EDU,PERSON_MAJOR,AGE,LAST_POSITION,LAST_INDUSTRY,CURR_LOC
0,33291,男,15,大专,计算机应用技术,37,网络管理/信息安全管理,,深圳市
1,2985277,男,12,大专,计算机应用技术,35,*公关/营销/业务类,文化体育行业,深圳市
2,2982066,女,10,大专,金融学（含保险学）,32,出纳,医药销售行业,南山区
3,3010866,男,10,中专,物理电子学,34,营销代表/销售顾问,珠宝玉石行业,深圳市
4,316816964,女,15,中专,学前教育学,34,小学教育/幼儿教育/保育,行业组织,福田区


In [10]:
edu_map = {
    '其它': 0,
    '中专': 1,
    '高中（职高、技校）': 2,
    '大专': 3,
    '大学本科': 4,
    '硕士研究生': 5,
    '博士后': 6
}

df_person['HIGHEST_EDU'] = df_person['HIGHEST_EDU'].map(edu_map)

In [11]:
def major_clean(x):
    if type(x) == float:
        return x

    x = x.replace('【', '').replace('】', '')
    return x


df_person['PERSON_MAJOR'] = df_person['PERSON_MAJOR'].apply(major_clean)

In [12]:
df_feature = df_feature.merge(df_person, how='left', on='PERSON_ID')

# 求职意向

In [13]:
df_person_cv = pd.read_csv('raw_data/trainset/person_cv.csv')

In [14]:
df_person_cv = df_person_cv.drop(columns=['REMARK', 'SELF_COMMENT'], axis=1)

In [15]:
df_person_cv.rename(columns={'LOCATION': 'CV_LOCATION'}, inplace=True)

In [16]:
df_person_cv.head()

Unnamed: 0,PERSON_ID,POSITION,CV_LOCATION,INDUSTRY,AVAILABLE_IN_DAYS
0,2985277,导游,深圳市,餐饮旅游娱乐行业,
1,4736088,*机械类,深圳市,,14.0
2,3016588,*财务类/审计类,宝安区,,7.0
3,2981299,*电子/通讯类*,深圳市,能源/光电/电器行业,7.0
4,2983790,结构技术,深圳市,建筑房地产行业,7.0


In [17]:
df_feature = df_feature.merge(df_person_cv, how='left', on='PERSON_ID')

# 工作经历

In [18]:
df_person_job_hist = pd.read_csv('raw_data/trainset/person_job_hist.csv')

In [19]:
df_person_job_hist.head()

Unnamed: 0,PERSON_ID,POSITION,LOCATION,INDUSTRY,ACHIEVEMENT
0,1281276,行政管理,罗湖区,其它,1.协助总经理处理好日常事务及和外部公共关系；2.负责协助起草总经理各类工作往来文件，并负责有关文件的呈报、催办、归档等管理事宜； 3.协助****公司企业文化、企业战略发展的规划； 4.协助****公司来宾的接待工作；****公司各个项目以及相关日常事务的执行情况，定期跟踪、汇报； 6.兼管行政人事、财务等事务。
1,980158,售前/售后服务,,,"本人主要负责万佳,天虹,岁宝,民润等重要客户的品牌分类管理,收集竞争对手信息与反馈,店内执行评估,货架,助销,价格等,建立和维护重点终端客户及kA市场实践经验．"
2,3016108,培训管理,福田区,信息行业（IT/通讯/互联网）,从事学生管理工作.并负责分校区的学生心理辅导和职业指导工作.
3,3016108,培训管理,福田区,信息行业（IT/通讯/互联网）,从事心理学的教学工作，并担任学校的心理辅导老师，负责了学校心理咨询中心的组建和日常咨询工作的开展，接受咨询需求****人次以上，获得了良好的社会效益。
4,2980989,产品开发,宝安区,电子行业,"1.工程师对产品进行设计及开发 2.处理3D图和2D图,同时制作相关的资料(如BOM\技术文件等)3.修改及更新旧产品结构及性能 4.制作产品的加工工艺及流程5.处理产品的结构及工艺问题"


In [20]:
df_tmp = df_person_job_hist.groupby(['PERSON_ID']).size().reset_index()
df_tmp.columns = ['PERSON_ID', 'job_hist_cnt']
df_feature = df_feature.merge(df_tmp, how='left', on='PERSON_ID')

# 招聘岗位信息

In [21]:
df_recruit = pd.read_csv('raw_data/trainset/recruit.csv')

In [22]:
df_recruit = df_recruit.drop(columns=['DETAIL'], axis=1)

In [23]:
df_recruit.rename(columns={
    'LOCATION': 'RECRUIT_LOCATION',
    'MAJOR': 'RECRUIT_MAJOR'
},
                  inplace=True)

In [24]:
df_recruit.head()

Unnamed: 0,RECRUIT_ID,PERSON_TYPE_CODE,PERSON_TYPE,JOB_TITLE,RECRUIT_MAJOR,LOWER_EDU,RECRUIT_LOCATION,WORK_YEARS_RANGE
0,135144,,,业务员,,高中（职高、技校）,深圳市,应届毕业生
1,137045,,,电子技术支持工程师,电子信息工程学,中专,龙岗区,0至1年
2,146798,,,仓管,【工商管理】,中专,龙岗区,0至1年
3,436321,2.0,社会无职,销售代表,,中专,深圳市,应届毕业生
4,440725,99.0,不限,造价员,工民建,中专,深圳市,3至5年


In [25]:
def major_clean(x):
    if type(x) == float:
        return x

    x = x.replace('【', '').replace('】', '')
    return x


df_recruit['RECRUIT_MAJOR'] = df_recruit['RECRUIT_MAJOR'].apply(major_clean)

In [26]:
df_recruit['LOWER_EDU'] = df_recruit['LOWER_EDU'].map(edu_map)

In [27]:
work_year_range_map = {
    '应届毕业生': 0,
    '0至1年': 1,
    '1至2年': 2,
    '3至5年': 3,
    '5年以上': 4,
    '不限': 5
}
df_recruit['WORK_YEARS_RANGE'] = df_recruit['WORK_YEARS_RANGE'].map(
    work_year_range_map)

In [28]:
df_feature = df_feature.merge(df_recruit, how='left', on='RECRUIT_ID')

# embedding 特征

In [29]:
job_title_embeddings = pd.read_pickle('data/embedding/job_title.pkl')
df_feature = df_feature.merge(job_title_embeddings, how='left', on='JOB_TITLE')
del df_feature['JOB_TITLE']

# 交叉特征

In [30]:
# 简单统计
def stat(df, df_merge, group_by, agg):
    group = df.groupby(group_by).agg(agg)

    columns = []
    for on, methods in agg.items():
        for method in methods:
            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))
    group.columns = columns
    group.reset_index(inplace=True)
    df_merge = df_merge.merge(group, on=group_by, how='left')

    del (group)
    gc.collect()

    return df_merge


def statis_feat(df_know, df_unknow):
    for f in tqdm([['CV_LOCATION'], ['RECRUIT_ID']]):
        df_unknow = stat(df_know, df_unknow, f, {'LABEL': ['mean']})

    return df_unknow


# 5折交叉
df_train = df_feature[~df_feature['LABEL'].isnull()]
df_train = df_train.reset_index(drop=True)
df_test = df_feature[df_feature['LABEL'].isnull()]

df_stas_feat = None
kfold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
for train_index, val_index in kfold.split(df_train, df_train['LABEL']):
    df_fold_train = df_train.iloc[train_index]
    df_fold_val = df_train.iloc[val_index]

    df_fold_val = statis_feat(df_fold_train, df_fold_val)
    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)

    del (df_fold_train)
    del (df_fold_val)
    gc.collect()

df_test = statis_feat(df_train, df_test)
df_feature = pd.concat([df_stas_feat, df_test], axis=0)

del (df_stas_feat)
del (df_train)
del (df_test)
gc.collect()

100%|██████████| 2/2 [00:00<00:00, 24.45it/s]
100%|██████████| 2/2 [00:00<00:00, 26.90it/s]
100%|██████████| 2/2 [00:00<00:00, 26.73it/s]
100%|██████████| 2/2 [00:00<00:00, 26.60it/s]
100%|██████████| 2/2 [00:00<00:00, 26.57it/s]
100%|██████████| 2/2 [00:00<00:00, 13.45it/s]


29

In [31]:
df_score = pd.read_pickle('data/score.pkl')
df_feature = df_feature.merge(df_score, how='left')

In [32]:
# count
for f in [['PERSON_ID'], ['POSITION']]:
    df_feature['{}_cnt'.format(
        '_'.join(f))] = df_feature.groupby(f)['PERSON_ID'].transform('count')

In [33]:
# nunique
for f1, f2 in [['RECRUIT_ID', 'POSITION'], ['RECRUIT_ID', 'PERSON_MAJOR']]:
    df_feature[f'{f1}_{f2}_nunique'] = df_feature.groupby(
        [f1])[f2].transform('nunique')

In [34]:
# 连续变量统计
for f1, f2 in [['RECRUIT_ID', 'WORK_YEARS']]:
    df_temp = df_feature.groupby(f1)[f2].agg({
        f'{f1}_{f2}_mean'.format(f): 'mean',
        f'{f1}_{f2}_max'.format(f): 'max',
        f'{f1}_{f2}_min'.format(f): 'min',
        f'{f1}_{f2}_std'.format(f): 'std',
    }).reset_index()
    df_feature = df_feature.merge(df_temp, how='left')

In [35]:
df_feature['CV_RECRUIT_LOCATION_equal'] = df_feature[[
    'RECRUIT_LOCATION', 'CV_LOCATION'
]].apply(lambda x: x['RECRUIT_LOCATION'] == x['CV_LOCATION'], axis=1)

df_feature['LOWER_EDU_HIGHEST_EDU_higher'] = df_feature[[
    'LOWER_EDU', 'HIGHEST_EDU'
]].apply(lambda x: x['LOWER_EDU'] > x['HIGHEST_EDU'], axis=1)

In [36]:
df_feature.head()

Unnamed: 0,RECRUIT_ID,PERSON_ID,LABEL,GENDER,WORK_YEARS,HIGHEST_EDU,PERSON_MAJOR,AGE,LAST_POSITION,LAST_INDUSTRY,CURR_LOC,POSITION,CV_LOCATION,INDUSTRY,AVAILABLE_IN_DAYS,job_hist_cnt,PERSON_TYPE_CODE,PERSON_TYPE,RECRUIT_MAJOR,LOWER_EDU,RECRUIT_LOCATION,WORK_YEARS_RANGE,JOB_TITLE_ernie_emb_0,JOB_TITLE_ernie_emb_1,JOB_TITLE_ernie_emb_2,JOB_TITLE_ernie_emb_3,JOB_TITLE_ernie_emb_4,JOB_TITLE_ernie_emb_5,JOB_TITLE_ernie_emb_6,JOB_TITLE_ernie_emb_7,JOB_TITLE_ernie_emb_8,JOB_TITLE_ernie_emb_9,JOB_TITLE_ernie_emb_10,JOB_TITLE_ernie_emb_11,JOB_TITLE_ernie_emb_12,JOB_TITLE_ernie_emb_13,JOB_TITLE_ernie_emb_14,JOB_TITLE_ernie_emb_15,JOB_TITLE_ernie_emb_16,JOB_TITLE_ernie_emb_17,JOB_TITLE_ernie_emb_18,JOB_TITLE_ernie_emb_19,JOB_TITLE_ernie_emb_20,JOB_TITLE_ernie_emb_21,JOB_TITLE_ernie_emb_22,JOB_TITLE_ernie_emb_23,JOB_TITLE_ernie_emb_24,JOB_TITLE_ernie_emb_25,JOB_TITLE_ernie_emb_26,JOB_TITLE_ernie_emb_27,JOB_TITLE_ernie_emb_28,JOB_TITLE_ernie_emb_29,CV_LOCATION_LABEL_mean,RECRUIT_ID_LABEL_mean,recruit_person_MAJOR_score,PERSON_ID_cnt,POSITION_cnt,RECRUIT_ID_POSITION_nunique,RECRUIT_ID_PERSON_MAJOR_nunique,RECRUIT_ID_WORK_YEARS_mean,RECRUIT_ID_WORK_YEARS_max,RECRUIT_ID_WORK_YEARS_min,RECRUIT_ID_WORK_YEARS_std,CV_RECRUIT_LOCATION_equal,LOWER_EDU_HIGHEST_EDU_higher
0,772899,5413605,0.0,女,0,3.0,文秘,29,人力资源管理,通讯行业,广东省,行政管理,深圳市,,30.0,2.0,1.0,社会在职,旅游管理,1.0,福田区,2,-0.051337,-0.243886,0.028309,-0.200545,-0.109874,-0.082832,0.341493,-0.113816,-0.216219,-0.110851,-0.13338,-0.183411,-0.079125,-0.434388,0.019965,-0.085903,-0.015218,-0.248044,0.247061,-0.285383,0.084865,0.000443,0.066399,-0.007481,0.205345,0.141893,0.189973,-0.235695,-0.182161,0.17924,0.147827,0.02439,0.056579,2,2240.0,121,124,6.5321,28,0,5.060721,False,False
1,813938,1391289,1.0,男,12,3.0,计算机科学与技术,34,网络管理/信息安全管理,,深圳市,项目实施/顾问,深圳市,信息行业（IT/通讯/互联网）,14.0,4.0,,,软件工程,3.0,深圳市,0,0.234713,-0.123183,-0.120129,0.181092,0.104147,0.089959,0.020083,-0.006046,0.109227,-0.333411,-0.011447,0.298391,0.104235,-0.130547,0.074952,-0.129922,-0.189609,0.311192,-0.101285,-0.053433,-0.051312,0.271554,-0.015772,0.075361,-0.368809,-0.015872,-0.207454,-0.277825,0.273316,0.202631,0.147827,0.695652,0.182027,1,322.0,23,11,7.45977,24,0,5.909796,True,False
2,795526,6196384,0.0,女,6,3.0,文秘,29,客户服务,,罗湖区,国际贸易/涉外业务,深圳市,商业零售行业,14.0,1.0,,,,1.0,福田区,1,-0.315937,-0.185878,-0.069693,-0.238269,-0.061867,-0.047715,0.443191,0.27141,-0.099232,0.248548,-0.201732,-0.266306,0.231007,0.085372,0.187326,0.144522,-0.300614,0.01654,0.118367,-0.107304,0.102943,0.069849,0.012187,-0.070502,0.050517,-0.063172,-0.141544,0.068182,-0.214175,0.118705,0.147827,0.005435,-1.0,5,6018.0,88,97,6.677188,22,0,4.694759,False,False
3,781773,1340058,0.0,男,12,3.0,机械制造及其自动化,35,电子/数码产品开发,,宝安区,结构技术,深圳市,,30.0,3.0,1.0,社会在职,,3.0,龙岗区,3,0.033851,-0.038578,-0.117304,0.086912,-0.594854,-0.009151,-0.142161,-0.198133,0.272518,0.107589,-0.217581,-0.304624,0.245591,-0.07293,0.260233,-0.204289,0.01046,-0.092746,-0.029742,0.149046,-0.185682,0.045448,0.212247,-0.001426,0.112673,-0.117515,-0.078132,0.077578,-0.093877,0.027175,0.147827,0.0,-1.0,1,207.0,51,38,9.671512,35,0,5.932045,False,False
4,820496,5869866,1.0,女,9,4.0,电子商务,31,行政管理,互联网行业,宝安区,行政管理,宝安区,,7.0,4.0,1.0,社会在职,,2.0,福田区,1,-0.076386,-0.224421,0.067912,-0.017263,0.226272,-0.289024,-0.402977,-0.14686,0.118392,-0.006297,0.259484,0.083771,0.030424,-0.125321,-0.283175,-0.134899,0.037033,-0.040271,-0.069357,0.123851,0.091143,-0.050242,0.068554,-0.433342,0.399648,-0.135726,0.071339,-0.050783,-0.054708,-0.038917,0.296959,0.6,-1.0,2,2240.0,20,14,9.641026,20,0,4.498463,False,False


# 建模

In [37]:
for f in df_feature.select_dtypes('object'):
    le = LabelEncoder()
    print(f)
    df_feature[f] = le.fit_transform(df_feature[f].astype('str'))

GENDER
PERSON_MAJOR
LAST_POSITION
LAST_INDUSTRY
CURR_LOC
POSITION
CV_LOCATION
INDUSTRY
PERSON_TYPE
RECRUIT_MAJOR
RECRUIT_LOCATION


In [38]:
df_train = df_feature[df_feature['LABEL'].notnull()]
df_test = df_feature[df_feature['LABEL'].isnull()]

In [39]:
df_train.shape, df_test.shape

((35291, 65), (70774, 65))

In [40]:
ycol = 'LABEL'
feature_names = list(filter(lambda x: x not in [ycol], df_train.columns))

oof = []
prediction = df_test[['RECRUIT_ID', 'PERSON_ID']]
prediction['pred'] = 0
df_importance_list = []

model = lgb.LGBMClassifier(num_leaves=64,
                           max_depth=10,
                           learning_rate=0.1,
                           n_estimators=1000000,
                           subsample=0.8,
                           feature_fraction=0.8,
                           reg_alpha=0.5,
                           reg_lambda=0.5,
                           random_state=seed,
                           metric='auc')

kfold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
for fold_id, (trn_idx, val_idx) in enumerate(
        kfold.split(df_train[feature_names], df_train[ycol])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(
        fold_id + 1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['valid'],
                          eval_set=[(X_val, Y_val)],
                          verbose=100,
                          eval_metric='auc',
                          early_stopping_rounds=100)

    pred_val = lgb_model.predict_proba(X_val)
    df_oof = df_train.iloc[val_idx][['RECRUIT_ID', 'PERSON_ID', ycol]].copy()
    df_oof['pred'] = pred_val[:, 1]
    oof.append(df_oof)

    pred_test = lgb_model.predict_proba(df_test[feature_names])
    prediction['pred'] += pred_test[:, 1] / kfold.n_splits

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': model.feature_importances_,
    })
    df_importance_list.append(df_importance)



Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.984237
Early stopping, best iteration is:
[93]	valid's auc: 0.984317


Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.980044
[200]	valid's auc: 0.979965
Early stopping, best iteration is:
[135]	valid's auc: 0.980294


Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.979141
[200]	valid's auc: 0.979322
Early stopping, best iteration is:
[140]	valid's auc: 0.97951


Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.982674
[200]	valid's auc: 0.982694
Early stopping, best iteration is:
[116]	valid's auc: 0.982874


Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.982222
[200]	valid's auc: 0.981732
Early stopping, best iteration is:
[142]	valid's auc: 0.982444


In [41]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby([
    'column'
])['importance'].agg('mean').sort_values(ascending=False).reset_index()
df_importance

Unnamed: 0,column,importance
0,PERSON_ID,627.8
1,RECRUIT_ID_LABEL_mean,478.0
2,PERSON_MAJOR,311.6
3,POSITION,294.2
4,POSITION_cnt,293.2
5,RECRUIT_ID,280.2
6,PERSON_ID_cnt,268.2
7,WORK_YEARS,253.0
8,AGE,235.0
9,LAST_POSITION,216.2


In [42]:
df_oof = pd.concat(oof)
df_oof.sort_values(['pred'], inplace=True, ascending=False)
df_oof.reset_index(drop=True, inplace=True)
df_oof['pred_label'] = 0
df_oof.loc[:int(0.16 * len(df_oof)), 'pred_label'] = 1
f1 = f1_score(df_oof[ycol], df_oof['pred_label'])
auc = roc_auc_score(df_oof[ycol], df_oof['pred'])
f1, auc

(0.8652822151224707, 0.981616311118922)

In [43]:
print(classification_report(df_oof[ycol], df_oof['pred_label']))

             precision    recall  f1-score   support

        0.0       0.97      0.97      0.97     29670
        1.0       0.86      0.87      0.87      5621

avg / total       0.96      0.96      0.96     35291



In [44]:
prediction.sort_values(['pred'], inplace=True, ascending=False)
prediction.reset_index(drop=True, inplace=True)
prediction['LABEL'] = 0
prediction.loc[:int(0.16 * len(prediction)), 'LABEL'] = 1
prediction['LABEL'].value_counts()

0    59450
1    11324
Name: LABEL, dtype: int64

In [45]:
df_oof.head(20)

Unnamed: 0,RECRUIT_ID,PERSON_ID,LABEL,pred,pred_label
0,42559662,320003954,1.0,0.999951,1
1,42559662,320338907,1.0,0.999949,1
2,42939022,317516901,1.0,0.999948,1
3,43236688,317282943,1.0,0.999946,1
4,43922680,319489936,1.0,0.999943,1
5,43668705,320540944,1.0,0.999943,1
6,43534698,317244952,1.0,0.999942,1
7,44254655,320042903,1.0,0.999942,1
8,43897745,319677903,1.0,0.999941,1
9,42989690,317729913,1.0,0.999941,1


In [46]:
prediction.head()

Unnamed: 0,RECRUIT_ID,PERSON_ID,pred,LABEL
0,42559662,320223900,0.999882,1
1,43267862,318537916,0.99988,1
2,43267862,319313935,0.99988,1
3,43809664,320148902,0.99988,1
4,44254655,320721906,0.99988,1


In [47]:
os.makedirs('sub', exist_ok=True)
prediction[['RECRUIT_ID', 'PERSON_ID', 'LABEL']].to_csv(f'sub/{f1}.csv',
                                                        index=False)
prediction[['RECRUIT_ID', 'PERSON_ID', 'LABEL']].to_csv('sub/submission.csv',
                                                        index=False)