In [1]:
from transformers import AutoTokenizer
from transformers import AutoModel
import torch
import pandas as pd
from tqdm import tqdm
import numpy as np
import os
import pickle
from pandarallel import pandarallel
import warnings

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pandarallel.initialize()
warnings.filterwarnings('ignore')

INFO: Pandarallel will run on 20 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
os.makedirs('data/tmp', exist_ok=True)
os.makedirs('data/embedding', exist_ok=True)

In [3]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
class Encoder():
    def __init__(self,
                 sentences,
                 embeddings,
                 key_name,
                 prefix,
                 keys_sentences_map=None):
        self.sentences = sentences
        self.embeddings = embeddings
        self.key_name = key_name
        self.prefix = prefix

        if keys_sentences_map is not None:
            self.keys_sentences_map = keys_sentences_map
        else:
            self.keys_sentences_map = dict(zip(sentences, sentences))

        sentences_embeddings_map = dict(zip(sentences, embeddings))
        self.keys_embeddings_map = {}
        for key, sentence in self.keys_sentences_map.items():
            self.keys_embeddings_map[key] = sentences_embeddings_map[sentence]

    def get_embeddings(self, normalize=False):
        if normalize and self.keys_normalize_embeddings_map is not None:
            keys_embeddings_map = self.keys_normalize_embeddings_map
        else:
            keys_embeddings_map = self.keys_embeddings_map

        emb_size = len(list(keys_embeddings_map.values())[0])

        data_list = []
        for key, embedding in keys_embeddings_map.items():
            data_list.append([key] + list(embedding))

        df_emb = pd.DataFrame(data_list)
        df_emb.columns = [self.key_name] + [
            '{}_emb_{}'.format(self.prefix, i) for i in range(emb_size)
        ]

        return df_emb

    def get_embedding(self, key, normalize=False):
        try:
            if normalize and self.keys_normalize_embeddings_map is not None:
                return self.keys_normalize_embeddings_map[key]
            else:
                return self.keys_embeddings_map[key]
        except Exception:
            return None

    def transform_and_normalize(self, kernel, bias, n_components=None):
        """应用变换，然后标准化
        """
        if n_components is not None:
            kernel = kernel[:, :n_components]

        if not (kernel is None or bias is None):
            vecs = (self.embeddings + bias).dot(kernel)
        else:
            vecs = vecs

        vecs = vecs / (vecs**2).sum(axis=1, keepdims=True)**0.5

        sentences_embeddings_map = dict(zip(self.sentences, vecs))
        self.keys_normalize_embeddings_map = {}
        for key, sentence in self.keys_sentences_map.items():
            self.keys_normalize_embeddings_map[key] = sentences_embeddings_map[
                sentence]

In [5]:
def build_model(path):
    tokenizer = AutoTokenizer.from_pretrained(path)
    model = AutoModel.from_pretrained(path)
    model = model.to(DEVICE)
    return tokenizer, model

In [6]:
def sent_to_vec(sent, tokenizer, model, pooling, max_length):
    with torch.no_grad():
        inputs = tokenizer(sent,
                           return_tensors="pt",
                           padding=True,
                           truncation=True,
                           max_length=max_length)
        inputs['input_ids'] = inputs['input_ids'].to(DEVICE)
        inputs['token_type_ids'] = inputs['token_type_ids'].to(DEVICE)
        inputs['attention_mask'] = inputs['attention_mask'].to(DEVICE)

        hidden_states = model(**inputs,
                              return_dict=True,
                              output_hidden_states=True).hidden_states

        if pooling == 'first_last_avg':
            output_hidden_state = (hidden_states[-1] +
                                   hidden_states[1]).mean(dim=1)
        elif pooling == 'last_avg':
            output_hidden_state = (hidden_states[-1]).mean(dim=1)
        elif pooling == 'last2avg':
            output_hidden_state = (hidden_states[-1] +
                                   hidden_states[-2]).mean(dim=1)
        elif pooling == 'cls':
            output_hidden_state = (hidden_states[-1])[:, 0, :]
        else:
            raise Exception("unknown pooling {}".format(POOLING))

        vec = output_hidden_state.cpu().numpy()[0]
    return vec

In [7]:
def sents_to_vecs(sents, tokenizer, model, pooling, max_length, verbose=True):
    vecs = []
    if verbose:
        sents = tqdm(sents)
    for sent in sents:
        vec = sent_to_vec(sent, tokenizer, model, pooling, max_length)
        vecs.append(vec)
    assert len(sents) == len(vecs)
    vecs = np.array(vecs)
    return vecs

In [8]:
def compute_kernel_bias(vecs):
    """计算kernel和bias
    最后的变换：y = (x + bias).dot(kernel)
    """
    vecs = np.concatenate(vecs, axis=0)
    mu = vecs.mean(axis=0, keepdims=True)
    cov = np.cov(vecs.T)
    u, s, vh = np.linalg.svd(cov)
    W = np.dot(u, np.diag(s**0.5))
    W = np.linalg.inv(W.T)
    return W, -mu

In [9]:
# 加载模型
path = 'data/pretrain_models/ernie'
tokenizer, model = build_model(path)

# 生成embedding

In [10]:
vecs_list = []
pooling = 'cls'
max_length = 512

In [11]:
# 招聘岗位信息的招聘职位
def get_job_title_encoder():
    try:
        vecs = np.load('data/tmp/job_title_vecs.npy')
        with open('data/tmp/job_title_encoder.txt', 'rb') as f:
            job_title_encoder = pickle.load(f)

    except Exception:
        df_recruit = pd.read_csv('raw_data/trainset/recruit.csv')
        sentences = df_recruit['JOB_TITLE'].values.tolist()
        sentences = list(set(sentences))
        vecs = sents_to_vecs(sentences, tokenizer, model, pooling, max_length)
        job_title_encoder = Encoder(sentences, vecs, 'JOB_TITLE',
                                    'JOB_TITLE_ernie')

        np.save('data/tmp/job_title_vecs.npy', vecs)
        with open('data/tmp/job_title_encoder.txt', 'wb') as f:
            pickle.dump(job_title_encoder, f)

    return vecs, job_title_encoder


vecs, job_title_encoder = get_job_title_encoder()
vecs_list.append(vecs)

In [12]:
def major_clean(x):
    if type(x) == float:
        return x

    x = x.replace('【', '').replace('】', '')
    return x

In [13]:
# 招聘岗位信息的对应聘者的专业要求
def get_recruit_major_encoder():
    try:
        vecs = np.load('data/tmp/recruit_major_vecs.npy')
        with open('data/tmp/recruit_major_encoder.txt', 'rb') as f:
            recruit_major_encoder = pickle.load(f)

    except Exception:
        df_recruit = pd.read_csv('raw_data/trainset/recruit.csv')
        df_recruit['MAJOR'].fillna('', inplace=True)
        df_recruit['MAJOR'] = df_recruit['MAJOR'].apply(major_clean)
        sentences = df_recruit['MAJOR'].values.tolist()
        sentences = list(set(sentences))
        vecs = sents_to_vecs(sentences, tokenizer, model, pooling, max_length)
        recruit_major_encoder = Encoder(sentences, vecs, 'MAJOR',
                                        'recruit_MAJOR_ernie')

        np.save('data/tmp/recruit_major_vecs.npy', vecs)
        with open('data/tmp/recruit_major_encoder.txt', 'wb') as f:
            pickle.dump(recruit_major_encoder, f)

    return vecs, recruit_major_encoder


vecs, recruit_major_encoder = get_recruit_major_encoder()
vecs_list.append(vecs)

In [14]:
# 求职者基本信息的应聘者专业
def get_person_major_encoder():
    try:
        vecs = np.load('data/tmp/person_major_vecs.npy')
        with open('data/tmp/person_major_encoder.txt', 'rb') as f:
            person_major_encoder = pickle.load(f)

    except Exception:
        df_person = pd.read_csv('raw_data/trainset/person.csv')
        df_person['MAJOR'].fillna('', inplace=True)
        df_person['MAJOR'] = df_person['MAJOR'].apply(major_clean)
        sentences = df_person['MAJOR'].values.tolist()
        sentences = list(set(sentences))
        vecs = sents_to_vecs(sentences, tokenizer, model, pooling, max_length)
        person_major_encoder = Encoder(sentences, vecs, 'MAJOR',
                                       'person_MAJOR_ernie')

        np.save('data/tmp/person_major_vecs.npy', vecs)
        with open('data/tmp/person_major_encoder.txt', 'wb') as f:
            pickle.dump(person_major_encoder, f)

    return vecs, person_major_encoder


vecs, person_major_encoder = get_person_major_encoder()
vecs_list.append(vecs)

# BERT-whitening
https://kexue.fm/archives/8321

In [15]:
kernel, bias = compute_kernel_bias(vecs_list)

# 保存embedding

In [16]:
job_title_encoder.transform_and_normalize(kernel, bias, 30)
job_title_embeddings = job_title_encoder.get_embeddings(True)
job_title_embeddings.to_pickle('data/embedding/job_title.pkl')

In [17]:
recruit_major_encoder.transform_and_normalize(kernel, bias, 30)
recruit_major_embeddings = recruit_major_encoder.get_embeddings(True)
recruit_major_embeddings.to_pickle('data/embedding/recruit_major.pkl')

In [18]:
person_major_encoder.transform_and_normalize(kernel, bias, 30)
person_major_embeddings = person_major_encoder.get_embeddings(True)
person_major_embeddings.to_pickle('data/embedding/person_major.pkl')

In [19]:
person_major_embeddings.head()

Unnamed: 0,MAJOR,person_MAJOR_ernie_emb_0,person_MAJOR_ernie_emb_1,person_MAJOR_ernie_emb_2,person_MAJOR_ernie_emb_3,person_MAJOR_ernie_emb_4,person_MAJOR_ernie_emb_5,person_MAJOR_ernie_emb_6,person_MAJOR_ernie_emb_7,person_MAJOR_ernie_emb_8,person_MAJOR_ernie_emb_9,person_MAJOR_ernie_emb_10,person_MAJOR_ernie_emb_11,person_MAJOR_ernie_emb_12,person_MAJOR_ernie_emb_13,person_MAJOR_ernie_emb_14,person_MAJOR_ernie_emb_15,person_MAJOR_ernie_emb_16,person_MAJOR_ernie_emb_17,person_MAJOR_ernie_emb_18,person_MAJOR_ernie_emb_19,person_MAJOR_ernie_emb_20,person_MAJOR_ernie_emb_21,person_MAJOR_ernie_emb_22,person_MAJOR_ernie_emb_23,person_MAJOR_ernie_emb_24,person_MAJOR_ernie_emb_25,person_MAJOR_ernie_emb_26,person_MAJOR_ernie_emb_27,person_MAJOR_ernie_emb_28,person_MAJOR_ernie_emb_29
0,,0.033922,0.016015,-0.079779,0.032491,0.194276,0.181433,-0.021364,0.01938,-0.204926,-0.107664,-0.020242,0.035183,0.269443,-0.102305,-0.01367,-0.260084,0.467201,0.224754,0.101588,-0.26879,-0.169664,-0.233692,0.126631,-0.14362,-0.339964,-0.121349,0.125929,-0.2896,-0.086212,0.068332
1,无机化学,0.161805,0.143702,-0.093986,-0.055504,-0.106657,-0.039275,0.158578,0.218503,-0.066466,-0.029445,0.023388,0.11009,-0.407006,0.180535,-0.170367,-0.233926,0.015842,-0.230791,-0.089471,0.214549,-0.161172,-0.064123,0.236262,-0.294073,0.05169,0.166714,-0.372305,0.003644,0.314602,0.086595
2,核科学与技术,0.163946,0.148324,-0.277979,-0.226509,0.118653,0.126283,-0.246546,0.040637,-0.125602,0.254303,-0.018426,-0.144588,-0.292179,-0.126347,0.343572,-0.240943,0.122548,-0.225315,0.040632,-0.180738,-0.045741,-0.189579,-0.364384,0.041531,-0.088535,0.146077,0.130512,-0.178523,-0.010251,0.011866
3,教育技术学,0.16801,0.316615,-0.187714,-0.108187,0.020457,-0.318753,-0.148261,-0.392003,-0.113287,-0.27727,0.05758,0.172383,-0.116611,-0.27895,-0.207907,0.029294,-0.228712,0.10474,0.031003,-0.156405,-0.080898,0.193279,0.060483,0.290677,0.006571,-0.157274,-0.077795,-0.005406,0.055618,-0.184305
4,公共管理,0.137709,0.136864,0.216789,-0.002276,0.054183,-0.362888,0.016013,-0.018113,0.042418,0.247334,-0.146403,0.135248,0.101729,-0.044444,0.097301,0.196056,-0.305892,-0.231472,0.064632,0.152949,-0.280432,0.244647,-0.125882,0.061134,0.076664,-0.002361,0.007039,0.012406,-0.219187,0.479831


# 计算匹配度

In [20]:
df_train = pd.read_csv('raw_data/trainset/recruit_folder.csv')
df_test = pd.read_csv('raw_data/testset/recruit_folder.csv')
df_test['LABEL'] = np.nan
df_feature = df_train.append(df_test, sort=False)
df_recruit = pd.read_csv('raw_data/trainset/recruit.csv')
df_feature = df_feature.merge(df_recruit[['RECRUIT_ID', 'MAJOR']],
                              how='left',
                              on='RECRUIT_ID')
df_feature.rename({'MAJOR': 'recruit_MAJOR'}, axis=1, inplace=True)
df_person = pd.read_csv('raw_data/trainset/person.csv')
df_feature = df_feature.merge(df_person[['PERSON_ID', 'MAJOR']],
                              how='left',
                              on='PERSON_ID')
df_feature.rename({'MAJOR': 'person_MAJOR'}, axis=1, inplace=True)

In [21]:
df_feature.head()

Unnamed: 0,RECRUIT_ID,PERSON_ID,LABEL,recruit_MAJOR,person_MAJOR
0,825081,6256839,0.0,工业自动化,
1,772899,5413605,0.0,旅游管理,文秘
2,795668,5219796,0.0,,财政学（含税收学）
3,769754,5700693,0.0,,计算机应用技术
4,773645,6208645,0.0,汽车工程,计算机应用技术


In [22]:
def consine(vector1, vector2):
    if type(vector1) != np.ndarray or type(vector2) != np.ndarray:
        return -1
    distance = np.dot(vector1, vector2) / \
        (np.linalg.norm(vector1)*(np.linalg.norm(vector2)))
    return distance

In [23]:
df_feature['recruit_person_MAJOR_score'] = df_feature[[
    'recruit_MAJOR', 'person_MAJOR'
]].apply(lambda x: consine(
    recruit_major_encoder.get_embedding(x['recruit_MAJOR'], True),
    person_major_encoder.get_embedding(x['person_MAJOR'], True)),
         axis=1)

In [24]:
df_feature[['RECRUIT_ID', 'PERSON_ID',
            'recruit_person_MAJOR_score']].to_pickle('data/score.pkl')