Автор задачи компания HeadHunter.

Выбор нужных специализаций для вакансии не всегда тривиален, но при этом набор специализаций сильно влияет на то, будет ли показана вакансия нужным соискателям. Необходимо помочь работодателю быстрее сделать правильный выбор. По контенту вакансий нужно предсказать набор специализаций.

Каждая вакансия на hh.ru протеггирована набором специализаций: от 1 до 6-ти штук. Специализации – категориальный признак, принимающий одно из 620-ти значений, подробнее можно посмотреть, например, в API HeadHunter. 

Датасет (набор данных) состоит из ~2,9 млн вакансий, которые представлены в формате json. Задача состоит в том, чтобы по контенту вакансий предсказать набор специализаций. Оценка качества будет проводиться по среднему значению f-score предсказанных наборов по каждой вакансии.

Набор данных будет разделен на 2 части — обучающая выборка (train) и тестовая (test), для проверки и подсчета метрик качества решения. Данные будут разбиты случайным образом.

По вакансиям будут даны следующие данные: 

название
текстовое описание (может содержать HTML разметку)
ключевые навыки
регион размещения
Id работодателя в виде хэша
вилка заработной платы

In [1]:
import json
import gzip

import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

import torch
from transformers import BertModel, BertTokenizer, BertConfig, AdamW # https://huggingface.co/transformers/
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from IPython.display import clear_output
import matplotlib.pyplot as plt
from pylab import rcParams

from functions import BertForMultiLabelSequenceClassification, read_vacancies_part
from functions import mean_f1score, f1score, decode_labels, decode_preds, sigmoid, cleanhtml

import pickle

Using TensorFlow backend.


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if device == torch.device('cpu'):
    print('Using cpu')
else:
    print('Using {} GPUs'.format(torch.cuda.get_device_name(0)))

Using GeForce RTX 2080 Ti GPUs


In [3]:
train_specializations = pd.read_csv('train_labels.csv.gz', compression='gzip')
train_specializations = {
    vacancy_id: list(map(int, specs[1:-1].split(',')))
    for vacancy_id, specs in train_specializations.set_index('vacancy_id')['specializations'].iteritems()
}
labels = [spec for value in train_specializations.values() for spec in value]
    
ohe = {cls:i for i, cls in enumerate(set(labels))}
ohe_count = len(set(ohe.values()))

# для декодинга
revert_ohe = {value:key for key, value in ohe.items()}

In [4]:
# Список классов, которые имеют меньше 100 вхождений
exclude = [760, 761, 763, 754, 738, 741, 745, 743, 762, 570, 739, 740, 759, 735, 753, 746, 747, 554, 749,
550, 737, 756, 757, 742, 733, 736, 748, 744, 755, 750, 758, 200, 752, 159, 734, 394, 421, 582, 751]

# Модель

In [5]:
import datetime

class MetaClassifier(torch.nn.Module):
    def __init__(self, total_class: int, predict_class: int, bert_title: str, bert_text: str, input_features, device = None):
        super(MetaClassifier, self).__init__()
        t = torch.nn
        self.bert_title = BertModel.from_pretrained(bert_title)
        self.bert_title_path = bert_title
        self.bert_text = BertModel.from_pretrained(bert_text)
        self.bert_text_path = bert_text
        self.features_fc = t.Sequential(t.Linear(input_features, total_class), t.ReLU(), t.Dropout(p=0.3))
        self.berts_fc = t.Sequential(t.Linear(768*2, total_class), t.ReLU(), t.Dropout(p=0.1))
        self.stack_fc = t.Sequential(t.Linear(total_class*2, total_class), t.ReLU())
        self.dp = t.Dropout(p=0.1)
        self.output = t.Linear(total_class, predict_class)
        if device is not None:
            self.device = device
            self.to(self.device)
        
    def forward(self, x, title, text):
        _, out_title = self.bert_title(title['x'], None, title['attention'])
        _, out_text = self.bert_text(text['x'], None, text['attention'])

        out_berts = self.berts_fc(torch.cat([out_title, out_text], dim=1))        
        del out_text, out_title       
        out_features = self.features_fc(x)
        
        out = self.stack_fc(torch.cat([out_features, out_berts], dim=1))
        del out_features, out_berts
        out = self.dp(out)
        out = self.output(out) 
        return out                           
    
    def save_berts(self):
        self.bert_title.save_pretrained(self.bert_title_path)
        self.bert_text.save_pretrained(self.bert_text_path)
        return True
    
    def change_save_paths(self, bert_title = '', bert_text = ''):
        if bert_title != '':
            self.bert_title_path = bert_title
        if bert_text != '':
            self.bert_text_path = bert_text
            
    def get_outputs(self, loader, train=True, validate=False):
        """
        Return output of the model for loader
        """
        if train:
            pass
        else:
            valid_preds = []
            if validate:
                valid_labels = []
            else:
                # Если не обучаем и не валидация - выдаем вместо лейблов номера вакансий
                vacancy = []
                
        for step, batch in enumerate(loader):
            if step % 1000 == 0:
                now = datetime.datetime.now()
                print('{} Осталось {} из {}'.format(now.strftime("%H:%M:%S"), step, len(loader)))
            bert_title, bert_text = {}, {}
            
            # Передавать метку или нет
            features, bert_title['x'], bert_title['attention'], bert_text_x, bert_text_at, labels = batch
            if train:
                labels = labels.to(self.device)
            
            features = features.type(torch.FloatTensor)       
            features = features.to(self.device)
            bert_title['x'] = bert_title['x'].type(torch.LongTensor)
            bert_title['x'] = bert_title['x'].to(self.device)
            bert_title['attention'] = bert_title['attention'].type(torch.LongTensor)
            bert_title['attention'] = bert_title['attention'].to(self.device)
            
            if train:
                optimizer.zero_grad()

            # Длина текстов часто превышает 512 токенов, поэтому я делаю отдельный расчет по частям и потом усредняю выход
            for i in range(bert_text_x.shape[2]):
                tokens = bert_text_x[:, :, i]
                if step % 1000 == 0:
                    print(tokens[0])
                
                if tokens.sum() == 0:
                    pass
                tokens = tokens.type(torch.LongTensor)
                tokens = tokens.to(device)
                attention = bert_text_at[:, :, i]
                attention = attention.type(torch.LongTensor).to(device)
                
                # вклад текущего текста (более короткие тексты меньше приносят вклад)
                token_count = 1 #len(tokens)/512
                #for token in tokens:
                #    if token > 0:
                #        token_count += 1
                #token_count = len(tokens)/token_count 
                
                if i == 0:
                    if train:
                        output = model(features, bert_title, {'x': tokens, 'x_id': None, 'attention': attention})
                    else:
                        with torch.no_grad():
                            output = model(features, bert_title, {'x': tokens, 'x_id': None, 'attention': attention})
                    predict = output
                    
                else:
                    # работает очень медленно, если обучать каждый раз
                    with torch.no_grad():
                        output = model(features, bert_title, {'x': tokens, 'x_id': None, 'attention': attention})
                    predict = torch.add(predict, output, alpha=token_count)
            output = torch.div(predict, bert_text_x.shape[2])
            
            if train:
                # Backward pass
                loss = floss(output, labels)
                loss.backward()

                # Обновляем параметры и делаем шаг используя посчитанные градиенты
                scheduler.step(loss)
                optimizer.step()

                # Обновляем loss
                train_loss_set.append(float(loss))  
                train_loss += float(loss)
                
                # Рисуем график
                if step % 1000 == 0:
                    clear_output(True)
                    plt.plot(train_loss_set)
                    plt.title("Training loss part={} EPOCH={} learning rage = {}".format(part_num, e, optimizer.param_groups[0]['lr']))
                    plt.xlabel("Batch")
                    plt.ylabel("Loss")
                    plt.ylim(0, 0.03)
                    plt.show()
            else:
                output = output.detach().cpu().numpy()
                valid_preds.extend(output)
                labels = labels.to('cpu').numpy()
                if validate:
                    valid_labels.extend(labels)
                else:
                    vacancy.extend(labels)
        if train:
            return train_loss, train_loss_set
        elif validate:
            return valid_preds, valid_labels
        else:
            return valid_preds, vacancy

# Чтение данных

## Кодировщики

In [6]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import MinMaxScaler

def calculate_money(cfrom, cto):
    median_value = 27.5 #Подсчитано в explore - нули не считаются, верхний лимит для одной зп 10 млн
    cap = 10000
    mean_money = []
    for key in range(len(cfrom)):
        if cfrom[key] is None and cto[key] is not None:
            mean_money.append(cto[key])
        elif cfrom[key] is not None and cto[key] is None:
            mean_money.append(cfrom[key])
        elif cfrom[key] is None and cto[key] is None:
            mean_money.append(median_value)
        else:
            mean_money.append((cfrom[key] + cto[key]) / 2000)
        
        if mean_money[-1] > cap:
            mean_money[-1] = cap
    return mean_money

def currency_none(currency):
    for i in range(len(currency)):
        if currency[i] is None:
            currency[i] = 'XXX'
    return currency

def skill_tokenize(skill):
    global tokenizer
    result = []
    for s in skill:
        ts = tokenizer.tokenize(s)
        ts.insert(0, '[CLS] ')
        ts.append(" [SEP]")
        result.extend(ts)
    return result

tokenizer = BertTokenizer.from_pretrained('pretrained/', do_lower_case=True)

In [152]:
employment, schedule, experience, currency, employer, cto, cfrom = [], [], [], [], [], [], []

for part_num in tqdm(range(1,11)):
    part = read_vacancies_part(part_num)
    employment.extend(vacancy['employment'] for vacancy in part.values())
    schedule.extend(vacancy['work_schedule'] for vacancy in part.values())
    currency.extend(vacancy['currency'] for vacancy in part.values())
    experience.extend(vacancy['work_experience'] for vacancy in part.values())
    employer.extend(vacancy['employer'] for vacancy in part.values())
    cto.extend(vacancy['compensation_from'] for vacancy in part.values())
    cfrom.extend(vacancy['compensation_to'] for vacancy in part.values())
del part

money = calculate_money(cfrom, cto)

encoders = {
    'employment': OneHotEncoder(sparse=False).fit(np.array(employment).reshape(-1, 1)),
    'schedule': OneHotEncoder(sparse=False).fit(np.array(schedule).reshape(-1, 1)),
    'currency': OneHotEncoder(sparse=False).fit(np.array(currency_none(currency)).reshape(-1, 1)),
    'experience': OneHotEncoder(sparse=False).fit(np.array(experience).reshape(-1, 1)),
    'employer': LabelEncoder().fit(employer),
    'y': ohe,
    'y_decode': revert_ohe,
    'money': MinMaxScaler().fit(np.array(money).reshape(-1, 1))
}
del employment, schedule, currency, experience, employer, money

f = open('encoders.pkl', 'wb')
pickle.dump(encoders, f)
f.close()

del encoders

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))





In [None]:
skills, regions, employment, schedule, experience, currency, employer, cfrom, cto, date = {}, {}, {}, {}, {}, {}, {}, {}, {}, {}
text, names = {}, {}

for part_num in range(1,11):
    print(f"Начинаем {part_num}")
    part = read_vacancies_part(part_num)
    skills.update({int(vacancy_id): vacancy['key_skills'] for vacancy_id, vacancy in part.items()})
    text.update({int(vacancy_id): vacancy['description'] for vacancy_id, vacancy in part.items()})
    names.update({int(vacancy_id): vacancy['name'] for vacancy_id, vacancy in part.items()})
    regions.update({int(vacancy_id): vacancy['area_id'] for vacancy_id, vacancy in part.items()})
    employment.update({int(vacancy_id): vacancy['employment'] for vacancy_id, vacancy in part.items()})
    schedule.update({int(vacancy_id): vacancy['work_schedule'] for vacancy_id, vacancy in part.items()})
    currency.update({int(vacancy_id): vacancy['currency'] for vacancy_id, vacancy in part.items()})
    experience.update({int(vacancy_id): vacancy['work_experience'] for vacancy_id, vacancy in part.items()})
    employer.update({int(vacancy_id): vacancy['employer'] for vacancy_id, vacancy in part.items()})
    date.update({int(vacancy_id): vacancy['creation_date'] for vacancy_id, vacancy in part.items()})
    cfrom.update({int(vacancy_id): vacancy['compensation_from'] for vacancy_id, vacancy in part.items()})
    cto.update({int(vacancy_id): vacancy['compensation_to'] for vacancy_id, vacancy in part.items()})
    
    # Готовим данные
    X_names, X_skills, X_text, X_regions, X_employment, X_schedule, X_currency, X_experience  = [], [], [], [], [], [], [], []
    X_employer, X_date, X_cfrom, X_cto, y = [], [], [], [], []
    Xs_names, Xs_skills, Xs_text, Xs_regions, Xs_employment, Xs_schedule, Xs_currency, Xs_experience  = [], [], [], [], [], [], [], []
    Xs_employer, Xs_date, Xs_cfrom, Xs_cto = [], [], [], []
    for vacancy in names.keys():
        # размечанные данные
        if vacancy in train_specializations:
            X_names.append(names[vacancy])
            X_skills.append(skills[vacancy])
            X_text.append(text[vacancy])
            X_regions.append(regions[vacancy])
            X_employment.append(employment[vacancy])
            X_schedule.append(schedule[vacancy])
            X_currency.append(currency[vacancy])
            X_experience.append(experience[vacancy])
            X_employer.append(employer[vacancy])
            X_date.append(date[vacancy])
            X_cfrom.append(cfrom[vacancy])
            X_cto.append(cto[vacancy])
            y.append(train_specializations[vacancy])
        # неразмечанные данные
        else:
            pass # закоментировать на первом прогоне
            Xs_names.append(names[vacancy])
            Xs_skills.append(skills[vacancy])
            Xs_text.append(text[vacancy])
            Xs_regions.append(regions[vacancy])
            Xs_employment.append(employment[vacancy])
            Xs_schedule.append(schedule[vacancy])
            Xs_currency.append(currency[vacancy])
            Xs_experience.append(experience[vacancy])
            Xs_employer.append(employer[vacancy])
            Xs_date.append(date[vacancy])
            Xs_cfrom.append(cfrom[vacancy])
            Xs_cto.append(cto[vacancy])
                
    skills, regions, employment, schedule, experience, currency, employer, cfrom, cto, date = {}, {}, {}, {}, {}, {}, {}, {}, {}, {}
    text, names = {}, {}
        
    print('БЕРТ по заголовку')
    X_names = ["[CLS] " + sentence + " [SEP]" for sentence in X_names]
    X_names = [tokenizer.tokenize(sent) for sent in X_names]
    print('Заголовки токенизированы')
    print('БЕРТ по скиллам')
    X_skills = [skill_tokenize(sent) for sent in X_skills]
    print('Скиллы токенизированы')
    
    for i in range(len(X_names)):
        X_names[i].extend(X_skills[i])
    del X_skills
    
    print('БЕРТ по тексту')
    X_text = ["[CLS] " + cleanhtml(x) + " [SEP]" for x in X_text]
    X_text = [tokenizer.tokenize(sent) for sent in tqdm(X_text)]
    
    f = open(f'train{part_num}.pkl', 'wb')
    pickle.dump(dict(
        title = X_names,
        text = X_text,
        regions = X_regions,
        employment = X_employment,
        schedule = X_schedule,
        currency = X_currency,
        employer = X_employer,
        date = X_date,
        cfrom = X_cfrom,
        cto = X_cto,
        y = y,
        ohe = ohe,
        revert_ohe = revert_ohe,
    ), f)
    f.close()
    
del X_names, X_regions, X_employment, X_schedule, X_currency, X_experience, X_employer, X_date, X_cfrom, X_cto, y
del Xs_names, Xs_skills, Xs_text, Xs_regions, Xs_employment, Xs_schedule, Xs_currency, Xs_experience, Xs_employer, Xs_date, Xs_cfrom, Xs_cto

## Подготовка и кодировка данных для самбита

In [17]:
skills, regions, employment, schedule, experience, currency, employer, cfrom, cto, date = {}, {}, {}, {}, {}, {}, {}, {}, {}, {}
text, names = {}, {}

for part_num in range(1,11):
    print(f"Начинаем {part_num}")
    part = read_vacancies_part(part_num)
    skills.update({int(vacancy_id): vacancy['key_skills'] for vacancy_id, vacancy in part.items()})
    text.update({int(vacancy_id): vacancy['description'] for vacancy_id, vacancy in part.items()})
    names.update({int(vacancy_id): vacancy['name'] for vacancy_id, vacancy in part.items()})
    regions.update({int(vacancy_id): vacancy['area_id'] for vacancy_id, vacancy in part.items()})
    employment.update({int(vacancy_id): vacancy['employment'] for vacancy_id, vacancy in part.items()})
    schedule.update({int(vacancy_id): vacancy['work_schedule'] for vacancy_id, vacancy in part.items()})
    currency.update({int(vacancy_id): vacancy['currency'] for vacancy_id, vacancy in part.items()})
    experience.update({int(vacancy_id): vacancy['work_experience'] for vacancy_id, vacancy in part.items()})
    employer.update({int(vacancy_id): vacancy['employer'] for vacancy_id, vacancy in part.items()})
    date.update({int(vacancy_id): vacancy['creation_date'] for vacancy_id, vacancy in part.items()})
    cfrom.update({int(vacancy_id): vacancy['compensation_from'] for vacancy_id, vacancy in part.items()})
    cto.update({int(vacancy_id): vacancy['compensation_to'] for vacancy_id, vacancy in part.items()})
    
    X_names, X_skills, X_text, X_regions, X_employment, X_schedule, X_currency, X_experience  = [], [], [], [], [], [], [], []
    X_employer, X_date, X_cfrom, X_cto, y = [], [], [], [], []
    Xs_names, Xs_skills, Xs_text, Xs_regions, Xs_employment, Xs_schedule, Xs_currency, Xs_experience  = [], [], [], [], [], [], [], []
    Xs_employer, Xs_date, Xs_cfrom, Xs_cto, Xs_vacancy = [], [], [], [], []
    for vacancy in names.keys():
        # неразмечанные данные
        if vacancy not in train_specializations:
            Xs_vacancy.append(vacancy)
            Xs_names.append(names[vacancy])
            Xs_skills.append(skills[vacancy])
            Xs_text.append(text[vacancy])
            Xs_regions.append(regions[vacancy])
            Xs_employment.append(employment[vacancy])
            Xs_schedule.append(schedule[vacancy])
            Xs_currency.append(currency[vacancy])
            Xs_experience.append(experience[vacancy])
            Xs_employer.append(employer[vacancy])
            Xs_date.append(date[vacancy])
            Xs_cfrom.append(cfrom[vacancy])
            Xs_cto.append(cto[vacancy])
    
    skills, regions, employment, schedule, experience, currency, employer, cfrom, cto, date = {}, {}, {}, {}, {}, {}, {}, {}, {}, {}
    text, names = {}, {}
    
    f = open(f'submit{part_num}.pkl', 'wb')
    pickle.dump(dict(
        vacancy = Xs_vacancy,
        names = Xs_names,
        skills = Xs_skills,
        text = Xs_text,
        regions = Xs_regions,
        employment = Xs_employment,
        schedule = Xs_schedule,
        currency = Xs_currency,
        employer = Xs_employer,
        date = Xs_date,
        cfrom = Xs_cfrom,
        cto = Xs_cto
    ), f)
    f.close()

Начинаем 1
Начинаем 2
Начинаем 3
Начинаем 4
Начинаем 5
Начинаем 6
Начинаем 7
Начинаем 8
Начинаем 9
Начинаем 10


# Подготовка фичей

## Загрузка токенизированных текстов и других данных

In [7]:
def read_param_data(part_num, train=True):
    if train:
        path = f'train{part_num}.pkl'
    else:
        path = f'submit{part_num}.pkl'
    f = open(path, 'rb')
    params = pickle.load(f)
    f.close()
    return params

In [8]:
# One hot encoding для y
def encode_y(y, ohe):
    for i in range(len(y)):
        y_ohe = np.zeros(ohe_count)
        for spec in y[i]:
            y_ohe[ohe[spec]] = 1.0
        y[i] = y_ohe
    return y

In [9]:
def split(dataset, size):
    #dataset = dataset.shuffle()
    size = int(len(dataset) * size)
    test_dataset, train_dataset = torch.utils.data.random_split(dataset, [size, len(dataset) - size])
    del dataset
    
    train_loader = DataLoader(
        train_dataset,
        sampler= RandomSampler(train_dataset),
        batch_size = batch_size
    )
    del train_dataset

    test_loader = DataLoader(
        test_dataset,
        sampler = SequentialSampler(test_dataset),
        batch_size = batch_size
    )
    del test_dataset
    
    return train_loader, test_loader

In [10]:
def prepare_bert_data(X, maxlen=512):
    global tokenizer
    input_ids = []
    for x in X:
        if len(x) > 0:
            input_ids.append(tokenizer.convert_tokens_to_ids(x))
        else:
            input_ids.append(np.zeros((maxlen,)))

    del X
    input_ids = pad_sequences(
        input_ids,
        maxlen=maxlen,
        dtype="long",
        truncating="post",
        padding="post"
    )
    attention_masks = [[float(i>0) for i in seq] for seq in input_ids]
    return {'x': torch.tensor(input_ids), 'x_id': None, 'attention': torch.tensor(attention_masks)} 

In [11]:
def load_encoders():
    f = open('encoders.pkl', 'rb')
    ohe = pickle.load(f)
    f.close()
    return ohe
    
    
def prepare_non_text_features(params):   
    regions = np.array(params['regions']).reshape(-1, 1)
    del params['regions']

    encoders = load_encoders()
    employment = encoders['employment'].transform(np.array(params['employment']).reshape(-1, 1))
    del params['employment']

    schedule = encoders['schedule'].transform(np.array(params['schedule']).reshape(-1, 1))
    del params['schedule']
    
    currency = currency_none(params['currency'])
    del params['currency']
    currency = encoders['currency'].transform(np.array(currency).reshape(-1, 1))
    # удалить последнюю валюту (которая была неизвестна)
    currency = currency[:, :-1]

    employer = encoders['employer'].transform(params['employer'])
    del params['employer']
    employer = list(employer)
    for i in range(len(employer)):
        employer[i] = [employer[i]]

    date = []
    for d in params['date']:
        date.append([int(d[5:7])])
    del params['date']

    money = calculate_money(params['cfrom'], params['cto'])
    del params['cfrom'], params['cto']
    money = encoders['money'].transform(np.array(money).reshape(-1, 1))

    return np.hstack((regions, employment, schedule, currency, employer, date, money))


def prepare_text_feature(text):
    # текст может не помещаться (макс количество токенов - 2982)
    # мы берем среднее по всем проходам нейронки с разными частями  
    stack_dim_x, stack_dim_xid, stack_dim_attention = [], [], []
    run = True
    # медленное решение
    #while run:
    run = False
    bert_text_x = []
    for i in range(len(text)):
        # флаг остались у нас еще длинные тексты
        if len(text[i]) > 0:
            run = True
            bert_text_x.append(text[i][:512]) #стак по номеру эксперимента
            text[i] = text[i][512:]
        else:
            bert_text_x.append([])
    if run:
        bert_text_x = prepare_bert_data(bert_text_x)
        # stack_dim_xid.append(bert_text_x['x_id'])
        stack_dim_attention.append(bert_text_x['attention'])
        stack_dim_x.append(bert_text_x['x'])
    # стакаем несколько входов для текстов
    stack_dim_x = torch.stack(tuple(stack_dim_x), dim=2)
    stack_dim_attention = torch.stack(tuple(stack_dim_attention), dim=2)
    return stack_dim_x, None,stack_dim_attention

In [12]:
PATH = 'metamodel.bin'
PATH_BERT_TITLE = 'pretrained/'
PATH_BERT_TEXT = 'pretrained/'

model = MetaClassifier(620, 620, PATH_BERT_TITLE, PATH_BERT_TEXT, 24, device=device)
# model.load_state_dict(torch.load(PATH))
model.to(device)
floss = torch.nn.BCEWithLogitsLoss()

# Pipeline

Перед началом обучения была использована публично доступная предобученная модель RuBert: http://docs.deeppavlov.ai/en/master/features/pretrained_vectors.html#bert

In [17]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [15]:
batch_size = 1
learning_rate = 0.001
EPOCHS = 1
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=10000, factor=0.9)

In [None]:
# pipeline
rcParams['figure.figsize'] = 15, 10

model.change_save_paths(PATH_BERT_TITLE, PATH_BERT_TEXT)
model.train()
train_loss_set = []

for e in range(EPOCHS):
    for part_num in tqdm(range(2,11)):
        print("Loading data...")
        params = read_param_data(part_num)
        params['title'] = prepare_bert_data(params['title'])
        x_text, xid_text, at_text = prepare_text_feature(params['text'])
        del params['text']

        features = prepare_non_text_features(params)
        features = torch.tensor(features)
        print('Non-text features shape: ', features.shape)
        
        y = encode_y(params['y'], ohe)
        y = torch.tensor(y)
        
        # Надо стакнуть тензоры так чтобы получилось новое пространство
        dataset = TensorDataset(features, 
                                params['title']['x'], params['title']['attention'], 
                                x_text, at_text, y)
        del y, params
        train_loader, test_loader = split(dataset, 0.1)
        
        # Запуск обучения
        for step, batch in enumerate(train_loader):
            bert_title, bert_text = {}, {}
            features, bert_title['x'], bert_title['attention'], bert_text_x, bert_text_at, labels = batch
            features = features.type(torch.FloatTensor)       
            features = features.to(device)
            labels = labels.to(device)

            bert_title['x'] = bert_title['x'].type(torch.LongTensor)
            bert_title['x'] = bert_title['x'].to(device)
            bert_title['attention'] = bert_title['attention'].type(torch.LongTensor)
            bert_title['attention'] = bert_title['attention'].to(device)

            optimizer.zero_grad()

            # Длина текстов часто превышает 512 токенов, поэтому я делаю отдельный расчет по частям и потом усредняю выход
            for i in range(1): #bert_text_x.shape[2]
                tokens = bert_text_x[:, :, i]
                if tokens.sum() == 0:
                    pass
                tokens = tokens.type(torch.LongTensor)
                tokens = tokens.to(device)
                attention = bert_text_at[:, :, i]
                attention = attention.type(torch.LongTensor).to(device)
                # вклад текущего текста (более короткие тексты меньше приносят вклад)
                token_count = 1 #len(tokens)/512
                #for token in tokens:
                #    if token > 0:
                #        token_count += 1
                #token_count = len(tokens)/token_count 
                if i == 0:
                    output = model(features, bert_title, {'x': tokens, 'x_id': None, 'attention': attention})
                    predict = output
                else:
                    # работает очень медленно, если обучать каждый раз
                    with torch.no_grad():
                        output = model(features, bert_title, {'x': tokens, 'x_id': None, 'attention': attention})
                    predict = torch.add(predict, output, alpha=token_count)
            #output = torch.div(predict, bert_text_x.shape[2])
            output = predict
            
            # Backward pass
            loss = floss(output, labels)
            loss.backward()

            # Обновляем параметры и делаем шаг используя посчитанные градиенты
            scheduler.step(loss)
            
            # Обновляем loss
            train_loss_set.append(float(loss))  

            # Рисуем график
            if step % 1000 == 0:
                clear_output(True)
                plt.plot(train_loss_set)
                plt.title("Training loss part={} EPOCH={} learning rage = {}".format(part_num, e, optimizer.param_groups[0]['lr']))
                print("Training loss {} part={} EPOCH={} learning rage = {}".format(
                    train_loss_set[-1], part_num, e, optimizer.param_groups[0]['lr']))
                plt.xlabel("Batch")
                plt.ylabel("Loss")
                plt.ylim(0, 0.03)
                plt.show()

        
        torch.save(model.state_dict(), PATH)
        model.save_berts() 

In [13]:
for i in range(1):
    print(i)

0


In [19]:
torch.save(model.state_dict(), PATH)
model.save_berts() 

True

# Валидация

In [None]:
model.load_state_dict(torch.load(PATH))

In [128]:
model.eval()
valid_preds, valid_labels = [], []
for batch in tqdm(test_loader):
    with torch.no_grad():
        x, labels = batch
        x = x.type(torch.FloatTensor)
        x = x.to(device)
        output = model(x)
        
        output = output.detach().cpu().numpy()
        labels = labels.numpy()

        valid_preds.extend(output)
        valid_labels.extend(labels)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [115]:
def decode_preds(y, revert_ohe, threshold=0.5, max_class=6):
    result = []
    for class_ in y:
        class_ = sigmoid(class_)
        current_class = []
        for i in range(max_class):
            top = np.argmax(class_)
            # Порог либо первый элемент (хотя бы одна специализация есть)
            if class_[top] >= threshold or i==0:
                current_class.append(revert_ohe[top])
            class_ = np.delete(class_, [top])
        result.append(current_class)
    return result

In [2]:
_valid_preds = decode_preds(valid_preds, revert_ohe)
_valid_labels = decode_labels(valid_labels, revert_ohe)

print("Score: {0:.2f}".format(mean_f1score(_valid_labels, _valid_preds)))

Score: 0.51


# Submit
## Токензация текстов
выполняется один раз

In [None]:
for part_num in range(2,11):
    print(f'Часть {part_num}')
    params = read_param_data(part_num, False)
    
    print('БЕРТ по заголовку')
    params['title'] = ["[CLS] " + sentence + " [SEP]" for sentence in params['names']]
    del params['names']
    params['title'] = [tokenizer.tokenize(sent) for sent in params['title']]
    
    print('БЕРТ по скиллам')
    params['skills'] = [skill_tokenize(sent) for sent in params['skills']]
    # объединяем входы
    for i in range(len(params['title'])):
        params['title'][i].extend(params['skills'][i])
    del params['skills']

    print('БЕРТ по тексту')
    params['text'] = ["[CLS] " + cleanhtml(x) + " [SEP]" for x in params['text']]
    params['text'] = [tokenizer.tokenize(sent) for sent in tqdm(params['text'])]

    f = open(f'submit{part_num}.pkl', 'wb')
    pickle.dump(params, f)
    f.close()

## Запуск расчета

In [None]:
answers = {}
model.eval()
for part_num in range(1,2):
    params = read_param_data(part_num, False)
    
    params['title'] = prepare_bert_data(params['title'])
    
    x_text, xid_text, at_text = prepare_text_feature(params['text'])
    del params['text']
    
    features = prepare_non_text_features(params)
    features = torch.tensor(features)
    print('Non-text features shape: ', features.shape)  

    dataset = TensorDataset(features, 
                            params['title']['x'], params['title']['attention'], 
                            x_text, at_text, torch.tensor(params['vacancy']))
    del params
    
    train_loader, test_loader = split(dataset, 0.99)
    del test_loader
    
    preds, vacancy = model.get_outputs(train_loader, train=False, validate=False)    
    for i, v in enumerate(vacancy):
        answers[int(v)] = decode_preds([preds[i]], revert_ohe)

In [None]:
for i, v in enumerate(vacancy):
        answers[int(v)] = decode_preds(preds[i],revert_ohe)
        
preds = []
for value in answers.values():
    preds.append(decode_preds([value], revert_ohe)[0])

In [None]:
%%time
sample_submission = pd.DataFrame([
    (vacancy_id, top3_specs_by_employer.get(vacancy_employers[vacancy_id], [top_spec]))
    for vacancy_id in test_ids
], columns=['vacancy_id', 'specializations'])
sample_submission.to_csv('sample_submission.csv.gz', index=False, compression='gzip')