In [None]:
!pip install -q transformers

In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

import joblib

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.base import TransformerMixin
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics import f1_score

from collections import Counter

import re
import string

# import nltk
# nltk.download('punkt')
# from nltk.tokenize import word_tokenize

import gc
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [4]:
def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

cleanup()

In [5]:
base_model = 'DeepPavlov/rubert-base-cased-sentence' # 0.780006575919

tokenizer = AutoTokenizer.from_pretrained(base_model)
model = BertForSequenceClassification.from_pretrained(
    base_model, num_labels=9, problem_type='multi_label_classification'
).to(device)
torch.save(model.state_dict(), 'model_bert_pavlov_initial.pt')

In [6]:
Xy_train_val = pd.read_csv('../input/headhunter/data/data/train.csv', index_col='review_id').fillna('Нет информации.')
X_train_val, y_train_val = Xy_train_val.iloc[:, :-1], Xy_train_val.iloc[:, -1]

mb = MultiLabelBinarizer(classes=[str(i) for i in range(9)])
y_train_val = mb.fit_transform(y_train_val)

X_test = pd.read_csv('../input/headhunter/data/data/test.csv', index_col='review_id').fillna('Нет информации.')

X_train_val = X_train_val.iloc[:, 2:4].apply(lambda row: '. '.join(row.values.astype(str)), axis=1).values
X_test = X_test.iloc[:, 2:4].apply(lambda row: '. '.join(row.values.astype(str)), axis=1).values

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.01, random_state=42)

X_train.shape, X_val.shape, y_train.shape, y_val.shape, X_test.shape

In [7]:
class MyDatasetForClassification(Dataset):
    """
    Make Dataset instance to return item for classification models
    
    Args:
    - X: n x 1
    - y: n x 9
    - tokenizer: tokenize sentences (to get 'input_ids' and 'attention_mask')

    """
    def __init__(self, X, y=None, tokenizer=None):
        self.sentences = list(X)
        self.labels = torch.FloatTensor(y) if y is not None else torch.zeros((len(self.sentences), 9))
        
        self.tokenizer_outputs = tokenizer.batch_encode_plus(self.sentences, return_tensors="pt", 
                                                             max_length=128, padding=True, truncation=True)
        
    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, index):
        input_ids = self.tokenizer_outputs['input_ids'][index]
        attention_mask = self.tokenizer_outputs['attention_mask'][index]

        label = self.labels[index]

        return {'attention_mask': attention_mask, 
                'input_ids': input_ids, 
                'labels': label}

In [8]:
dataset_train = MyDatasetForClassification(X_train, y_train, tokenizer)
dataset_val = MyDatasetForClassification(X_val, y_val, tokenizer)
dataset_test = MyDatasetForClassification(X_test, tokenizer=tokenizer)

In [9]:
def compute_metrics(eval_preds):
    cleanup()
    
    y_pred, y_true = eval_preds
    
    if type(y_pred) == tuple:
        y_pred = y_pred[0]
    
    z = 1 / (1 + np.exp(-y_pred))
    y_pred, y_true = np.array(z >= 0.5, dtype=int), y_true.astype(int)
    
    return {'f1_score': f1_score(y_true, y_pred, average='samples')}

In [11]:
cleanup()

for lr in [5e-6, 1e-5]:
    for ws, bs in [(1000, 16)]: #[(500, 32), (1000, 16)]:
        for wd in [0, 1e-5]:
            print(lr, ws, bs, wd)
            
            model.load_state_dict(torch.load('model_bert_pavlov_initial.pt', map_location=device))

            training_args = TrainingArguments(
                output_dir="test_trainer",
                per_device_train_batch_size=bs,
                per_device_eval_batch_size=32,
                num_train_epochs=2,
                warmup_steps=ws,
                learning_rate=lr,
                weight_decay=wd,
                evaluation_strategy="epoch",
                eval_accumulation_steps=10,
                save_strategy='epoch',
                load_best_model_at_end=True,
                metric_for_best_model='f1_score',
                seed=42,
            )

            trainer = Trainer(model=model, 
                              args=training_args, 
                              train_dataset=dataset_train, 
                              eval_dataset=dataset_val, 
                              compute_metrics=compute_metrics)

            trainer.train()

            torch.save(model.state_dict(), f'model_bert_pavlov_{lr}_{ws}_{bs}_{wd}.pt')

            cleanup()
            print()

In [12]:
cleanup()

model.load_state_dict(torch.load('model_bert_pavlov_initial.pt', map_location=device))

training_args = TrainingArguments(
    output_dir="test_trainer",
    per_device_train_batch_size=16, #
    per_device_eval_batch_size=32,
    num_train_epochs=3, # 
    warmup_steps=1000, #
    learning_rate=1e-5, #
    evaluation_strategy="epoch",
    eval_accumulation_steps=10,
    save_strategy='epoch',
    load_best_model_at_end=False,
    metric_for_best_model='f1_score',
    seed=42,
)

trainer = Trainer(model=model, 
                  args=training_args, 
                  train_dataset=dataset_train, 
                  eval_dataset=dataset_val, 
                  compute_metrics=compute_metrics)

trainer.train()

torch.save(model.state_dict(), 'model_bert_pavlov_final.pt')

In [13]:
# model.load_state_dict(torch.load('model_bert_pavlov_5e-06_1000_16.pt', map_location=device))

In [13]:
def predict_multilabel_bert(model, X, thres=0.5):
    model.to(device)
    output = torch.sigmoid(model(**{k: v.to(device) for k, v in X.items()}).logits.detach().cpu())
    
    y_pred = list(map(lambda x: ','.join(x), mb.inverse_transform((output >= thres).long())))
    y_pred_top1 = output.argmax(axis=1)
    
    return np.where([len(x) > 0 for x in y_pred], y_pred, y_pred_top1)

In [14]:
dataloader_test = DataLoader(dataset_test, batch_size=64, shuffle=False)

pred_labels = []
with torch.no_grad():
    for i, batch in enumerate(dataloader_test):
        pred_labels.append(predict_multilabel_bert(model, batch, thres=0.5))

pred_labels = np.concatenate(pred_labels, axis=0)
pred_labels

In [15]:
pd.DataFrame({
    'review_id': pd.read_csv('../input/headhunter/data/data/test.csv', index_col='review_id').index, 
    'target': pred_labels
}).to_csv('answers.csv', index=False)

In [17]:
dataloader_test = DataLoader(dataset_test, batch_size=64, shuffle=False)

embeddings_test = []
with torch.no_grad():
    for i, batch in enumerate(dataloader_test):
        batch = {k: v.to(device) for k, v in batch.items()}
        embeddings_test.append(model(**batch, 
                                     output_hidden_states=True)['hidden_states'][0].mean(dim=1).cpu())

embeddings_test = np.concatenate(embeddings_test, axis=0)
embeddings_test.shape

In [18]:
dataloader_train = DataLoader(dataset_train, batch_size=64, shuffle=False)

embeddings_train = []
with torch.no_grad():
    for i, batch in enumerate(dataloader_train):
        batch = {k: v.to(device) for k, v in batch.items()}
        embeddings_train.append(model(**batch, 
                                      output_hidden_states=True)['hidden_states'][0].mean(dim=1).cpu())

embeddings_train = np.concatenate(embeddings_train, axis=0)
embeddings_train.shape

In [36]:
Xy_train_val = pd.read_csv('../input/headhunter/data/data/train.csv', index_col='review_id').fillna('Нет информации.')
X_train_val, y_train_val = Xy_train_val.iloc[:, :-1], Xy_train_val.iloc[:, -1]

mb = MultiLabelBinarizer(classes=[str(i) for i in range(9)])
y_train_val = mb.fit_transform(y_train_val)

X_test = pd.read_csv('../input/headhunter/data/data/test.csv', index_col='review_id').fillna('Нет информации.')

for data in [X_train_val, X_test]:
    
    # class 0: special symbol
    data['xa_symbol_pos'] = (data['positive'].str.find('\xa0') != -1).astype(int)
    data['xa_symbol_neg'] = (data['negative'].str.find('\xa0') != -1).astype(int)
    
    # small preprocessing
    data['positive'] = data['positive'].str.replace(',', ', ').str.replace('.', '. ').apply(lambda x: re.sub(' +', ' ', x))
    data['negative'] = data['negative'].str.replace(',', ', ').str.replace('.', '. ').apply(lambda x: re.sub(' +', ' ', x))
    
    # class 8: length (with round -1)
    data['length_pos'] = data['positive'].apply(lambda x: round(len(x), -1)) # .str.len() also works
    data.loc[data['length_pos'] > 1000, 'length_pos'] = 1000
    data['length_neg'] = data['negative'].apply(lambda x: round(len(x), -1))
    data.loc[data['length_neg'] > 1000, 'length_neg'] = 1000
    
    # class 
    data['max_pos'] = data['positive'].apply(lambda x: np.max([len(w) for w in x.split(' ')]))
    data.loc[data['max_pos'] > 25, 'max_pos'] = 25
    data['max_neg'] = data['negative'].apply(lambda x: np.max([len(w) for w in x.split(' ')]))
    data.loc[data['max_neg'] > 25, 'max_neg'] = 25
    
    #
    data['most_common_pos'] = data['positive'].apply(
        lambda x: Counter([w for w in x.split(' ')]).most_common(1)[0][1]
    )
    data.loc[data['most_common_pos'] > 25, 'most_common_pos'] = 25
    data['most_common_neg'] = data['negative'].apply(
        lambda x: Counter([w for w in x.split(' ')]).most_common(1)[0][1]
    )
    data.loc[data['most_common_neg'] > 25, 'most_common_neg'] = 25
    
    for col in ['city', 'position']:
        counts = data[col].value_counts()
        data.loc[data[col].isin(counts[counts < 10].index), col] = 'Прочее'
        
    cols = ['salary_rating', 'team_rating', 'managment_rating', 
            'career_rating', 'workplace_rating', 'rest_recovery_rating']
    
    for i in range(1, 5+1):
        data[f'count_{i}'] = (data.loc[:, cols] == i).sum(axis=1)
    
    data['rating_mean'] = data.loc[:, cols].mean(axis=1)
    data['rating_std'] = data.loc[:, cols].std(axis=1)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, 
                                                  test_size=0.01, random_state=42)

X_train.shape, y_train.shape, X_test.shape

In [37]:
class DummyTransformer(TransformerMixin):
    """
    Mini class to return initial features without transformation
    
    """
    def __init__(self, value=None):
        TransformerMixin.__init__(self)
        self.value = value
    
    def fit(self, *_):
        return self

    def transform(self, X):
        return X
    
    def get_params(self, deep=True):
        return {'value': self.value}

In [38]:
X_train = pd.concat([X_train.reset_index(drop=True), 
                     pd.DataFrame(embeddings_train).reset_index(drop=True)], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), 
                    pd.DataFrame(embeddings_test).reset_index(drop=True)], axis=1)
X_train.shape, X_test.shape

In [41]:
pipeline = Pipeline([
    ('transforms', ColumnTransformer([
        ('ohe', OneHotEncoder(handle_unknown='ignore'), [0, 1] + [*range(4, X_train.shape[1]-2-768)]),
        ('features', DummyTransformer(), [*range(X_train.shape[1]-2-768, X_train.shape[1])])
    ])),
    ('lr', OneVsRestClassifier(LogisticRegression(C=0.1, max_iter=500, n_jobs=-1, random_state=42)))
])
pipeline.fit(X_train, y_train)

In [42]:
def predict_multilabel(model, X):
    y_pred = list(map(lambda x: ','.join(x), mb.inverse_transform(model.predict(X))))
    y_pred_top1 = model.predict_proba(X).argmax(axis=1)
    
    return np.where([len(x) > 0 for x in y_pred], y_pred, y_pred_top1)

In [45]:
pd.DataFrame({
    'review_id': pd.read_csv('../input/headhunter/data/data/test.csv', index_col='review_id').index, 
    'target': predict_multilabel(pipeline, X_test) # model.predict(X_test).flatten()
}).to_csv('answers1.csv', index=False)