In [1]:
# !pip install -q transformers

In [23]:
import os
os.environ["WANDB_DISABLED"] = "true"

import joblib

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.base import TransformerMixin
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics import f1_score

from collections import Counter

import re
import string

# import nltk
# nltk.download('punkt')
# from nltk.tokenize import word_tokenize

import gc
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, FeatureExtractionPipeline

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [4]:
def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

cleanup()

In [5]:
base_model = 'DeepPavlov/rubert-base-cased-sentence' # 0.780006575919

tokenizer = AutoTokenizer.from_pretrained(base_model)
model = BertForSequenceClassification.from_pretrained(
    base_model, num_labels=9, problem_type='multi_label_classification'
).to(device)
# torch.save(model.state_dict(), 'model_bert_pavlov_initial.pt')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=24.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=642.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1649718.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=711456784.0, style=ProgressStyle(descri…




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-sentence and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
Xy_train_val = pd.read_csv('data/train.csv', index_col='review_id').fillna('Нет информации.')
X_train_val, y_train_val = Xy_train_val.iloc[:, :-1], Xy_train_val.iloc[:, -1]

mb = MultiLabelBinarizer(classes=[str(i) for i in range(9)])
y_train_val = mb.fit_transform(y_train_val)

X_test = pd.read_csv('data/test.csv', index_col='review_id').fillna('Нет информации.')

# for data in [X_train_val, X_test]:
#     data.positive = data.positive.apply(lambda x: "Короткое предложение. " if len(x) < 50 else "") + data.positive
#     data.negative = data.negative.apply(lambda x: "Короткое предложение. " if len(x) < 50 else "") + data.negative

X_train_val = X_train_val.iloc[:, 2:4].apply(lambda row: '. '.join(row.values.astype(str)), axis=1).values
X_test = X_test.iloc[:, 2:4].apply(lambda row: '. '.join(row.values.astype(str)), axis=1).values

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.01, random_state=42)

X_train.shape, X_val.shape, y_train.shape, y_val.shape, X_test.shape



((50367,), (509,), (50367, 9), (509, 9), (50651,))

In [7]:
class MyDatasetForClassification(Dataset):
    """
    Make Dataset instance to return item for classification models
    
    Args:
    - X: n x 1
    - y: n x 9
    - tokenizer: tokenize sentences (to get 'input_ids' and 'attention_mask')

    """
    def __init__(self, X, y=None, tokenizer=None):
        self.sentences = list(X)
        self.labels = torch.FloatTensor(y) if y is not None else torch.zeros((len(self.sentences), 9))
        
        self.tokenizer_outputs = tokenizer.batch_encode_plus(self.sentences, return_tensors="pt", 
                                                             max_length=128, padding=True, truncation=True)
        
    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, index):
        input_ids = self.tokenizer_outputs['input_ids'][index]
        attention_mask = self.tokenizer_outputs['attention_mask'][index]

        label = self.labels[index]

        return {'attention_mask': attention_mask, 
                'input_ids': input_ids, 
                'labels': label}

In [8]:
dataset_train = MyDatasetForClassification(X_train, y_train, tokenizer)
dataset_val = MyDatasetForClassification(X_val, y_val, tokenizer)
dataset_test = MyDatasetForClassification(X_test, tokenizer=tokenizer)

In [9]:
def compute_metrics(eval_preds):
    cleanup()
    
    y_pred, y_true = eval_preds
    
    if type(y_pred) == tuple:
        y_pred = y_pred[0]
    
    z = 1 / (1 + np.exp(-y_pred))
    y_pred, y_true = np.array(z >= 0.5, dtype=int), y_true.astype(int)
    
    return {'f1_score': f1_score(y_true, y_pred, average='samples')}

In [44]:
output_hidden_states = model(**dataset_test[:2], output_hidden_states=True)['hidden_states'][0].mean(dim=1) 

In [53]:
output_hidden_states[0].mean(dim=1).size()

torch.Size([2, 768])

In [24]:
pipeline = FeatureExtractionPipeline(model=model, tokenizer=tokenizer)

In [47]:
pipeline('Нет информации')

[[0.2061765193939209,
  0.2131747305393219,
  -0.005690824240446091,
  -0.03178182244300842,
  0.07467031478881836,
  -0.0338858962059021,
  -0.06764063239097595,
  -0.013200432062149048,
  -0.07935985177755356]]

In [18]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [None]:
cleanup()

for lr in [5e-6, 1e-5, 5e-5]:
    for ws in [0, 500, 1000]:
        for bs in [16, 32]:
            print(lr, ws, bs)
            
            model.load_state_dict(torch.load('model_bert_pavlov_initial.pt', map_location=device))
            
            training_args = TrainingArguments(
                output_dir="test_trainer",
                per_device_train_batch_size=bs,
                per_device_eval_batch_size=32,
                num_train_epochs=2,
                warmup_steps=ws,
                learning_rate=lr,
                evaluation_strategy="epoch",
                eval_accumulation_steps=10,
                save_strategy='epoch',
                load_best_model_at_end=True,
                metric_for_best_model='f1_score',
                seed=42,
            )
            
            trainer = Trainer(model=model, 
                              args=training_args, 
                              train_dataset=dataset_train, 
                              eval_dataset=dataset_val, 
                              compute_metrics=compute_metrics)
            
            trainer.train()
            
            torch.save(model.state_dict(), f'model_bert_pavlov_{lr}_{ws}_{bs}.pt')
            
            cleanup()
            print()

In [11]:
# length = 128, epochs=3, warm=500, batch 32, 2-4 ---------> best
# Epoch	Training Loss	Validation Loss	F1 Score
# 1	0.147500	0.114881	0.799581
# 2	0.107300	0.108582	0.814714
# 3	0.095200	0.108814	0.818717

# length = 128 (64, 100), epochs=3 (5), warm=500 (1000), batch 32 (64), 2-4 + length feauture adding () -> 0.765064772802
# Epoch	Training Loss	Validation Loss	F1 Score
# 1	0.148700	0.116537	0.802542
# 2	0.107600	0.108154	0.815998
# 3	0.095300	0.108450	0.822058

# new length = 128, epochs=3, warm=500 + new 'unknown' filling + Pavlov
# Epoch	Training Loss	Validation Loss	F1 Score
# 1	0.116400	0.086875	0.847000
# 2	0.092300	0.083631	0.863333
# 3	0.073400	0.089647	0.850667

# new length = 128, epochs=3, warm=1000 + new 'unknown' filling + Pavlov + more data
# Epoch	Training Loss	Validation Loss	F1 Score
# 1	0.111700	0.088201	0.840864
# 2	0.092700	0.084430	0.845776
# 3	0.072400	0.089046	0.847741

# new length = 128, epochs=2, warm=1000 + new 'unknown' filling + length feauture adding () + Pavlov + more data
# Epoch	Training Loss	Validation Loss	F1 Score
# 1	0.109400	0.086877	0.849378
# 2	0.090400	0.081575	0.864113

# base pavlov
# Epoch	Training Loss	Validation Loss	F1 Score
# 1	0.104400	0.081640	0.861166
# 2	0.085200	0.079099	0.865095
# 3	0.060600	0.084978	0.861100

# no distill pavlov base-cased-sentence
# Epoch	Training Loss	Validation Loss	F1 Score
# 1	0.109600	0.084530	0.869352
# 2	0.087800	0.078737	0.865422
# 3	0.064400	0.079970	0.862083

# no distill pavlov base-cased
# Epoch	Training Loss	Validation Loss	F1 Score
# 1	0.109400	0.085003	0.859201
# 2	0.089000	0.082281	0.865750
# 3	0.066100	0.086777	0.859528

# no distill pavlov base-cased-sentence 1e-5
# Epoch	Training Loss	Validation Loss	F1 Score
# 1	0.116600	0.093780	0.852980
# 2	0.099900	0.084599	0.868697
# 3	0.088200	0.085746	0.856909

In [16]:
trainer.train()
torch.save(model.state_dict(), 'model_bert_pavlov_full.pt')

In [13]:
def predict_multilabel_bert(model, X, thres=0.5):
    model.to(device)
    output = torch.sigmoid(model(**{k: v.to(device) for k, v in X.items()}).logits.detach().cpu())
    
    y_pred = list(map(lambda x: ','.join(x), mb.inverse_transform((output >= thres).long())))
    y_pred_top1 = output.argmax(axis=1)
    
    return np.where([len(x) > 0 for x in y_pred], y_pred, y_pred_top1)

In [14]:
dataloader_test = DataLoader(dataset_test, batch_size=64, shuffle=False)

pred_labels = []
with torch.no_grad():
    for i, batch in enumerate(dataloader_test):
        pred_labels.append(predict_multilabel_bert(model, batch, thres=0.5))

pred_labels = np.concatenate(pred_labels, axis=0)
pred_labels

In [15]:
pd.DataFrame({
    'review_id': pd.read_csv('../input/headhunter/data/test.csv', index_col='review_id').index, 
    'target': pred_labels
}).to_csv('answers2.csv', index=False)