In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import fasttext
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
import torch.functional as F
from torch import nn
import torchmetrics
import pytorch_lightning as pl

from warnings import filterwarnings
filterwarnings("ignore")

In [2]:
# embeddings
from navec import Navec
path = "data/navec_hudlit_v1_12B_500K_300d_100q.tar"
navec = Navec.load(path)

# for synonyms
# fasttext_model = fasttext.load_model("data/cc.ru.300.bin")

# stopwords
stopwords = nltk.corpus.stopwords.words('russian')

In [3]:
# %%time
# synonyms = model.get_nearest_neighbors("********", k=10)
# synonyms = [i[1] for i in synonyms]

## Load data

In [4]:
train = pd.read_csv("data/HeadHunter_train.csv")
test = pd.read_csv("data/HeadHunter_test.csv")
sample_submission = pd.read_csv("data/HeadHunter_sample_submit.csv")

print(f"Train shape: {train.shape} | Test shape: {test.shape}")

Train shape: (50876, 12) | Test shape: (50651, 11)


## Utils

In [5]:
SENT_SIZE = 124 # q_95
META_SIZE = 6
METADATA_SIZE = 10
VEC_SIZE = 300
BATCH_SIZE = 64

## Dataloaders

In [6]:
# Preprocessing
# NaNs preprocessing
train.fillna(value={"city":"<unk>", "position":"<unk>", "positive":"<unk>", "negative":"<unk>"}, inplace=True)
test.fillna(value={"city":"<unk>", "position":"<unk>", "positive":"<unk>", "negative":"<unk>"}, inplace=True) 

# lowercase
train[["positive", "negative"]] = train[["positive", "negative"]].apply(lambda x: x.str.lower())
test[["positive", "negative"]] = test[["positive", "negative"]].apply(lambda x: x.str.lower())

# # One Hot
# concat_temp = pd.concat((train, test))
# metadata_columns = ["salary_rating", "team_rating", "managment_rating",
#                     "career_rating", "workplace_rating", "rest_recovery_rating"]
# concat_temp = pd.get_dummies(concat_temp, columns=metadata_columns)
# dummies_columns = [i for i in concat_temp.columns if len([j for j in metadata_columns if j in i]) != 0]
# train = concat_temp.loc[concat_temp["target"].notna()]
# test = concat_temp.loc[concat_temp["target"].isna()]

# standard scaler
scaler = StandardScaler()
scaler_columns = ["salary_rating", "team_rating", "managment_rating",
                  "career_rating", "workplace_rating", "rest_recovery_rating"]
train[scaler_columns] = scaler.fit_transform(train[scaler_columns])
test[scaler_columns] = scaler.transform(test[scaler_columns])

# target to single label
train["preprocessed_target"] = train["target"].apply(lambda x: [1 if str(i) in x.split(",") else 0 for i in range(9)])

# reset index
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [7]:
%%time
# vocab
tokenizer = nltk.RegexpTokenizer(r"[а-я]+|<unk>|[a-z]+")
word2idx = {"<pad>":0, "<unk>":1}
idx = 2

# create vocab
for text_column in ["positive", "negative"]:
    text = train[text_column].values
    tokens = [tokenizer.tokenize(sent) for sent in text]
    for idx, sent in enumerate(tokens):        
        for word in sent:
            if word in stopwords:
                continue
            word_emb = navec.get(word)
            if word not in word2idx and word_emb is not None:
                word2idx[word] = idx
                idx += 1
    
# idx2word
idx2word = {j:i for j,i in word2idx.items()}

CPU times: user 6 s, sys: 24.2 ms, total: 6.02 s
Wall time: 6.02 s


In [8]:
# split
train, val = train_test_split(train, test_size=0.1, shuffle=True)
train.reset_index(drop=True, inplace=True)
val.reset_index(drop=True, inplace=True)

print(f"Train Shape: {train.shape}, Val Shape: {val.shape}")

Train Shape: (45788, 13), Val Shape: (5088, 13)


In [9]:
# Dataset
class CustomDataset(Dataset):
    def __init__(self, df, sent_size, train_mode):
        # utils
        metadata_columns = ["salary_rating", "team_rating", "managment_rating",
                            "career_rating", "workplace_rating", "rest_recovery_rating"]
        # for one hot
        metadata_columns = [i for i in df.columns if len([j for j in metadata_columns if j in i]) != 0]
        self.tokenizer = nltk.RegexpTokenizer(r"[а-я]+|<unk>|<pad>")
        
        # utils
        self.train_mode = train_mode
        self.sent_size = sent_size
        
        # init features
        self.positive = df["positive"].values
        self.negative = df["negative"].values
        self.cities = df["city"].values
        self.position = df["position"].values
        self.metadata = df[metadata_columns].values
        if self.train_mode:
            self.target = df["preprocessed_target"].values
    
    def __len__(self):
        return len(self.positive)
    
    def __getitem__(self, idx):
        # get sent
        positive, negative = self.positive[idx], self.negative[idx]
#         tokens_city, tokens_position = self.cities[idx], self.position[idx]
        metadata = self.metadata[idx]
        """
        For text:

        """
        # tokenization
        tokens_positive = tokenizer.tokenize(positive)
        tokens_negative = tokenizer.tokenize(negative)
        # word2idx
        tokens_positive = [word2idx[w] if w in word2idx else word2idx["<unk>"] for w in tokens_positive]
        tokens_negative = [word2idx[w] if w in word2idx else word2idx["<unk>"] for w in tokens_negative]
        # padding
        tokens_positive = np.pad(tokens_positive[:self.sent_size],
                                 pad_width=(max(0, self.sent_size - len(tokens_positive)), 0), 
                                 constant_values=(word2idx["<pad>"], word2idx["<pad>"]))
        tokens_negative = np.pad(tokens_negative[:self.sent_size],
                                 pad_width=(max(0, self.sent_size - len(tokens_negative)), 0), 
                                 constant_values=(word2idx["<pad>"], word2idx["<pad>"]))   
        # stack tokens
        tokens_positive = np.stack(tokens_positive)
        tokens_negative = np.stack(tokens_negative)
        # cnvert tokens 2 Long
        tokens_positive = torch.LongTensor(tokens_positive)
        tokens_negative = torch.LongTensor(tokens_negative)
        
        """
        For metadata:
        1) len(positive)
        2) len(negative)
        3) percent <unk> in positive
        4) percent <unk> in negative
        """
        metadata = metadata.tolist()
        metadata += [(tokens_positive != 0).sum().item() / self.sent_size]
        metadata += [(tokens_negative != 0).sum().item() / self.sent_size]
        metadata += [(tokens_positive != 1).sum().item() / self.sent_size]
        metadata += [(tokens_negative != 1).sum().item() / self.sent_size]
        
        """
        For target
        """        
        if self.train_mode:
            target = self.target[idx]
            return tokens_positive, tokens_negative, torch.FloatTensor(metadata), torch.FloatTensor(target)
        else:
            return tokens_positive, tokens_negative, torch.FloatTensor(metadata)

In [10]:
# create datasets
dataset_train = CustomDataset(train, sent_size=112, train_mode=True)
dataset_val = CustomDataset(val, sent_size=112, train_mode=True)
dataset_test = CustomDataset(test, sent_size=112, train_mode=False)
dataset_fulltrain = CustomDataset(pd.concat((train, val)), sent_size=112, train_mode=True)

# create dataloaders
dataloader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=BATCH_SIZE, shuffle=True)
dataloader_fulltrain = DataLoader(dataset_fulltrain, batch_size=BATCH_SIZE, shuffle=True)
dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=False)

In [11]:
for tokens_positive, tokens_negative,  metadata, target in dataloader_train:
    break

## Model

In [12]:
# emb weights
vocab = list(word2idx.keys())
emb_weights = np.zeros((len(vocab), VEC_SIZE))
for idx, (word, word_idx) in enumerate(word2idx.items()):
    emb_weights[idx] = navec.get(word)
    assert navec.get(word) is not None

def create_emb_layer(emb_weights, train_embed=False):
    """
    Create embeddings
    """
    num_embeddings, embedding_dim = emb_weights.shape
    emb_layer = nn.Embedding.from_pretrained(torch.from_numpy(emb_weights))
    emb_layer.weight.requires_grad = train_embed

    return emb_layer, num_embeddings, embedding_dim

In [13]:
class WordAttention(nn.Module):
    def __init__(self, hidden_dim):
        super(WordAttention, self).__init__()

        self.attention = nn.Linear(hidden_dim, hidden_dim)
        self.context = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, x):
        out_attention = torch.nn.Tanh()(self.attention(x))
        out_context = torch.nn.Softmax(dim=1)(self.context(out_attention))
        out = (out_context * x).sum(1)
        
        return out_context.permute(0, 2, 1), out

In [14]:
class LSTMModel(pl.LightningModule,):
    def __init__(self, learning_rate=1e-3, weight_decay=1e-8, hidden_size=4, bidirectional=True, linear_size=512,
                 dropout_rate=0.2):
        super().__init__()
        # save hyperparameters
        self.save_hyperparameters()
        # utils
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional
        self.linear_size = linear_size
        self.dropout_rate = dropout_rate
        # metrics
        self.metric_accuracy = torchmetrics.Accuracy()
        self.metric_f1 = torchmetrics.F1(num_classes=9, average="samples")
        # logs
        self.train_accuracy_log, self.train_f1_log, self.train_loss_log = [], [], []
        self.val_accuracy_log, self.val_f1_log, self.val_loss_log = [], [], []
        
        # model
        self.emb_layer_positive, _, self.embedding_dim = create_emb_layer(emb_weights, train_embed=True)
        self.emb_layer_negative, _, self.embedding_dim = create_emb_layer(emb_weights, train_embed=True)
        
        self.lstm_layer_positive = nn.LSTM(input_size=VEC_SIZE, hidden_size=self.hidden_size,
                                           bidirectional=self.bidirectional, dropout=0.2,
                                           batch_first=True)
        self.lstm_layer_negative = nn.LSTM(input_size=VEC_SIZE, hidden_size=self.hidden_size,
                                           bidirectional=self.bidirectional, dropout=0.2,
                                           batch_first=True)
        
        self.attention_positive = WordAttention(hidden_size*(self.bidirectional+1))
        self.attention_negative = WordAttention(hidden_size*(self.bidirectional+1))
        
        self.linear1_positive = nn.Linear(self.hidden_size*(self.bidirectional+1), self.linear_size)
        self.linear1_negative = nn.Linear(self.hidden_size*(self.bidirectional+1), self.linear_size)
        self.linear1_metadata = nn.Linear(METADATA_SIZE, self.linear_size)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(3*self.linear_size, 9) 
        
        # extra utils
        self.dropout = nn.Dropout(self.dropout_rate)
        
    def forward(self, tokens_positive, tokens_negative, metadata):        
        # embeddings
        emb_positive = self.emb_layer_positive(tokens_positive)
        emb_negative = self.emb_layer_negative(tokens_negative)
        
        # dropout
        emb_positive = self.dropout(emb_positive)
        emb_negative = self.dropout(emb_negative)
           
        # lstm
        lstm_out_positive, (h_n, c_n) = self.lstm_layer_positive(emb_positive.float())
        lstm_out_negative, (h_n, c_n) = self.lstm_layer_negative(emb_negative.float())
        
        # attention
        _, out_positive = self.attention_positive(lstm_out_positive)
        _, out_negative = self.attention_negative(lstm_out_negative)
        
        # fc
        x_positive = self.linear1_positive(out_positive)
        x_negative = self.linear1_negative(out_negative)
        x_metadata = self.linear1_metadata(metadata)
        x = torch.cat((x_positive, x_negative, x_metadata), dim=1)
        x = self.relu(x)
        x = self.linear2(x)
        x = torch.nn.Sigmoid()(x)
        
        return x

    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate,
                                     weight_decay=self.weight_decay)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, verbose=False)
        #learning rate scheduler
        return {"optimizer":optimizer,
                "lr_scheduler" : {"scheduler" : scheduler, "monitor": "val_f1"}
               }
    
    def training_step(self, batch, batch_idx):
        tokens_positive, tokens_negative, metadata, y = batch
        out = self(tokens_positive, tokens_negative, metadata)
        loss = torch.nn.BCELoss()(out, y)
        accuracy = self.metric_accuracy(out, y.int())
        f1 = self.metric_f1(out, y.int())
        
        # save logs
        self.log("train_loss", loss, prog_bar=True)
        self.log("train_accuracy", accuracy, prog_bar=True)
        self.log("train_f1", f1, prog_bar=True)

        return {"loss": loss, "accuracy": accuracy, "F1":f1}
    
    def validation_step(self, batch, batch_idx):
        tokens_positive, tokens_negative, metadata, y = batch
        out = self(tokens_positive, tokens_negative, metadata)        
        loss = torch.nn.BCELoss()(out, y)
        accuracy = self.metric_accuracy(out, y.int())
        f1 = self.metric_f1(out, y.int())
        
        # save logs
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_accuracy", accuracy, prog_bar=True)
        self.log("val_f1", f1, prog_bar=True)
        self.log("hp_metric", f1)

        return {"loss": loss, "accuracy": accuracy, "F1":f1}
        
    def training_epoch_end(self, outs):
        # log epoch metric
        self.train_loss_log.append(np.mean([i["loss"].item() for i in outs]))
        self.train_accuracy_log.append(np.mean([i["accuracy"].cpu() for i in outs]))
        self.train_f1_log.append(np.mean([i["F1"].cpu() for i in outs]))


    def validation_epoch_end(self, outs):
        # log epoch metric
        self.val_loss_log.append(np.mean([i["loss"].item() for i in outs]))
        self.val_accuracy_log.append(np.mean([i["accuracy"].cpu() for i in outs]))
        self.val_f1_log.append(np.mean([i["F1"].cpu() for i in outs]))
        
    def predict_step(self, batch, batch_idx):
        tokens_positive, tokens_negative, metadata = batch
        out = self(tokens_positive, tokens_negative, metadata)
        
        return out

### Hypeopt

In [15]:
import optuna

In [16]:
def objective(trial: optuna.trial.Trial):

    # params
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-3)
    weight_decay = trial.suggest_float("weight_decay", 1e-12, 1e-2)
    hidden_size = trial.suggest_int("hidden_size", 1, 64)
    bidirectional = trial.suggest_categorical("bidirectional", [True, False])
    linear_size = trial.suggest_int("linear_size", 112, 1024)
    dropout_rate = trial.suggest_float("dropout_rate", 0.0, 0.6)
    sent_size = trial.suggest_int("sent_size", 50, 150)
    batch_size = trial.suggest_int("batch_size", 4, 256)    
    
    hyperparameters = {"learning_rate":learning_rate, "weight_decay":weight_decay,
                       "hidden_size":hidden_size, "bidirectional":bidirectional,
                       "linear_size":linear_size, "dropout_rate":dropout_rate}
    
    # data
    dataset_train = CustomDataset(train, sent_size=sent_size, train_mode=True)
    dataset_val = CustomDataset(val, sent_size=sent_size, train_mode=True)
    dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
    dataloader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle=True)
    
    # model
    lstm_model = LSTMModel(learning_rate, weight_decay, hidden_size, bidirectional, linear_size, dropout_rate)
    checkpoint = pl.callbacks.ModelCheckpoint(monitor="val_loss", mode = "min", dirpath="data/", filename="bilstm")
    lr_monitoring = pl.callbacks.LearningRateMonitor(logging_interval="epoch")
    early_stop_callback = pl.callbacks.early_stopping.EarlyStopping(monitor="val_f1", min_delta=0.00, patience=4, verbose=False, mode="max")
    logger = pl.loggers.TensorBoardLogger(save_dir="logs", name="lstm_attention")
    
    # train
    trainer = pl.Trainer(gpus=1, max_epochs=15, logger=logger, callbacks=[lr_monitoring, early_stop_callback],
                         default_root_dir="data/", auto_lr_find=True, weights_summary=None)
    trainer.logger.log_hyperparams(hyperparameters)
    trainer.fit(lstm_model, dataloader_train, dataloader_val)

    return trainer.callback_metrics["val_f1"].item()

In [42]:
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=100, timeout=2*60*60)

In [19]:
study.best_params

{'learning_rate': 0.0002513194765987834,
 'weight_decay': 0.00023611620695341944,
 'hidden_size': 55,
 'bidirectional': True,
 'linear_size': 1012,
 'dropout_rate': 0.5950975603765589,
 'sent_size': 56,
 'batch_size': 38}

### Train model

In [21]:
%%time
# params
best_params = {'learning_rate': 0.0002513194765987834, 'weight_decay': 0.00023611620695341944,
               'hidden_size': 55, 'bidirectional': True, 'linear_size': 1012,
               'dropout_rate': 0.5950975603765589}

sent_size = 56
batch_size = 38

# data
dataset_train = CustomDataset(train, sent_size=sent_size, train_mode=True)
dataset_val = CustomDataset(val, sent_size=sent_size, train_mode=True)
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle=True)

# create model
lstm_model = LSTMModel(**best_params) # learning_rate, weight_decay, hidden_size, bidirectional, linear_size, dropout_rate)
checkpoint = pl.callbacks.ModelCheckpoint(monitor="val_loss", mode = "min", dirpath="data/", filename="bilstm")
lr_monitoring = pl.callbacks.LearningRateMonitor(logging_interval="epoch")
early_stop_callback = pl.callbacks.early_stopping.EarlyStopping(monitor="val_f1", min_delta=0.00, patience=3, verbose=False, mode="max")
logger = pl.loggers.TensorBoardLogger(save_dir="logs", name="lstm_attention")

# train
trainer = pl.Trainer(gpus=1, max_epochs=15, logger=logger, callbacks=[lr_monitoring, early_stop_callback],
                     default_root_dir="data/", auto_lr_find=True)
trainer.fit(lstm_model, dataloader_train, dataloader_val)

# save model
trainer.save_checkpoint("data/models/BILstm_attention.ckpt")

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name                | Type          | Params
-------------------------------------------------------
0  | metric_accuracy     | Accuracy      | 0     
1  | metric_f1           | F1            | 0     
2  | emb_layer_positive  | Embedding     | 15.6 M
3  | emb_layer_negative  | Embedding     | 15.6 M
4  | lstm_layer_positive | LSTM          | 157 K 
5  | lstm_layer_negative | LSTM          | 157 K 
6  | attention_positive  | WordAttention | 12.3 K
7  | attention_negative  | WordAttention | 12.3 K
8  | linear1_positive    | Linear        | 112 K 
9  | linear1_negative    | Linear        | 112 K 
10 | linear1_metadata    | Linear        | 11.1 K
11 | relu                | ReLU          | 0     
12 | linear2             | Linear        | 27.3 K
13 | dropout             | Dropout       | 0     
------------------------------------------

Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

CPU times: user 7min 16s, sys: 8.73 s, total: 7min 25s
Wall time: 7min 15s


## Preds

In [22]:
%%time
preds = trainer.predict(lstm_model, dataloader_test)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 1205it [00:00, ?it/s]

CPU times: user 2min 44s, sys: 314 ms, total: 2min 44s
Wall time: 2min 44s


In [28]:
%%time
# save test preds
submit = []
thresh1, thresh2, thresh3 = 0.2, 0.2, 0.2

for pred in tqdm(preds):
    pred_batch = torch.where((pred > thresh1)[0])[0].detach().tolist()
    pred_batch = ",".join([str(i) for i in pred_batch])
    if pred_batch == '':
        pred_batch = torch.where((pred > thresh2)[0])[0].detach().tolist()
        pred_batch = ",".join([str(i) for i in pred_batch])
        if pred_batch == '':
            pred_batch = torch.where((pred > thresh3)[0])[0].detach().tolist()
            pred_batch = ",".join([str(i) for i in pred_batch])
            if pred_batch == '':
                print(pred)
                pred_batch = "0"
    submit.append(pred_batch)
    
sample_submission["target"] = submit
sample_submission.to_csv("data/submissions/submission_lstm_with_attention.csv", index=False)

  0%|          | 0/50651 [00:00<?, ?it/s]

tensor([[1.4453e-04, 4.4036e-02, 6.7410e-10, 1.3379e-01, 6.6564e-05, 3.8361e-04,
         4.5827e-04, 6.5299e-05, 1.3423e-01]])
tensor([[3.7928e-04, 3.5203e-03, 9.6458e-10, 1.9976e-01, 3.5124e-05, 1.5491e-04,
         2.6519e-02, 3.0001e-05, 1.3648e-01]])
tensor([[1.4436e-03, 1.3448e-03, 1.7903e-08, 1.8679e-01, 3.6873e-05, 1.7914e-04,
         2.2573e-04, 6.0680e-05, 9.0712e-02]])
tensor([[1.8369e-01, 1.4236e-01, 1.3058e-05, 5.3578e-03, 2.3162e-03, 2.5184e-02,
         1.0690e-02, 2.3393e-03, 1.3418e-01]])
tensor([[2.4125e-04, 8.3625e-03, 2.5383e-09, 1.8364e-01, 5.6786e-05, 3.9037e-04,
         3.7014e-04, 1.3091e-04, 1.2621e-01]])
CPU times: user 2.2 s, sys: 19.9 ms, total: 2.22 s
Wall time: 579 ms
