In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import torchmetrics
import pytorch_lightning as pl

from warnings import filterwarnings
filterwarnings("ignore")

In [2]:
# !wget https://storage.yandexcloud.net/natasha-navec/packs/navec_hudlit_v1_12B_500K_300d_100q.tar -P data/

In [3]:
from navec import Navec
path = "data/navec_hudlit_v1_12B_500K_300d_100q.tar"
navec = Navec.load(path)

## Load data

In [4]:
train = pd.read_csv("data/HeadHunter_train.csv")
test = pd.read_csv("data/HeadHunter_test.csv")
sample_submission = pd.read_csv("data/HeadHunter_sample_submit.csv")

print(f"Train shape: {train.shape} | Test shape: {test.shape}")

Train shape: (50876, 12) | Test shape: (50651, 11)


## Utils

In [5]:
SENT_SIZE = 102
META_SIZE = 4
METADATA_SIZE = 31
VEC_SIZE = 300

## Dataloader

In [6]:
# NaNs preprocessing
train.fillna(value={"city":"<unk>", "position":"<unk>", "positive":"<unk>", "negative":"<unk>"}, inplace=True)
test.fillna(value={"city":"<unk>", "position":"<unk>", "positive":"<unk>", "negative":"<unk>"}, inplace=True)                                    

In [7]:
# Features preprocessing
# lower city and position
train[["city", "position", "positive", "negative"]] = train[["city", "position", "positive", "negative"]].apply(lambda x: x.str.lower())
test[["city", "position", "positive", "negative"]] = test[["city", "position", "positive", "negative"]].apply(lambda x: x.str.lower())

# standard scaler
# scaler = StandardScaler()
# scaler_columns = ["salary_rating", "team_rating", "managment_rating",
#                   "career_rating", "workplace_rating", "rest_recovery_rating"]
# train[scaler_columns] = scaler.fit_transform(train[scaler_columns])
# test[scaler_columns] = scaler.transform(test[scaler_columns])

# One Hot
concat_temp = pd.concat((train, test))
metadata_columns = ["salary_rating", "team_rating", "managment_rating",
                    "career_rating", "workplace_rating", "rest_recovery_rating"]
concat_temp = pd.get_dummies(concat_temp, columns=metadata_columns)
dummies_columns = [i for i in concat_temp.columns if len([j for j in metadata_columns if j in i]) != 0]
train = concat_temp.loc[concat_temp["target"].notna()]
test = concat_temp.loc[concat_temp["target"].isna()]

# target to single label
train["preprocessed_target"] = train["target"].str.split(",").apply(lambda x: x[0]).astype(int)

# reset index
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [8]:
%%time
# vocab
tokenizer = nltk.RegexpTokenizer(r"[а-я]+|<unk>|[a-z]+")
word2idx = {"<pad>":0, "<unk>":1}
idx = 2

# create vocab
for text_column in ["city", "position", "positive", "negative"]:
    text = train[text_column].values
    tokens = [tokenizer.tokenize(sent) for sent in text]
    text_column_sentences = []    
    for idx, sent in enumerate(tokens):        
        new_sent = []
        for word in sent:
            word_emb = navec.get(word)
            if word_emb is None:
                new_sent.append(word2idx["<unk>"])
            elif word not in word2idx:
                word2idx[word] = idx
                idx += 1
                new_sent.append(word2idx[word])
            else:
                new_sent.append(word2idx[word])
        # update text_column list
        if new_sent == []:
            new_sent = [word2idx["<unk>"]]
        text_column_sentences.append(new_sent)
    # update dataframe
    train[text_column] = text_column_sentences
    
# test preprocessing
for text_column in ["city", "position", "positive", "negative"]:
    text = test[text_column].values
    tokens = [tokenizer.tokenize(sent) for sent in text]
    text_column_sentences = []    
    for idx, sent in enumerate(tokens):        
        new_sent = []
        for word in sent:
            if word in word2idx:
                word_emb = word2idx[word]
            else:
                word_emb = word2idx["<unk>"]
            new_sent.append(word_emb)
        # update text_column list
        if new_sent == []:
            new_sent = [word2idx["<unk>"]]
        text_column_sentences.append(new_sent)
    # update dataframe
    test[text_column] = text_column_sentences
 
# emb weights
vocab = list(word2idx.keys())
emb_weights = np.zeros((len(vocab), VEC_SIZE))
                     
for idx, (word, word_idx) in enumerate(word2idx.items()):
    emb_weights[idx] = navec.get(word)

CPU times: user 9.64 s, sys: 15.8 ms, total: 9.66 s
Wall time: 9.66 s


In [9]:
# split
train, val = train_test_split(train, test_size=0.1, stratify=train["preprocessed_target"], shuffle=True)
train.reset_index(drop=True, inplace=True)
val.reset_index(drop=True, inplace=True)

print(f"Train Shape: {train.shape}, Val Shape: {val.shape}")

Train Shape: (45788, 38), Val Shape: (5088, 38)


In [10]:
# Dataset
class CustomDataset(Dataset):
    def __init__(self, df, train_mode):
        # utils
        metadata_columns = ["salary_rating", "team_rating", "managment_rating",
                            "career_rating", "workplace_rating", "rest_recovery_rating"]
        # for one hot
        metadata_columns = [i for i in df.columns if len([j for j in metadata_columns if j in i]) != 0]
        self.tokenizer = nltk.RegexpTokenizer(r"[а-я]+|<unk>|<pad>")
        self.train_mode = train_mode
        
        # init features
        self.positive = df["positive"].values
        self.negative = df["negative"].values
        self.cities = df["city"].values
        self.position = df["position"].values
        self.metadata = df[metadata_columns].values
        if self.train_mode:
            self.target = df["preprocessed_target"].values
    
    def __len__(self):
        return len(self.positive)
    
    def __getitem__(self, idx):
        # get sent
        tokens_positive, tokens_negative = self.positive[idx], self.negative[idx]
        tokens_city, tokens_position = self.cities[idx], self.position[idx]
        metadata = self.metadata[idx]
        """
        For text:

        """
        # padding
        tokens_positive = np.pad(tokens_positive[:SENT_SIZE],
                                 pad_width=(max(0, SENT_SIZE - len(tokens_positive)), 0), 
                                 constant_values=(word2idx["<pad>"], word2idx["<pad>"]))
        tokens_negative = np.pad(tokens_negative[:SENT_SIZE],
                                 pad_width=(max(0, SENT_SIZE - len(tokens_negative)), 0), 
                                 constant_values=(word2idx["<pad>"], word2idx["<pad>"]))   
        tokens_city = np.pad(tokens_city[:META_SIZE],
                             pad_width=(max(0, META_SIZE - len(tokens_city)), 0), 
                             constant_values=(word2idx["<pad>"], word2idx["<pad>"]))
        tokens_position = np.pad(tokens_position[:META_SIZE],
                                   pad_width=(max(0, META_SIZE - len(tokens_position)), 0), 
                                   constant_values=(word2idx["<pad>"], word2idx["<pad>"]))
        # stack tokens
        tokens_positive = np.stack(tokens_positive)
        tokens_negative = np.stack(tokens_negative)
        tokens_city = np.stack(tokens_city)
        tokens_position = np.stack(tokens_position)
        # cnvert tokens 2 Long
        tokens_positive = torch.LongTensor(tokens_positive)
        tokens_negative = torch.LongTensor(tokens_negative)
        tokens_meta = torch.cat((torch.LongTensor(tokens_city),
                                 torch.LongTensor(tokens_position)), dim=0)
        
        """
        For target
        """        
        if self.train_mode:
            target = self.target[idx]
            return tokens_positive, tokens_negative, tokens_meta, torch.FloatTensor(metadata), target
        else:
            return tokens_positive, tokens_negative, tokens_meta, torch.FloatTensor(metadata)

In [11]:
# create datasets
dataset_train = CustomDataset(train, train_mode=True)
dataset_val = CustomDataset(val, train_mode=True)
dataset_test = CustomDataset(test, train_mode=False)
dataset_fulltrain = CustomDataset(pd.concat((train, val)), train_mode=True)

# create dataloaders
dataloader_train = DataLoader(dataset_train, batch_size=256, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=256, shuffle=True)
dataloader_fulltrain = DataLoader(dataset_fulltrain, batch_size=256, shuffle=True)
dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=False)

In [12]:
for tokens_positive, tokens_negative, tokens_meta, metadata, target in dataloader_train:
    break

## Model

In [17]:
def create_emb_layer(emb_weights, train_embed=False):
    """
    Create embeddings
    """
    num_embeddings, embedding_dim = emb_weights.shape
    emb_layer = nn.Embedding.from_pretrained(torch.from_numpy(emb_weights))
    emb_layer.weight.requires_grad = train_embed

    return emb_layer, num_embeddings, embedding_dim

In [18]:
class LSTMModel(pl.LightningModule,):
    def __init__(self, hidden_size=4, bidirectional=True):
        super().__init__()
        # utils
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional
        self.metric_accuracy = torchmetrics.Accuracy()
        self.metric_f1 = torchmetrics.F1(num_classes=9, average="macro")
        
        # logs
        self.train_accuracy_log, self.train_f1_log, self.train_loss_log = [], [], []
        self.val_accuracy_log, self.val_f1_log, self.val_loss_log = [], [], []
        
        # model
        self.emb_layer_positive, _, self.embedding_dim = create_emb_layer(emb_weights, train_embed=True)
        self.emb_layer_negative, _, self.embedding_dim = create_emb_layer(emb_weights, train_embed=True)
        self.emb_layer_meta, _, self.embedding_dim = create_emb_layer(emb_weights, train_embed=True)
        
        self.lstm_layer_positive = nn.LSTM(input_size=VEC_SIZE, hidden_size=self.hidden_size,
                                           bidirectional=self.bidirectional, dropout=0.2)
        self.lstm_layer_negative = nn.LSTM(input_size=VEC_SIZE, hidden_size=self.hidden_size,
                                           bidirectional=self.bidirectional, dropout=0.2)
        self.lstm_layer_meta = nn.LSTM(input_size=VEC_SIZE, hidden_size=self.hidden_size,
                                       bidirectional=self.bidirectional, dropout=0.2)
        
        self.linear1_positive = nn.Linear(SENT_SIZE*self.hidden_size*(self.bidirectional+1), 256)
        self.linear1_negative = nn.Linear(SENT_SIZE*self.hidden_size*(self.bidirectional+1), 256)
        self.linear1_meta = nn.Linear(2*META_SIZE*self.hidden_size*(self.bidirectional+1), 256)
        self.linear1_metadata = nn.Linear(METADATA_SIZE, 256)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(4*256, 9) 
        
        # extra utils
        self.dropout = nn.Dropout(0.3)
        self.batchnorm_positive = nn.BatchNorm1d(SENT_SIZE)
        self.batchnorm_negative = nn.BatchNorm1d(SENT_SIZE)
        self.batchnorm_meta = nn.BatchNorm1d(2*META_SIZE)
        
    def forward(self, tokens_positive, tokens_negative, tokens_meta, metadata):        
        # embeddings
        emb_positive = self.emb_layer_positive(tokens_positive)
        emb_negative = self.emb_layer_negative(tokens_negative)
        emb_meta = self.emb_layer_meta(tokens_meta)        
        
        # dropout + batchnorm
        emb_positive = self.dropout(emb_positive)
        emb_negative = self.dropout(emb_negative)
        emb_meta = self.dropout(emb_meta)
        
        emb_positive = self.batchnorm_positive(emb_positive.float())
        emb_negative = self.batchnorm_negative(emb_negative.float())
        emb_meta = self.batchnorm_meta(emb_meta.float())
           
        # lstm
        lstm_out_positive, (h_n, c_n) = self.lstm_layer_positive(emb_positive.float())
        lstm_out_negative, (h_n, c_n) = self.lstm_layer_negative(emb_negative.float())
        lstm_out_meta, (h_n, c_n) = self.lstm_layer_meta(emb_meta.float())
                
        # reshape
        fc_input_positive = torch.reshape(lstm_out_positive,
                                          (lstm_out_positive.shape[0],
                                           SENT_SIZE*self.hidden_size*(self.bidirectional+1)))
        fc_input_negative = torch.reshape(lstm_out_negative,
                                          (lstm_out_negative.shape[0],
                                           SENT_SIZE*self.hidden_size*(self.bidirectional+1)))
        fc_input_meta = torch.reshape(lstm_out_meta, 
                                      (lstm_out_meta.shape[0],
                                       lstm_out_meta.shape[1]*lstm_out_meta.shape[2]))
        
        # fc
        x_positive = self.linear1_positive(fc_input_positive)
        x_negative = self.linear1_negative(fc_input_negative)
        x_meta = self.linear1_meta(fc_input_meta)
        x_metadata = self.linear1_metadata(metadata)
        x = torch.cat((x_positive, x_negative, x_meta, x_metadata), dim=1)
        x = self.relu(x)
        x = self.linear2(x)
        
        return x

    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3, weight_decay=1e-4)
        sch = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
        #learning rate scheduler
        return {"optimizer":optimizer,
                "lr_scheduler" : {"scheduler" : sch}
               }
    
    def training_step(self, batch, batch_idx):
        tokens_positive, tokens_negative, tokens_meta, metadata, y = batch
        out = self(tokens_positive, tokens_negative, tokens_meta, metadata)
        loss = torch.nn.CrossEntropyLoss()(out, y)
        accuracy = self.metric_accuracy(out, y)
        f1 = self.metric_f1(out, y)
        
        # save logs
        self.log("train_loss", loss, prog_bar=True)
        self.log("train_accuracy", accuracy, prog_bar=True)
        self.log("train_f1", f1, prog_bar=True)

        return {"loss": loss, "accuracy": accuracy, "F1":f1}
    
    def validation_step(self, batch, batch_idx):
        tokens_positive, tokens_negative, tokens_meta, metadata, y = batch
        out = self(tokens_positive, tokens_negative, tokens_meta, metadata)
        loss = torch.nn.CrossEntropyLoss()(out, y)
        accuracy = self.metric_accuracy(out, y)
        f1 = self.metric_f1(out, y)
        
        # save logs
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_accuracy", accuracy, prog_bar=True)
        self.log("val_f1", f1, prog_bar=True)

        return {"loss": loss, "accuracy": accuracy, "F1":f1}
        
    def training_epoch_end(self, outs):
        # log epoch metric
        self.train_loss_log.append(np.mean([i["loss"].item() for i in outs]))
        self.train_accuracy_log.append(np.mean([i["accuracy"].cpu() for i in outs]))
        self.train_f1_log.append(np.mean([i["F1"].cpu() for i in outs]))


    def validation_epoch_end(self, outs):
        # log epoch metric
        self.val_loss_log.append(np.mean([i["loss"].item() for i in outs]))
        self.val_accuracy_log.append(np.mean([i["accuracy"].cpu() for i in outs]))
        self.val_f1_log.append(np.mean([i["F1"].cpu() for i in outs]))
        

### Train with split

In [19]:
%%time
lstm_model = LSTMModel()
checkpoint = pl.callbacks.ModelCheckpoint(monitor="val_loss", mode = "min", dirpath="data/", filename="bilstm")
logger = pl.loggers.TensorBoardLogger(save_dir="logs", name="lstm", version=1)

trainer = pl.Trainer(gpus=1, max_epochs=30, logger=logger,
                     default_root_dir="data/")
trainer.fit(lstm_model, dataloader_train, dataloader_val)

# save model
trainer.save_checkpoint("data/models/BILstm.ckpt")

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name                | Type        | Params
-----------------------------------------------------
0  | metric_accuracy     | Accuracy    | 0     
1  | metric_f1           | F1          | 0     
2  | emb_layer_positive  | Embedding   | 16.2 M
3  | emb_layer_negative  | Embedding   | 16.2 M
4  | emb_layer_meta      | Embedding   | 16.2 M
5  | lstm_layer_positive | LSTM        | 9.8 K 
6  | lstm_layer_negative | LSTM        | 9.8 K 
7  | lstm_layer_meta     | LSTM        | 9.8 K 
8  | linear1_positive    | Linear      | 209 K 
9  | linear1_negative    | Linear      | 209 K 
10 | linear1_meta        | Linear      | 16.6 K
11 | linear1_metadata    | Linear      | 8.2 K 
12 | relu                | ReLU        | 0     
13 | linear2             | Linear      | 9.2 K 
14 | dropout             | Dropout     | 0     
15 | batchnorm_positive  |

Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

CPU times: user 19min 2s, sys: 45.1 s, total: 19min 47s
Wall time: 19min 14s


In [22]:
# save model
trainer.save_checkpoint("data/models/BILstm.ckpt")

In [21]:
# final model
preds = []

for tokens_positive, tokens_negative, tokens_meta, metadata in tqdm(dataloader_test):
    pred = lstm_model(tokens_positive, tokens_negative, tokens_meta, metadata)
    pred_target = torch.argmax(pred).item()
    preds.append(pred_target)
    
sample_submission["target"] = preds
sample_submission.to_csv("data/submissions/submission_bilstm.csv", index=False)

  0%|          | 0/50651 [00:00<?, ?it/s]

In [26]:
sample_submission["target"].value_counts(normalize=True)

8    0.452745
0    0.406428
1    0.093443
3    0.022428
6    0.016525
7    0.003968
5    0.003672
4    0.000730
2    0.000059
Name: target, dtype: float64