In [1]:
import numpy as np
from tqdm.notebook import tqdm
import nltk
import pandas as pd
from glob import glob

from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertModel

import torch
from torch.utils.data import Dataset, DataLoader
import torch.functional as F
from torch import nn
import torchmetrics
import pytorch_lightning as pl

from warnings import filterwarnings
filterwarnings("ignore")

## LSTM

In [2]:
from model import LSTMModel, Model, CustomDataset

In [3]:
sent_size = 112

# Load tokenizer
tokenizer_bert = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
tokenizer_lstm = nltk.RegexpTokenizer(r"[а-я]+|<unk>|<pad>")

# load test and sort
test_data = glob("data/augmentations/test/*.npy")
test_data = [i.split("_") for i in test_data]
test_data = {i[1]: i[0] for i in test_data}
sorted_test_data = []
for i in range(len(test_data)):
    sorted_test_data.append(test_data[str(i)+".npy"]+"_"+str(i)+".npy")

# data
dataset_test = CustomDataset(sorted_test_data, tokenizer_bert, tokenizer_lstm, sent_size=sent_size,
                              train_mode=False, model_type="lstm")
dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=False)

In [4]:
# load model
# model = LSTMModel.load_from_checkpoint("data/models/Final_Model_lstm.ckpt")

model = LSTMModel.load_from_checkpoint("data/models/Final_Model_lstm_pseudo.ckpt")
trainer = pl.Trainer(gpus=1)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [5]:
# preds
preds = trainer.predict(model, dataloader_test)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

In [8]:
# Predicts
sample_submission = pd.read_csv("data/HeadHunter_sample_submit.csv")

const = 0.2
thresholds = [const]
y_pred = []
submit_preds = []
count_zero = 0

for pred in tqdm(preds):
    pred = (pred.numpy() > thresholds).astype(int).tolist()
    y_pred.extend(pred)
    
    if sum(pred[0]) == 0:
        count_zero += 1
        submit_preds.append("0")
    else:
        submit_preds.append(",".join([str(i) for i in range(9) if pred[0][i]==1]))
        
print(f"Zero forecasts: {count_zero}")
sample_submission["target"] = submit_preds
sample_submission.to_csv("data/submissions/final_lstm.csv", index=False)

  0%|          | 0/50651 [00:00<?, ?it/s]

Zero forecasts: 0


## Bert Not Retrained

In [2]:
from model import LSTMModel, Model, CustomDataset

In [3]:
sent_size = 112

# Load tokenizer
tokenizer_bert = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
tokenizer_lstm = nltk.RegexpTokenizer(r"[а-я]+|<unk>|<pad>")

# load test and sort
test_data = glob("data/augmentations/test/*.npy")
test_data = [i.split("_") for i in test_data]
test_data = {i[1]: i[0] for i in test_data}
sorted_test_data = []
for i in range(len(test_data)):
    sorted_test_data.append(test_data[str(i)+".npy"]+"_"+str(i)+".npy")

# data
dataset_test = CustomDataset(sorted_test_data, tokenizer_bert, tokenizer_lstm, sent_size=sent_size,
                              train_mode=False, model_type="bert")
dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=False)

In [4]:
# load model
model = Model.load_from_checkpoint("data/models/Final_Model_notretrained.ckpt")
trainer = pl.Trainer(gpus=1)

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [None]:
# preds
preds = trainer.predict(model, dataloader_test)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

In [None]:
# Predicts
sample_submission = pd.read_csv("data/HeadHunter_sample_submit.csv")

const = 0.2
thresholds = [const]
y_pred = []
submit_preds = []
count_zero = 0

for pred in tqdm(preds):
    pred = (pred.numpy() > thresholds).astype(int).tolist()
    y_pred.extend(pred)
    
    if sum(pred[0]) == 0:
        count_zero += 1
        submit_preds.append("0")
    else:
        submit_preds.append(",".join([str(i) for i in range(9) if pred[0][i]==1]))
        
print(f"Zero forecasts: {count_zero}")
sample_submission["target"] = submit_preds
sample_submission.to_csv("data/submissions/final_bert_not_retrained.csv", index=False)

## Bert Retrained Pseudo

In [2]:
from model import LSTMModel, Model, CustomDataset

In [3]:
sent_size = 112

# Load tokenizer
tokenizer_bert = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
tokenizer_lstm = nltk.RegexpTokenizer(r"[а-я]+|<unk>|<pad>")

# load test and sort
test_data = glob("data/augmentations/test/*.npy")
test_data = [i.split("_") for i in test_data]
test_data = {i[1]: i[0] for i in test_data}
sorted_test_data = []
for i in range(len(test_data)):
    sorted_test_data.append(test_data[str(i)+".npy"]+"_"+str(i)+".npy")

# data
dataset_test = CustomDataset(sorted_test_data, tokenizer_bert, tokenizer_lstm, sent_size=sent_size,
                              train_mode=False, model_type="bert")
dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=False)

In [5]:
# load model
model = Model.load_from_checkpoint("data/models/Final_Model_notretrained.ckpt")
trainer = pl.Trainer(gpus=1)

# preds
preds = trainer.predict(model, dataloader_test)

Some weights of the model checkpoint at Skoltech/russian-sensitive-topics were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

In [6]:
# Predicts
sample_submission = pd.read_csv("data/HeadHunter_sample_submit.csv")

const = 0.2
thresholds = [const]
y_pred = []
submit_preds = []
count_zero = 0

for pred in tqdm(preds):
    pred = (pred.numpy() > thresholds).astype(int).tolist()
    y_pred.extend(pred)
    
    if sum(pred[0]) == 0:
        count_zero += 1
        submit_preds.append("0")
    else:
        submit_preds.append(",".join([str(i) for i in range(9) if pred[0][i]==1]))
        
print(f"Zero forecasts: {count_zero}")
sample_submission["target"] = submit_preds
sample_submission.to_csv("data/submissions/final_bert_retrained_pseudo.csv", index=False)

  0%|          | 0/50651 [00:00<?, ?it/s]

Zero forecasts: 3
