In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from glob import glob
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertModel
import nltk
import pytorch_lightning as pl
from tqdm.notebook import tqdm

## Load data

In [None]:
train = pd.read_csv("data/HeadHunter_train.csv")
test = pd.read_csv("data/HeadHunter_test.csv")
new_test = pd.read_csv("data/HeadHunter_new_train.csv")
sample_submission = pd.read_csv("data/HeadHunter_sample_submit.csv")

print(f"Train shape: {train.shape} | Test shape: {test.shape}")

## Augmentations

In [None]:
from data import preprocessing, get_vocab, augmentations

In [None]:
# Preprocessing
train, test = preprocessing(train.copy(), test.copy())

In [None]:
# Split
train, val = train_test_split(train, test_size=0.1)

print(f"Train Size: {train.shape}, Val Size: {val.shape}")

In [None]:
# Train Augmentations
train_positive_sentences, train_negative_sentences, train_meta, train_labels = augmentations(train)

np.save("data/augmentations/train_positive_sentences", np.asarray(train_positive_sentences))
np.save("data/augmentations/train_negative_sentences", np.asarray(train_negative_sentences))
np.save("data/augmentations/train_meta", np.asarray(train_meta))
np.save("data/augmentations/train_labels", np.asarray(train_labels))

In [None]:
# Val Augmentations
val_positive_sentences, val_negative_sentences, val_meta, val_labels = augmentations(val, is_test=True)

np.save("data/augmentations/val_positive_sentences", np.asarray(val_positive_sentences))
np.save("data/augmentations/val_negative_sentences", np.asarray(val_negative_sentences))
np.save("data/augmentations/val_meta", np.asarray(val_meta))
np.save("data/augmentations/val_labels", np.asarray(val_labels))

In [None]:
# Test augmentations
test_positive_sentences, test_negative_sentences, test_meta, _ = augmentations(test, is_test=True)

np.save("data/augmentations/test_positive_sentences", np.asarray(test_positive_sentences))
np.save("data/augmentations/test_negative_sentences", np.asarray(test_negative_sentences))
np.save("data/augmentations/test_meta", np.asarray(test_meta))

In [None]:
# Train Augmentations (as test)
train_positive_sentences, train_negative_sentences, train_meta, train_labels = augmentations(train, is_test=True)

np.save("data/augmentations/train2_positive_sentences", np.asarray(train_positive_sentences))
np.save("data/augmentations/train2_negative_sentences", np.asarray(train_negative_sentences))
np.save("data/augmentations/train2_meta", np.asarray(train_meta))
np.save("data/augmentations/train2_labels", np.asarray(train_labels))

In [None]:
# New test
train, test = preprocessing(train.copy(), new_test.copy())

# Test augmentations
test_positive_sentences, test_negative_sentences, test_meta, _ = augmentations(test, is_test=True)

np.save("data/augmentations/new_test_positive_sentences", np.asarray(test_positive_sentences))
np.save("data/augmentations/new_test_negative_sentences", np.asarray(test_negative_sentences))
np.save("data/augmentations/new_test_meta", np.asarray(test_meta))

## Save to batches

In [None]:
import numpy as np
from tqdm.notebook import tqdm

In [None]:
# load train
train_positive_sentences = np.load("data/augmentations/train_positive_sentences.npy")
train_negative_sentences = np.load("data/augmentations/train_negative_sentences.npy")
train_meta = np.load("data/augmentations/train_meta.npy")
train_labels = np.load("data/augmentations/train_labels.npy")

# load val
val_positive_sentences = np.load("data/augmentations/val_positive_sentences.npy")
val_negative_sentences = np.load("data/augmentations/val_negative_sentences.npy")
val_meta = np.load("data/augmentations/val_meta.npy")
val_labels = np.load("data/augmentations/val_labels.npy")

# load test
test_positive_sentences = np.load("data/augmentations/test_positive_sentences.npy")
test_negative_sentences = np.load("data/augmentations/test_negative_sentences.npy")
test_meta = np.load("data/augmentations/test_meta.npy")

# load new test
new_test_positive_sentences = np.load("data/augmentations/new_test_positive_sentences.npy")
new_test_negative_sentences = np.load("data/augmentations/new_test_negative_sentences.npy")
new_test_meta = np.load("data/augmentations/new_test_meta.npy")

In [None]:
# save train
for i in tqdm(range(len(train_positive_sentences))):
    batch = np.array([train_positive_sentences[i], train_negative_sentences[i], list(train_meta[i]),
                  list(train_labels[i])], dtype="object")
    np.save(f"data/augmentations/train/train_{i}", batch)
    
# save val
for i in tqdm(range(len(val_positive_sentences))):
    batch = np.array([val_positive_sentences[i], val_negative_sentences[i], list(val_meta[i]),
                  list(val_labels[i])], dtype="object")
    np.save(f"data/augmentations/val/val_{i}", batch)
    
# save test
for i in tqdm(range(len(test_positive_sentences))):
    batch = np.array([test_positive_sentences[i], test_negative_sentences[i], list(test_meta[i])], dtype="object")
    np.save(f"data/augmentations/test/test_{i}", batch)
    
# save new test
for i in tqdm(range(len(new_test_positive_sentences))):
    batch = np.array([new_test_positive_sentences[i], new_test_negative_sentences[i], list(new_test_meta[i])], dtype="object")
    np.save(f"data/augmentations/new_test/test_{i}", batch)

In [None]:
# load train
train_positive_sentences = np.load("data/augmentations/train2_positive_sentences.npy")
train_negative_sentences = np.load("data/augmentations/train2_negative_sentences.npy")
train_meta = np.load("data/augmentations/train2_meta.npy")
train_labels = np.load("data/augmentations/train2_labels.npy")

# save train
for i in tqdm(range(len(train_positive_sentences))):
    batch = np.array([train_positive_sentences[i], train_negative_sentences[i], list(train_meta[i]),
                  list(train_labels[i])], dtype="object")
    np.save(f"data/augmentations/train2/train_{i}", batch)

## Pseudo Labeling

In [None]:
from model import LSTMModel, Model, CustomDataset

In [None]:
sent_size = 112

# Load tokenizer
tokenizer_bert = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
tokenizer_lstm = nltk.RegexpTokenizer(r"[а-я]+|<unk>|<pad>")

# load test and sort
test_data = glob("data/augmentations/test/*.npy")

# data
dataset_test = CustomDataset(test_data, tokenizer_bert, tokenizer_lstm, sent_size=sent_size,
                              train_mode=False, model_type="lstm")
dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=False)

In [None]:
# load model
model = LSTMModel.load_from_checkpoint("data/models/Final_Model_lstm.ckpt")
trainer = pl.Trainer(gpus=1)
# preds
preds = trainer.predict(model, dataloader_test)

In [None]:
# Preds2Classes
const = 0.2
thresholds = [const]
y_pred = []
count_zero = 0

for pred in preds:
    pred = (pred.numpy() > thresholds).astype(int).tolist()
    y_pred.extend(pred)

In [None]:
# Save pseudo labels
for idx, data_path in enumerate(test_data):
    pseudo_test = np.load(data_path, allow_pickle=True).tolist()
    pseudo_test.append(y_pred[idx])
    pseudo_test = np.array(pseudo_test, dtype="object")    
    np.save(f"data/augmentations/test_pseudo/test_{idx}", pseudo_test)

## Change target

In [None]:
# load train
train_positive_sentences = np.load("data/augmentations/train_positive_sentences.npy")
train_negative_sentences = np.load("data/augmentations/train_negative_sentences.npy")
train_meta = np.load("data/augmentations/train_meta.npy")
train_labels = np.load("data/augmentations/train_labels.npy")

# load val
val_positive_sentences = np.load("data/augmentations/val_positive_sentences.npy")
val_negative_sentences = np.load("data/augmentations/val_negative_sentences.npy")
val_meta = np.load("data/augmentations/val_meta.npy")
val_labels = np.load("data/augmentations/val_labels.npy")

# load test
test_positive_sentences = np.load("data/augmentations/test_positive_sentences.npy")
test_negative_sentences = np.load("data/augmentations/test_negative_sentences.npy")
test_meta = np.load("data/augmentations/test_meta.npy")

# load test
new_test_positive_sentences = np.load("data/augmentations/new_test_positive_sentences.npy")
new_test_negative_sentences = np.load("data/augmentations/new_test_negative_sentences.npy")
new_test_meta = np.load("data/augmentations/new_test_meta.npy")

In [None]:
# save train
for i in tqdm(range(len(train_positive_sentences))):
    batch = np.array([train_positive_sentences[i], train_negative_sentences[i], list(train_meta[i]),
                  list(train_labels[i])], dtype="object")
    batch[-1] = batch[-1][1:]
    np.save(f"data/augmentations/train/train_{i}", batch)
    
# # save val
for i in tqdm(range(len(val_positive_sentences))):
    batch = np.array([val_positive_sentences[i], val_negative_sentences[i], list(val_meta[i]),
                  list(val_labels[i])], dtype="object")
    batch[-1] = batch[-1][1:]
    np.save(f"data/augmentations/val/val_{i}", batch)
    
# save test
for i in tqdm(range(len(test_positive_sentences))):
    batch = np.array([test_positive_sentences[i], test_negative_sentences[i], list(test_meta[i])], dtype="object")
    np.save(f"data/augmentations/test/test_{i}", batch)
    
    
# save new_test
for i in tqdm(range(len(new_test_positive_sentences))):
    batch = np.array([new_test_positive_sentences[i], new_test_negative_sentences[i], list(new_test_meta[i])], dtype="object")
    np.save(f"data/augmentations/new_test/test_{i}", batch)