In [None]:
import numpy as np
from tqdm.notebook import tqdm
import nltk
import pandas as pd
from glob import glob
import fasttext
from navec import Navec

from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertModel

import torch
from torch.utils.data import Dataset, DataLoader
import torch.functional as F
from torch import nn
import torchmetrics
import pytorch_lightning as pl

from warnings import filterwarnings
filterwarnings("ignore")

## Data

In [None]:
from model import CustomDataset, FinalModel

In [None]:
# embeddings
navec_model = Navec.load("data/navec_hudlit_v1_12B_500K_300d_100q.tar")
fasttext_model = fasttext.load_model("data/cc.ru.300.bin")

# # load test and sort
# test_data = glob("data/augmentations/test/*.npy")
# test_data = [i.split("_") for i in test_data]
# test_data = {i[1]: i[0] for i in test_data}
# sorted_test_data = []
# for i in range(len(test_data)):
#     sorted_test_data.append(test_data[str(i)+".npy"]+"_"+str(i)+".npy")
    
# load new_test and sort
new_test_data = glob("data/augmentations/new_test/*.npy")
new_test_data = [["/".join(i.split("/")[:-1]), i.split("_")[-1]] for i in new_test_data]
new_test_data = {i[1]: i[0] for i in new_test_data}
sorted_test_data = []
for i in range(len(new_test_data)):
    sorted_test_data.append(new_test_data[str(i)+".npy"] + "/" + "test_" + str(i) + ".npy")

In [None]:
# data
sent_size = 112
batch_size = 128

dataset_test = CustomDataset(sorted_test_data, sent_size, False, navec_model, fasttext_model)
dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)

## Predict

In [None]:
preds_list = []

for idx in range(4):

    model = FinalModel.load_from_checkpoint(f"data/models/final_model_{idx}.ckpt")
    trainer = pl.Trainer(gpus=1)
    
    preds = trainer.predict(model, dataloader_test)
    preds_list.append(preds)
    
    break

In [None]:
preds1 = np.asarray([j.numpy().tolist() for i in preds_list[0] for j in i])
preds2 = np.asarray([j.numpy().tolist() for i in preds_list[1] for j in i])
preds3 = np.asarray([j.numpy().tolist() for i in preds_list[2] for j in i])
preds4 = np.asarray([j.numpy().tolist() for i in preds_list[3] for j in i])
preds5 = np.asarray([j.numpy().tolist() for i in preds_list[3] for j in i])

preds = (preds1 + preds2 + preds3 + preds4 + preds5) / 5

In [None]:
# Predicts
sample_submission = pd.read_csv("data/HeadHunter_sample_submit.csv")

const = 0.4
thresholds = [const]
y_pred = []
submit_preds = []
count_zero = 0

for pred in tqdm(preds):
    pred = (pred > thresholds).astype(int).tolist()
    y_pred.extend(pred)
    
    if sum(pred) == 0:
        count_zero += 1
        submit_preds.append("0")
    else:
        submit_preds.append(",".join([str(i) for i in range(9) if pred[i]==1]))
        
print(f"Zero forecasts: {count_zero}")
sample_submission["target"] = submit_preds
sample_submission.to_csv("data/submissions/final_model.csv", index=False)

## Predict new test

In [None]:
model = FinalModel.load_from_checkpoint(f"data/models/final_model_3.ckpt")
trainer = pl.Trainer(gpus=1)

preds = trainer.predict(model, dataloader_test)

In [None]:
preds = np.asarray([j.numpy().tolist() for i in preds for j in i])

In [None]:
# Predicts
sample_submission = pd.read_csv("data/HeadHunter_new_train.csv")

const = 0.5
thresholds = [const]
y_pred = []
submit_preds = []
count_zero = 0

for pred in tqdm(preds):
    pred = (pred > thresholds).astype(int).tolist()
    y_pred.extend(pred)
    
    if sum(pred) == 0:
        count_zero += 1
        submit_preds.append("0")
    else:
        temp = ",".join([str(i) for i in range(9) if pred[i]==1])
        submit_preds.append(temp)
        
print(f"Zero forecasts: {count_zero}")
sample_submission["preds"] = submit_preds

In [None]:
# score
from sklearn.metrics import f1_score

sample_submission["target"] = sample_submission["target"].apply(
        lambda x: [1 if str(i) in x.split(",") else 0 for i in range(9)]
    )
sample_submission["preds"] = sample_submission["preds"].apply(
        lambda x: [1 if str(i) in x.split(",") else 0 for i in range(9)]
    )

y_true = sample_submission["target"].values
y_pred = sample_submission["preds"].values

f1_score(np.array([i for i in y_true]), np.array([i for i in y_pred]), average="samples")

In [None]:
# without '0,8'
y_true_res = []
y_pred_res = []

for idx, data in sample_submission.iterrows():
    if data["preds"][0] == 1 and data["preds"][-1] == 1:
        continue
    
    y_true_res.append(data["target"])
    y_pred_res.append(data["preds"])
    
f1_score(np.array([i for i in y_true_res]), np.array([i for i in y_pred_res]), average="samples")