In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import torchmetrics
import pytorch_lightning as pl

from warnings import filterwarnings
filterwarnings("ignore")

In [None]:
from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertModel

## Load data

In [None]:
train = pd.read_csv("data/HeadHunter_train.csv")
test = pd.read_csv("data/HeadHunter_test.csv")
sample_submission = pd.read_csv("data/HeadHunter_sample_submit.csv")

print(f"Train shape: {train.shape} | Test shape: {test.shape}")

## Utils

In [None]:
SENT_SIZE = 102 # q_95
META_SIZE = 6
METADATA_SIZE = 31
VEC_SIZE = 300
FULL_CLASSES = 9

# TOKENIZER = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
TOKENIZER = BertTokenizer.from_pretrained("SkolkovoInstitute/russian_toxicity_classifier")

## Dataloaders

In [None]:
BATCH_SIZE = 8

In [None]:
# Preprocessing
# NaNs preprocessing
train.fillna(value={"city":"[UNK]", "position":"[UNK]", "positive":"[UNK]", "negative":"[UNK]"}, inplace=True)
test.fillna(value={"city":"[UNK]", "position":"[UNK]", "positive":"[UNK]", "negative":"[UNK]"}, inplace=True) 

# lowercase
train[["positive", "negative"]] = train[["positive", "negative"]].apply(lambda x: x.str.lower())
test[["positive", "negative"]] = test[["positive", "negative"]].apply(lambda x: x.str.lower())

# One Hot
concat_temp = pd.concat((train, test))
metadata_columns = ["salary_rating", "team_rating", "managment_rating",
                    "career_rating", "workplace_rating", "rest_recovery_rating"]
concat_temp = pd.get_dummies(concat_temp, columns=metadata_columns)
dummies_columns = [i for i in concat_temp.columns if len([j for j in metadata_columns if j in i]) != 0]
train = concat_temp.loc[concat_temp["target"].notna()]
test = concat_temp.loc[concat_temp["target"].isna()]

# target to single label
train["preprocessed_target"] = train["target"].apply(lambda x: [1 if str(i) in x.split(",") else 0 for i in range(9)])

# reset index
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [None]:
# split
train, val = train_test_split(train, test_size=0.1, shuffle=True)
train.reset_index(drop=True, inplace=True)
val.reset_index(drop=True, inplace=True)

print(f"Train Shape: {train.shape}, Val Shape: {val.shape}")

In [None]:
# Dataset
class CustomDataset(Dataset):
    def __init__(self, df, train_mode):
        # utils
        metadata_columns = ["salary_rating", "team_rating", "managment_rating",
                            "career_rating", "workplace_rating", "rest_recovery_rating"]
        # for one hot
        metadata_columns = [i for i in df.columns if len([j for j in metadata_columns if j in i]) != 0]
        self.train_mode = train_mode
        
        # init features
        self.positive = df["positive"].values
        self.negative = df["negative"].values
        self.cities = df["city"].values
        self.position = df["position"].values
        self.metadata = df[metadata_columns].values
        if self.train_mode:
            self.target = df["preprocessed_target"].values
    
    def __len__(self):
        return len(self.positive)
    
    def __getitem__(self, idx):
        # get sent
        positive, negative = self.positive[idx], self.negative[idx]
        city, position = self.cities[idx], self.position[idx]
        metadata = self.metadata[idx]
        """
        For text:
        """      
        tokens_positive = TOKENIZER(positive, padding="max_length", truncation=True, max_length=SENT_SIZE, return_tensors="pt")
        tokens_negative = TOKENIZER(negative, padding="max_length", truncation=True, max_length=SENT_SIZE, return_tensors="pt")
        tokens_city = TOKENIZER(city, padding="max_length", truncation=True, max_length=META_SIZE, return_tensors="pt")
        tokens_position = TOKENIZER(position, padding="max_length", truncation=True, max_length=META_SIZE, return_tensors="pt")
        
        
        
        """
        For target
        """        
        if self.train_mode:
            target = self.target[idx]
            return tokens_positive, tokens_negative, tokens_city, tokens_position, torch.FloatTensor(metadata), torch.FloatTensor(target)
        else:
            return tokens_positive, tokens_negative, tokens_city, tokens_position, torch.FloatTensor(metadata),

In [None]:
# create datasets
dataset_train = CustomDataset(train, train_mode=True)
dataset_val = CustomDataset(val, train_mode=True)
dataset_test = CustomDataset(test, train_mode=False)
dataset_fulltrain = CustomDataset(pd.concat((train, val)), train_mode=True)

# create dataloaders
dataloader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=BATCH_SIZE, shuffle=True)
dataloader_fulltrain = DataLoader(dataset_fulltrain, batch_size=BATCH_SIZE, shuffle=True)
dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=False)

In [None]:
for tokens_positive, tokens_negative, tokens_city, tokens_position, metadata, target in dataloader_train:
    break

## Model

In [None]:
def init_RUBert(is_train: bool = False, model_path: str = "DeepPavlov/rubert-base-cased"):
    """
    rubert-base-cased
    rubert-base-cased-sentence
    distilrubert-base-cased-conversational
    """
    RUBert = BertModel.from_pretrained(model_path)
    for i in RUBert.named_parameters():
        i[1].requires_grad = is_train
    
    return RUBert

In [None]:
# RUBert_positive = BertModel.from_pretrained(MODEL_PATH)
# a = RUBert_positive(input_ids=tokens_positive["input_ids"].squeeze())

In [None]:
class RuBertModel(pl.LightningModule,):
    def __init__(self, is_train, LR, WEIGHT_DECAY, MODEL_PATH):
        super().__init__()
        
        # save hyperparams
        self.save_hyperparameters()
        
        # utils
        self.metric_accuracy = torchmetrics.Accuracy()
        self.metric_f1 = torchmetrics.F1(num_classes=FULL_CLASSES, average="samples")
        
        # logs
        self.train_accuracy_log, self.train_f1_log, self.train_loss_log = [], [], []
        self.val_accuracy_log, self.val_f1_log, self.val_loss_log = [], [], []
        
        # RuBert
        self.RUBert_positive = init_RUBert(is_train, MODEL_PATH)
        self.RUBert_negative = init_RUBert(is_train, MODEL_PATH)
        
        # Linears
        self.linear_metadata = nn.Linear(METADATA_SIZE, 256)
        self.linear1_positive = nn.Linear(768, 512)
        self.linear1_negative = nn.Linear(768, 512)
        self.linear2_positive = nn.Linear(512, 256)
        self.linear2_negative = nn.Linear(512, 256)
        self.linear_out = nn.Linear(3*256, FULL_CLASSES) 
        
        # utils
        self.relu = nn.ReLU(inplace=True)
        self.dropout1 = nn.Dropout(0.3)
        self.dropout2 = nn.Dropout(0.2)
        
        
    def forward(self, tokens_positive, tokens_negative, tokens_city, tokens_position, metadata):   
        
        # RuBert layer
        positive_out = self.RUBert_positive(input_ids=tokens_positive["input_ids"].squeeze(1),
                                            attention_mask=tokens_positive["attention_mask"].squeeze(1))
        positive_out = positive_out[1]
        positive_out = torch.reshape(positive_out, (positive_out.shape[0], -1))
        positive_out = self.dropout1(positive_out)
        negative_out = self.RUBert_negative(input_ids=tokens_negative["input_ids"].squeeze(1),
                                            attention_mask=tokens_negative["attention_mask"].squeeze(1))
        negative_out = negative_out[1]
        negative_out = torch.reshape(negative_out, (negative_out.shape[0], -1))
        negative_out = self.dropout1(negative_out)
        
        # Linear layers
        positive_linear = self.relu(self.linear1_positive(positive_out))
        negative_linear = self.relu(self.linear1_negative(negative_out))
        positive_linear = self.dropout2(positive_linear)
        negative_linear = self.dropout2(negative_linear)
        positive_linear = self.relu(self.linear2_positive(positive_linear))
        negative_linear = self.relu(self.linear2_negative(negative_linear))
        metadata = self.linear_metadata(metadata)
        
        x = torch.cat((positive_linear, negative_linear, metadata), dim=-1)
        
        # Output
        out = self.linear_out(x)
        out = torch.nn.Sigmoid()(out)
        
        
        return out

    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
        sch = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
        #learning rate scheduler
        return {"optimizer":optimizer,
                "lr_scheduler" : {"scheduler" : sch}
               }
    
    def training_step(self, batch, batch_idx):
        tokens_positive, tokens_negative, tokens_city, tokens_position, metadata, target = batch
        out = self(tokens_positive, tokens_negative, tokens_city, tokens_position, metadata,)
        loss = torch.nn.BCELoss()(out, target)
        accuracy = self.metric_accuracy(out, target.int())
        f1 = self.metric_f1(out, target.int())
        
        # save logs
        self.log("train_loss", loss, prog_bar=True)
        self.log("train_accuracy", accuracy, prog_bar=True)
        self.log("train_f1", f1, prog_bar=True)

        return {"loss": loss, "accuracy": accuracy, "F1":f1}
    
    def validation_step(self, batch, batch_idx):
        tokens_positive, tokens_negative, tokens_city, tokens_position, metadata, target = batch
        out = self(tokens_positive, tokens_negative, tokens_city, tokens_position, metadata,)        
        loss = torch.nn.BCELoss()(out, target)
        accuracy = self.metric_accuracy(out, target.int())
        f1 = self.metric_f1(out, target.int())
        
        # save logs
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_accuracy", accuracy, prog_bar=True)
        self.log("val_f1", f1, prog_bar=True)

        return {"loss": loss, "accuracy": accuracy, "F1":f1}
    
    def predict_step(self, batch, batch_idx):
        tokens_positive, tokens_negative, tokens_city, tokens_position, metadata = batch
        out = self(tokens_positive, tokens_negative, tokens_city, tokens_position, metadata)
        
        return out
        
    def training_epoch_end(self, outs):
        # log epoch metric
        self.train_loss_log.append(np.mean([i["loss"].item() for i in outs]))
        self.train_accuracy_log.append(np.mean([i["accuracy"].cpu() for i in outs]))
        self.train_f1_log.append(np.mean([i["F1"].cpu() for i in outs]))


    def validation_epoch_end(self, outs):
        # log epoch metric
        self.val_loss_log.append(np.mean([i["loss"].item() for i in outs]))
        self.val_accuracy_log.append(np.mean([i["accuracy"].cpu() for i in outs]))
        self.val_f1_log.append(np.mean([i["F1"].cpu() for i in outs]))
        
    
        
    

In [None]:
# %%time
# bert_model = RuBertModel(is_train=False)
# checkpoint = pl.callbacks.ModelCheckpoint(monitor="val_loss", mode = "min", dirpath="data/", filename="RuBert")
# logger = pl.loggers.TensorBoardLogger(save_dir="logs", name="RuBert", version=1)

# trainer = pl.Trainer(gpus=1, max_epochs=11, logger=logger,
#                      default_root_dir="data/")
# trainer.fit(bert_model, dataloader_train, dataloader_val)

# # save model
# trainer.save_checkpoint("data/models/RUBert.ckpt")

In [None]:
# %%time
# # save test preds
# preds = trainer.predict(bert_model, dataloader_test)
# submit = []
# thresh = 0.3

# for pred in tqdm(preds):
#     pred_batch = torch.where((pred > thresh)[0])[0].detach().tolist()
#     pred_batch = ",".join([str(i) for i in pred_batch])
#     if pred_batch == '':
#         print(pred)
#     submit.append(pred_batch)
    
# sample_submission["target"] = submit
# sample_submission.to_csv("data/submissions/submission_rubert.csv", index=False)

In [None]:
# %%time
# bert_model = RuBertModel(is_train=True)
# # checkpoint = pl.callbacks.ModelCheckpoint(monitor="val_loss", mode = "min", dirpath="data/", filename="RuBert")
# logger = pl.loggers.TensorBoardLogger(save_dir="logs", name="RuBert_retrained", version=1)

# trainer = pl.Trainer(gpus=1, max_epochs=7, logger=logger,
#                      default_root_dir="data/")
# trainer.fit(bert_model, dataloader_train, dataloader_val)

# # save model
# trainer.save_checkpoint("data/models/RUBert_retrained.ckpt")

In [None]:
# %%time
# preds = trainer.predict(bert_model, dataloader_test)

In [None]:
# %%time
# # save test preds
# submit = []
# thresh = 0.3

# for pred in tqdm(preds):
#     pred_batch = torch.where((pred > thresh)[0])[0].detach().tolist()
#     pred_batch = ",".join([str(i) for i in pred_batch])
#     if pred_batch == '':
#         print(pred)
#     submit.append(pred_batch)
    
# sample_submission["target"] = submit
# sample_submission.to_csv("data/submissions/submission_rubert_retrained.csv", index=False)

#### Full data

In [None]:
%%time
# params
LR = 2e-5
WEIGHT_DECAY = 1e-3
MODEL_PATH = "DeepPavlov/rubert-base-cased"


bert_model = RuBertModel(is_train=True, LR=LR, WEIGHT_DECAY=WEIGHT_DECAY, MODEL_PATH=MODEL_PATH)
# checkpoint = pl.callbacks.ModelCheckpoint(monitor="val_loss", mode = "min", dirpath="data/", filename="RuBert")
logger = pl.loggers.TensorBoardLogger(save_dir="logs", name="RuBert_retrained", version=2)

trainer = pl.Trainer(gpus=1, max_epochs=7, logger=logger,
                     default_root_dir="data/")
trainer.fit(bert_model, dataloader_fulltrain)

# save model
trainer.save_checkpoint("data/models/RUBert_retrained_fulldata.ckpt")

In [None]:
%%time
preds = trainer.predict(bert_model, dataloader_test)

In [None]:
%%time
# save test preds
submit = []
thresh1, thresh2, thresh3 = 0.5, 0.3, 0.2

for pred in tqdm(preds):
    pred_batch = torch.where((pred > thresh1)[0])[0].detach().tolist()
    pred_batch = ",".join([str(i) for i in pred_batch])
    if pred_batch == '':
        pred_batch = torch.where((pred > thresh2)[0])[0].detach().tolist()
        pred_batch = ",".join([str(i) for i in pred_batch])
        if pred_batch == '':
            pred_batch = torch.where((pred > thresh3)[0])[0].detach().tolist()
            pred_batch = ",".join([str(i) for i in pred_batch])
            if pred_batch == '':
                print(pred)
    submit.append(pred_batch)
    
sample_submission["target"] = submit
sample_submission.to_csv("data/submissions/submission_rubert_retrained_fulldata.csv", index=False)