In [1]:
!pip install -q transformers

In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

import joblib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.compose import ColumnTransformer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics import f1_score

from collections import Counter

import re
import string

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("russian")
stopwords = stopwords.words('russian')

import gc
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer

In [3]:
def compute_metrics(eval_preds):
    cleanup()
    
    y_pred, y_true = eval_preds
    
    if type(y_pred) == tuple:
        y_pred = y_pred[0]
    
    z = 1 / (1 + np.exp(-y_pred))
    y_pred, y_true = np.array(z >= 0.5, dtype=int), y_true.astype(int)
    
    return {'f1_score': f1_score(y_true, y_pred, average='samples')}

In [4]:
class MyDatasetForClassification(Dataset):
    
    def __init__(self, X, y=None, tokenizer=None):
        self.sentences = list(X)
        self.labels = torch.FloatTensor(y) if y is not None else torch.zeros((len(self.sentences), 9))
        
        self.tokenizer_outputs = tokenizer.batch_encode_plus(self.sentences, return_tensors="pt", 
                                                             max_length=128, padding=True, truncation=True)
        
    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, index):
        input_ids = self.tokenizer_outputs['input_ids'][index]
        attention_mask = self.tokenizer_outputs['attention_mask'][index]

        label = self.labels[index]

        return {'attention_mask': attention_mask, 
                'input_ids': input_ids, 
                'labels': label}

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [6]:
def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

cleanup()

# Neural net

In [7]:
Xy_train_val = pd.read_csv('../input/headhunter/data/data/train.csv', index_col='review_id'
                          ).fillna('Нет информации.')
X_train_val, y_train_val = Xy_train_val.iloc[:, :-1], Xy_train_val.iloc[:, -1]

mb = MultiLabelBinarizer(classes=[str(i) for i in range(9)])
y_train_val = mb.fit_transform(y_train_val)

X_test = pd.read_csv('../input/headhunter/data/data/test.csv', index_col='review_id'
                    ).fillna('Нет информации.')

X_train_val = X_train_val.iloc[:, 2:4].apply(lambda row: '. '.join(row.values.astype(str)), axis=1).values
X_test = X_test.iloc[:, 2:4].apply(lambda row: '. '.join(row.values.astype(str)), axis=1).values

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

X_train.shape, X_val.shape, y_train.shape, y_val.shape, X_test.shape

In [8]:
base_model = 'DeepPavlov/rubert-base-cased-sentence' # 0.780006575919

tokenizer = AutoTokenizer.from_pretrained(base_model)
model = BertForSequenceClassification.from_pretrained(
    base_model, num_labels=9, problem_type='multi_label_classification'
).to(device)

In [9]:
dataset_train = MyDatasetForClassification(X_train, y_train, tokenizer)
dataset_val = MyDatasetForClassification(X_val[:500], y_val[:500], tokenizer)
dataset_test = MyDatasetForClassification(X_test, tokenizer=tokenizer)

In [10]:
cleanup()

training_args = TrainingArguments(
    output_dir="test_trainer",
    per_device_train_batch_size=16, #
    per_device_eval_batch_size=32,
    num_train_epochs=2, # 
    warmup_steps=1000, #
    learning_rate=1e-5, #
    evaluation_strategy="epoch",
    eval_accumulation_steps=10,
    save_strategy='epoch',
    load_best_model_at_end=False,
    metric_for_best_model='f1_score',
    seed=42,
)

trainer = Trainer(model=model, 
                  args=training_args, 
                  train_dataset=dataset_train, 
                  eval_dataset=dataset_val, 
                  compute_metrics=compute_metrics)

trainer.train()

torch.save(model.state_dict(), 'model_bert_pavlov_final.pt')

In [11]:
dataset_val = MyDatasetForClassification(X_val, y_val, tokenizer)

In [12]:
dataloader_train = DataLoader(dataset_train, batch_size=64, shuffle=False)
dataloader_val = DataLoader(dataset_val, batch_size=64, shuffle=False)
dataloader_test = DataLoader(dataset_test, batch_size=64, shuffle=False)

outputs_train, outputs_val, outputs_test = [], [], []
with torch.no_grad():
    for i, batch in enumerate(dataloader_train):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs_train.append(model(**batch).logits.cpu().numpy())
        
    X_train_new_1 = np.concatenate(outputs_train, axis=0)
        
    for i, batch in enumerate(dataloader_val):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs_val.append(model(**batch).logits.cpu().numpy())
    
    X_val_new_1 = np.concatenate(outputs_val, axis=0)
    
    for i, batch in enumerate(dataloader_test):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs_test.append(model(**batch).logits.cpu().numpy())
    
    X_test_new_1 = np.concatenate(outputs_test, axis=0)

X_train_new_1.shape, X_val_new_1.shape, X_test_new_1.shape

# Logistic regression

In [13]:
class DummyTransformer(TransformerMixin):
    """
    Mini class to return initial features without transformation
    
    """
    def __init__(self, value=None):
        TransformerMixin.__init__(self)
        self.value = value
    
    def fit(self, *_):
        return self

    def transform(self, X):
        return X
    
    def get_params(self, deep=True):
        return {'value': self.value}

In [15]:
Xy_train_val = pd.read_csv('../input/headhunter/data/data/train.csv', index_col='review_id').fillna('Unknown')
X_train_val, y_train_val = Xy_train_val.iloc[:, :-1], Xy_train_val.iloc[:, -1] # .apply(lambda x: int(x[0]))

mb = MultiLabelBinarizer(classes=[str(i) for i in range(9)])
y_train_val = mb.fit_transform(y_train_val)

X_test = pd.read_csv('../input/headhunter/data/data/test.csv', index_col='review_id').fillna('Unknown')

for data in [X_train_val, X_test]:
    
    # class 0: special symbol
    data['xa_symbol_pos'] = (data['positive'].str.find('\xa0') != -1).astype(int)
    data['xa_symbol_neg'] = (data['negative'].str.find('\xa0') != -1).astype(int)
    
    # small preprocessing
    data['positive'] = data['positive'].str.replace(',', ', '
                                                   ).str.replace('.', '. '
                                                                ).apply(lambda x: re.sub(' +', ' ', x))
    data['negative'] = data['negative'].str.replace(',', ', '
                                                   ).str.replace('.', '. '
                                                                ).apply(lambda x: re.sub(' +', ' ', x))
    
    # class 8: length (woith round -1)
    data['length_pos'] = data['positive'].apply(lambda x: round(len(x), -1)) # .str.len() also works
    data.loc[data['length_pos'] > 1000, 'length_pos'] = 1000
    data['length_neg'] = data['negative'].apply(lambda x: round(len(x), -1))
    data.loc[data['length_neg'] > 1000, 'length_neg'] = 1000
    
    # class 
    data['max_pos'] = data['positive'].apply(lambda x: np.max([len(w) for w in x.split(' ')]))
    data.loc[data['max_pos'] > 25, 'max_pos'] = 25
    data['max_neg'] = data['negative'].apply(lambda x: np.max([len(w) for w in x.split(' ')]))
    data.loc[data['max_neg'] > 25, 'max_neg'] = 25
    
    # class 
    data['most_common_pos'] = data['positive'].apply(
        lambda x: Counter([w for w in x.split(' ')]).most_common(1)[0][1]
    )
    data.loc[data['most_common_pos'] > 25, 'most_common_pos'] = 25
    data['most_common_neg'] = data['negative'].apply(
        lambda x: Counter([w for w in x.split(' ')]).most_common(1)[0][1]
    )
    data.loc[data['most_common_neg'] > 25, 'most_common_neg'] = 25
    
    for col in ['city', 'position']:
        counts = data[col].value_counts()
        data.loc[data[col].isin(counts[counts < 5].index), col] = 'Прочее'
        
    cols = ['salary_rating', 'team_rating', 'managment_rating', 
            'career_rating', 'workplace_rating', 'rest_recovery_rating']
    
    for i in range(1, 5+1):
        data[f'count_{i}'] = (data.loc[:, cols] == i).sum(axis=1)
    
    data['rating_mean'] = data.loc[:, cols].mean(axis=1)
    data['rating_std'] = data.loc[:, cols].std(axis=1)
    
    data['positive_stem'] = data['positive'].apply(
        lambda x: re.sub(' +', ' ', ' '.join(
            [stemmer.stem(word) if word not in string.punctuation else '' for word in word_tokenize(x)]
        ))
    )
    data['negative_stem'] = data['negative'].apply(
        lambda x: re.sub(' +', ' ', ' '.join(
            [stemmer.stem(word) if word not in string.punctuation else '' for word in word_tokenize(x)]
        ))
    )

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

X_train.shape, X_val.shape, y_train.shape, y_val.shape, X_test.shape

In [16]:
set1 = TfidfVectorizer(ngram_range=(1, 3), max_df=1.0, min_df=3, stop_words=stopwords, analyzer='word'
                      ).fit(X_train_val.loc[Xy_train_val.target == '2', 'positive_stem']).vocabulary_.keys()

set2 = TfidfVectorizer(ngram_range=(1, 3), max_df=1.0, min_df=3, stop_words=stopwords, analyzer='word'
                      ).fit(X_train_val.loc[Xy_train_val.target == '4', 'positive_stem']).vocabulary_.keys()

set3 = TfidfVectorizer(ngram_range=(1, 3), max_df=1.0, min_df=5, stop_words=stopwords, analyzer='word'
                      ).fit(X_train_val.loc[Xy_train_val.target == '5', 'positive_stem']).vocabulary_.keys()

set4 = TfidfVectorizer(ngram_range=(1, 3), max_df=1.0, min_df=10, stop_words=stopwords, analyzer='word'
                      ).fit(X_train_val.loc[Xy_train_val.target == '7', 'positive_stem']).vocabulary_.keys()

set_words_pos = set(set1).union(set(set2)).union(set(set3)).union(set(set4))

len(set1), len(set2), len(set3), len(set4), len(set_words_pos)

In [17]:
set1 = TfidfVectorizer(ngram_range=(1, 3), max_df=1.0, min_df=3, stop_words=stopwords, analyzer='word'
                      ).fit(X_train_val.loc[Xy_train_val.target == '2', 'negative_stem']).vocabulary_.keys()

set2 = TfidfVectorizer(ngram_range=(1, 3), max_df=1.0, min_df=3, stop_words=stopwords, analyzer='word'
                      ).fit(X_train_val.loc[Xy_train_val.target == '4', 'negative_stem']).vocabulary_.keys()

set3 = TfidfVectorizer(ngram_range=(1, 3), max_df=1.0, min_df=5, stop_words=stopwords, analyzer='word'
                      ).fit(X_train_val.loc[Xy_train_val.target == '5', 'negative_stem']).vocabulary_.keys()

set4 = TfidfVectorizer(ngram_range=(1, 3), max_df=1.0, min_df=10, stop_words=stopwords, analyzer='word'
                      ).fit(X_train_val.loc[Xy_train_val.target == '7', 'negative_stem']).vocabulary_.keys()

set_words_neg = set(set1).union(set(set2)).union(set(set3)).union(set(set4))

len(set1), len(set2), len(set3), len(set4), len(set_words_neg)

In [18]:
pipeline = Pipeline([
    ('transforms', ColumnTransformer([
        ('ohe', OneHotEncoder(handle_unknown='ignore'), [0, 1] + [*range(4, X_val.shape[1]-4)]),
        ('two_features', DummyTransformer(), [X_val.shape[1]-4, X_val.shape[1]-3]),
        ('tfidf1', TfidfVectorizer(ngram_range=(1, 4), max_df=0.999, min_df=0.001, 
                                   analyzer='char_wb'), 2),
        ('tfidf2', TfidfVectorizer(ngram_range=(1, 4), max_df=0.999, min_df=0.001, 
                                   analyzer='char_wb'), 3),
        ('count1', CountVectorizer(ngram_range=(1, 4), max_df=0.999, min_df=0.001, binary=True,
                                   analyzer='char_wb'), 2),
        ('count2', CountVectorizer(ngram_range=(1, 4), max_df=0.999, min_df=0.001, binary=True,
                                   analyzer='char_wb'), 3),
        ('count3', CountVectorizer(ngram_range=(1, 3), analyzer='word', binary=True,
                                   stop_words=stopwords, vocabulary=set_words_pos), X_val.shape[1]-2),
        ('count4', CountVectorizer(ngram_range=(1, 3), analyzer='word', binary=True,
                                   stop_words=stopwords, vocabulary=set_words_neg), X_val.shape[1]-1),
    ])),
    ('lr', OneVsRestClassifier(LogisticRegression(C=0.01, max_iter=500, n_jobs=-1, random_state=42)))
])
pipeline.fit(X_train, y_train)

joblib.dump(pipeline, 'model_logreg_final')

print(round(f1_score(y_val, pipeline.predict(X_val), average='samples'), 3))

In [19]:
X_train_new_2 = pipeline.predict_proba(X_train)
X_val_new_2 = pipeline.predict_proba(X_val)
X_test_new_2 = pipeline.predict_proba(X_test)

X_train_new_2.shape, X_val_new_2.shape, X_test_new_2.shape

# RF

In [20]:
X_train_new = np.concatenate([X_train_new_1, X_train_new_2], axis=1)
X_val_new = np.concatenate([X_val_new_1, X_val_new_2], axis=1)
X_test_new = np.concatenate([X_test_new_1, X_test_new_2], axis=1)

X_train_new.shape, X_val_new.shape, X_test_new.shape

In [22]:
for d in range(3, 8):
    print(d)
    model = OneVsRestClassifier(RandomForestClassifier(max_depth=d, n_jobs=-1, random_state=42))
    model.fit(X_train_new, y_train)
    print('Train: ', round(f1_score(y_train, model.predict(X_train_new), average='samples'), 3))
    print('Valid: ', round(f1_score(y_val, model.predict(X_val_new), average='samples'), 3))
    print()

In [23]:
for d in range(3, 8):
    print(d)
    model = OneVsRestClassifier(ExtraTreesClassifier(max_depth=d, n_jobs=-1, random_state=42))
    model.fit(X_train_new, y_train)
    print('Train: ', round(f1_score(y_train, model.predict(X_train_new), average='samples'), 3))
    print('Valid: ', round(f1_score(y_val, model.predict(X_val_new), average='samples'), 3))
    print()

In [25]:
for d in range(6, 11):
    print(d)
    model = OneVsRestClassifier(ExtraTreesClassifier(max_depth=d, bootstrap=True, n_jobs=-1, random_state=42))
    model.fit(X_train_new, y_train)
    print('Train: ', round(f1_score(y_train, model.predict(X_train_new), average='samples'), 3))
    print('Valid: ', round(f1_score(y_val, model.predict(X_val_new), average='samples'), 3))
    print()

In [29]:
for n in [100, 150, 500]:
    for d in [7, 10]:
        print(n, d)
        model = OneVsRestClassifier(ExtraTreesClassifier(n_estimators=n, max_depth=d, bootstrap=True, 
                                                         n_jobs=-1, random_state=42))
        model.fit(X_train_new, y_train)
        print('Train: ', round(f1_score(y_train, model.predict(X_train_new), average='samples'), 3))
        print('Valid: ', round(f1_score(y_val, model.predict(X_val_new), average='samples'), 3))
        print()

In [32]:
X_train_val_new = np.concatenate([X_train_new, X_val_new], axis=0)
y_train_val_new = np.concatenate([y_train, y_val], axis=0)

X_train_val_new.shape, y_train_val_new.shape

In [33]:
model = OneVsRestClassifier(ExtraTreesClassifier(n_estimators=500, max_depth=7, bootstrap=True, 
                                                 n_jobs=-1, random_state=42))
model.fit(X_train_val_new, y_train_val_new)

joblib.dump(model, 'model_forest_final')

In [34]:
def predict_multilabel(model, X):
    y_pred = list(map(lambda x: ','.join(x), mb.inverse_transform(model.predict(X))))
    y_pred_top1 = model.predict_proba(X).argmax(axis=1)
    
    return np.where([len(x) > 0 for x in y_pred], y_pred, y_pred_top1)

In [35]:
pd.DataFrame({
    'review_id': X_test.index, 
    'target': predict_multilabel(model, X_test_new)
}).to_csv('answers.csv', index=False)