For baseline solution I used simple tf-idf model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from tqdm.notebook import tqdm

from warnings import filterwarnings
filterwarnings("ignore")

## Load data

In [2]:
train = pd.read_csv("data/HeadHunter_train.csv")
test = pd.read_csv("data/HeadHunter_test.csv")
sample_submission = pd.read_csv("data/HeadHunter_sample_submit.csv")

positive_sentences = np.load("data/positive_sentences.npy")
negative_sentences = np.load("data/negative_sentences.npy")

print(f"Train shape: {train.shape} | Test shape: {test.shape}")

Train shape: (50876, 12) | Test shape: (50651, 11)


## Model

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from shared import calculate_metrics

import optuna

In [4]:
# data
text = [positive_sentences[i] + " " + negative_sentences[i] for i in range(len(positive_sentences))]
text = np.array(text)

# target
train.dropna(subset=["positive", "negative"], inplace=True, how="all")
train["preprocessed_target"] = train["target"].str.split(",").apply(lambda x: x[0]).astype(int)
target = train["preprocessed_target"].values

print(f"Input shape: {text.shape}, Target shape : {target.shape}")

Input shape: (50837,), Target shape : (50837,)


In [5]:
# split
text_train, text_test, target_train, target_test = train_test_split(text, target)
print(f"Train shape: {text_train.shape}, Test shape: {text_test.shape}")

Train shape: (38127,), Test shape: (12710,)


In [7]:
def objective(trial):

    # params
    C = trial.suggest_float("C", 1e-10, 3)
    max_df = trial.suggest_int("max_df", 1, 5)
    ngram_range = trial.suggest_int("ngram_range", 1, 5)
    
    # train
    pipeline = Pipeline([("tfidf", TfidfVectorizer(ngram_range=(1,ngram_range), max_df=max_df)),
                         ("clf", OneVsRestClassifier(LogisticRegression(max_iter=1000, C=C)))])
    pipeline.fit(text_train, target_train)
    
    # metrics
    preds = pipeline.predict(text_test)
    acc, pr, rc = calculate_metrics(y_true=target_test, y_pred=preds)
    
    return acc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

[32m[I 2022-01-24 20:32:24,463][0m A new study created in memory with name: no-name-9731fba2-bbbd-48f3-9b24-b8122661f2db[0m
[32m[I 2022-01-24 20:33:43,140][0m Trial 0 finished with value: 0.611 and parameters: {'C': 2.2209704003741435, 'max_df': 2, 'ngram_range': 5}. Best is trial 0 with value: 0.611.[0m


Accuracy: 0.611, Precision: 0.314, Recall: 0.15


[32m[I 2022-01-24 20:34:23,246][0m Trial 1 finished with value: 0.479 and parameters: {'C': 0.26351563639139375, 'max_df': 3, 'ngram_range': 4}. Best is trial 0 with value: 0.611.[0m


Accuracy: 0.479, Precision: 0.157, Recall: 0.112


[32m[I 2022-01-24 20:35:13,209][0m Trial 2 finished with value: 0.596 and parameters: {'C': 1.3473065154012838, 'max_df': 2, 'ngram_range': 4}. Best is trial 0 with value: 0.611.[0m


Accuracy: 0.596, Precision: 0.148, Recall: 0.145


[32m[I 2022-01-24 20:36:14,851][0m Trial 3 finished with value: 0.563 and parameters: {'C': 0.6892056295381991, 'max_df': 4, 'ngram_range': 5}. Best is trial 0 with value: 0.611.[0m


Accuracy: 0.563, Precision: 0.155, Recall: 0.135


[32m[I 2022-01-24 20:36:23,689][0m Trial 4 finished with value: 0.604 and parameters: {'C': 2.347587118220292, 'max_df': 1, 'ngram_range': 2}. Best is trial 0 with value: 0.611.[0m


Accuracy: 0.604, Precision: 0.143, Recall: 0.148


[32m[I 2022-01-24 20:37:35,822][0m Trial 5 finished with value: 0.651 and parameters: {'C': 1.27223342618966, 'max_df': 5, 'ngram_range': 5}. Best is trial 5 with value: 0.651.[0m


Accuracy: 0.651, Precision: 0.267, Recall: 0.162


[32m[I 2022-01-24 20:37:37,398][0m Trial 6 finished with value: 0.537 and parameters: {'C': 2.985810789269549, 'max_df': 2, 'ngram_range': 1}. Best is trial 5 with value: 0.651.[0m


Accuracy: 0.537, Precision: 0.167, Recall: 0.13


[32m[I 2022-01-24 20:38:32,956][0m Trial 7 finished with value: 0.509 and parameters: {'C': 1.1829731338963059, 'max_df': 1, 'ngram_range': 5}. Best is trial 5 with value: 0.651.[0m


Accuracy: 0.509, Precision: 0.143, Recall: 0.121


[32m[I 2022-01-24 20:40:02,604][0m Trial 8 finished with value: 0.656 and parameters: {'C': 2.9370071627539214, 'max_df': 4, 'ngram_range': 5}. Best is trial 8 with value: 0.656.[0m


Accuracy: 0.656, Precision: 0.358, Recall: 0.165


[32m[I 2022-01-24 20:41:02,941][0m Trial 9 finished with value: 0.646 and parameters: {'C': 2.751929841828929, 'max_df': 3, 'ngram_range': 4}. Best is trial 8 with value: 0.656.[0m


Accuracy: 0.646, Precision: 0.351, Recall: 0.162


[32m[I 2022-01-24 20:41:38,151][0m Trial 10 finished with value: 0.663 and parameters: {'C': 1.9962993759573195, 'max_df': 5, 'ngram_range': 3}. Best is trial 10 with value: 0.663.[0m


Accuracy: 0.663, Precision: 0.358, Recall: 0.167


[32m[I 2022-01-24 20:42:11,706][0m Trial 11 finished with value: 0.664 and parameters: {'C': 1.9524151524531084, 'max_df': 5, 'ngram_range': 3}. Best is trial 11 with value: 0.664.[0m


Accuracy: 0.664, Precision: 0.358, Recall: 0.168


[32m[I 2022-01-24 20:42:46,087][0m Trial 12 finished with value: 0.664 and parameters: {'C': 1.9219501000269013, 'max_df': 5, 'ngram_range': 3}. Best is trial 11 with value: 0.664.[0m


Accuracy: 0.664, Precision: 0.358, Recall: 0.168


[32m[I 2022-01-24 20:43:19,949][0m Trial 13 finished with value: 0.665 and parameters: {'C': 1.810072362948492, 'max_df': 5, 'ngram_range': 3}. Best is trial 13 with value: 0.665.[0m


Accuracy: 0.665, Precision: 0.373, Recall: 0.167


[32m[I 2022-01-24 20:43:32,556][0m Trial 14 finished with value: 0.647 and parameters: {'C': 1.7616087988007167, 'max_df': 4, 'ngram_range': 2}. Best is trial 13 with value: 0.665.[0m


Accuracy: 0.647, Precision: 0.295, Recall: 0.162


[32m[I 2022-01-24 20:43:43,755][0m Trial 15 finished with value: 0.648 and parameters: {'C': 0.8844562737606815, 'max_df': 5, 'ngram_range': 2}. Best is trial 13 with value: 0.665.[0m


Accuracy: 0.648, Precision: 0.152, Recall: 0.16


[32m[I 2022-01-24 20:44:17,277][0m Trial 16 finished with value: 0.654 and parameters: {'C': 1.6157413831048346, 'max_df': 4, 'ngram_range': 3}. Best is trial 13 with value: 0.665.[0m


Accuracy: 0.654, Precision: 0.26, Recall: 0.164


[32m[I 2022-01-24 20:44:19,070][0m Trial 17 finished with value: 0.573 and parameters: {'C': 2.4774872657830977, 'max_df': 5, 'ngram_range': 1}. Best is trial 13 with value: 0.665.[0m


Accuracy: 0.573, Precision: 0.408, Recall: 0.148


[32m[I 2022-01-24 20:44:31,474][0m Trial 18 finished with value: 0.633 and parameters: {'C': 1.6907106556580616, 'max_df': 3, 'ngram_range': 2}. Best is trial 13 with value: 0.665.[0m


Accuracy: 0.633, Precision: 0.257, Recall: 0.158


[32m[I 2022-01-24 20:45:19,870][0m Trial 19 finished with value: 0.633 and parameters: {'C': 0.9849502876573251, 'max_df': 4, 'ngram_range': 4}. Best is trial 13 with value: 0.665.[0m


Accuracy: 0.633, Precision: 0.266, Recall: 0.156


[32m[I 2022-01-24 20:45:57,549][0m Trial 20 finished with value: 0.663 and parameters: {'C': 2.5940421308999033, 'max_df': 5, 'ngram_range': 3}. Best is trial 13 with value: 0.665.[0m


Accuracy: 0.663, Precision: 0.357, Recall: 0.168


[32m[I 2022-01-24 20:46:32,684][0m Trial 21 finished with value: 0.663 and parameters: {'C': 2.0467658629162, 'max_df': 5, 'ngram_range': 3}. Best is trial 13 with value: 0.665.[0m


Accuracy: 0.663, Precision: 0.358, Recall: 0.168


[32m[I 2022-01-24 20:47:05,821][0m Trial 22 finished with value: 0.664 and parameters: {'C': 1.95037911815183, 'max_df': 5, 'ngram_range': 3}. Best is trial 13 with value: 0.665.[0m


Accuracy: 0.664, Precision: 0.358, Recall: 0.168


[32m[I 2022-01-24 20:47:37,090][0m Trial 23 finished with value: 0.653 and parameters: {'C': 1.4891536244601442, 'max_df': 4, 'ngram_range': 3}. Best is trial 13 with value: 0.665.[0m


Accuracy: 0.653, Precision: 0.261, Recall: 0.163


[32m[I 2022-01-24 20:47:50,795][0m Trial 24 finished with value: 0.656 and parameters: {'C': 2.1780300947325495, 'max_df': 5, 'ngram_range': 2}. Best is trial 13 with value: 0.665.[0m


Accuracy: 0.656, Precision: 0.29, Recall: 0.166


[32m[I 2022-01-24 20:48:47,141][0m Trial 25 finished with value: 0.654 and parameters: {'C': 1.7659726401983575, 'max_df': 4, 'ngram_range': 4}. Best is trial 13 with value: 0.665.[0m


Accuracy: 0.654, Precision: 0.373, Recall: 0.164


[32m[I 2022-01-24 20:49:20,575][0m Trial 26 finished with value: 0.664 and parameters: {'C': 1.9092752280911105, 'max_df': 5, 'ngram_range': 3}. Best is trial 13 with value: 0.665.[0m


Accuracy: 0.664, Precision: 0.358, Recall: 0.168


[32m[I 2022-01-24 20:49:51,387][0m Trial 27 finished with value: 0.641 and parameters: {'C': 1.4872587230285013, 'max_df': 3, 'ngram_range': 3}. Best is trial 13 with value: 0.665.[0m


Accuracy: 0.641, Precision: 0.26, Recall: 0.159


[32m[I 2022-01-24 20:50:05,584][0m Trial 28 finished with value: 0.656 and parameters: {'C': 2.3403352359884884, 'max_df': 5, 'ngram_range': 2}. Best is trial 13 with value: 0.665.[0m


Accuracy: 0.656, Precision: 0.29, Recall: 0.166


[32m[I 2022-01-24 20:51:07,010][0m Trial 29 finished with value: 0.656 and parameters: {'C': 2.202621156239747, 'max_df': 4, 'ngram_range': 4}. Best is trial 13 with value: 0.665.[0m


Accuracy: 0.656, Precision: 0.358, Recall: 0.165


In [8]:
study.best_params

{'C': 1.810072362948492, 'max_df': 5, 'ngram_range': 3}

In [12]:
%%time
pipeline = Pipeline([("tfidf", TfidfVectorizer(ngram_range=(1,study.best_params["ngram_range"]),
                                               max_df=study.best_params["max_df"])),
                     ("clf", OneVsRestClassifier(LogisticRegression(max_iter=1000,
                                                                    C=study.best_params["C"])))])
pipeline.fit(text_train, target_train)
preds = pipeline.predict(text_test)
acc, pr, rc = calculate_metrics(y_true=target_test, y_pred=preds)

Accuracy: 0.665, Precision: 0.373, Recall: 0.167
CPU times: user 2min 27s, sys: 948 ms, total: 2min 28s
Wall time: 33.3 s


In [13]:
# check distribution
display(np.unique(preds, return_counts=True))
display(np.unique(target_test, return_counts=True))

(array([0, 1, 3, 8]), array([4752,    1,    6, 7951]))

(array([0, 1, 2, 3, 4, 5, 6, 7, 8]),
 array([5282,  765,    1,  263,   23,   61,  194,   95, 6026]))