For baseline solution I used simple tf-idf model

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from tqdm.notebook import tqdm

from warnings import filterwarnings
filterwarnings("ignore")

## Load data

In [None]:
train = pd.read_csv("data/HeadHunter_train.csv")
test = pd.read_csv("data/HeadHunter_test.csv")
sample_submission = pd.read_csv("data/HeadHunter_sample_submit.csv")

positive_sentences = np.load("data/positive_sentences.npy")
negative_sentences = np.load("data/negative_sentences.npy")

print(f"Train shape: {train.shape} | Test shape: {test.shape}")

## Model

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from shared import calculate_metrics

import optuna

In [None]:
# data
text = [positive_sentences[i] + " " + negative_sentences[i] for i in range(len(positive_sentences))]
text = np.array(text)

# target
train.dropna(subset=["positive", "negative"], inplace=True, how="all")
train["preprocessed_target"] = train["target"].str.split(",").apply(lambda x: x[0]).astype(int)
target = train["preprocessed_target"].values

print(f"Input shape: {text.shape}, Target shape : {target.shape}")

In [None]:
# split
text_train, text_test, target_train, target_test = train_test_split(text, target)
print(f"Train shape: {text_train.shape}, Test shape: {text_test.shape}")

In [None]:
def objective(trial):

    # params
    C = trial.suggest_float("C", 1e-10, 3)
    max_df = trial.suggest_int("max_df", 1, 5)
    ngram_range = trial.suggest_int("ngram_range", 1, 5)
    
    # train
    pipeline = Pipeline([("tfidf", TfidfVectorizer(ngram_range=(1,ngram_range), max_df=max_df)),
                         ("clf", OneVsRestClassifier(LogisticRegression(max_iter=1000, C=C)))])
    pipeline.fit(text_train, target_train)
    
    # metrics
    preds = pipeline.predict(text_test)
    acc, pr, rc = calculate_metrics(y_true=target_test, y_pred=preds)
    
    return acc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

In [None]:
study.best_params

In [None]:
%%time
pipeline = Pipeline([("tfidf", TfidfVectorizer(ngram_range=(1,study.best_params["ngram_range"]),
                                               max_df=study.best_params["max_df"])),
                     ("clf", OneVsRestClassifier(LogisticRegression(max_iter=1000,
                                                                    C=study.best_params["C"])))])
pipeline.fit(text_train, target_train)
preds = pipeline.predict(text_test)
acc, pr, rc = calculate_metrics(y_true=target_test, y_pred=preds)

In [None]:
# check distribution
display(np.unique(preds, return_counts=True))
display(np.unique(target_test, return_counts=True))