In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from tqdm.notebook import tqdm

from natasha import Doc, MorphVocab, Segmenter, NewsEmbedding, NewsMorphTagger

In [None]:
# nltk.download('punkt')
# nltk.download('stopwords')

## Load data

In [None]:
train = pd.read_csv("../data/HeadHunter_train.csv")
test = pd.read_csv("../data/HeadHunter_test.csv")
sample_submission = pd.read_csv("../data/HeadHunter_sample_submit.csv")

print(f"Train shape: {train.shape} | Test shape: {test.shape}")

## EDA

In [None]:
train.head()

In [None]:
test.head()

In [None]:
plt.bar([i[0] for i in train[["target"]].value_counts(normalize=True).iloc[:5].index.values],
        train[["target"]].value_counts(normalize=True).iloc[:5].values)
plt.ylabel("Percent of target")
plt.xlabel("Target Name")
plt.title("Target distribution")
plt.show()

In [None]:
# metadata distribution
metadata_columns = ["salary_rating", "team_rating", "managment_rating", "career_rating",
                    "workplace_rating", "rest_recovery_rating"]
for feature in metadata_columns:
    plt.bar([i[0] for i in train[[feature]].value_counts(normalize=True).index.values],
            train[[feature]].value_counts(normalize=True).values)
    plt.ylabel(f"Percent of {feature}")
    plt.xlabel(f"{feature}")
    plt.title(f"{feature} distribution")
    plt.show()

In [None]:
# correlation plot
train["preprocessed_target"] = train["target"].str.split(",").apply(lambda x: x[0]).astype(int)
corr = train[metadata_columns + ["preprocessed_target"]].corr()

plt.figure(figsize=(13, 7))
sns.heatmap(corr, vmin=-1, vmax=1, annot=True)
plt.title("Metadata and Target correlation")
plt.show()

In [None]:
# metadata by target
train["metadata_sum"] = train[metadata_columns].sum(axis=1)
train.groupby(["target"])["metadata_sum"].agg(["mean", "median", "max", "min", "count"]).sort_values("mean", ascending=False).head(7)

In [None]:
# city
plt.bar([i[0] for i in train[["city"]].value_counts(normalize=True).iloc[:5].index.values],
       train[["city"]].value_counts(normalize=True).iloc[:5].values)
plt.ylabel("Percent")
plt.xlabel("City")
plt.xticks(rotation=-45)
plt.title("City distribution")
plt.show()

# position
plt.bar([i[0] for i in train[["position"]].value_counts(normalize=True).iloc[:5].index.values],
       train[["position"]].value_counts(normalize=True).iloc[:5].values)
plt.ylabel("Percent")
plt.xlabel("Position")
plt.xticks(rotation=-45)
plt.title("Position distribution")
plt.show()

In [None]:
# NA
print("NaNs sum")
display(train.isna().sum())

print("-"*20)
print("NaNs Postive")
display(train[train["positive"].isna()==True]["target"].value_counts())

print("-"*20)
print("NaNs Negative")
display(train[train["negative"].isna()==True]["target"].value_counts())

print("-"*20)
print("NaNs Both")
display(train[(train["positive"].isna()==True)
             &((train["negative"].isna()==True))]["target"].value_counts())

# Test distribution
print("NaNs sum")
display(test.isna().sum())

In [None]:
# drop nans
train.dropna(subset=["positive", "negative"], inplace=True, how="all")
train.loc[train["negative"].isna(), "negative"] = ""

In [None]:
def preprocessing(sent:str, lowercase:bool=True,
                  remove_punctuation:bool=False, remove_stopwords:bool=False,
                  lemmatize:bool=False):
    # lowercase
    if lowercase:
        sent = sent.lower()
        
    # remove_punctuation
    if remove_punctuation:
        tokenizer = nltk.RegexpTokenizer(r'[а-я]+')
        sent = " ".join(tokenizer.tokenize(sent))
    
    # remove_stopwords
    if remove_stopwords:
        stopwords = nltk.corpus.stopwords.words("russian")
        sent = " ".join([w for w in sent.split() if w not in stopwords])
        
    # lemmatize
    if lemmatize:
        doc = Doc(sent)
        # Segmentation
        doc.segment(segmenter)

        # Morphology
        morph_tagger = NewsMorphTagger(emb)
        doc.tag_morph(morph_tagger)

        # Lemmatization
        for token in doc.tokens:
            token.lemmatize(morph_vocab)
        sent = " ".join([w.lemma for w in doc.tokens])
        
    return sent

In [None]:
# grab sentences
positive_sentences_raw = train["positive"].tolist()
negative_sentences_raw = train["negative"].tolist()

# init empty lists
positive_sentences = []
negative_sentences = []

# natasha utils
segmenter = Segmenter()
emb = NewsEmbedding()
morph_vocab = MorphVocab()

# preprocess
for sent in tqdm(positive_sentences_raw):
    sent = preprocessing(sent, lowercase=True, remove_punctuation=True, remove_stopwords=True,
                         lemmatize=True)
    positive_sentences.append(sent)
    del sent
    
for sent in tqdm(negative_sentences_raw):
    sent = preprocessing(sent, lowercase=True, remove_punctuation=True, remove_stopwords=True,
                         lemmatize=True)
    negative_sentences.append(sent)
    del sent

In [None]:
# save to npy
positive_sentences = np.array(positive_sentences)
negative_sentences = np.array(negative_sentences)

np.save("data/positive_sentences", positive_sentences)
np.save("data/negative_sentences", negative_sentences)

In [None]:
# Words Frequency Positive
wordfreq = nltk.FreqDist([w for sent in positive_sentences for w in sent.split()])
wordfreq = {k: v for k, v in sorted(dict(wordfreq).items(), key=lambda item: item[1], reverse=True)}

plt.figure(figsize=(16, 6))
plt.bar(list(wordfreq.keys())[:30], list(wordfreq.values())[:30])
plt.title("Words frequency Positive")
plt.xticks(rotation=45)
plt.xlabel("Words")
plt.ylabel("Freq")
plt.show()

# Words Frequency Negative
wordfreq = nltk.FreqDist([w for sent in negative_sentences for w in sent.split()])
wordfreq = {k: v for k, v in sorted(dict(wordfreq).items(), key=lambda item: item[1], reverse=True)}

plt.figure(figsize=(16, 6))
plt.bar(list(wordfreq.keys())[:30], list(wordfreq.values())[:30])
plt.title("Words frequency Negative")
plt.xticks(rotation=45)
plt.xlabel("Words")
plt.ylabel("Freq")
plt.show()

1) We don't need rows with missed both "positive" and "negative"\
2) More then 80% of targets are 0 or 8 values, it's would be great to train binary classification model\
3) We have multilabel but i'm not sure that we can submit multilabel\
4) We need somehow concat "positive", "negative" and "metadata" columns