In [1]:
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import fasttext.util
import numpy as np
import ast
from collections import Counter

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from catboost import CatBoostRegressor

from StemLemPipe import phrases2lower, phrases_without_excess_symbols, phrases_transform, text2sentences, split_by_words, sentence_split, create_stemmer_lemmer, words_to_ngrams_list, sum_phrases, wordlist2set, stopwords, StemLemPipeline

import nltk
import spacy
from nltk.tokenize import word_tokenize  
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import *

In [2]:
def get_emb_dataframe(custom_position, dim_size):
    
    ft = fasttext.load_model('cc.ru.300.bin')

    fasttext.util.reduce_model(ft, dim_size)
    
    vector_professions = []
    for words in custom_position:
        vector = np.mean([ft.get_word_vector(word) for word in words.split(' ')], axis=0).tolist()
        vector_professions.append(vector)
        
    df_vector_professions = pd.DataFrame(vector_professions, columns=[f"feat_prof_{c}" for c in range(1, dim_size+1)])
    
    return df_vector_professions

def cv_fit_predict(data, cols, text_col_name):
    dim = 100

    df_vector_professions = get_emb_dataframe(data[text_col_name].values.tolist(), dim)

    data = data[fit_cols+[text_col_name]]

    data.reset_index(drop=True, inplace=True)
    df_vector_professions.reset_index(drop=True, inplace=True)

    data = pd.concat([data, df_vector_professions], axis=1)

    data.vector_skills = data.vector_skills.apply(ast.literal_eval)
    vector_skills = pd.DataFrame(data.vector_skills.tolist(), columns=[f"feat_skill_{c}" for c in range(1, dim+1)])

    data = pd.concat([data, vector_skills], axis=1)

    y = data.salary_from
    X = data.drop(['salary_from', 'vector_skills', 'vector_professions', text_col_name], axis=1)

    num_folds = 5
    random_state = 42
    scoring = 'neg_mean_absolute_error'

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

    model = CatBoostRegressor(iterations=1000, verbose=0)

    kfold = KFold(n_splits=num_folds, random_state=random_state, shuffle=True)

    cv_results = -1 * cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)

    print([round(i,1) for i in cv_results])
    print(round(cv_results.mean(),2))
    
def str_replace(value):
    for i in ['/ка', '/ца','(-ца)']:
        value = value.replace(i, ' ')
    return value

def spacy_process(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc])

def nltk_process(text):
    #Tokenization
    nltk_tokenList = word_tokenize(text)
    
    #Stemming
    nltk_stemedList = []
    for word in nltk_tokenList:
        nltk_stemedList.append(s_stemmer.stem(word))
    
    #Lemmatization
    wordnet_lemmatizer = WordNetLemmatizer()
    return ' '.join([wordnet_lemmatizer.lemmatize(word) for word in nltk_stemedList])

In [3]:
fit_cols = ['count_skills', 'schedule_полный рабочий день',
       'schedule_свободный график', 'schedule_сменный график',
       'schedule_удаленная работа', 'schedule_частичная занятость',
       'education_name_высшее (бакалавр)', 'education_name_любое',
       'education_name_неполное высшее', 'education_name_среднее',
       'education_name_среднее профессиональное',
       'required_experience_Нет опыта',
       'required_experience_От 1 года до 3 лет',
       'required_experience_От 3 до 6 лет', 'vector_skills',
       'vector_professions','salary_from']

In [4]:
data = pd.read_csv('my_data.csv', index_col=0)

In [5]:
data.shape

(69997, 30)

In [6]:
%%time
text_col_name = 'custom_position'

cv_fit_predict(data, cols=fit_cols+[text_col_name], text_col_name=text_col_name)

[13713.7, 13417.2, 13639.5, 13573.7, 13553.8]
13579.62
CPU times: total: 14min 38s
Wall time: 1min 45s


In [7]:
#CatBoostRegressor 13579.62

In [8]:
#Замена знаков пунктуации
data['custom_position'] = data.custom_position.apply(lambda value: re.sub(r'[^\w\s]', ' ', value))

#Удаление лишних пробелов
data['custom_position'] = data.custom_position.apply(lambda value: re.sub('\s+', ' ', value).strip())

In [9]:
words  = []

for sublist in [i.split(' ') for i in data.custom_position.to_list()]:
    for item in sublist:
        words.append(item)
        
cnt_df = pd.DataFrame.from_dict(Counter(words), orient='index').reset_index()
cnt_df.sort_values(by=0, ascending=False).head(20)

Unnamed: 0,index,0
26,по,10084
20,продавец,7561
32,менеджер,7390
40,на,5361
66,кассир,4773
11,водитель,4756
9,в,4626
44,оператор,4446
17,с,3715
36,продажам,3026


In [10]:
#Заменяем ка и ца
data.custom_position = data.custom_position.apply(str_replace)

In [11]:
#Очищаем от стоп слов
stopwords = stopwords.words("russian")

data.custom_position = data.custom_position.apply(lambda value: ' '.join([word for word in value.split(' ') if word not in stopwords]))

In [12]:
s_stemmer = SnowballStemmer("russian")

nlp = spacy.load('ru_core_news_sm')

  _C._set_default_tensor_type(t)


In [13]:
%%time

data['nltk_custom_position'] = data.custom_position.apply(lambda value: nltk_process(value))

CPU times: total: 10.5 s
Wall time: 10.6 s


In [14]:
%%time

data['spacy_custom_position'] = data.custom_position.apply(lambda value: spacy_process(value))

CPU times: total: 3min 14s
Wall time: 3min 15s


In [15]:
stem_lem = create_stemmer_lemmer(lemmatizer_backend='pymorphy', stemmer_backend='snowball')

In [16]:
data['stem_custom_position'] = phrases_transform(data.custom_position.to_list(), func = stem_lem)

In [17]:
%%time
text_col_name = 'nltk_custom_position'

cv_fit_predict(data, cols=fit_cols+[text_col_name], text_col_name=text_col_name)

[13548.0, 13275.4, 13526.4, 13387.2, 13452.0]
13437.79
CPU times: total: 14min 45s
Wall time: 1min 46s


In [18]:
%%time
text_col_name = 'spacy_custom_position'

cv_fit_predict(data, cols=fit_cols+[text_col_name], text_col_name=text_col_name)

[13552.9, 13240.8, 13450.7, 13351.4, 13489.1]
13416.98
CPU times: total: 14min 44s
Wall time: 1min 46s


In [19]:
%%time
text_col_name = 'stem_custom_position'

cv_fit_predict(data, cols=fit_cols+[text_col_name], text_col_name=text_col_name)

[13549.0, 13282.6, 13549.2, 13373.5, 13421.4]
13435.14
CPU times: total: 14min 49s
Wall time: 1min 46s


In [20]:
words  = []

for sublist in [i.split(' ') for i in data.nltk_custom_position.to_list()]:
    for item in sublist:
        words.append(item)
        
cnt_df = pd.DataFrame.from_dict(Counter(words), orient='index').reset_index()
cnt_df.sort_values(by=0, ascending=False).head(20)

Unnamed: 0,index,0
29,менеджер,7569
18,продавец,7561
10,водител,4794
62,кассир,4776
40,оператор,4487
33,продаж,3923
58,консультант,2953
83,работ,2675
159,центр,2646
23,специалист,2610
