In [1]:
! pip install python-dotenv
! pip install pymorphy2 nltk scikit-learn
! pip install catboost
! pip install xgboost



In [2]:
import pandas as pd
import numpy as np

import gdown
import os
import json

from dotenv import load_dotenv

import pymorphy2
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download('punkt')
nltk.download('stopwords')

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to /home/kitsu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/kitsu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
ROOT_DIR: str = os.path.dirname(os.path.abspath(os.curdir))
DATASET_PATH: str = f"{ROOT_DIR}/source_data/superset_hr.xlsx"
SKILLS_JSON_PATH: str = f"{ROOT_DIR}/source_data/skills_dictionary.json"
GOOGLE_COLAB_PATH: str = '/content/superset_hr.xlsx'

load_dotenv()
SKILL_DICT_URL = os.environ.get('SKILL_DICT_URL')
DATASET_URL = os.environ.get('DATASET_URL')

if DATASET_URL is None:
    print('>>> .env was not found!', end='\n\n')
    DATASET_URL = input('>>> Provide DATASET url for downloading: ')

    if SKILL_DICT_URL is None:
        SKILL_DICT_URL = input('>>> Provide JSON url for downloading: ')

In [4]:
def get_skill_dict() -> dict[str: list[int]]:
    try:
        with open(file=SKILLS_JSON_PATH, mode='r') as input_json_file:
            data = json.load(input_json_file)
    except FileNotFoundError:
        print(f"Can't open file from path: {SKILLS_JSON_PATH}", end='\n\n')
        file_name = gdown.download(SKILL_DICT_URL, fuzzy=True)
        os.makedirs(os.path.dirname(SKILLS_JSON_PATH), exist_ok=True)
        os.rename(os.path.abspath(file_name), SKILLS_JSON_PATH)
        print(f'Moved to: {SKILLS_JSON_PATH}', end='\n\n')
        
        with open(file=SKILLS_JSON_PATH, mode='r') as input_json_file:
            data = json.load(input_json_file)
    return data

def get_original_dataframe() -> pd.DataFrame:
    try:
        df = pd.read_excel(DATASET_PATH)
    except FileNotFoundError:
        try:
            print(f"Can't open file from path: {DATASET_PATH}", end='\n\n')
            df = pd.read_excel(GOOGLE_COLAB_PATH)
        except FileNotFoundError:
            print(f"Can't open file from path: {GOOGLE_COLAB_PATH}", end='\n\n')
            file_name = gdown.download(DATASET_URL, fuzzy=True)
            os.makedirs(os.path.dirname(DATASET_PATH), exist_ok=True)
            os.rename(os.path.abspath(file_name), DATASET_PATH)
            print(f'Moved to: {DATASET_PATH}', end='\n\n')
            df = pd.read_excel(DATASET_PATH)
    finally:
        print('Success!')
        return df

In [5]:
skills_dict = get_skill_dict()

In [6]:
df_original = get_original_dataframe()

Success!


In [7]:
print(df_original.shape)
df_original.sample(3)

(175455, 24)


Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,income_name,name,published_at,created_at,collected_at,url,...,address__city,schedule__name,grade,employment__name,key_skills__names,languages__names,exchange_rate,salary_from_gross,salary_to_gross,salary_average
66409,67520,67520,67519,94578303,Менеджер по работе с клиентами,Менеджер по продажам,2024-03-15T09:17:25+0300,2024-03-15T09:17:25+0300,2024-03-15 15:11:26.746,https://api.hh.ru/vacancies/94578303?host=hh.ru,...,Москва,Сменный график,Middle (3-6),Полная занятость,"['Навыки продаж', 'Навыки переговоров', 'Подго...",[],1.0,146900.0,339000.0,242950.0
71419,72585,72585,72584,94189514,QA инженер,Инженер по качеству (мастер контрольный),2024-03-04T11:07:25+0300,2024-03-04T11:07:25+0300,2024-03-15 20:37:11.419,https://api.hh.ru/vacancies/94189514?host=hh.ru,...,Нижнекамск,Полный день,Junior (1-3),Полная занятость,"['Пользователь ПК', 'Техническая документация'...",[],1.0,50000.0,,
94329,96026,96026,96025,92777685,Специалист технической поддержки,Специалист технической поддержки,2024-03-04T10:22:43+0300,2024-03-04T10:22:43+0300,2024-03-24 19:26:33.262,https://api.hh.ru/vacancies/92777685?host=hh.ru,...,Санкт-Петербург,Полный день,Intern (0-1),Полная занятость,"['Настройка ПО', 'Офисная техника', 'Настройка...",[],,,,


In [8]:
def print_df_info(df: pd.DataFrame) -> None:
    print(f"Shape: {df.shape}")
    print('-' * 50)
    display(df.sample(3))
    print('-' * 50)
    display(df.info())
    print('-' * 50)
    display(df.isna().sum())
    print('-' * 50)


def get_clear_df_version(df: pd.DataFrame) -> pd.DataFrame:
    df_new = df[['income_name', 'area__name', 'schedule__name', 'grade',
                 'key_skills__names', 'salary__currency',
                 'exchange_rate', 'salary_from_gross', 'salary_to_gross', 'salary_average']].copy()

    df_new = df_new.dropna(subset=['salary_from_gross', 'salary_to_gross', 'salary_average'], how='all')
    df_new = df_new.dropna(subset=['income_name'])

    selected_cities = {'Москва', 'Санкт-Петербург', 'Новосибирск', 'Екатеринбург', 'Казань'}

    def replace_city(city):
        if city in selected_cities:
            return city
        else:
            return 'Другой'

    df_new['area__name'] = df_new['area__name'].apply(replace_city)
    df_new['not_rur'] = df_new['exchange_rate'] != 1
    df_new['not_rur'] = df_new['not_rur'].astype(int)
    df_new = df_new.drop(['salary__currency', 'exchange_rate'], axis=1)

    return df_new



def get_difference_percentiles(df: pd.DataFrame) -> tuple[float, float, float]:
    filtered_df = df.dropna(subset=['salary_from_gross', 'salary_to_gross']).copy()
    filtered_df.loc[:, 'difference'] = filtered_df['salary_to_gross'] - filtered_df['salary_from_gross']
    condition = filtered_df['difference'] > 0.8 * filtered_df['salary_to_gross']
    filtered_df.loc[condition, ['salary_from_gross', 'difference']] = np.nan

    filtered_df = filtered_df.dropna(subset=['salary_from_gross', 'salary_to_gross'])

    filtered_df = filtered_df[filtered_df['salary_from_gross'] >= 1000]
    filtered_df = filtered_df[filtered_df['salary_to_gross'] >= 10000]

    filtered_df.loc[:, 'difference_ratio'] = np.where(filtered_df['salary_to_gross'] != 0, 
                                                      filtered_df['difference'] / filtered_df['salary_to_gross'], 
                                                      np.nan)

    perc25 = filtered_df['difference_ratio'].quantile(0.25)
    perc50 = filtered_df['difference_ratio'].quantile(0.50)
    perc75 = filtered_df['difference_ratio'].quantile(0.75)

    return perc25, perc50, perc75


def fill_na_salary(df: pd.DataFrame, coef) -> pd.DataFrame:
    result_df = df.copy()
    result_df['salary_to_gross'] = result_df['salary_to_gross'].fillna(result_df['salary_from_gross'] / (1-coef))
    result_df['salary_from_gross'] = result_df['salary_from_gross'].fillna(result_df['salary_to_gross'] * (1-coef))
    result_df['salary_average'] = result_df['salary_average'].fillna((result_df['salary_to_gross'] + result_df['salary_from_gross']) / 2)
    
    result_df[['salary_to_gross', 'salary_from_gross', 'salary_average']] = result_df[['salary_to_gross', 'salary_from_gross', 'salary_average']].astype(float)
    return result_df


def exctract_features(df: pd.DataFrame) -> pd.DataFrame:
    transformers = [
        ('schedule_name', OneHotEncoder(sparse_output=False, drop='first'), ['schedule__name']),
        ('grade', OneHotEncoder(sparse_output=False, drop='first'), ['grade']),
        ('income_name', OneHotEncoder(sparse_output=False, drop='first'), ['income_name']),
        ('area_name', OneHotEncoder(sparse_output=False, drop='first'), ['area__name'])
    ]

    pipeline = Pipeline(steps=[
        ('column_transformer', ColumnTransformer(transformers=transformers, remainder='passthrough'))
    ])

    display(pipeline)

    df_transformed = pipeline.fit_transform(df)
    column_names = pipeline.named_steps['column_transformer'].get_feature_names_out()
    transofrmed_cols = [x for x in column_names if 'remainder' not in x]
    column_names = [x.replace('remainder__', '') if x not in transofrmed_cols else x for x in column_names]
    
    df_new = pd.DataFrame(df_transformed, columns=column_names)
    df_new[transofrmed_cols] = df_new[transofrmed_cols].astype(float)
    
    return df_new



def convert_skills(skill_string: str, skills_dict: dict) -> tuple[list[str], int, list[str]]:
    try:
        skill_list = eval(skill_string)
        if not isinstance(skill_list, list):
            return np.nan, 0, []
    except:
        return np.nan, 0, []

    converted_skills = []
    unconverted_skills = []

    for skill in skill_list:
        found = False
        for main_skill, variations in skills_dict.items():
            if skill in variations:
                converted_skills.append(main_skill)
                found = True
                break
        if not found:
            unconverted_skills.append(skill)

    return converted_skills, unconverted_skills


def preprocess_key_skills(df: pd.DataFrame, s_dict: dict[str: list[str]]) -> pd.DataFrame:
    df_new = df.copy()
    df_new[['skills', 'unconverted_skills']] = df_new['key_skills__names'].apply(
    lambda x: pd.Series(convert_skills(x, s_dict)))
    df_new = df_new.drop(['key_skills__names'], axis=1)

    return df_new


morph = pymorphy2.MorphAnalyzer()
nltk_stop_words  = set(stopwords.words('russian'))
custom_words = {',', ':', 'работа'}
stop_words = nltk_stop_words.union(custom_words)


def tokenize_and_lemmatize(skill_list: list[str]) -> list[str]:
    tokens = []
    for skill in skill_list:
        words = word_tokenize(skill)
        lemmas = [morph.parse(word.lower())[0].normal_form for word in words if word.lower() not in stop_words]
        tokens.extend(lemmas)
    return tokens


def process_skills(df: pd.DataFrame, freq_cutoff: int = 100) -> pd.DataFrame:
    df_processed = df.copy()

    df_processed['unskills_processed'] = df_processed['unconverted_skills'].apply(tokenize_and_lemmatize)
    unskills_counts = df_processed['unskills_processed'].explode().value_counts()
    frequent_skills = unskills_counts[unskills_counts >= freq_cutoff].index

    df_processed['unskills_filtered'] = df_processed['unskills_processed'].apply(lambda skills: [skill for skill in skills if skill in frequent_skills])
    df_processed['skills_plus'] = df_processed['skills'] + df_processed['unskills_filtered']
    df_processed = df_processed[df_processed['skills_plus'].apply(lambda x: len(x) > 0)]

    df_processed = df_processed.drop(['unconverted_skills', 
                                      'skills', 
                                      'unskills_processed', 
                                      'unskills_filtered'], axis = 1)
    return df_processed

def vectorize_skills(df: pd.DataFrame) -> pd.DataFrame:
    df_vect = df.copy()
    df_vect['skills_plus'] = df_vect['skills_plus'].apply(lambda x: ' '.join(x))

    vectorizer = TfidfVectorizer()
    display(vectorizer)
    skills_tfidf = vectorizer.fit_transform(df_vect['skills_plus'])
    skills_df = pd.DataFrame(skills_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

    df_vect = df_vect.reset_index()
    df_merged = pd.concat([df_vect, skills_df], axis=1)

    df_merged = df_merged.drop(['skills_plus'], axis=1)
    df_merged[['not_rur',
               'salary_from_gross', 
               'salary_to_gross', 
               'salary_average']] = df_merged[['not_rur',
                                               'salary_from_gross', 
                                               'salary_to_gross', 
                                               'salary_average']].astype(float)

    return df_merged

In [9]:
FREQ_CUTOFF = 100
PERC_NUMBER = 0  # 0 - 25% | 1 - 50% | 2 - 75%

pipeline_preprocess = Pipeline(steps=[
    ('clear_df', FunctionTransformer(get_clear_df_version)),

    ('fill_na_salary', FunctionTransformer(
        func=lambda df: fill_na_salary(df, get_difference_percentiles(df)[PERC_NUMBER]),
        validate=False,
    )),

    ('extract_features', FunctionTransformer(exctract_features, validate=False)),

    ('preprocess_key_skills', FunctionTransformer(
        func=lambda df: preprocess_key_skills(df, skills_dict),
        validate=False,
    )),
    
    ('process_skills', FunctionTransformer( 
        func=lambda df: process_skills(df, FREQ_CUTOFF),
        validate=False,
    )),
    
    ('vectorize_skills', FunctionTransformer(vectorize_skills, validate=False))
    ], 
    verbose=True).set_output(transform="pandas")

display(pipeline_preprocess)

In [10]:
df_after_preprocess: pd.DataFrame = pipeline_preprocess.fit_transform(df_original)

[Pipeline] .......... (step 1 of 6) Processing clear_df, total=   0.1s
[Pipeline] .... (step 2 of 6) Processing fill_na_salary, total=   0.0s


[Pipeline] .. (step 3 of 6) Processing extract_features, total=   2.3s
[Pipeline]  (step 4 of 6) Processing preprocess_key_skills, total=  20.5s
[Pipeline] .... (step 5 of 6) Processing process_skills, total=  29.7s


[Pipeline] .. (step 6 of 6) Processing vectorize_skills, total=   2.0s


In [11]:
print_df_info(df_after_preprocess)

Shape: (77828, 873)
--------------------------------------------------


Unnamed: 0,index,schedule_name__schedule__name_Гибкий график,schedule_name__schedule__name_Полный день,schedule_name__schedule__name_Сменный график,schedule_name__schedule__name_Удаленная работа,grade__grade_Junior (1-3),grade__grade_Middle (3-6),grade__grade_Senior (>6),income_name__income_name_1C оператор,income_name__income_name_1С администратор,...,эксплуатационные,электронный,энергичность,этика,этикет,эффективностью,юридический,юридическими,язык,яндекс
61944,86699,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39342,54702,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23558,32967,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77828 entries, 0 to 77827
Columns: 873 entries, index to яндекс
dtypes: float64(872), int64(1)
memory usage: 518.4 MB


None

--------------------------------------------------


index                                             0
schedule_name__schedule__name_Гибкий график       0
schedule_name__schedule__name_Полный день         0
schedule_name__schedule__name_Сменный график      0
schedule_name__schedule__name_Удаленная работа    0
                                                 ..
эффективностью                                    0
юридический                                       0
юридическими                                      0
язык                                              0
яндекс                                            0
Length: 873, dtype: int64

--------------------------------------------------


In [23]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import make_scorer, root_mean_squared_error, mean_squared_error
from tqdm import tqdm
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge, Lasso

param_grid = [
        {
            'model': [LinearRegression()],
            'model__fit_intercept': [True, False]
        },
        {
            'model': [RandomForestRegressor()],
            'model__n_estimators': [10],
            'model__max_depth': [None, 10, 20]
        },
        {
            'model': [CatBoostRegressor(verbose=False)]
        },
        {
            'model': [XGBRegressor()]
        },
        {
            'model': [SVR()],
            'model__kernel': ['linear', 'rbf'],
            'model__C': [0.1, 1, 10],
            'model__gamma': ['scale', 'auto']
        },
        {
            'model': [DecisionTreeRegressor()],
            'max_depth': [5, 10, 15, 20],
            'min_samples_split': [2, 10, 20],
            'min_samples_leaf': [1, 5, 10]
        },
        {
            'model': [Ridge()],
            'alpha': [0.01, 0.1, 1, 10]
        },
        {
            'model': [Lasso()],
            'alpha': [0.01, 0.1, 1, 10]
        }
]

rmse = make_scorer(root_mean_squared_error)
mse = make_scorer(mean_squared_error)

def pipeline_train(data):
    X = data.drop(columns=['salary_from_gross', 'salary_to_gross', 'salary_average'])
    y = data['salary_average']

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

    best_models = {}

    for model_params in tqdm(param_grid):
        pipeline = Pipeline(steps=[
            ('model', model_params['model'])
        ])

        grid_search = GridSearchCV(pipeline, param_grid=model_params, cv=3, scoring=rmse)
        display(grid_search)
        grid_search.fit(X_train, y_train)

        y_pred = grid_search.predict(X_test)

        best_models[type(model_params['model']).__name__] = grid_search.best_estimator_
        print(f"Best parameters for {type(model_params['model']).__name__}: {grid_search.best_params_}")
        print(f"Best score for {type(model_params['model']).__name__}: {grid_search.best_score_}")

    return best_models

rersults = pipeline_train(df_after_preprocess)

  0%|          | 0/8 [00:00<?, ?it/s]

 12%|█▎        | 1/8 [00:07<00:51,  7.32s/it]

Best parameters for list: {'model': LinearRegression(), 'model__fit_intercept': True}
Best score for list: 366120428.55647975


 25%|██▌       | 2/8 [03:23<11:51, 118.55s/it]

Best parameters for list: {'model': RandomForestRegressor(), 'model__max_depth': 10, 'model__n_estimators': 10}
Best score for list: 47014.69561471138


 38%|███▊      | 3/8 [04:38<08:12, 98.57s/it] 

Best parameters for list: {'model': <catboost.core.CatBoostRegressor object at 0x7f0145dfb2b0>}
Best score for list: 42755.12372149535


 50%|█████     | 4/8 [05:27<05:15, 78.82s/it]

Best parameters for list: {'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)}
Best score for list: 42968.138176638626


In [13]:
def train(data, estimator):
    X = data.drop(columns=['salary_from_gross', 'salary_to_gross', 'salary_average'])
    y = data['salary_average']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)
    print(root_mean_squared_error(y_test, y_pred))

In [14]:
from catboost import CatBoostRegressor
train(df_after_preprocess, CatBoostRegressor())

Learning rate set to 0.078644
0:	learn: 64727.0921095	total: 104ms	remaining: 1m 44s
1:	learn: 63059.8022352	total: 128ms	remaining: 1m 3s
2:	learn: 61547.4473180	total: 149ms	remaining: 49.6s
3:	learn: 60216.7792729	total: 173ms	remaining: 43.1s
4:	learn: 59055.5057364	total: 197ms	remaining: 39.2s
5:	learn: 58012.9984083	total: 223ms	remaining: 37s
6:	learn: 57111.2117479	total: 248ms	remaining: 35.1s
7:	learn: 56336.0109253	total: 271ms	remaining: 33.6s
8:	learn: 55609.3059948	total: 291ms	remaining: 32.1s
9:	learn: 54958.3994065	total: 312ms	remaining: 30.9s
10:	learn: 54398.8816549	total: 336ms	remaining: 30.2s
11:	learn: 53895.0988679	total: 360ms	remaining: 29.6s
12:	learn: 53431.0166977	total: 382ms	remaining: 29s
13:	learn: 53011.7224972	total: 405ms	remaining: 28.5s
14:	learn: 52629.6233704	total: 428ms	remaining: 28.1s
15:	learn: 52242.6691097	total: 455ms	remaining: 28s
16:	learn: 51920.8094438	total: 476ms	remaining: 27.5s
17:	learn: 51643.7436076	total: 496ms	remaining: 2

In [15]:
from xgboost import XGBRegressor
train(df_after_preprocess, XGBRegressor())

42285.09350057263
