# Initialization cell

In [23]:
! pip install --upgrade pip
! pip install python-dotenv
! pip install pymorphy2 nltk scikit-learn
! pip install catboost
! pip install xgboost
! pip install -U scikit-learn
! pip install joblib
! pip install --upgrade termcolor
! pip install cloudpickle
! pip install transformers



In [29]:
import pandas as pd
import numpy as np

import gdown
import os
import json

import matplotlib.pyplot as plt

from dataclasses import dataclass, fields, asdict

from typing import List, Optional, Tuple, Dict
import shutil
from dotenv import load_dotenv

import joblib
import cloudpickle
from termcolor import colored

import pymorphy2
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download('punkt')
nltk.download('stopwords')

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import make_scorer, mean_squared_error, root_mean_squared_error, mean_absolute_percentage_error

from tqdm import tqdm
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset
import torch.optim as optim
from torch.nn.utils.rnn import pack_sequence, pad_sequence

from transformers import BertModel, BertTokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device type:', colored(device, 'green'))

Device type: [32mcuda[0m


[nltk_data] Downloading package punkt to /home/kitsu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/kitsu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Preparation cell

In [10]:
ROOT_DIR: str = os.path.dirname(os.path.abspath(os.curdir))
DATASET_PATH: str = f"{ROOT_DIR}/source_data/superset_hr.xlsx"
SKILLS_JSON_PATH: str = f"{ROOT_DIR}/source_data/skills_dictionary.json"
GOOGLE_COLAB_PATH: str = '/content/superset_hr.xlsx'
SKILLS_PRIO_JSON_PATH: str = f"{ROOT_DIR}/source_data/skills_priority.json"

load_dotenv()
SKILL_DICT_URL = os.environ.get('SKILL_DICT_URL')
DATASET_URL = os.environ.get('DATASET_V2_URL')
SKILL_PRIO_URL = os.environ.get('SKILL_PRIO_URL')

if DATASET_URL is None:
    print('>>> .env was not found!', end='\n\n')
    DATASET_URL = input('>>> Provide DATASET url for downloading: ')

    if SKILL_DICT_URL is None: SKILL_DICT_URL = input('>>> Provide JSON url for downloading: ')
    if SKILL_PRIO_URL is None: SKILL_PRIO_URL = input('>>> Provide priorities JSON url for downloading: ')

In [11]:
def get_skill_dict() -> dict[str: list[int]]:
    try:
        with open(file=SKILLS_JSON_PATH, mode='r') as input_json_file:
            data = json.load(input_json_file)
    except FileNotFoundError:
        print(f"Can't open file from path: {SKILLS_JSON_PATH}", end='\n\n')
        file_name = gdown.download(SKILL_DICT_URL, fuzzy=True)
        os.makedirs(os.path.dirname(SKILLS_JSON_PATH), exist_ok=True)
        os.rename(os.path.abspath(file_name), SKILLS_JSON_PATH)
        print(f'Moved to: {SKILLS_JSON_PATH}', end='\n\n')

        with open(file=SKILLS_JSON_PATH, mode='r') as input_json_file:
            data = json.load(input_json_file)
    return data

def get_skill_prio() -> dict[str: dict[str, int]]:
    try:
        with open(file=SKILLS_PRIO_JSON_PATH, mode='r') as input_json_file:
            data = json.load(input_json_file)
    except FileNotFoundError:
        print(f"Can't open file from path: {SKILLS_PRIO_JSON_PATH}", end='\n\n')
        file_name = gdown.download(SKILL_PRIO_URL, fuzzy=True)
        os.makedirs(os.path.dirname(SKILLS_PRIO_JSON_PATH), exist_ok=True)
        os.rename(os.path.abspath(file_name), SKILLS_PRIO_JSON_PATH)
        print(f'Moved to: {SKILLS_PRIO_JSON_PATH}', end='\n\n')

        with open(file=SKILLS_PRIO_JSON_PATH, mode='r') as input_json_file:
            data = json.load(input_json_file)
    return data


def get_original_dataframe() -> pd.DataFrame:
    try:
        df = pd.read_excel(DATASET_PATH)
    except FileNotFoundError:
        try:
            print(f"Can't open file from path: {DATASET_PATH}", end='\n\n')
            df = pd.read_excel(GOOGLE_COLAB_PATH)
        except FileNotFoundError:
            print(f"Can't open file from path: {GOOGLE_COLAB_PATH}", end='\n\n')
            file_name = gdown.download(DATASET_URL, fuzzy=True)
            os.makedirs(os.path.dirname(DATASET_PATH), exist_ok=True)
            os.rename(os.path.abspath(file_name), DATASET_PATH)
            print(f'Moved to: {DATASET_PATH}', end='\n\n')
            df = pd.read_excel(DATASET_PATH)
    finally:
        print('Success!')
        return df

In [12]:
skills_dict = get_skill_dict()

In [13]:
skill_prio_dict = get_skill_prio()
skill_prio_dict = {k: {key.lower(): value for key, value in v.items()} for k, v in skill_prio_dict.items()}

In [14]:
df_original = get_original_dataframe()

Success!


In [15]:
print(df_original.shape)
df_original.sample(3)

(188833, 21)


Unnamed: 0,id,income_name,name,published_at,created_at,collected_at,url,area__name,salary__currency,salary__gross,...,address__city,schedule__name,grade,employment__name,key_skills__names,languages__names,exchange_rate,salary_from_gross,salary_to_gross,salary_average
176824,99933267,Специалист технической поддержки,Ведущий специалист отдела технической поддержк...,2024-06-20T12:24:00+0300,2024-06-20T12:24:00+0300,2024-06-27 03:07:49.680,https://api.hh.ru/vacancies/99933267?host=hh.ru,Санкт-Петербург,,,...,,Полный день,Middle (3-6),Полная занятость,[],[],,,,
99080,95813514,Руководитель проектов,Руководитель строительного проекта,2024-03-29T17:27:52+0300,2024-03-29T17:27:52+0300,2024-03-31 18:19:48.694,https://api.hh.ru/vacancies/95813514?host=hh.ru,Екатеринбург,RUR,0.0,...,,Полный день,Junior (1-3),Полная занятость,"['Управление строительством', 'Строительно-отд...",[],1.0,135600.0,,
36993,94230024,Менеджер по продажам,Менеджер по продажам нефтепродуктов и услуг по...,2024-03-04T19:18:33+0300,2024-03-04T19:18:33+0300,2024-03-06 19:17:01.413,https://api.hh.ru/vacancies/94230024?host=hh.ru,Тюмень,RUR,0.0,...,,Полный день,Junior (1-3),Полная занятость,"['Активные продажи', 'Холодные продажи', 'Поис...",[],1.0,90400.0,,


# Function & utils cell

In [16]:
def print_df_info(df: pd.DataFrame) -> None:
    print(f"Shape: {df.shape}")
    print('-' * 50)
    display(df.sample(3))
    print('-' * 50)
    display(df.info())

In [17]:
class ClearDataFrameTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, save_cols: Optional[List[str]] = None) -> None:
        self.save_cols = save_cols

    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> 'ClearDataFrameTransformer':
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        df_new = X[self.save_cols].copy()
        df_new.rename(columns={'income_name': 'income__name'}, inplace=True)
        df_new = df_new.dropna(subset=['salary_from_gross', 'salary_to_gross', 'salary_average'], how='all')
        df_new = df_new.dropna(subset=['income__name'])
        df_new = df_new.drop(['salary__currency', 'exchange_rate'], axis=1)
        return df_new


class RenameDataTransformer(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        pass

    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> 'RenameDataTransformer':
        return self
    
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        df_new = X.copy()
        df_new.rename(columns={'income_name': 'income__name'}, inplace=True)
        return df_new


class CreateNewCitiesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, threshold: int = 75):
        self.threshold = threshold

    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> 'CreateNewCitiesTransformer':
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        df_new = X.copy()
        cities = df_new['area__name'].unique()
        city_counts = df_new['area__name'].value_counts()
        N_rows = df_new.shape[0]
        ratios = {city: city_counts[city] / N_rows for city in cities}

        answer = []
        all_percentage = 0
        sorted_dict = {k: v for k, v in sorted(ratios.items(), key=lambda item: item[1], reverse=True)}
        for label, pct in sorted_dict.items():
            all_percentage += pct * 100
            if all_percentage < self.threshold:
                answer.append(label)

        df_new.loc[~df_new['area__name'].isin(answer), 'area__name'] = 'Малый город'
        return df_new
    
    
class SelectProfessionsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, prof_list: Optional[List[str]] = None, whitelist: bool = True) -> None:
        self.prof_list = prof_list
        self.whitelist = whitelist

    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> 'SelectProfessionsTransformer':
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        df_new = X.copy()
        if self.whitelist:
            df_new = df_new[df_new['income__name'].isin(self.prof_list)]
        else:
            df_new = df_new[~df_new['income__name'].isin(self.prof_list)]
        return df_new


class FillNaSalaryTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, coef_index: int) -> None:
        self.coef_index = coef_index
    
    @staticmethod
    def get_difference_percentiles(df: pd.DataFrame) -> Tuple[float, float, float]:
        filtered_df = df.dropna(subset=['salary_from_gross', 'salary_to_gross']).copy()
        filtered_df.loc[:, 'difference'] = filtered_df['salary_to_gross'] - filtered_df['salary_from_gross']
        condition = filtered_df['difference'] > 0.8 * filtered_df['salary_to_gross']
        filtered_df.loc[condition, ['salary_from_gross', 'difference']] = np.nan

        filtered_df = filtered_df.dropna(subset=['salary_from_gross', 'salary_to_gross'])

        filtered_df = filtered_df[filtered_df['salary_from_gross'] >= 1000]
        filtered_df = filtered_df[filtered_df['salary_to_gross'] >= 10000]

        filtered_df.loc[:, 'difference_ratio'] = np.where(filtered_df['salary_to_gross'] != 0,
                                                        filtered_df['difference'] / filtered_df['salary_to_gross'],
                                                        np.nan)

        perc25 = filtered_df['difference_ratio'].quantile(0.25)
        perc50 = filtered_df['difference_ratio'].quantile(0.50)
        perc75 = filtered_df['difference_ratio'].quantile(0.75)

        return perc25, perc50, perc75

    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> 'FillNaSalaryTransformer':
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        df_new = X.copy()
        coef = self.get_difference_percentiles(df_new)[self.coef_index]
        df_new['salary_to_gross'] = df_new['salary_to_gross'].fillna(df_new['salary_from_gross'] / (1 - coef))
        df_new['salary_from_gross'] = df_new['salary_from_gross'].fillna(df_new['salary_to_gross'] * (1 - coef))
        df_new['salary_average'] = df_new['salary_average'].fillna((df_new['salary_to_gross'] + df_new['salary_from_gross']) / 2)

        df_new[['salary_to_gross', 'salary_from_gross', 'salary_average']] = df_new[['salary_to_gross', 'salary_from_gross', 'salary_average']].astype(float)
        return df_new
    

class PreprocessKeySkillsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, s_dict: Dict[str, List[str]],
                 p_dict,
                 calc_skills: bool = True,
                 with_skills: bool = True,
                 with_grade: bool = True,
                 with_area: bool = True) -> None:

        self.s_dict = s_dict
        self.p_dict = p_dict
        self.calc_skills = calc_skills
        self.with_skills = with_skills
        self.with_grade = with_grade
        self.with_area = with_area

    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> 'PreprocessKeySkillsTransformer':
        return self

    @staticmethod
    def convert_skills(skill_string: str, skills_dict: Dict[str, List[str]]) -> Tuple[List[str], List[str]]:
        skill_list = eval(skill_string)
        converted_skills = []
        unconverted_skills = []
        for skill in skill_list:
            found = False
            for main_skill, variations in skills_dict.items():
                if skill in variations:
                    converted_skills.append(main_skill)
                    found = True
                    break
            if not found:
                unconverted_skills.append(skill)

        return converted_skills, unconverted_skills


    @staticmethod
    def calculate_skills_quality(skills, profession, p_dict):
        if not skills:
            return 0.0

        skills_importance = p_dict.get(profession, {})

        if not skills_importance:
            return 0.0

        total_importance = sum(skills_importance.values())
        skill_score = sum([skills_importance.get(skill.lower(), 0) for skill in skills])
        quality = skill_score / total_importance
        return min(quality, 1.0)

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        df_new = X.copy()

        if self.calc_skills or self.with_skills:
            df_new[['skills', 'unconverted_skills']] = df_new['key_skills__names'].apply(
                lambda x: pd.Series(self.convert_skills(x, self.s_dict))
            )
            df_new['skills_quality'] = df_new.apply(
                lambda row: self.calculate_skills_quality(row['skills'], row['income__name'], self.p_dict), axis=1
            )
            if not self.with_skills:
                df_new = df_new.drop(['skills', 'unconverted_skills'], axis=1)
            if not self.calc_skills:
                df_new = df_new.drop(['skills_quality'], axis=1)

        df_new = df_new.drop(['key_skills__names'], axis=1)

        if not self.with_grade:
            df_new = df_new.drop(columns=df_new.filter(like='grade').columns, axis=1)

        if not self.with_area:
            df_new = df_new.drop(columns=df_new.filter(like='area_name').columns, axis=1)

        return df_new
    

class ProcessSkillsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, freq_cutoff: int = 100, 
                       add_unprocessed: bool = False) -> None:
        self.freq_cutoff = freq_cutoff
        self.add_unprocessed = add_unprocessed

        self.morph = pymorphy2.MorphAnalyzer()
        self.nltk_stop_words = set(stopwords.words('russian'))
        self.custom_words = {',', ':', 'работа', 'активный', 'язык', 'навык',
                             'деловой', 'ведение', 'проведение', 'презентация', 'грамотный'}
        self.stop_words = self.nltk_stop_words.union(self.custom_words)

    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> 'ProcessSkillsTransformer':
        return self

    def tokenize_and_lemmatize(self, skill_list: List[str]) -> List[str]:
        tokens = []
        for skill in skill_list:
            words = word_tokenize(skill)
            lemmas = [self.morph.parse(word.lower())[0].normal_form for word in words if word.lower() not in self.stop_words]
            tokens.extend(lemmas)
        return tokens

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        df_processed = X.copy()

        def prepare_skills(df: pd.DataFrame, col: str) -> pd.DataFrame:
            df_new = df.copy()
            df_new[col] = df_new[col].apply(self.tokenize_and_lemmatize)
            counts = df_new[col].explode().value_counts()
            frequent = counts[counts >= self.freq_cutoff].index
            df_new[col] = df_new[col].apply(lambda skills: [skill for skill in skills if skill in frequent])
            return df_new

        if self.add_unprocessed:
            df_processed = prepare_skills(df_processed, 'skills')
            df_processed = prepare_skills(df_processed, 'unconverted_skills')
            df_processed['skills_plus'] = df_processed['skills'] + df_processed['unconverted_skills']
        else:
            df_processed['skills_plus'] = df_processed['skills']

        df_processed = df_processed[df_processed['skills_plus'].apply(lambda x: len(x) > 0)]
        df_processed = df_processed.drop(['unconverted_skills', 'skills'], axis=1)
        return df_processed


class VectorizePCASkillsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, 
                 n_components: int = 100, 
                 PCA_enable: bool = True, 
                 add_unprocessed: bool = False) -> None:
        
        self.n_components = n_components
        self.PCA_enable = PCA_enable
        self.add_unprocessed = add_unprocessed
        
        if not self.add_unprocessed:
            self.vectorizer = TfidfVectorizer(analyzer=lambda x: x)
        else:
            self.vectorizer = TfidfVectorizer()

        self.pca = PCA(n_components=self.n_components)

    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> 'VectorizePCASkillsTransformer':
        X = X.copy()
        if self.add_unprocessed:
            X['skills_plus'] = X['skills_plus'].apply(lambda x: ' '.join(x))

        self.vectorizer.fit(X['skills_plus'])
        if self.PCA_enable:
            skills_tfidf = self.vectorizer.transform(X['skills_plus'])
            skills_df = pd.DataFrame(skills_tfidf.toarray(), 
                                     columns=self.vectorizer.get_feature_names_out())
            self.pca.fit(skills_df)
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        df_vect = X.copy()

        skills_tfidf = self.vectorizer.transform(df_vect['skills_plus'])
        skills_df = pd.DataFrame(skills_tfidf.toarray(), 
                                 columns=self.vectorizer.get_feature_names_out())
        df_vect = df_vect.reset_index()

        if self.PCA_enable:
            skills_pca = self.pca.transform(skills_df)
            skills_pca_df = pd.DataFrame(skills_pca, 
                                         columns=[f'component_{i+1}' for i in range(skills_pca.shape[1])])
            df_merged = pd.concat([df_vect, skills_pca_df], axis=1)
        else:
            df_merged = pd.concat([df_vect, skills_df], axis=1)

        df_merged = df_merged.drop(['skills_plus', 'index'], axis=1)

        return df_merged


class ExtractFeaturesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, drop_param: Optional[str] = None) -> None:
        self.drop_param = drop_param
        self.transformers = [
            ('schedule_name', OneHotEncoder(sparse_output=False, drop=self.drop_param), ['schedule__name']),
            ('grade', OneHotEncoder(sparse_output=False, drop=self.drop_param), ['grade']),
            ('income_name', OneHotEncoder(sparse_output=False, drop=self.drop_param), ['income__name']),
            ('area_name', OneHotEncoder(sparse_output=False, drop=self.drop_param), ['area__name'])
        ]
        self.pipeline = Pipeline(steps=[
            ('column_transformer', ColumnTransformer(transformers=self.transformers, remainder='passthrough'))
        ])

    def fit(self, X: pd.DataFrame, y=None) -> 'ExtractFeaturesTransformer':
        self.pipeline.fit(X)
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        df_new = X.copy()
        df_new = self.pipeline.transform(df_new)
        
        column_names = self.pipeline.named_steps['column_transformer'].get_feature_names_out()
        strings_to_replace = ['remainder__','schedule__name_','income__name_', 'area__name_']
        for string in strings_to_replace:
            column_names = [x.replace(string, '') for x in column_names]
        column_names = [x.replace('grade__grade_', 'grade_') for x in column_names]
        
        df_new = pd.DataFrame(df_new, columns=column_names)

        return df_new


class ConvertTypeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, new_type: type) -> None:
        self.new_type = new_type

    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> 'ConvertTypeTransformer':
        return self
    
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        df_new = X.copy()
        df_new: pd.DataFrame = df_new.astype(self.new_type)
        return df_new

# Preprocessing Pipleline

In [18]:
RANDOM_STATE = 42

income_names_IT = ['Frontend', 'Backend', 'DevOps', 'Веб дизайнер', 'QA инженер', 'Mobile',
                   'Project manager', 'Product manager', 'Технический писатель', 'Data Analyst',
                   'Data Engineer', 'Data Scientist', 'Аналитик', 'Бизнес аналитик',
                   'Системный аналитик', 'Руководитель проектов', 'IT Project manager', 'IT Product manager']

income_names_1C = ['1C оператор', '1С эксперт', '1С архитектор', 'Руководитель проектов 1С',
                   '1С методист', '1С администратор', '1C консультант', '1С программист', '1С аналитик']

income_names_other = ['Менеджер по продажам', 'Менеджер по работе с клиентами',
                      'Специалист технической поддержки']

SAVE_COLS = ['income_name', 'area__name', 'schedule__name', 'grade',
            'key_skills__names', 'salary__currency',
            'exchange_rate', 'salary_from_gross', 'salary_to_gross', 'salary_average']


@dataclass
class PreprocessParams:
    BASIC_TRANSFORM: bool = False
    FREQ_CUTOFF: int = 1000
    DROP_PARAM_ONE_HOT = 'first'
    THRESHOLD_CITIES: int = 50
    PERC_NUMBER: int = 0  # 0 - 25% | 1 - 50% | 2 - 75%
    IS_PCA_ENABLE: bool = True
    N_COMPONENTS: int = 100
    ADD_UNPROCESSED: bool = False
    WITH_SKILLS: bool = True
    WITH_GRADE: bool = True
    WITH_AREA: bool = True
    WHITELIST: bool = True
    PROF_LIST: list = None
    VERBOSE: bool = False
    CALC_SKILLS: bool = False

    def __repr__(self):
        field_strings = []
        for field in fields(self):
            field_name = field.name
            field_value = getattr(self, field_name)
            field_strings.append(f"* {field_name}: {field_value}")
        result_str = "\n".join(field_strings)
        sep = colored('\n' + '-' * 30 + '\n', 'cyan')
        return f'\nPreprocess params:{sep}{result_str}'


def get_preprocess_pipeline(params: PreprocessParams) -> Pipeline:
    basic_pipe = Pipeline(steps=[
            ('clear_df', ClearDataFrameTransformer(SAVE_COLS)),
            ('create_new_cities', CreateNewCitiesTransformer(params.THRESHOLD_CITIES)),
            ('select_professions', SelectProfessionsTransformer(params.PROF_LIST, params.WHITELIST)),
            ('fill_na_salary', FillNaSalaryTransformer(params.PERC_NUMBER)),
            ('preprocess_key_skills', PreprocessKeySkillsTransformer(skills_dict,
                                                                     skill_prio_dict,
                                                                     params.CALC_SKILLS,
                                                                     params.WITH_SKILLS,
                                                                     params.WITH_GRADE,
                                                                     params.WITH_AREA))
        ], verbose=params.VERBOSE)

    if params.BASIC_TRANSFORM:
        display(basic_pipe)
        return basic_pipe

    pipe_skill_vectorize = Pipeline(steps=[
            ('process_skills', ProcessSkillsTransformer(params.FREQ_CUTOFF,
                                                        params.ADD_UNPROCESSED)),
            ('vectorize_PCA_skills', VectorizePCASkillsTransformer(params.N_COMPONENTS,
                                                                params.IS_PCA_ENABLE,
                                                                params.ADD_UNPROCESSED))
        ],verbose=params.VERBOSE)

    if params.WITH_SKILLS:
        pipe = Pipeline(steps=[
            ('basic_pipe', basic_pipe),
            ('extract_features', ExtractFeaturesTransformer(params.DROP_PARAM_ONE_HOT)),
            ('skill_vectorize', pipe_skill_vectorize),
            ('convert_type', ConvertTypeTransformer(float))
        ], verbose=params.VERBOSE)
    else:
        pipe = Pipeline(steps=[
            ('basic_pipe', basic_pipe),
            ('extract_features', ExtractFeaturesTransformer(params.DROP_PARAM_ONE_HOT)),
            ('convert_type', ConvertTypeTransformer(float))
        ], verbose=params.VERBOSE)

    print(params, end='\n\n')
    display(pipe)
    return pipe

In [19]:
df_all = get_preprocess_pipeline(PreprocessParams(PROF_LIST=[],
                                                  BASIC_TRANSFORM=True,
                                                  WHITELIST=False,
                                                  IS_PCA_ENABLE=False)).fit_transform(df_original)
print_df_info(df_all)

Shape: (100453, 9)
--------------------------------------------------


Unnamed: 0,income__name,area__name,schedule__name,grade,salary_from_gross,salary_to_gross,salary_average,skills,unconverted_skills
52221,Менеджер по работе с клиентами,Малый город,Полный день,Intern (0-1),48280.0,56800.0,52540.0,[],[]
30884,Менеджер по продажам,Краснодар,Удаленная работа,Junior (1-3),146900.0,172823.529412,159861.764706,"[активные продажи, проведение презентаций, нав...",[B2C продажи]
137751,1С программист,Казань,Полный день,Junior (1-3),90400.0,135600.0,113000.0,[],[]


--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 100453 entries, 2 to 188832
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   income__name        100453 non-null  object 
 1   area__name          100453 non-null  object 
 2   schedule__name      100453 non-null  object 
 3   grade               100453 non-null  object 
 4   salary_from_gross   100453 non-null  float64
 5   salary_to_gross     100453 non-null  float64
 6   salary_average      100453 non-null  float64
 7   skills              100453 non-null  object 
 8   unconverted_skills  100453 non-null  object 
dtypes: float64(3), object(6)
memory usage: 7.7+ MB


None

# Model RNN & BERT vectorization

In [13]:
# words_and_vec = {k: None for k in list(skills_dict.keys())}
# tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
# model = BertModel.from_pretrained('bert-base-multilingual-cased')

# def get_word_vector(word):
#     inputs = tokenizer(word, return_tensors='pt')
#     outputs = model(**inputs)
#     vector = outputs.last_hidden_state[:, 0, :].detach().numpy()
#     return vector.flatten()

# words_and_vec = {k: get_word_vector(k) for k in list(skills_dict.keys())}

In [14]:
# def get_transforms_for_dl(df, word_dict):

#     def skills_to_vectors(skills_list):
#         return [word_dict[skill] for skill in skills_list]

#     df_new = df.copy()
#     df_new = df_new.drop(['salary_from_gross', 'salary_to_gross', 'unconverted_skills'], axis=1)
#     df_new = df_new[df_new['skills'].apply(lambda x: len(x) > 0)]
#     df_new['skills'] = df_new['skills'].apply(skills_to_vectors)

#     # One-hot encoding for cols: 'area_name', 'shedule_name', 'grade'
#     pipe_for_dl = Pipeline(steps=[
#         ('one-hot', ExtractFeaturesTransformer(drop_param='first'))
#     ])

#     df_new = pipe_for_dl.fit_transform(df_new)

#     return df_new

In [15]:
# def pad_skills(skills_list, max_len=None):
#     if not max_len:
#         max_len = max(len(skills) for skills in skills_list)
#     padded_skills = [np.pad(skills, ((0, max_len - len(skills)), (0, 0)), 'constant') for skills in skills_list]
#     return padded_skills

# class SalaryDataset(Dataset):
#     def __init__(self, X, y, max_skill_len=None):
#         self.tabular_data = torch.tensor(np.array(X.drop(columns=['skills']).values.tolist(), dtype=np.float32))
#         self.skills_data = pad_skills(X['skills'], max_skill_len)
#         self.skills_data = torch.tensor(self.skills_data, dtype=torch.float32)
#         self.labels = torch.tensor(y.values.astype(np.float32)).view(-1, 1)
    
#     def __len__(self):
#         return len(self.tabular_data)
    
#     def __getitem__(self, idx):
#         return self.tabular_data[idx], self.skills_data[idx], self.labels[idx]

In [16]:
# df_all_dl = get_transforms_for_dl(df_all, words_and_vec)

# X = df_all_dl.drop(['salary_average'], axis=1)
# y = df_all_dl['salary_average']

# X_train, X_temp, y_train, y_temp = train_test_split(X, y, random_state=42, test_size=0.2)
# X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, random_state=42, test_size=0.5)

# train_dataset = SalaryDataset(X_train, y_train)
# validation_dataset = SalaryDataset(X_val, y_val)
# test_dataset = SalaryDataset(X_test, y_test)

# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# validation_loader = DataLoader(validation_dataset, batch_size=32, shuffle=False)
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

  self.skills_data = torch.tensor(self.skills_data, dtype=torch.float32)


In [19]:
# class SalaryPredictionModel(nn.Module):
#     def __init__(self, input_dim, skills_dim, hidden_dim):
#         super(SalaryPredictionModel, self).__init__()
#         self.fc1 = nn.Linear(input_dim, 128)
#         self.fc2 = nn.Linear(128, 64)
        
#         self.lstm = nn.LSTM(skills_dim, hidden_dim, batch_first=True)
#         self.fc3 = nn.Linear(hidden_dim, 64)
        
#         self.fc4 = nn.Linear(64 + 64, 1)
    
#     def forward(self, tabular_data, skills_data):
#         x1 = torch.relu(self.fc1(tabular_data))
#         x1 = torch.relu(self.fc2(x1))
        
#         packed_skills = pack_sequence(skills_data, enforce_sorted=False)
#         lstm_out, _ = self.lstm(packed_skills)
#         lstm_out, _ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
        
#         last_lstm_out = lstm_out[:, -1, :]
        
#         x2 = torch.relu(self.fc3(last_lstm_out))
        
#         x = torch.cat((x1, x2), dim=1)
#         x = self.fc4(x)
#         return x

# input_dim = X_train.drop(columns=['skills']).shape[1]
# skills_dim = len(X_train['skills'].iloc[0][0])
# hidden_dim = 64

# model = SalaryPredictionModel(input_dim, skills_dim, hidden_dim)

# criterion = nn.L1Loss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
# num_epochs = 20

# for epoch in range(num_epochs):
#     model.train()
#     running_loss = 0.0
#     for tabular_data, skills_data, labels in train_loader:
#         optimizer.zero_grad()
#         outputs = model(tabular_data, skills_data)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item()
    
#     print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}')

# model.eval()
# with torch.no_grad():
#     y_pred = []
#     y_true = []
#     for tabular_data, skills_data, labels in test_loader:
#         outputs = model(tabular_data, skills_data)
#         y_pred.extend(outputs.numpy())
#         y_true.extend(labels.numpy())

#     from sklearn.metrics import mean_squared_error
#     mse = mean_squared_error(y_true, y_pred)
#     print(f'Mean Squared Error: {mse:.4f}')

# print("Примеры предсказаний:")
# for i in range(3):
#     print(f"Предсказанная зарплата: {y_pred[i][0]:.2f}, Истинная зарплата: {y_true[i][0]:.2f}")

# Preprocess IT/1C/other

In [20]:
preprocess_params = {
    "IT": PreprocessParams(
        WITH_SKILLS=True,
        WITH_GRADE=True,
        WITH_AREA=True,
        IS_PCA_ENABLE=False,
        PROF_LIST=income_names_IT
    ),
    "1C": PreprocessParams(
        WITH_SKILLS=True,
        WITH_GRADE=True,
        WITH_AREA=True,
        PROF_LIST=income_names_1C
    ),
    "other": PreprocessParams(
        WITH_SKILLS=False,
        WITH_GRADE=True,
        WITH_AREA=True,
        PROF_LIST=income_names_other
    )
}

results = {}    

width = 40

for key, params in preprocess_params.items():
    print(colored(f"{key:-^{width}}", 'green'))
    pipeline = get_preprocess_pipeline(params).fit(df_original)
    df_processed = pipeline.transform(df_original)
    results[key] = {
        "data": df_processed,
        "pipe": pipeline,
        "Params": params,
        "pretrained_model": None,
        "rmse_score": None
    }

df_after_preprocess_IT = results["IT"]["data"]
df_after_preprocess_1C = results["1C"]["data"]
df_after_preprocess_other = results["other"]["data"]

[32m-------------------IT-------------------[0m

Preprocess params:[36m
------------------------------
[0m* BASIC_TRANSFORM: False
* FREQ_CUTOFF: 1000
* THRESHOLD_CITIES: 50
* PERC_NUMBER: 0
* IS_PCA_ENABLE: False
* N_COMPONENTS: 100
* ADD_UNPROCESSED: False
* WITH_SKILLS: True
* WITH_GRADE: True
* WITH_AREA: True
* WHITELIST: True
* PROF_LIST: ['Frontend', 'Backend', 'DevOps', 'Веб дизайнер', 'QA инженер', 'Mobile', 'Project manager', 'Product manager', 'Технический писатель', 'Data Analyst', 'Data Engineer', 'Data Scientist', 'Аналитик', 'Бизнес аналитик', 'Системный аналитик', 'Руководитель проектов', 'IT Project manager', 'IT Product manager']
* VERBOSE: False
* CALC_SKILLS: False



[32m-------------------1C-------------------[0m

Preprocess params:[36m
------------------------------
[0m* BASIC_TRANSFORM: False
* FREQ_CUTOFF: 1000
* THRESHOLD_CITIES: 50
* PERC_NUMBER: 0
* IS_PCA_ENABLE: True
* N_COMPONENTS: 100
* ADD_UNPROCESSED: False
* WITH_SKILLS: True
* WITH_GRADE: True
* WITH_AREA: True
* WHITELIST: True
* PROF_LIST: ['1C оператор', '1С эксперт', '1С архитектор', 'Руководитель проектов 1С', '1С методист', '1С администратор', '1C консультант', '1С программист', '1С аналитик']
* VERBOSE: False
* CALC_SKILLS: False



[32m-----------------other------------------[0m

Preprocess params:[36m
------------------------------
[0m* BASIC_TRANSFORM: False
* FREQ_CUTOFF: 1000
* THRESHOLD_CITIES: 50
* PERC_NUMBER: 0
* IS_PCA_ENABLE: True
* N_COMPONENTS: 100
* ADD_UNPROCESSED: False
* WITH_SKILLS: False
* WITH_GRADE: True
* WITH_AREA: True
* WHITELIST: True
* PROF_LIST: ['Менеджер по продажам', 'Менеджер по работе с клиентами', 'Специалист технической поддержки']
* VERBOSE: False
* CALC_SKILLS: False



In [21]:
print_df_info(df_after_preprocess_IT)

print_df_info(df_after_preprocess_1C)

print_df_info(df_after_preprocess_other)

Shape: (14708, 460)
--------------------------------------------------


Unnamed: 0,schedule_name__Гибкий график,schedule_name__Полный день,schedule_name__Сменный график,schedule_name__Удаленная работа,grade_Junior (1-3),grade_Middle (3-6),grade_Senior (>6),income_name__Data Analyst,income_name__Data Engineer,income_name__Data Scientist,...,финансовый контроль,функциональное тестирование,художественный вкус,ценообразование,чувство вкуса,чувство стиля,эконометрика,экономика,экономический анализ,эксплуатационные документы
10940,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1668,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5655,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14708 entries, 0 to 14707
Columns: 460 entries, schedule_name__Гибкий график to эксплуатационные документы
dtypes: float64(460)
memory usage: 51.6 MB


None

Shape: (7510, 125)
--------------------------------------------------


Unnamed: 0,schedule_name__Гибкий график,schedule_name__Полный день,schedule_name__Сменный график,schedule_name__Удаленная работа,grade_Junior (1-3),grade_Middle (3-6),grade_Senior (>6),income_name__1C оператор,income_name__1С администратор,income_name__1С аналитик,...,component_91,component_92,component_93,component_94,component_95,component_96,component_97,component_98,component_99,component_100
2266,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.005221,-0.013146,-0.007121,0.003097,-0.002682,-0.001346,-0.009628,-0.003209,0.00257,-0.005433
5331,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.059631,0.065732,-0.065973,-0.035098,-0.002619,-0.008205,0.006635,-0.001817,0.007207,-0.006344
4386,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-0.01497,-0.002411,0.00072,-0.227124,0.026607,-0.14883,0.022957,0.068311,0.024638,0.027507


--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7510 entries, 0 to 7509
Columns: 125 entries, schedule_name__Гибкий график to component_100
dtypes: float64(125)
memory usage: 7.2 MB


None

Shape: (57533, 19)
--------------------------------------------------


Unnamed: 0,schedule_name__Гибкий график,schedule_name__Полный день,schedule_name__Сменный график,schedule_name__Удаленная работа,grade_Junior (1-3),grade_Middle (3-6),grade_Senior (>6),income_name__Менеджер по работе с клиентами,income_name__Специалист технической поддержки,area_name__Казань,area_name__Краснодар,area_name__Малый город,area_name__Москва,area_name__Нижний Новгород,area_name__Новосибирск,area_name__Санкт-Петербург,salary_from_gross,salary_to_gross,salary_average
50759,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,93112.0,116390.0,104751.0
26128,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,60000.0,150000.0,105000.0
11762,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,60000.0,75000.0,67500.0


--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57533 entries, 0 to 57532
Data columns (total 19 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   schedule_name__Гибкий график                   57533 non-null  float64
 1   schedule_name__Полный день                     57533 non-null  float64
 2   schedule_name__Сменный график                  57533 non-null  float64
 3   schedule_name__Удаленная работа                57533 non-null  float64
 4   grade_Junior (1-3)                             57533 non-null  float64
 5   grade_Middle (3-6)                             57533 non-null  float64
 6   grade_Senior (>6)                              57533 non-null  float64
 7   income_name__Менеджер по работе с клиентами    57533 non-null  float64
 8   income_name__Специалист технической поддержки  57533 non-null  float64
 9  

None

# Deep Learning Solution

In [34]:
class SparseRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(SparseRegressionModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, 256)
        self.fc4 = nn.Linear(256, 128)
        self.fc5 = nn.Linear(128, 64)
        self.fc6 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.5)
        self.leaky_relu = nn.LeakyReLU(0.1)
    
    def forward(self, x):
        x = self.leaky_relu(self.fc1(x.to_dense()))
        x = self.dropout(x)
        x = self.leaky_relu(self.fc2(x))
        x = self.leaky_relu(self.fc3(x))
        x = self.dropout(x)
        x = self.leaky_relu(self.fc4(x))
        x = self.leaky_relu(self.fc5(x))
        x = self.fc6(x)
        return x

In [35]:
# data = df_all.copy()

# X = data.drop(columns=['salary_from_gross', 'salary_to_gross', 'salary_average']).values 
# y = data['salary_average'].values

# X_train, X_temp, y_train, y_temp = train_test_split(X, y, random_state=42, test_size=0.2)
# X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, random_state=42, test_size=0.5)

# X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
# y_train = torch.tensor(y_train, dtype=torch.float32).view(-1, 1).to(device)
# X_val = torch.tensor(X_val, dtype=torch.float32).to(device)
# y_val = torch.tensor(y_val, dtype=torch.float32).view(-1, 1).to(device)
# X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
# y_test = torch.tensor(y_test, dtype=torch.float32).view(-1, 1).to(device)

# train_dataset = TensorDataset(X_train, y_train)
# val_dataset = TensorDataset(X_val, y_val)
# test_dataset = TensorDataset(X_test, y_test)

# batch_size = 16
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# model = SparseRegressionModel(X.shape[1]).to(device)

# model = SparseRegressionModel(X.shape[1]).to(device)
# l1_loss = nn.L1Loss()
# mse_loss = nn.MSELoss()

# criterion = l1_loss
# optimizer = optim.Adam(model.parameters(), lr=0.0001)


# num_epochs = 200
# patience = 5
# best_val_loss = float('inf')
# early_stop_counter = 0
# train_history = []
# val_history = []

# for epoch in range(num_epochs):
#     model.train()
#     running_loss = 0.0
#     for inputs, labels in train_loader:
#         optimizer.zero_grad()
#         outputs = model(inputs)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item() * inputs.size(0)
#     epoch_loss = running_loss / len(train_loader.dataset)
#     train_history.append(epoch_loss)
    
#     model.eval()
#     val_running_loss = 0.0
#     with torch.no_grad():
#         for inputs, labels in val_loader:
#             outputs = model(inputs)
#             loss = criterion(outputs, labels)
#             val_running_loss += loss.item() * inputs.size(0)
#     val_epoch_loss = val_running_loss / len(val_loader.dataset)
#     val_history.append(val_epoch_loss)
    
#     print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Val Loss: {val_epoch_loss:.4f}')
    
#     if val_epoch_loss < best_val_loss:
#         best_val_loss = val_epoch_loss
#         early_stop_counter = 0
#         torch.save(model.state_dict(), 'best_model.pth')
#     else:
#         early_stop_counter += 1
#         if early_stop_counter >= patience:
#             print("Early stopping")
#             break


# model.load_state_dict(torch.load('best_model.pth'))

# model.eval()
# with torch.no_grad():
#     y_pred = model(X_test)
#     l1_lose = l1_loss(y_pred, y_test).item()
#     mse = mse_loss(y_pred, y_test).item()
#     rmse = np.sqrt(mse)
#     mape = torch.mean(torch.abs((y_test - y_pred) / y_test)).item() * 100

# print(f'MSE: {mse:.4f}')
# print(f'RMSE: {rmse:.4f}')
# print(f'MAPE: {mape:.2f}%')

# plt.plot(train_history, label='Training Loss')
# plt.plot(val_history, label='Validation Loss')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.legend()
# plt.show()

cuda
(100453, 6)


# Fitting Model

In [30]:
param_grid = [
        {
            'model': [Ridge(random_state=RANDOM_STATE)]
        },
        {
            'model': [CatBoostRegressor(verbose=False,
                                        random_state=RANDOM_STATE)]
        },
        {
            'model': [XGBRegressor(random_state=RANDOM_STATE)]
        },
        {
            'model': [RandomForestRegressor(verbose=False, n_jobs=-1, random_state=RANDOM_STATE)]
        },
        {
            'model': [DecisionTreeRegressor(random_state=RANDOM_STATE,
                                            criterion='friedman_mse')]
        }
]

mape = make_scorer(mean_absolute_percentage_error)
rmse = make_scorer(root_mean_squared_error)
mse = make_scorer(mean_squared_error)

In [31]:
def train(data, estimator, n_top_features=10):
    X = data.drop(columns=['salary_from_gross', 'salary_to_gross', 'salary_average'], axis=1)
    y = data['salary_average']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE, test_size=0.2)
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)

    rmse_score = root_mean_squared_error(y_test, y_pred)
    print(rmse_score)

    importances = estimator.feature_importances_
    indices = np.argsort(importances)[::-1]
    top_indices = indices[:n_top_features]
    top_importances = importances[top_indices]
    top_features = X.columns[top_indices]

    plt.figure(figsize=(10, 6))
    plt.title('Feature Importances')
    plt.bar(range(n_top_features), top_importances, align='center')
    plt.xticks(range(n_top_features), top_features, rotation=90)
    plt.xlim([-1, n_top_features])
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.tight_layout()
    plt.show()

    return rmse_score, estimator

# DEMO preprocess & fitting

In [38]:
def get_prediction_pipeline(params: PreprocessParams) -> Pipeline:
    pipe: Pipeline = Pipeline(steps=[
        ('rename_cols', RenameDataTransformer()),
        ('one-hot', ExtractFeaturesTransformer(drop_param=params.DROP_PARAM_ONE_HOT))
    ], verbose=params.VERBOSE)

    if params.WITH_SKILLS:
        pipe: Pipeline = Pipeline(steps=[
            ('basic_pipe', pipe),
            ('process_scills', ProcessSkillsTransformer(freq_cutoff=params.FREQ_CUTOFF)),
            ('vectorize_skills', VectorizePCASkillsTransformer(n_components=params.N_COMPONENTS,
                                                               PCA_enable=params.IS_PCA_ENABLE)),
            ('convert_type', ConvertTypeTransformer(float))
        ], verbose=params.VERBOSE)
    else:
        pipe: Pipeline = Pipeline(steps=[
            ('basic_pipe', pipe),
            ('convert_type', ConvertTypeTransformer(float))
        ], verbose=params.VERBOSE)

    display(pipe)
    return pipe

def train_model(X, y, estimator):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE, test_size=0.2)
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)

    rmse_score = root_mean_squared_error(y_test, y_pred)
    print(rmse_score)
    return estimator


def export_results_for_prediction(result_dict: Dict, folder_name: str = 'output') -> None:
    for key, value in result_dict.items():
        folder_path = f"{folder_name}/{key}"
        os.makedirs(folder_path, exist_ok=True)
        output_dir_path = f"{ROOT_DIR}/Notebooks/{folder_path}"
        print(f'\n>>> Folder was created: {colored(output_dir_path, "green")}')

        if key != 'all_data':
            value['data_before'].to_excel(f"{output_dir_path}/data_before.xlsx")
            value['data_after'].to_excel(f"{output_dir_path}/data_after.xlsx")
            print('>>> Data frames were exported')
        else:
            print('>>> All data is processing. Skip saving DataFrames')

        if key != 'all_data':
            joblib.dump(value['pretrained_model'], f'{output_dir_path}/model_{key}.pkl')
        else:
            torch.save(value['pretrained_model'].state_dict(), f'{output_dir_path}/best_model_cuda.pth')
            model = SparseRegressionModel(468)
            model.load_state_dict(torch.load(f'{output_dir_path}/best_model_cuda.pth', map_location='cpu'))
            torch.save(value['pretrained_model'].state_dict(), f'{output_dir_path}/best_model_cpu.pth')
        
        print('>>> Model was exported')

        with open(f'{output_dir_path}/params.json', 'w', encoding='utf-8') as f:
            json.dump(asdict(value['Params']), f, ensure_ascii=False)

        print('>>> Preprocessing params were exported')

        with open(f'{output_dir_path}/pipe_{key}.pkl', 'wb') as f:
            cloudpickle.dump(value['pipe'], f)
            print('>>> Pipeline was exported')

    zip_path = f"{ROOT_DIR}/Notebooks/{folder_name}"
    archived = shutil.make_archive('output', 'zip', zip_path)

    if os.path.exists(f'{zip_path}.zip'):
        print(f"\n>>> Arhive was created: {colored(archived, 'green')}") 
    else: 
        print('\n>>>', colored("ZIP file not created", 'red'))

In [36]:
predict_preprocess_params = {
    "IT": PreprocessParams(
        WITH_SKILLS=False,
        WITH_GRADE=True,
        WITH_AREA=True,
        PROF_LIST=income_names_IT,
    ),
    "1C": PreprocessParams(
        WITH_SKILLS=False,
        WITH_GRADE=True,
        WITH_AREA=True,
        PROF_LIST=income_names_1C,
    ),
    "other": PreprocessParams(
        WITH_SKILLS=False,
        WITH_GRADE=True,
        WITH_AREA=True,
        PROF_LIST=income_names_other
    ),
    "all_data": PreprocessParams(
        WITH_SKILLS=True,
        WITH_GRADE=True,
        WITH_AREA=True,
        IS_PCA_ENABLE=False,
        PROF_LIST=['Frontend'],
        WHITELIST=False,
        CALC_SKILLS=False,
        VERBOSE=False,
    )
}

results_predict = {}

width = 40

for key, params in predict_preprocess_params.items():
    print(colored(f"{key:-^{width}}", 'green'))

    params.BASIC_TRANSFORM = True
    df_basic: pd.DataFrame = get_preprocess_pipeline(params).fit_transform(df_original)

    target_basic = df_basic['salary_average']
    df_basic = df_basic.drop(['salary_from_gross', 'salary_to_gross', 'salary_average'], axis=1)
    display(df_basic)

    params.BASIC_TRANSFORM = False
    pipeline = get_prediction_pipeline(params).fit(df_basic)
    df_processed = pipeline.transform(df_basic)

    display(df_processed)

    if key != 'all_data':
        model = RandomForestRegressor(n_jobs=-1, random_state=RANDOM_STATE)
        model = train_model(df_processed, target_basic, model)
    else:
        model = SparseRegressionModel(df_processed.values.shape[1]).to(device)
        model.load_state_dict(torch.load('model_skills.pth'))
    
    results_predict[key] = {
        "data_before": df_basic,
        "data_after": df_processed,
        "pipe": pipeline,
        "Params": params,
        "pretrained_model": model
    }

[32m-------------------IT-------------------[0m


Unnamed: 0,income__name,area__name,schedule__name,grade
48,Backend,Санкт-Петербург,Полный день,Middle (3-6)
54,Backend,Москва,Удаленная работа,Middle (3-6)
55,Backend,Санкт-Петербург,Удаленная работа,Middle (3-6)
57,Backend,Санкт-Петербург,Удаленная работа,Middle (3-6)
61,Backend,Москва,Полный день,Junior (1-3)
...,...,...,...,...
188787,Frontend,Санкт-Петербург,Полный день,Junior (1-3)
188792,Frontend,Москва,Удаленная работа,Middle (3-6)
188806,Frontend,Москва,Удаленная работа,Senior (>6)
188819,Frontend,Екатеринбург,Удаленная работа,Middle (3-6)


Unnamed: 0,schedule_name__Гибкий график,schedule_name__Полный день,schedule_name__Сменный график,schedule_name__Удаленная работа,grade_Junior (1-3),grade_Middle (3-6),grade_Senior (>6),income_name__Data Analyst,income_name__Data Engineer,income_name__Data Scientist,...,income_name__Руководитель проектов,income_name__Системный аналитик,income_name__Технический писатель,area_name__Казань,area_name__Краснодар,area_name__Малый город,area_name__Москва,area_name__Нижний Новгород,area_name__Новосибирск,area_name__Санкт-Петербург
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28011,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
28012,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
28013,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
28014,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


44107.15708629126
[32m-------------------1C-------------------[0m


Unnamed: 0,income__name,area__name,schedule__name,grade
2,1С программист,Санкт-Петербург,Полный день,Middle (3-6)
5,1С программист,Нижний Новгород,Удаленная работа,Middle (3-6)
9,1С программист,Санкт-Петербург,Полный день,Middle (3-6)
11,1С программист,Москва,Полный день,Middle (3-6)
12,1С программист,Санкт-Петербург,Полный день,Junior (1-3)
...,...,...,...,...
187587,1С архитектор,Москва,Полный день,Middle (3-6)
187588,1С архитектор,Москва,Удаленная работа,Middle (3-6)
187606,1С архитектор,Москва,Удаленная работа,Middle (3-6)
187607,1С архитектор,Москва,Удаленная работа,Senior (>6)


Unnamed: 0,schedule_name__Гибкий график,schedule_name__Полный день,schedule_name__Сменный график,schedule_name__Удаленная работа,grade_Junior (1-3),grade_Middle (3-6),grade_Senior (>6),income_name__1C оператор,income_name__1С администратор,income_name__1С аналитик,...,income_name__1С программист,income_name__1С эксперт,income_name__Руководитель проектов 1С,area_name__Казань,area_name__Краснодар,area_name__Малый город,area_name__Москва,area_name__Нижний Новгород,area_name__Новосибирск,area_name__Санкт-Петербург
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14899,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
14900,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
14901,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
14902,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


28739.553738600167
[32m-----------------other------------------[0m


Unnamed: 0,income__name,area__name,schedule__name,grade
20195,Менеджер по продажам,Малый город,Полный день,Intern (0-1)
20197,Менеджер по продажам,Екатеринбург,Полный день,Intern (0-1)
20198,Менеджер по продажам,Малый город,Полный день,Intern (0-1)
20199,Менеджер по продажам,Нижний Новгород,Полный день,Intern (0-1)
20200,Менеджер по продажам,Санкт-Петербург,Полный день,Middle (3-6)
...,...,...,...,...
177305,Специалист технической поддержки,Москва,Полный день,Junior (1-3)
177307,Специалист технической поддержки,Москва,Полный день,Junior (1-3)
177308,Специалист технической поддержки,Москва,Удаленная работа,Intern (0-1)
177309,Специалист технической поддержки,Москва,Полный день,Intern (0-1)


Unnamed: 0,schedule_name__Гибкий график,schedule_name__Полный день,schedule_name__Сменный график,schedule_name__Удаленная работа,grade_Junior (1-3),grade_Middle (3-6),grade_Senior (>6),income_name__Менеджер по работе с клиентами,income_name__Специалист технической поддержки,area_name__Казань,area_name__Краснодар,area_name__Малый город,area_name__Москва,area_name__Нижний Новгород,area_name__Новосибирск,area_name__Санкт-Петербург
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57528,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
57529,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
57530,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
57531,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


25787.62278948415
[32m----------------all_data----------------[0m


Unnamed: 0,income__name,area__name,schedule__name,grade,skills,unconverted_skills
2,1С программист,Санкт-Петербург,Полный день,Middle (3-6),"[1с программирование, 1с: предприятие]",[]
5,1С программист,Нижний Новгород,Удаленная работа,Middle (3-6),[],[]
9,1С программист,Санкт-Петербург,Полный день,Middle (3-6),"[1с: предприятие, аналитическое мышление, 1с п...",[]
11,1С программист,Москва,Полный день,Middle (3-6),"[1с программирование, 1с: предприятие, 1с: пре...",[]
12,1С программист,Санкт-Петербург,Полный день,Junior (1-3),"[1с: комплексная автоматизация, 1с программиро...",[СКД]
...,...,...,...,...,...,...
187587,1С архитектор,Москва,Полный день,Middle (3-6),[],[]
187588,1С архитектор,Москва,Удаленная работа,Middle (3-6),[],[]
187606,1С архитектор,Москва,Удаленная работа,Middle (3-6),[],[]
187607,1С архитектор,Москва,Удаленная работа,Senior (>6),[],[]


Unnamed: 0,schedule_name__Гибкий график,schedule_name__Полный день,schedule_name__Сменный график,schedule_name__Удаленная работа,grade_Junior (1-3),grade_Middle (3-6),grade_Senior (>6),income_name__1C оператор,income_name__1С администратор,income_name__1С аналитик,...,финансовый контроль,функциональное тестирование,художественный вкус,ценообразование,чувство вкуса,чувство стиля,эконометрика,экономика,экономический анализ,эксплуатационные документы
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49173,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49174,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49175,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49176,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
export_results_for_prediction(results_predict)


>>> Folder was created: [32m/home/kitsu/ML_HR/Notebooks/output/IT[0m
>>> Data frames were exported
>>> Model was exported
>>> Preprocessing params were exported
>>> Pipeline was exported

>>> Folder was created: [32m/home/kitsu/ML_HR/Notebooks/output/1C[0m
>>> Data frames were exported
>>> Model was exported
>>> Preprocessing params were exported
>>> Pipeline was exported

>>> Folder was created: [32m/home/kitsu/ML_HR/Notebooks/output/other[0m
>>> Data frames were exported
>>> Model was exported
>>> Preprocessing params were exported
>>> Pipeline was exported

>>> Folder was created: [32m/home/kitsu/ML_HR/Notebooks/output/all_data[0m
>>> All data is processing. Skip saving DataFrames
>>> Model was exported
>>> Preprocessing params were exported
>>> Pipeline was exported

>>> Arhive was created: [32m/home/kitsu/ML_HR/Notebooks/output.zip[0m
