# Установка библиотек

In [20]:
!pip install gradio
!pip install pymorphy2 nltk scikit-learn
!pip install -U scikit-learn
!pip install -U cloudpickle



In [21]:
import gradio as gr
import pandas as pd
import numpy as np
import os
import json
import gdown
import cloudpickle
import pickle

from typing import List, Optional, Tuple, Dict
from dataclasses import dataclass, fields

import pymorphy2
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download('punkt')
nltk.download('stopwords')

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin

import joblib

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Переменные окружение и считывание датасета

In [22]:
ROOT_DIR: str = os.path.dirname(os.path.abspath(os.curdir))
DATASET_PATH: str = f"{ROOT_DIR}/source_data/cleared_hh.xlsx"
SKILLS_JSON_PATH: str = f"{ROOT_DIR}/source_data/skills_dictionary.json"
GOOGLE_COLAB_PATH: str = '/content/cleared_hh.xlsx'


DATASET_URL = 'https://docs.google.com/spreadsheets/d/130NPywKIhzOxcV-5uaaaX6FZOSJEuKFD/edit?usp=sharing&ouid=107174765330788202514&rtpof=true&sd=true'
SKILL_DICT_URL = 'https://drive.google.com/file/d/16AByN3AxPvlg3JZKV6T3njo7XJITnr8s/view?usp=sharing'


def get_original_dataframe() -> pd.DataFrame:
    try:
        df = pd.read_excel(DATASET_PATH)
    except FileNotFoundError:
        try:
            print(f"Can't open file from path: {DATASET_PATH}", end='\n\n')
            df = pd.read_excel(GOOGLE_COLAB_PATH)
        except FileNotFoundError:
            print(f"Can't open file from path: {GOOGLE_COLAB_PATH}", end='\n\n')
            file_name = gdown.download(DATASET_URL, fuzzy=True)
            os.makedirs(os.path.dirname(DATASET_PATH), exist_ok=True)
            os.rename(os.path.abspath(file_name), DATASET_PATH)
            print(f'Moved to: {DATASET_PATH}', end='\n\n')
            df = pd.read_excel(DATASET_PATH)
    finally:
        print('Success!')
        return df

def get_skill_dict() -> dict[str: list[int]]:
    try:
        with open(file=SKILLS_JSON_PATH, mode='r') as input_json_file:
            data = json.load(input_json_file)
    except FileNotFoundError:
        print(f"Can't open file from path: {SKILLS_JSON_PATH}", end='\n\n')
        file_name = gdown.download(SKILL_DICT_URL, fuzzy=True)
        os.makedirs(os.path.dirname(SKILLS_JSON_PATH), exist_ok=True)
        os.rename(os.path.abspath(file_name), SKILLS_JSON_PATH)
        print(f'Moved to: {SKILLS_JSON_PATH}', end='\n\n')

        with open(file=SKILLS_JSON_PATH, mode='r') as input_json_file:
            data = json.load(input_json_file)
    return data

In [38]:
skill_dict = get_skill_dict()

In [24]:
df_original = get_original_dataframe()
df_original

Success!


Unnamed: 0.1,Unnamed: 0,income_name,area__name,schedule__name,grade,salary_from_gross,salary_to_gross,salary_average,skills,unconverted_skills
0,2,1С программист,Санкт-Петербург,Полный день,Middle (3-6),203000.0,238823.529412,220911.764706,"['1с программирование', '1с: предприятие']",[]
1,5,1С программист,Нижний Новгород,Удаленная работа,Middle (3-6),169500.0,282500.000000,226000.000000,[],[]
2,9,1С программист,Санкт-Петербург,Полный день,Middle (3-6),169500.0,199411.764706,184455.882353,"['1с: предприятие', 'аналитическое мышление', ...",[]
3,11,1С программист,Москва,Полный день,Middle (3-6),169500.0,199411.764706,184455.882353,"['1с программирование', '1с: предприятие', '1с...",[]
4,12,1С программист,Санкт-Петербург,Полный день,Junior (1-3),135600.0,159529.411765,147564.705882,"['1с: комплексная автоматизация', '1с программ...",['СКД']
...,...,...,...,...,...,...,...,...,...,...
100448,188787,Frontend,Санкт-Петербург,Полный день,Junior (1-3),56500.0,113000.000000,84750.000000,[],[]
100449,188792,Frontend,Москва,Удаленная работа,Middle (3-6),226000.0,265882.352941,245941.176471,[],[]
100450,188806,Frontend,Москва,Удаленная работа,Senior (>6),297500.0,350000.000000,323750.000000,[],[]
100451,188819,Frontend,Екатеринбург,Удаленная работа,Middle (3-6),192100.0,226000.000000,209050.000000,[],[]


# Формирование списков

In [25]:
vacancy_list = df_original['income_name'].unique().tolist()
grade_list = df_original['grade'].unique().tolist()
area_list = df_original['area__name'].unique().tolist()
schedule_list = df_original['schedule__name'].unique().tolist()
skill_list = skill_dict.keys()

# Загрузка модели и пайплайна

In [26]:
MODEL_IT_URL = 'https://drive.google.com/file/d/1k2k062r2HoLNcpGPlVD16TMyX02wug-2/view?usp=sharing'
MODEL_1C_URL = 'https://drive.google.com/file/d/1cRUmD9c9H1Ne6Sk7iSnaa4IakeHT7O7G/view?usp=sharing'
MODEL_OTHER_URL = 'https://drive.google.com/file/d/18a4P6lFLgU7L01c49GOmQwB6Uiv6PtTO/view?usp=sharing'
MODEL_DL_GPU_WEIGHTS_URL = 'https://drive.google.com/file/d/1IDkBZcxEnLl1CCgFSHfx1wPvNpCG7C8l/view?usp=sharing'
MODEL_DL_CPU_WEIGHTS_URL = 'https://drive.google.com/file/d/1EBjl7BNygziZd7ufxG1HBRGbJSgTLTH6/view?usp=sharing'

PIPELINE_IT_URL = 'https://drive.google.com/file/d/1ZDWr5IcUgp5b4DjCJdo592BqaHeL81iL/view?usp=sharing'
PIPELINE_1C_URL = 'https://drive.google.com/file/d/1eMa8X4fgSHgT8AjYwq3Z2NkSEJW1MFPO/view?usp=sharing'
PIPELINE_OTHER_URL = 'https://drive.google.com/file/d/1mH8UKlnG3707UJdSx4xPTOtPsmk0ZK0Z/view?usp=sharing'
PIPELINE_DL_URL = 'https://drive.google.com/file/d/19dh3Wt_-HdNjXb991ODN4X1GhLHLDceM/view?usp=sharing'

In [50]:
gdown.download(MODEL_IT_URL, fuzzy=True)
gdown.download(MODEL_1C_URL, fuzzy=True)
gdown.download(MODEL_OTHER_URL, fuzzy=True)
gdown.download(MODEL_DL_GPU_WEIGHTS_URL, fuzzy=True)
gdown.download(MODEL_DL_CPU_WEIGHTS_URL, fuzzy=True)

gdown.download(PIPELINE_IT_URL, fuzzy=True)
gdown.download(PIPELINE_1C_URL, fuzzy=True)
gdown.download(PIPELINE_OTHER_URL, fuzzy=True)
gdown.download(PIPELINE_DL_URL, fuzzy=True)

Downloading...
From: https://drive.google.com/uc?id=1k2k062r2HoLNcpGPlVD16TMyX02wug-2
To: /content/model_it.pkl
100%|██████████| 81.9M/81.9M [00:01<00:00, 72.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1cRUmD9c9H1Ne6Sk7iSnaa4IakeHT7O7G
To: /content/model_1c.pkl
100%|██████████| 36.8M/36.8M [00:00<00:00, 59.9MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=18a4P6lFLgU7L01c49GOmQwB6Uiv6PtTO
From (redirected): https://drive.google.com/uc?id=18a4P6lFLgU7L01c49GOmQwB6Uiv6PtTO&confirm=t&uuid=8451e355-1f6b-4b76-b662-dfcbd6a7c39e
To: /content/model_other.pkl
100%|██████████| 142M/142M [00:00<00:00, 167MB/s]
Downloading...
From: https://drive.google.com/uc?id=1IDkBZcxEnLl1CCgFSHfx1wPvNpCG7C8l
To: /content/model_dl_gpu_weights.pth
100%|██████████| 4.72M/4.72M [00:00<00:00, 141MB/s]
Downloading...
From: https://drive.google.com/uc?id=1EBjl7BNygziZd7ufxG1HBRGbJSgTLTH6
To: /content/model_dl_cpu_weights.pth
100%|██████████| 4.72M/4.72M [00:00<00:00, 156MB/s]
Downlo

'pipeline_dl.pkl'

# Sparse Regression model

In [51]:
import torch
from torch import nn

class SparseRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(SparseRegressionModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, 256)
        self.fc4 = nn.Linear(256, 128)
        self.fc5 = nn.Linear(128, 64)
        self.fc6 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.5)
        self.leaky_relu = nn.LeakyReLU(0.1)

    def forward(self, x):
        x = self.leaky_relu(self.fc1(x.to_dense()))
        x = self.dropout(x)
        x = self.leaky_relu(self.fc2(x))
        x = self.leaky_relu(self.fc3(x))
        x = self.dropout(x)
        x = self.leaky_relu(self.fc4(x))
        x = self.leaky_relu(self.fc5(x))
        x = self.fc6(x)
        return x

In [52]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_weights_path = 'model_dl_gpu_weights.pth' if torch.cuda.is_available() else 'model_dl_cpu_weights.pth'
model_dl = SparseRegressionModel(470).to(device)
model_dl.load_state_dict(torch.load(model_weights_path))

<All keys matched successfully>

In [53]:
model_it = joblib.load('model_it.pkl')
model_1c = joblib.load('model_1c.pkl')
model_other = joblib.load('model_other.pkl')

with open('pipeline_it.pkl', 'rb') as f:
    pipeline_it = pickle.load(f)
with open('pipeline_1c.pkl', 'rb') as f:
    pipeline_1c = pickle.load(f)
with open('pipeline_other.pkl', 'rb') as f:
    pipeline_other = pickle.load(f)
with open('pipeline_dl.pkl', 'rb') as f:
    pipeline_dl = pickle.load(f)

# Взаимодействие с приложением Gradio

In [48]:
income_names_IT = ['Frontend', 'Backend', 'DevOps', 'Веб дизайнер', 'QA инженер', 'Mobile',
                   'Project manager', 'Product manager', 'Технический писатель', 'Data Analyst',
                   'Data Engineer', 'Data Scientist', 'Аналитик', 'Бизнес аналитик',
                   'Системный аналитик', 'Руководитель проектов', 'IT Project manager', 'IT Product manager']

income_names_1C = ['1C оператор', '1С эксперт', '1С архитектор', 'Руководитель проектов 1С',
                   '1С методист', '1С администратор', '1C консультант', '1С программист', '1С аналитик']

income_names_other = ['Менеджер по продажам', 'Менеджер по работе с клиентами',
                      'Специалист технической поддержки']

model_name = ['RandomForest_IT', 'RandomForest_1C', "RandomForest_Other", "SparseRegression"]

In [69]:
def calculate_wages(vacancy: str, grade: str, area: str, schedule: str, using_dl: bool, skills):
    if not vacancy or not grade or not area or not schedule:
        return np.nan, np.nan, np.nan, ""

    if skills is None or skills == []:
        skills = [""]

    data = pd.DataFrame({
        "income_name": [vacancy],
        "area__name": [area],
        "schedule__name": [schedule],
        "grade": [grade],
        "salary_from_gross": [0],
        "salary_to_gross": [0],
        "salary_average": [0],
        "skills": [skills],
        "unconverted_skills": [[""]],
    })

    using_model = ""
    predicted_salary = np.nan
    if using_dl:
        using_model = model_name[3]
        data = pipeline_dl.transform(data).drop(['salary_from_gross', 'salary_to_gross', 'salary_average'], axis=1)

        X_test = torch.tensor(data.values, dtype=torch.float32).to(device)
        model_dl.eval()
        with torch.no_grad():
            predicted_salary = model_dl(X_test).cpu().item()
    else:
        if vacancy in income_names_IT:
            using_model = model_name[0]
            data = pipeline_it.transform(data).drop(['salary_from_gross', 'salary_to_gross', 'salary_average'], axis=1)
            predicted_salary = model_it.predict(data)[0]
        elif vacancy in income_names_1C:
            using_model = model_name[1]
            data = pipeline_1c.transform(data).drop(['salary_from_gross', 'salary_to_gross', 'salary_average'], axis=1)
            predicted_salary = model_1c.predict(data)[0]
        elif vacancy in income_names_other:
            using_model = model_name[2]
            data = pipeline_other.transform(data).drop(['salary_from_gross', 'salary_to_gross', 'salary_average'], axis=1)
            predicted_salary = model_other.predict(data)[0]

    predicted_salary = int(predicted_salary)
    predicted_low = int(np.round(predicted_salary * 8 / 9, -2))
    predicted_high = int(np.round(predicted_salary * 10 / 9, -2))

    return predicted_low, predicted_high, predicted_salary, using_model


calculate_wages('Менеджер по продажам', 'Middle (3-6)', 'Нижний Новгород', 'Полный день', False, [])

(202600, 253300, 227930, 'RandomForest_Other')

In [70]:
with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            vacancy_dropdown = gr.Dropdown(vacancy_list, label='Вакансия')
            grade_dropdown = gr.Dropdown(grade_list, label='Стаж')
            city_dropdown = gr.Dropdown(area_list, label='Город')
            schedule_dropdown = gr.Dropdown(schedule_list, label='Тип занятости')
            skills_dropdown = gr.Dropdown(skill_list,
                                          label='Список навыков',
                                          multiselect=True)
            use_dl_model = gr.Checkbox(label='Использовать Deep Learning')

            inputs = [
                vacancy_dropdown,
                grade_dropdown,
                city_dropdown,
                schedule_dropdown,
                use_dl_model,
                skills_dropdown,
            ]

        with gr.Column():
            with gr.Row():
                with gr.Column():
                    pred_from = gr.Number(label='Зарплата от')
                    pred_to = gr.Number(label='Зарплата до')
                    pred_aver = gr.Number(label='Средняя зарплата')
                    pred_outputs = [pred_from, pred_to, pred_aver]

            with gr.Row():
                model_textbox = gr.Textbox(label='Используемая модель')

        @gr.on(inputs=inputs, outputs=[*pred_outputs, model_textbox])
        def update(vacancy, grade, city, schedule, using_dl, skills):
            return calculate_wages(vacancy, grade, city, schedule, using_dl, skills)

demo.launch(share=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://d64d1b5fd39ecb724f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7867 <> https://d64d1b5fd39ecb724f.gradio.live


