# Установка библиотек

In [None]:
!pip install gradio
! pip install pymorphy2 nltk scikit-learn
! pip install -U scikit-learn



In [None]:
import gradio as gr
import pandas as pd
import numpy as np
import os
import json
import gdown

from typing import List, Optional, Tuple, Dict
from dataclasses import dataclass, fields

import pymorphy2
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download('punkt')
nltk.download('stopwords')

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin

import joblib
# from transformers import RenameDataTransformer, ProcessSkillsTransformer, VectorizePCASkillsTransformer, ExtractFeaturesTransformer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Переменные окружение и считывание датасета

In [None]:
ROOT_DIR: str = os.path.dirname(os.path.abspath(os.curdir))
DATASET_PATH: str = f"{ROOT_DIR}/source_data/cleared_hh.xlsx"
SKILLS_JSON_PATH: str = f"{ROOT_DIR}/source_data/skills_dictionary.json"
GOOGLE_COLAB_PATH: str = '/content/cleared_hh.xlsx'


DATASET_URL = 'https://docs.google.com/spreadsheets/d/130NPywKIhzOxcV-5uaaaX6FZOSJEuKFD/edit?usp=sharing&ouid=107174765330788202514&rtpof=true&sd=true'
SKILL_DICT_URL = 'https://drive.google.com/file/d/16AByN3AxPvlg3JZKV6T3njo7XJITnr8s/view?usp=sharing'


def get_original_dataframe() -> pd.DataFrame:
    try:
        df = pd.read_excel(DATASET_PATH)
    except FileNotFoundError:
        try:
            print(f"Can't open file from path: {DATASET_PATH}", end='\n\n')
            df = pd.read_excel(GOOGLE_COLAB_PATH)
        except FileNotFoundError:
            print(f"Can't open file from path: {GOOGLE_COLAB_PATH}", end='\n\n')
            file_name = gdown.download(DATASET_URL, fuzzy=True)
            os.makedirs(os.path.dirname(DATASET_PATH), exist_ok=True)
            os.rename(os.path.abspath(file_name), DATASET_PATH)
            print(f'Moved to: {DATASET_PATH}', end='\n\n')
            df = pd.read_excel(DATASET_PATH)
    finally:
        print('Success!')
        return df

def get_skill_dict() -> dict[str: list[int]]:
    try:
        with open(file=SKILLS_JSON_PATH, mode='r') as input_json_file:
            data = json.load(input_json_file)
    except FileNotFoundError:
        print(f"Can't open file from path: {SKILLS_JSON_PATH}", end='\n\n')
        file_name = gdown.download(SKILL_DICT_URL, fuzzy=True)
        os.makedirs(os.path.dirname(SKILLS_JSON_PATH), exist_ok=True)
        os.rename(os.path.abspath(file_name), SKILLS_JSON_PATH)
        print(f'Moved to: {SKILLS_JSON_PATH}', end='\n\n')

        with open(file=SKILLS_JSON_PATH, mode='r') as input_json_file:
            data = json.load(input_json_file)
    return data

In [None]:
skill_dict = get_skill_dict()

In [None]:
df_original = get_original_dataframe()

Success!


# Формирование списков

In [None]:
vacancy_list = df_original['income_name'].unique().tolist()
grade_list = df_original['grade'].unique().tolist()
area_list = df_original['area__name'].unique().tolist()
schedule_list = df_original['schedule__name'].unique().tolist()

# Экспорт пайплайна (подлежит удалению)

In [None]:
# predict_cols = ['income_name', 'area__name', 'schedule__name', 'grade']

# def get_prediction_pipeline(apply_skills: bool = False,
#                             drop_param: Optional[str] = None,
#                             freq_cutoff: int = 100,
#                             n_components: int = 100,
#                             PCA_enable: bool = True) -> Pipeline:
#     pipe: Pipeline = Pipeline(steps=[
#         ('rename_cols', RenameDataTransformer()),
#         ('one-hot', ExtractFeaturesTransformer(drop_param=drop_param))
#     ], verbose=True)
#     if apply_skills:
#         pipe: Pipeline = Pipeline(steps=[
#             ('basic_pipe', pipe),
#             ('process_scills', ProcessSkillsTransformer(freq_cutoff=freq_cutoff)),
#             ('vectorize_skills', VectorizePCASkillsTransformer(n_components=n_components,
#                                                                PCA_enable=PCA_enable))
#         ], verbose=True)

#     display(pipe)

#     return pipe

In [None]:
# preprocess_prediction_pipeline = get_prediction_pipeline(drop_param='first')

# df_for_fit = df_original[predict_cols]
# preprocess_prediction_pipeline = preprocess_prediction_pipeline.fit(df_for_fit)


# df_test = preprocess_prediction_pipeline.transform(df_for_fit)

[Pipeline] ....... (step 1 of 2) Processing rename_cols, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing one-hot, total=   0.1s


In [None]:
# joblib_file = 'pipeline_preprocess.pkl'
# joblib.dump(preprocess_prediction_pipeline, joblib_file)

['pipeline_preprocess.pkl']

# Загрузка модели и пайплайна

In [None]:
MODEL_URL = 'https://drive.google.com/file/d/1k2k062r2HoLNcpGPlVD16TMyX02wug-2/view?usp=sharing'
PIPELINE_URL = 'https://drive.google.com/file/d/1ZDWr5IcUgp5b4DjCJdo592BqaHeL81iL/view?usp=sharing'

gdown.download(MODEL_URL, fuzzy=True)
gdown.download(PIPELINE_URL, fuzzy=True)

model = joblib.load('model.pkl')
loaded_pipeline = joblib.load('pipeline_preprocess.pkl')

Downloading...
From: https://drive.google.com/uc?id=1k2k062r2HoLNcpGPlVD16TMyX02wug-2
To: /content/model.pkl
100%|██████████| 20.4M/20.4M [00:00<00:00, 147MB/s]
Downloading...
From: https://drive.google.com/uc?id=1ZDWr5IcUgp5b4DjCJdo592BqaHeL81iL
To: /content/pipeline_preprocess.pkl
100%|██████████| 7.53k/7.53k [00:00<00:00, 13.5MB/s]


# Взаимодействие со приложение Gradio

In [None]:
def calculate_wages(vacancy: str, grade: str, area: str, schedule: str) -> Tuple[float, float, float]:
    if not vacancy or not grade or not area or not schedule:
        return 0, 0

    data = pd.DataFrame({
        "income_name": [vacancy],
        "area__name": [area],
        "schedule__name": [schedule],
        "grade": [grade]
    })

    data = loaded_pipeline.transform(data)

    predicted_salary = model.predict(data)[0]
    predicted_low = int(np.round(predicted_salary * 8 / 9, -2))
    predicted_high = int(np.round(predicted_salary * 10 / 9, -2))

    return predicted_low, predicted_high, int(predicted_salary)


calculate_wages('Системный аналитик', 'Middle (3-6)', 'Нижний Новгород', 'Полный день')

(198400, 248000, 223167)

In [None]:
demo = gr.Interface(
    fn=calculate_wages,
    inputs=[
        gr.Dropdown(vacancy_list, label='Вакансия'),
        gr.Dropdown(grade_list, label='Стаж'),
        gr.Dropdown(area_list, label='Город'),
        gr.Dropdown(schedule_list, label='Тип занятости'),
    ],
    outputs=[
        gr.Number(label='Зарплата от'),
        gr.Number(label='Зарплата до'),
        gr.Number(label='Средняя зарплата'),
    ],
    theme=gr.themes.glass,
    live=True,
)
demo.launch(share=True)



Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://46b73ec27a53ca2bd4.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


