In [1]:
import pandas as pd
import numpy as np
import os
import requests
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
URL_BASE = 'https://media.githubusercontent.com/media/Job-market-team-AI2024/job_market_project/refs/heads/main/data/'
def read_database(file_name):
    return pd.read_csv(os.path.join(URL_BASE, file_name))

In [3]:
df = read_database('IT_vacancies.csv')

### Добавление региона и страны вакансии

In [4]:
def area_transform(entry):
    areas_dict = {}
    areas_dict[entry['id']] = {'name': entry['name'], 'parent_id': entry['parent_id']}
    for area in entry['areas']:
        areas_dict.update(area_transform(area))
    return areas_dict

def area_region(area_id, areas_dict):
    if areas_dict[area_id]['parent_id'] is None or areas_dict[areas_dict[area_id]['parent_id']]['parent_id'] is None:
        return areas_dict[area_id]['name']
    else:
        return areas_dict[areas_dict[area_id]['parent_id']]['name']

def area_country(area_id, areas_dict):
    while areas_dict[area_id]['parent_id'] is not None:
        area_id = areas_dict[area_id]['parent_id']
    return areas_dict[area_id]['name']

In [5]:
areas = requests.get('https://api.hh.ru/areas').json()

areas_dict = {}

for area in areas:
    areas_dict.update(area_transform(area))

In [6]:
df['region_name'] = df['area_id'].apply(lambda x: area_region(str(x), areas_dict))
df['country_name'] = df['area_id'].apply(lambda x: area_country(str(x), areas_dict))

### Добавление категорей, ролей и грейдов

In [7]:
categories = {
    'field': [
        ("product", ['product', 'продуктовый', 'продакт', 'продукта']),
        ("project", ['project', 'проектов', 'проектный', 'проекта']),
        ("data", ['data', 'дата', 'данных']),
        ("bi", ['bi', 'би', 'визуализация']),
        ("business", ['business', 'бизнес']),
        ("system", ['system', 'системный']),
        ("technical", ['qa', 'по', 'программного обеспечения', '1C', '1С', 'технический', 'technical', 'информационной безопасности']),
        ("support", ['поддержк', 'support']),
        ("design", ['graphic', 'web', 'графический', 'веб'])
    ],
    'role': [
        ("developer", ['developer', 'разработчик', 'программист', 'архитектор', 'architect', 'devops', 'mlops', 'разработка', 'разработку', 'программирование']),
        ("scientist", ['scientist', 'science', 'саенс']),
        ("analyst", ['analyst', 'analysis', 'analytics', 'аналитик']),
        ("consultant", ['consultant', 'консультант', 'технолог']),
        ("manager", ['manager', 'lead', 'owner', 'менеджер', 'лид', 'руководитель', 'руководителя', 'оунэр', 'оунер', 'coordinator', 'координатор', 'директор', 'director', 'владелец', 'начальник', 'chief']),
        ("tester", ['тестировщик', 'qa', 'автоматизатор тестирования', 'tester']),
        ("engineer", ['engineer', 'инженер']),
        ("specialist", ['specialist', 'operator', 'support', 'специалист', 'оператор', 'писатель', 'мастер', 'эксперт', 'поддержки', 'поддержка']),
        ("designer", ['design', 'designer', 'дизайн', 'дизайнер', 'artist', 'художник']),
        ("admin", ['администратор'])
    ],
    'grade': [
        ("intern", ['intern', 'стажер']),
        ("junior", ['junior', 'младший']),
        ("middle", ['middle', 'ведущий']),
        ("senior", ['senior', 'старший']),
        ("lead", ['lead', 'руководитель', 'начальник'])
    ]
}

# Функция для поиска категорий
def find_categories(name, categories):
    return [category for category, elements in categories if any(el.lower() in name.lower() for el in elements)]

# Применение функции к DataFrame
for category_type, category_list in categories.items():
    df[f'{category_type}s'] = df['name'].apply(lambda x: find_categories(x, category_list))
    df[category_type] = df[f'{category_type}s'].apply(lambda x: ' '.join(x))

### Зарплата

In [8]:
df['salary'] = df[['salary_from', 'salary_to']].mean(axis=1)

### Ограничения на страну + удаление выбросов в зарплатах

In [9]:
df = df[df['country_name'] == 'Россия']

df = df[~df['salary'].isnull()]
df = df[df['salary_currency'] == 'RUR']

df = df[df['salary'] > np.quantile(df['salary'],0.005)]
df = df[df['salary'] < np.quantile(df['salary'],0.995)]

### Разделим данные на целевую переменную и признаки

In [10]:
X = df.drop(columns=['salary_from', 'salary_to', 'salary'])
y = df['salary']

### Удаление пропусков

In [11]:
X['key_skills'] = X['key_skills'].fillna('')
X['accredited_it_employer'] = X['accredited_it_employer'].fillna(False)

  X['accredited_it_employer'] = X['accredited_it_employer'].fillna(False)


### Разделение на трейн и тест

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### Выделение групп признаков

In [14]:
cat_features = ['area_name', 'region_name', 'experience', 'employer_name', 'schedule', 'employment', 'field', 'role', 'grade',
                'professional_roles_name', 'accredited_it_employer', 'has_test', 'billing_type', 'allow_messages',
                'accept_temporary', 'response_letter_required', 'accept_incomplete_resumes'
               ]

num_features = ['count_key_skills']

### Метрики качества

In [15]:
def MAPE(y_true, y_pred):
  """
  Compute Mean Absolute Percentage Error (MAPE)

  Input:
  y_true (array-like): Actual target values
  y_pred (array-like): Predicted target values

  Returns:
  float: MAPE as a percentage
  """
  y_true = np.array(y_true)
  y_pred = np.array(y_pred)

  ### избегаем нули, чтобы корректно отработало
  non_zero_mask = y_true != 0
  percentage_errors = np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])
  return np.mean(percentage_errors) * 100

In [16]:
def share_within_indifference_interval(y_true, y_pred, percentage):
  """
  Share of predicted targets within percentage interval of true targets

  Input:
  y_true (array-like): Actual target values
  y_pred (array-like): Predicted target values

  Returns:
  float: Share as percentage
  """
  x = (y_pred - y_true) / y_true
  return ((x <= percentage / 100) & (x >= -percentage / 100)).mean() * 100

### Преобразование признаков
- Категориальные: Без изменений (Catboost), OneHotEncoder для остальных моделей
- Числовые (количество указанных навыков в вакансии): Стандартизация
- Текстовые (название вакансии и ключевые навыки): CountVectorizer, топ-20 самых популярных слов

In [24]:
column_transform_ctb = ColumnTransformer(
    transformers=[
        ('cat', 'passthrough', cat_features),
        ('num', StandardScaler(), num_features),
        ('name', CountVectorizer(max_features=20), 'name'),
        ('key_skills', CountVectorizer(max_features=20), 'key_skills')
    ],
    remainder='drop'
)

column_transform = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_features),
        ('num', StandardScaler(), num_features),
        ('name', CountVectorizer(max_features=20), 'name'),
        ('key_skills', CountVectorizer(max_features=20), 'key_skills')
    ], remainder='drop'
)

### CatBoost

In [None]:
X_train_ctb = column_transform_ctb.fit_transform(X_train)
X_test_ctb = column_transform_ctb.transform(X_test)

X_train_ctb = pd.DataFrame(X_train, columns=cat_features + list(range(X_train.shape[1] - len(cat_features))))
X_test_ctb = pd.DataFrame(X_test, columns=cat_features + list(range(X_test.shape[1] - len(cat_features))))

In [None]:
ctb = CatBoostRegressor(cat_features=cat_features, loss_function='MAPE')
ctb.fit(X_train_ctb, np.log(y_train))

0:	learn: 0.0450643	total: 114ms	remaining: 1m 54s
1:	learn: 0.0442450	total: 170ms	remaining: 1m 24s
2:	learn: 0.0434430	total: 219ms	remaining: 1m 12s
3:	learn: 0.0426760	total: 268ms	remaining: 1m 6s
4:	learn: 0.0419747	total: 296ms	remaining: 58.9s
5:	learn: 0.0412917	total: 336ms	remaining: 55.7s
6:	learn: 0.0406353	total: 373ms	remaining: 53s
7:	learn: 0.0400061	total: 423ms	remaining: 52.4s
8:	learn: 0.0393940	total: 469ms	remaining: 51.6s
9:	learn: 0.0388194	total: 505ms	remaining: 50s
10:	learn: 0.0382511	total: 562ms	remaining: 50.5s
11:	learn: 0.0377191	total: 606ms	remaining: 49.9s
12:	learn: 0.0372195	total: 641ms	remaining: 48.7s
13:	learn: 0.0367303	total: 683ms	remaining: 48.1s
14:	learn: 0.0362465	total: 731ms	remaining: 48s
15:	learn: 0.0357867	total: 759ms	remaining: 46.6s
16:	learn: 0.0353424	total: 802ms	remaining: 46.4s
17:	learn: 0.0349117	total: 838ms	remaining: 45.7s
18:	learn: 0.0345019	total: 883ms	remaining: 45.6s
19:	learn: 0.0341167	total: 930ms	remaining:

<catboost.core.CatBoostRegressor at 0x7e2cd1242910>

In [None]:
y_pred = ctb.predict(X_test_ctb)
mape = MAPE(y_test, np.exp(y_pred))
share_10 = share_within_indifference_interval(y_test, np.exp(y_pred), 10)

print('CatBoost:')
print(f'MAPE: {mape}')
print(f'Share 10%: {share_10}')

CatBoost:
MAPE: 24.685501186183977
Share 10%: 31.620724491268582


### XGBoost

In [None]:
pipeline_xgb = Pipeline(steps=[
    ('preprocessor', column_transform),
    ('xgb', XGBRegressor())
])

In [None]:
pipeline_xgb.fit(X_train, np.log(y_train))

In [None]:
y_pred = pipeline_xgb.predict(X_test)
mape = MAPE(y_test, np.exp(y_pred))
share_10 = share_within_indifference_interval(y_test, np.exp(y_pred), 10)

print('XGBoost:')
print(f'MAPE: {mape}')
print(f'Share 10%: {share_10}')

XGBoost:
MAPE: 26.26382450013334
Share 10%: 28.258045894068406




### Random Forest

In [20]:
from sklearn.ensemble import RandomForestRegressor

pipeline_rfr = Pipeline(steps=[
    ('preprocessor', column_transform),
    ('rfr', RandomForestRegressor())
])

In [21]:
pipeline_rfr.fit(X_train, np.log(y_train))

In [22]:
y_pred = pipeline_rfr.predict(X_test)
mape = MAPE(y_test, np.exp(y_pred))
share_10 = share_within_indifference_interval(y_test, np.exp(y_pred), 10)

print('Random Forest:')
print(f'MAPE: {mape}')
print(f'Share 10%: {share_10}')



Random Forest:
MAPE: 24.45430669427721
Share 10%: 34.564872275941696


### SVM

In [25]:
from sklearn.svm import LinearSVR

pipeline_svm = Pipeline(steps=[
    ('preprocessor', column_transform),
    ('rfr', LinearSVR())
])

In [26]:
pipeline_svm.fit(X_train, np.log(y_train))



In [27]:
y_pred = pipeline_svm.predict(X_test)
mape = MAPE(y_test, np.exp(y_pred))
share_10 = share_within_indifference_interval(y_test, np.exp(y_pred), 10)

print('SVM:')
print(f'MAPE: {mape}')
print(f'Share 10%: {share_10}')



SVM:
MAPE: 25.50792752227331
Share 10%: 32.08255159474672


### KNN

In [29]:
from sklearn.neighbors import KNeighborsRegressor

pipeline_knn = Pipeline(steps=[
    ('preprocessor', column_transform),
    ('rfr', KNeighborsRegressor())
])

In [30]:
pipeline_knn.fit(X_train, np.log(y_train))

In [31]:
y_pred = pipeline_knn.predict(X_test)
mape = MAPE(y_test, np.exp(y_pred))
share_10 = share_within_indifference_interval(y_test, np.exp(y_pred), 10)

print('KNN:')
print(f'MAPE: {mape}')
print(f'Share 10%: {share_10}')



KNN:
MAPE: 31.95118912238638
Share 10%: 28.488959445807478


### Выводы
- В ходе экспериментов были протестированы модели градиентного бустинга (Catboost и XGBoost), случайного леса, SVM и KNN с дефолтными гиперпараметрами
- Лучший результат по обеим метрикам на тестовой выборке показала модель случайного леса (MAPE - 24.45 %, Доля вакансий с ошибкой в пределах 10 % - 34.56 %)