# DL-модели
### Команда 4. Вакансии с портала HeadHunter.

In [3]:
import pandas as pd
import numpy as np
import os
import requests
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import seaborn as sns

### Загрузка данных

In [4]:
URL_BASE = 'https://media.githubusercontent.com/media/Job-market-team-AI2024/job_market_project/refs/heads/main/data/'
def read_database(file_name):
    return pd.read_csv(os.path.join(URL_BASE, file_name))

In [5]:
df = read_database('IT_vacancies.csv')

### Добавление региона и страны вакансии

In [7]:
def area_transform(entry):
    areas_dict = {}
    areas_dict[entry['id']] = {'name': entry['name'], 'parent_id': entry['parent_id']}
    for area in entry['areas']:
        areas_dict.update(area_transform(area))
    return areas_dict

def area_region(area_id, areas_dict):
    if areas_dict[area_id]['parent_id'] is None or areas_dict[areas_dict[area_id]['parent_id']]['parent_id'] is None:
        return areas_dict[area_id]['name']
    else:
        return areas_dict[areas_dict[area_id]['parent_id']]['name']

def area_country(area_id, areas_dict):
    while areas_dict[area_id]['parent_id'] is not None:
        area_id = areas_dict[area_id]['parent_id']
    return areas_dict[area_id]['name']

In [8]:
areas = requests.get('https://api.hh.ru/areas').json()

areas_dict = {}

for area in areas:
    areas_dict.update(area_transform(area))

In [9]:
df['region_name'] = df['area_id'].apply(lambda x: area_region(str(x), areas_dict))
df['country_name'] = df['area_id'].apply(lambda x: area_country(str(x), areas_dict))

### Добавление категорей, ролей и грейдов

In [10]:
categories = {
    'field': [
        ("product", ['product', 'продуктовый', 'продакт', 'продукта']),
        ("project", ['project', 'проектов', 'проектный', 'проекта']),
        ("data", ['data', 'дата', 'данных']),
        ("bi", ['bi', 'би', 'визуализация']),
        ("business", ['business', 'бизнес']),
        ("system", ['system', 'системный']),
        ("technical", ['qa', 'по', 'программного обеспечения', '1C', '1С', 'технический', 'technical', 'информационной безопасности']),
        ("support", ['поддержк', 'support']),
        ("design", ['graphic', 'web', 'графический', 'веб'])
    ],
    'role': [
        ("developer", ['developer', 'разработчик', 'программист', 'архитектор', 'architect', 'devops', 'mlops', 'разработка', 'разработку', 'программирование']),
        ("scientist", ['scientist', 'science', 'саенс']),
        ("analyst", ['analyst', 'analysis', 'analytics', 'аналитик']),
        ("consultant", ['consultant', 'консультант', 'технолог']),
        ("manager", ['manager', 'lead', 'owner', 'менеджер', 'лид', 'руководитель', 'руководителя', 'оунэр', 'оунер', 'coordinator', 'координатор', 'директор', 'director', 'владелец', 'начальник', 'chief']),
        ("tester", ['тестировщик', 'qa', 'автоматизатор тестирования', 'tester']),
        ("engineer", ['engineer', 'инженер']),
        ("specialist", ['specialist', 'operator', 'support', 'специалист', 'оператор', 'писатель', 'мастер', 'эксперт', 'поддержки', 'поддержка']),
        ("designer", ['design', 'designer', 'дизайн', 'дизайнер', 'artist', 'художник']),
        ("admin", ['администратор'])
    ],
    'grade': [
        ("intern", ['intern', 'стажер']),
        ("junior", ['junior', 'младший']),
        ("middle", ['middle', 'ведущий']),
        ("senior", ['senior', 'старший']),
        ("lead", ['lead', 'руководитель', 'начальник'])
    ]
}

# Функция для поиска категорий
def find_categories(name, categories):
    return [category for category, elements in categories if any(el.lower() in name.lower() for el in elements)]

# Применение функции к DataFrame
for category_type, category_list in categories.items():
    df[f'{category_type}s'] = df['name'].apply(lambda x: find_categories(x, category_list))
    df[category_type] = df[f'{category_type}s'].apply(lambda x: ' '.join(x))

### Зарплата

In [11]:
df['salary'] = df[['salary_from', 'salary_to']].mean(axis=1)

### Ограничения на страну + удаление выбросов в зарплатах

In [12]:
df = df[df['country_name'] == 'Россия']

df = df[~df['salary'].isnull()]
df = df[df['salary_currency'] == 'RUR']

df = df[df['salary'] > np.quantile(df['salary'],0.005)]
df = df[df['salary'] < np.quantile(df['salary'],0.995)]

### Разделим данные на целевую переменную и признаки

In [13]:
X = df.drop(columns=['salary_from', 'salary_to', 'salary'])
y = df['salary']

### Удаление пропусков

In [14]:
X['key_skills'] = X['key_skills'].fillna('')
X['accredited_it_employer'] = X['accredited_it_employer'].fillna(False)

  X['accredited_it_employer'] = X['accredited_it_employer'].fillna(False)


### Разделение на трейн и тест

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### Выделение групп признаков

In [17]:
cat_features = ['area_name', 'region_name', 'experience', 'employer_name', 'schedule', 'employment', 'field', 'role', 'grade',
                'professional_roles_name', 'accredited_it_employer', 'has_test', 'billing_type', 'allow_messages',
                'accept_temporary', 'response_letter_required', 'accept_incomplete_resumes'
               ]

num_features = ['count_key_skills']

### Метрики качества

In [16]:
def MAPE(y_true, y_pred):
  """
  Compute Mean Absolute Percentage Error (MAPE)

  Input:
  y_true (array-like): Actual target values
  y_pred (array-like): Predicted target values

  Returns:
  float: MAPE as a percentage
  """
  y_true = np.array(y_true)
  y_pred = np.array(y_pred)

  ### избегаем нули, чтобы корректно отработало
  non_zero_mask = y_true != 0
  percentage_errors = np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])
  return np.mean(percentage_errors) * 100

In [18]:
def share_within_indifference_interval(y_true, y_pred, percentage):
  """
  Share of predicted targets within percentage interval of true targets

  Input:
  y_true (array-like): Actual target values
  y_pred (array-like): Predicted target values

  Returns:
  float: Share as percentage
  """
  x = (y_pred - y_true) / y_true
  return ((x <= percentage / 100) & (x >= -percentage / 100)).mean() * 100

### DL

In [61]:
import tensorflow as tf

def mape_tf(y_true, y_pred):
    epsilon = tf.keras.backend.epsilon()
    y_true = tf.maximum(tf.abs(y_true), epsilon)  
    return tf.reduce_mean(tf.abs((y_true - y_pred) / y_true)) * 100


In [62]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Embedding, Dense, LSTM, concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
text_data = df.select_dtypes(['object']).fillna("").astype(str).agg(" ".join, axis=1)
num_data = df.select_dtypes(['float', 'int64']).drop(['id', 'salary', 'salary_to', 'salary_from'], axis = 1)
num_data = num_data.fillna(num_data.mean())
target = df["salary"]


tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(text_data)
sequences = tokenizer.texts_to_sequences(text_data)
X_text = pad_sequences(sequences, maxlen=200)

# X_num = num_data.to_numpy().astype(np.float32)
scaler = StandardScaler()
X_num = scaler.fit_transform(num_data)
y = target.to_numpy().astype(np.float32)

scaler = StandardScaler()
X_num = scaler.fit_transform(num_data)

X_text_train, X_text_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    X_text, X_num, y, test_size=0.2, random_state=42
)

text_input = Input(shape=(200,), name="text_input")
x_text = Embedding(input_dim=10000, output_dim=32)(text_input)
x_text = LSTM(64)(x_text)

num_input = Input(shape=(X_num.shape[1],), name="num_input")
x_num = Dense(32, activation="relu")(num_input)

x = concatenate([x_text, x_num])
x = Dense(64, activation="relu")(x)
x = Dropout(0.3)(x)
output = Dense(1)(x)  

model = Model(inputs=[text_input, num_input], outputs=output)
model.compile(optimizer="adam", loss="mse", metrics=[mape_tf])  # кастомная метрика

model.summary()

# Обучение
model.fit(
    {"text_input": X_text_train, "num_input": X_num_train},
    y_train,
    validation_data=(
        {"text_input": X_text_test, "num_input": X_num_test},
        y_test
    ),
    epochs=10,
    batch_size=32
)


Epoch 1/10
[1m693/693[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 38ms/step - loss: 15143336960.0000 - mape_tf: 99.2742 - val_loss: 13493368832.0000 - val_mape_tf: 91.7672
Epoch 2/10
[1m693/693[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 39ms/step - loss: 12901306368.0000 - mape_tf: 85.4362 - val_loss: 8420563968.0000 - val_mape_tf: 64.4089
Epoch 3/10
[1m693/693[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 40ms/step - loss: 7156320768.0000 - mape_tf: 62.6655 - val_loss: 3591724800.0000 - val_mape_tf: 65.2763
Epoch 4/10
[1m693/693[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 40ms/step - loss: 3179756544.0000 - mape_tf: 69.9276 - val_loss: 1516584576.0000 - val_mape_tf: 78.5956
Epoch 5/10
[1m693/693[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 39ms/step - loss: 1457669632.0000 - mape_tf: 83.0021 - val_loss: 812050368.0000 - val_mape_tf: 85.9486
Epoch 6/10
[1m693/693[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 39ms/step 

<keras.src.callbacks.history.History at 0x3b926cbe0>

In [73]:
text_data = df[cat_features].fillna("").astype(str).agg(" ".join, axis=1)
num_data = df[num_features]
num_data = num_data.fillna(num_data.mean())
target = df["salary"]


tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(text_data)
sequences = tokenizer.texts_to_sequences(text_data)
X_text = pad_sequences(sequences, maxlen=200)

# X_num = num_data.to_numpy().astype(np.float32)
scaler = StandardScaler()
X_num = scaler.fit_transform(num_data)
y = target.to_numpy().astype(np.float32)

scaler = StandardScaler()
X_num = scaler.fit_transform(num_data)

X_text_train, X_text_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    X_text, X_num, y, test_size=0.2, random_state=42
)

text_input = Input(shape=(200,), name="text_input")
x_text = Embedding(input_dim=10000, output_dim=32)(text_input)
x_text = LSTM(64)(x_text)

num_input = Input(shape=(X_num.shape[1],), name="num_input")
x_num = Dense(32, activation="relu")(num_input)

x = concatenate([x_text, x_num])
x = Dense(64, activation="relu")(x)
x = Dropout(0.3)(x)
output = Dense(1)(x)  

model = Model(inputs=[text_input, num_input], outputs=output)
model.compile(optimizer="adam", loss="mse", metrics=[mape_tf])  # кастомная метрика

model.summary()

# Обучение
model.fit(
    {"text_input": X_text_train, "num_input": X_num_train},
    y_train,
    validation_data=(
        {"text_input": X_text_test, "num_input": X_num_test},
        y_test
    ),
    epochs=10,
    batch_size=32
)


Epoch 1/10
[1m693/693[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 37ms/step - loss: 14859529216.0000 - mape_tf: 99.3617 - val_loss: 13917022208.0000 - val_mape_tf: 93.3180
Epoch 2/10
[1m693/693[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 37ms/step - loss: 13642586112.0000 - mape_tf: 88.1719 - val_loss: 10714537984.0000 - val_mape_tf: 66.1724
Epoch 3/10
[1m693/693[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 37ms/step - loss: 10240205824.0000 - mape_tf: 58.5005 - val_loss: 6976589312.0000 - val_mape_tf: 44.0716
Epoch 4/10
[1m693/693[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 38ms/step - loss: 6870162432.0000 - mape_tf: 47.2094 - val_loss: 5183003648.0000 - val_mape_tf: 57.1020
Epoch 5/10
[1m693/693[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 38ms/step - loss: 5373954048.0000 - mape_tf: 61.0945 - val_loss: 3917415168.0000 - val_mape_tf: 56.0424
Epoch 6/10
[1m693/693[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 38ms/st

<keras.src.callbacks.history.History at 0x3a2ddf0a0>

### Выводы
- DL-подходы не смогли по качеству превзойти нелинейные ML-модели