In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error, mean_absolute_error
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
import re
import logging
import os
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_log_error

In [3]:
df = pd.read_csv('final_data.csv', low_memory=False)
df.salary_gross.fillna(False, inplace=True)
df.shape

  df = pd.read_csv('final_data.csv')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.salary_gross.fillna(False, inplace=True)
  df.salary_gross.fillna(False, inplace=True)


(709524, 43)

In [4]:
def culc_metrics(y_test, y_pred):
    test_mse = mean_squared_error(y_test, y_pred)
    rmse = test_mse**0.5
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    def symmetric_mean_absolute_percentage_error(y_true, y_pred):
        y_true = np.array(y_true)
        y_pred = np.array(y_pred)
        smape = 100 * np.mean(2 * np.abs(y_true - y_pred) / (np.abs(y_true) + np.abs(y_pred)))
        return smape

    smape = symmetric_mean_absolute_percentage_error(y_test, y_pred)

    medae = median_absolute_error(y_test, y_pred)

    print(f'Корень из среднеквадратичной ошибки (RMSE): {rmse}')
    print(f"R² Score: {r2}")
    print(f"Средняя абсолютная ошибка (MAE): {mae}")
    print(f"Средняя абсолютная процентная ошибка (SMAPE): {smape:.2f}%")
    print(f"Медианная абсолютная ошибка (MedAE): {medae}")

In [5]:
cat_columns = ['premium', 'has_test', 'response_letter_required', 'area_name', 'salary_currency', 'salary_gross', 'type_name', 'address_city', 'address_metro_station_name', 'address_metro_line_name', 'address_metro_stations_0_line_name', 'archived', 'employer_name', 'employer_accredited_it_employer', 'employer_trusted', 'schedule_name', 'accept_temporary', 'professional_roles_0_name', 'accept_incomplete_resumes', 'experience_name', 'employment_name', 'address_metro_stations_3_station_name', 'address_metro_stations_3_line_name', 'working_time_intervals_0_name', 'working_time_modes_0_name', 'working_days_0_name', 'branding_type', 'branding_tariff', 'department_name', 'insider_interview_id', 'brand_snippet_logo', 'brand_snippet_picture', 'brand_snippet_background_color', 'brand_snippet_background_gradient_angle', 'brand_snippet_background_gradient_color_list_0_position', 'brand_snippet_background_gradient_color_list_1_position', 'category']
text_columns = ['name', 'snippet_requirement', 'snippet_responsibility']
num_columns = ['name_length', 'length']

In [6]:
scaler = StandardScaler()
num_df = pd.DataFrame(scaler.fit_transform(df[num_columns]), columns=num_columns)

In [7]:
label_columns = []
ohe_columns = []

for column in cat_columns:
    if df[column].nunique() > 10:
        label_columns.append(column)
    else:
        ohe_columns.append(column)

to_bool = list(df[cat_columns].select_dtypes(include=['bool']).columns)
df[['salary_gross', 'employer_accredited_it_employer']] = df[['salary_gross', 'employer_accredited_it_employer']].astype(bool).astype(int)
df[to_bool] = df[to_bool].astype(int)

ohe = OneHotEncoder(sparse_output=False, drop='first')
ohe_encoded = ohe.fit_transform(df[ohe_columns])
ohe_feature_names = ohe.get_feature_names_out(ohe_columns).tolist()
encoded_ohe_data = pd.DataFrame(ohe_encoded, columns=ohe_feature_names)

embedding_dim = 5
embeddings = {}

for col in label_columns:
    unique_values = df[col].unique()
    value_to_idx = {v: i for i, v in enumerate(unique_values)}
    df[col+'_idx'] = df[col].map(value_to_idx)

    num_embeddings = len(unique_values)
    embedding_layer = nn.Embedding(num_embeddings, embedding_dim)

    embeddings[col] = {
        'value_to_idx': value_to_idx,
        'embedding': embedding_layer,
        'num_embeddings': num_embeddings
    }

embedded_data = []
for col in label_columns:
    indices = torch.tensor(df[col+'_idx'].values, dtype=torch.long)
    embedded = embeddings[col]['embedding'](indices).detach().numpy()
    embedded_cols = [f"{col}_embed_{i}" for i in range(embedding_dim)]
    embedded_df = pd.DataFrame(embedded, columns=embedded_cols)
    embedded_data.append(embedded_df)

embedded_data = pd.concat(embedded_data, axis=1)
final_data = pd.concat([encoded_ohe_data, embedded_data], axis=1)

for col in label_columns:
    df.drop(col+'_idx', axis=1, inplace=True)

In [10]:
final_data.shape

(709524, 112)

In [11]:
final_data

Unnamed: 0,premium_1,has_test_1,response_letter_required_1,salary_currency_BYR,salary_currency_EUR,salary_currency_GEL,salary_currency_KGS,salary_currency_KZT,salary_currency_RUR,salary_currency_USD,salary_currency_UZS,salary_gross_1,type_name_Закрытая,type_name_Открытая,type_name_Рекламная,archived_1,employer_accredited_it_employer_1,employer_trusted_1,schedule_name_Гибкий график,schedule_name_Полный день,schedule_name_Сменный график,schedule_name_Удаленная работа,accept_temporary_1,accept_incomplete_resumes_1,experience_name_Нет опыта,experience_name_От 1 года до 3 лет,experience_name_От 3 до 6 лет,employment_name_Полная занятость,employment_name_Проектная работа,employment_name_Стажировка,employment_name_Частичная занятость,working_time_intervals_0_name_Можно сменами по 4-6 часов в день,working_time_modes_0_name_С началом дня после 16:00,working_days_0_name_По субботам и воскресеньям,branding_type_MAKEUP,branding_type_Unknown,branding_tariff_Unknown,insider_interview_id_1,brand_snippet_logo_Unknown,brand_snippet_picture_Unknown,brand_snippet_background_color_#EF3124,brand_snippet_background_color_#FF5B29,brand_snippet_background_color_Unknown,brand_snippet_background_gradient_angle_134.0,brand_snippet_background_gradient_angle_200.0,brand_snippet_background_gradient_angle_206.43,brand_snippet_background_gradient_angle_67.0,brand_snippet_background_gradient_angle_Unknown,brand_snippet_background_gradient_color_list_0_position_0.0,brand_snippet_background_gradient_color_list_0_position_0.52,brand_snippet_background_gradient_color_list_0_position_6.96,brand_snippet_background_gradient_color_list_0_position_Unknown,brand_snippet_background_gradient_color_list_1_position_40.0,brand_snippet_background_gradient_color_list_1_position_88.86,brand_snippet_background_gradient_color_list_1_position_90.95,brand_snippet_background_gradient_color_list_1_position_94.48,brand_snippet_background_gradient_color_list_1_position_Unknown,area_name_embed_0,area_name_embed_1,area_name_embed_2,area_name_embed_3,area_name_embed_4,address_city_embed_0,address_city_embed_1,address_city_embed_2,address_city_embed_3,address_city_embed_4,address_metro_station_name_embed_0,address_metro_station_name_embed_1,address_metro_station_name_embed_2,address_metro_station_name_embed_3,address_metro_station_name_embed_4,address_metro_line_name_embed_0,address_metro_line_name_embed_1,address_metro_line_name_embed_2,address_metro_line_name_embed_3,address_metro_line_name_embed_4,address_metro_stations_0_line_name_embed_0,address_metro_stations_0_line_name_embed_1,address_metro_stations_0_line_name_embed_2,address_metro_stations_0_line_name_embed_3,address_metro_stations_0_line_name_embed_4,employer_name_embed_0,employer_name_embed_1,employer_name_embed_2,employer_name_embed_3,employer_name_embed_4,professional_roles_0_name_embed_0,professional_roles_0_name_embed_1,professional_roles_0_name_embed_2,professional_roles_0_name_embed_3,professional_roles_0_name_embed_4,address_metro_stations_3_station_name_embed_0,address_metro_stations_3_station_name_embed_1,address_metro_stations_3_station_name_embed_2,address_metro_stations_3_station_name_embed_3,address_metro_stations_3_station_name_embed_4,address_metro_stations_3_line_name_embed_0,address_metro_stations_3_line_name_embed_1,address_metro_stations_3_line_name_embed_2,address_metro_stations_3_line_name_embed_3,address_metro_stations_3_line_name_embed_4,department_name_embed_0,department_name_embed_1,department_name_embed_2,department_name_embed_3,department_name_embed_4,category_embed_0,category_embed_1,category_embed_2,category_embed_3,category_embed_4
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.652505,-0.368607,1.095452,-1.322183,-1.459315,-0.455402,-1.962246,-1.000633,1.080902,0.427841,1.804894,-2.582664,-1.463829,0.025981,-1.529305,1.062846,-0.680691,-0.242012,1.167792,0.279337,0.023169,-0.576443,-0.705927,-0.777326,-1.218284,-0.627765,-0.242025,0.749218,1.044417,-0.348710,-0.803582,-1.063112,-0.798858,-0.116605,0.658222,-0.120403,-1.143518,0.914951,0.665503,0.852125,-0.567592,0.509747,-0.812765,2.378783,1.516769,1.776673,-1.810006,-0.408357,0.228647,0.188721,-0.023008,0.645585,0.979055,-1.981147,-0.028583
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.652505,-0.368607,1.095452,-1.322183,-1.459315,-0.455402,-1.962246,-1.000633,1.080902,0.427841,1.239274,0.583559,1.484236,-0.542939,0.445673,0.467794,1.052778,-1.463598,1.941157,-0.442756,-0.172507,-0.802929,0.776700,0.345943,0.350638,-0.921635,0.802392,0.233477,-0.790880,-0.774614,0.744079,-0.617447,1.228476,-0.184319,-1.067731,-0.120403,-1.143518,0.914951,0.665503,0.852125,-0.567592,0.509747,-0.812765,2.378783,1.516769,1.776673,-1.810006,-0.408357,0.228647,0.188721,-0.288124,0.652618,-0.411295,-0.438743,-0.074688
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.652505,-0.368607,1.095452,-1.322183,-1.459315,-0.455402,-1.962246,-1.000633,1.080902,0.427841,0.117154,-0.525887,0.623840,-0.214389,0.579531,0.924701,-1.833789,-0.219824,0.402514,-0.396792,-0.118738,1.172824,-1.195580,-0.064300,0.499637,0.643843,0.629076,-0.993739,-0.354939,1.255363,0.744079,-0.617447,1.228476,-0.184319,-1.067731,-0.120403,-1.143518,0.914951,0.665503,0.852125,-0.567592,0.509747,-0.812765,2.378783,1.516769,1.776673,-1.810006,-0.408357,0.228647,0.188721,-0.202256,0.094759,0.230094,-2.534478,-1.072785
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.010789,-1.139127,0.769904,-0.575076,0.168872,-0.455402,-1.962246,-1.000633,1.080902,0.427841,0.969745,0.976655,0.629043,0.066213,1.845097,-0.801098,0.508767,0.051399,-0.790030,-0.203619,1.193290,0.324793,-2.154282,0.550901,0.424184,-0.888735,0.211729,-0.304674,2.325128,-0.087781,0.744079,-0.617447,1.228476,-0.184319,-1.067731,0.529986,1.485120,1.330569,1.095721,-0.212884,-1.554940,1.016977,-0.379282,-0.017666,-1.605943,1.776673,-1.810006,-0.408357,0.228647,0.188721,-0.023008,0.645585,0.979055,-1.981147,-0.028583
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-0.523559,-0.914635,0.626950,0.058712,-0.950223,-0.455402,-1.962246,-1.000633,1.080902,0.427841,-0.148729,-0.540210,0.540157,1.937805,2.341961,0.171712,-1.198354,0.398563,0.606520,-1.183418,-1.262271,0.077538,-1.080446,2.066349,1.758393,0.910799,0.799326,1.495492,-1.188784,-0.304168,-1.500939,-0.198826,0.251913,0.289374,0.872468,-0.120403,-1.143518,0.914951,0.665503,0.852125,-0.567592,0.509747,-0.812765,2.378783,1.516769,1.776673,-1.810006,-0.408357,0.228647,0.188721,-2.455722,-0.448813,-0.385866,-1.057859,-0.523528
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
709519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-0.267896,-0.163118,-1.401817,-0.345453,0.448348,-0.442233,-0.590128,0.162582,-1.031254,-0.095562,-3.622132,-0.724769,-0.473057,0.569101,-0.233145,3.293729,-0.156111,-0.893418,0.801990,-0.424846,-0.758177,-0.537882,-0.334652,0.619887,0.472138,0.028059,-0.460189,-0.683544,0.595136,-0.412429,1.121057,0.315554,-2.122544,1.535711,0.706550,-0.120403,-1.143518,0.914951,0.665503,0.852125,-0.567592,0.509747,-0.812765,2.378783,1.516769,-0.268293,-0.571598,0.361660,0.153256,-0.739171,-0.943469,0.351706,1.840374,-0.329713,0.022562
709520,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-0.267896,-0.163118,-1.401817,-0.345453,0.448348,-0.442233,-0.590128,0.162582,-1.031254,-0.095562,-0.103530,0.254994,-0.275249,0.250681,0.898506,-1.151577,-0.967518,-1.137642,-0.578554,0.085253,-0.063634,1.313112,0.957178,-1.643299,0.237376,-0.696379,0.297795,1.654361,0.696716,-1.026124,0.779543,0.208039,1.340530,1.103640,-1.122889,-0.120403,-1.143518,0.914951,0.665503,0.852125,-0.567592,0.509747,-0.812765,2.378783,1.516769,1.776673,-1.810006,-0.408357,0.228647,0.188721,-0.334753,0.276960,1.889106,-0.272238,-0.335370
709521,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.652505,-0.368607,1.095452,-1.322183,-1.459315,-0.455402,-1.962246,-1.000633,1.080902,0.427841,-3.622132,-0.724769,-0.473057,0.569101,-0.233145,3.293729,-0.156111,-0.893418,0.801990,-0.424846,-0.758177,-0.537882,-0.334652,0.619887,0.472138,1.933325,-1.429628,-0.516816,0.710582,0.362434,1.121057,0.315554,-2.122544,1.535711,0.706550,-0.120403,-1.143518,0.914951,0.665503,0.852125,-0.567592,0.509747,-0.812765,2.378783,1.516769,1.776673,-1.810006,-0.408357,0.228647,0.188721,-0.943469,0.351706,1.840374,-0.329713,0.022562
709522,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-0.860474,0.401313,-0.871812,-0.296979,1.504986,-0.369002,-0.197675,0.688523,-1.360851,0.635603,1.386172,-1.922272,1.158087,2.146952,-0.180407,-2.197189,-0.797371,0.833678,0.481077,-1.489404,0.234462,0.947455,-0.834632,-0.937419,0.563906,0.040184,0.866980,-0.619904,0.736496,1.398171,-0.803582,-1.063112,-0.798858,-0.116605,0.658222,-0.120403,-1.143518,0.914951,0.665503,0.852125,-0.567592,0.509747,-0.812765,2.378783,1.516769,1.776673,-1.810006,-0.408357,0.228647,0.188721,-0.023008,0.645585,0.979055,-1.981147,-0.028583


In [12]:
X_train, X_test_val, y_train, y_test_val, = train_test_split(final_data, df['salary'], test_size=0.4, random_state=12345)
X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=12345)

print(f'Размеры выборок: Обучающая {X_train.shape}, Валидационная {X_test.shape}, Тестовая {X_val.shape}')

Размеры выборок: Обучающая (425714, 112), Валидационная (141905, 112), Тестовая (141905, 112)


In [13]:
model_dtr = DecisionTreeRegressor(random_state=12345)

regressor = TransformedTargetRegressor(
    regressor=model_dtr,
    func=np.log,
    inverse_func=np.exp
)


param_grid = {
    'regressor__max_depth': [10, 11, 12, 13, 14, 15, 16],
    'regressor__min_samples_split': [2, 5],
    'regressor__min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(
    estimator=regressor,
    param_grid=param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print(f'Лучшие параметры: {best_params}')


y_pred = best_model.predict(X_test)
culc_metrics(y_test, y_pred)

Fitting 3 folds for each of 28 candidates, totalling 84 fits
Лучшие параметры: {'regressor__max_depth': 14, 'regressor__min_samples_leaf': 2, 'regressor__min_samples_split': 5}
Корень из среднеквадратичной ошибки (RMSE): 51507.25754661
R² Score: 0.49290859063905046
Средняя абсолютная ошибка (MAE): 25809.137176590288
Средняя абсолютная процентная ошибка (SMAPE): 28.91%
Медианная абсолютная ошибка (MedAE): 15563.7648293065


Случайному лесу создание эмбедингов не принесло никакой информации и никак не улучшило обобщающую способность. Продолжем использщовать изначальный DF.