In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error, mean_absolute_error
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.compose import TransformedTargetRegressor, ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import logging
import os
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_log_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, InputLayer
from tensorflow.keras.callbacks import EarlyStopping
import time
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('final_data.csv', low_memory=False)
df.salary_gross.fillna(False, inplace=True)
df.shape

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.salary_gross.fillna(False, inplace=True)
  df.salary_gross.fillna(False, inplace=True)


(709524, 43)

In [4]:
def culc_metrics(y_test, y_pred):
    test_mse = mean_squared_error(y_test, y_pred)
    rmse = test_mse**0.5
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    def symmetric_mean_absolute_percentage_error(y_true, y_pred):
        y_true = np.array(y_true)
        y_pred = np.array(y_pred)
        smape = 100 * np.mean(2 * np.abs(y_true - y_pred) / (np.abs(y_true) + np.abs(y_pred)))
        return smape

    smape = symmetric_mean_absolute_percentage_error(y_test, y_pred)

    medae = median_absolute_error(y_test, y_pred)

    print(f'Корень из среднеквадратичной ошибки (RMSE): {rmse}')
    print(f"R² Score: {r2}")
    print(f"Средняя абсолютная ошибка (MAE): {mae}")
    print(f"Средняя абсолютная процентная ошибка (SMAPE): {smape:.2f}%")
    print(f"Медианная абсолютная ошибка (MedAE): {medae}")

In [5]:
cat_columns = ['premium', 'has_test', 'response_letter_required', 'area_name', 'salary_currency', 'salary_gross', 'type_name', 'address_city', 'address_metro_station_name', 'address_metro_line_name', 'address_metro_stations_0_line_name', 'archived', 'employer_name', 'employer_accredited_it_employer', 'employer_trusted', 'schedule_name', 'accept_temporary', 'professional_roles_0_name', 'accept_incomplete_resumes', 'experience_name', 'employment_name', 'address_metro_stations_3_station_name', 'address_metro_stations_3_line_name', 'working_time_intervals_0_name', 'working_time_modes_0_name', 'working_days_0_name', 'branding_type', 'branding_tariff', 'department_name', 'insider_interview_id', 'brand_snippet_logo', 'brand_snippet_picture', 'brand_snippet_background_color', 'brand_snippet_background_gradient_angle', 'brand_snippet_background_gradient_color_list_0_position', 'brand_snippet_background_gradient_color_list_1_position', 'category']
text_columns = ['name', 'snippet_requirement', 'snippet_responsibility']
num_columns = ['name_length', 'length']

In [5]:
scaler = StandardScaler()
num_df = pd.DataFrame(scaler.fit_transform(df[num_columns]), columns=num_columns)

In [6]:
label_columns = []
ohe_columns = []

for column in cat_columns:
    if df[column].nunique() > 10:
        label_columns.append(column)
    else:
        ohe_columns.append(column)

to_bool = list(df[cat_columns].select_dtypes(include=['bool']).columns)
df[['salary_gross', 'employer_accredited_it_employer']] = df[['salary_gross', 'employer_accredited_it_employer']].astype(bool).astype(int)
df[to_bool] = df[to_bool].astype(int)

ohe = OneHotEncoder(sparse_output=False, drop='first')
ohe_encoded = ohe.fit_transform(df[ohe_columns])
ohe_feature_names = ohe.get_feature_names_out(ohe_columns).tolist()
encoded_ohe_data = pd.DataFrame(ohe_encoded, columns=ohe_feature_names)

embedding_dim = 5
embeddings = {}

for col in label_columns:
    unique_values = df[col].unique()
    value_to_idx = {v: i for i, v in enumerate(unique_values)}
    df[col+'_idx'] = df[col].map(value_to_idx)

    num_embeddings = len(unique_values)
    embedding_layer = nn.Embedding(num_embeddings, embedding_dim)

    embeddings[col] = {
        'value_to_idx': value_to_idx,
        'embedding': embedding_layer,
        'num_embeddings': num_embeddings
    }

embedded_data = []
for col in label_columns:
    indices = torch.tensor(df[col+'_idx'].values, dtype=torch.long)
    embedded = embeddings[col]['embedding'](indices).detach().numpy()
    embedded_cols = [f"{col}_embed_{i}" for i in range(embedding_dim)]
    embedded_df = pd.DataFrame(embedded, columns=embedded_cols)
    embedded_data.append(embedded_df)

embedded_data = pd.concat(embedded_data, axis=1)
final_data = pd.concat([encoded_ohe_data, embedded_data], axis=1)

for col in label_columns:
    df.drop(col+'_idx', axis=1, inplace=True)

In [7]:
final_data.shape

(709524, 111)

In [8]:
final_data

Unnamed: 0,premium_1,has_test_1,response_letter_required_1,salary_currency_BYR,salary_currency_EUR,salary_currency_GEL,salary_currency_KGS,salary_currency_KZT,salary_currency_RUR,salary_currency_USD,salary_currency_UZS,salary_gross_1,type_name_Закрытая,type_name_Открытая,type_name_Рекламная,archived_1,employer_trusted_1,schedule_name_Гибкий график,schedule_name_Полный день,schedule_name_Сменный график,schedule_name_Удаленная работа,accept_temporary_1,accept_incomplete_resumes_1,experience_name_Нет опыта,experience_name_От 1 года до 3 лет,experience_name_От 3 до 6 лет,employment_name_Полная занятость,employment_name_Проектная работа,employment_name_Стажировка,employment_name_Частичная занятость,working_time_intervals_0_name_Можно сменами по 4-6 часов в день,working_time_modes_0_name_С началом дня после 16:00,working_days_0_name_По субботам и воскресеньям,branding_type_MAKEUP,branding_type_Unknown,branding_tariff_Unknown,insider_interview_id_1,brand_snippet_logo_Unknown,brand_snippet_picture_Unknown,brand_snippet_background_color_#EF3124,brand_snippet_background_color_#FF5B29,brand_snippet_background_color_Unknown,brand_snippet_background_gradient_angle_134.0,brand_snippet_background_gradient_angle_200.0,brand_snippet_background_gradient_angle_206.43,brand_snippet_background_gradient_angle_67.0,brand_snippet_background_gradient_angle_Unknown,brand_snippet_background_gradient_color_list_0_position_0.0,brand_snippet_background_gradient_color_list_0_position_0.52,brand_snippet_background_gradient_color_list_0_position_6.96,brand_snippet_background_gradient_color_list_0_position_Unknown,brand_snippet_background_gradient_color_list_1_position_40.0,brand_snippet_background_gradient_color_list_1_position_88.86,brand_snippet_background_gradient_color_list_1_position_90.95,brand_snippet_background_gradient_color_list_1_position_94.48,brand_snippet_background_gradient_color_list_1_position_Unknown,area_name_embed_0,area_name_embed_1,area_name_embed_2,area_name_embed_3,area_name_embed_4,address_city_embed_0,address_city_embed_1,address_city_embed_2,address_city_embed_3,address_city_embed_4,address_metro_station_name_embed_0,address_metro_station_name_embed_1,address_metro_station_name_embed_2,address_metro_station_name_embed_3,address_metro_station_name_embed_4,address_metro_line_name_embed_0,address_metro_line_name_embed_1,address_metro_line_name_embed_2,address_metro_line_name_embed_3,address_metro_line_name_embed_4,address_metro_stations_0_line_name_embed_0,address_metro_stations_0_line_name_embed_1,address_metro_stations_0_line_name_embed_2,address_metro_stations_0_line_name_embed_3,address_metro_stations_0_line_name_embed_4,employer_name_embed_0,employer_name_embed_1,employer_name_embed_2,employer_name_embed_3,employer_name_embed_4,professional_roles_0_name_embed_0,professional_roles_0_name_embed_1,professional_roles_0_name_embed_2,professional_roles_0_name_embed_3,professional_roles_0_name_embed_4,address_metro_stations_3_station_name_embed_0,address_metro_stations_3_station_name_embed_1,address_metro_stations_3_station_name_embed_2,address_metro_stations_3_station_name_embed_3,address_metro_stations_3_station_name_embed_4,address_metro_stations_3_line_name_embed_0,address_metro_stations_3_line_name_embed_1,address_metro_stations_3_line_name_embed_2,address_metro_stations_3_line_name_embed_3,address_metro_stations_3_line_name_embed_4,department_name_embed_0,department_name_embed_1,department_name_embed_2,department_name_embed_3,department_name_embed_4,category_embed_0,category_embed_1,category_embed_2,category_embed_3,category_embed_4
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.701464,-1.615262,0.754498,-0.262368,1.099497,0.024146,0.160305,2.728099,0.458977,1.072765,-0.003873,1.226812,1.276646,-0.885671,-0.579613,0.270035,-0.253400,1.645754,-0.602068,-0.322790,-2.181500,-2.041956,-1.006408,0.428850,0.235146,0.515863,0.554688,-0.014223,-1.592910,0.125352,-0.170704,-0.403730,-1.501531,1.334968,0.148811,1.407867,-0.279962,-1.049748,2.225323,0.155167,0.958755,1.145142,-1.377043,1.319855,0.933729,-1.899433,0.196458,0.440680,-0.778424,-1.976537,-2.070985,-0.664560,-2.560243,0.391297,-1.388778
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.701464,-1.615262,0.754498,-0.262368,1.099497,0.024146,0.160305,2.728099,0.458977,1.072765,-0.095361,0.417154,0.724835,-1.752340,-0.174674,-1.323844,-1.011770,-0.055409,0.236577,-1.272605,-1.446106,-2.001752,1.122317,-0.299088,0.400086,-0.385185,-1.039672,0.000320,-0.903292,0.492474,0.917950,0.171739,-0.501276,0.760431,-1.615547,1.407867,-0.279962,-1.049748,2.225323,0.155167,0.958755,1.145142,-1.377043,1.319855,0.933729,-1.899433,0.196458,0.440680,-0.778424,-1.976537,0.485914,0.002965,0.807796,0.412373,2.250132
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.701464,-1.615262,0.754498,-0.262368,1.099497,0.024146,0.160305,2.728099,0.458977,1.072765,0.374836,-1.077513,-0.044093,1.052550,-0.811428,-0.304150,-0.084076,1.027458,0.144151,0.956945,0.554514,-0.272324,-0.745118,-0.030253,-0.010703,-2.329050,2.246178,-0.299642,-0.384970,-0.346754,0.917950,0.171739,-0.501276,0.760431,-1.615547,1.407867,-0.279962,-1.049748,2.225323,0.155167,0.958755,1.145142,-1.377043,1.319855,0.933729,-1.899433,0.196458,0.440680,-0.778424,-1.976537,-0.355552,-1.068382,-0.914832,-0.541872,0.195133
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.741489,-0.835589,-0.281605,-1.349698,0.339746,0.024146,0.160305,2.728099,0.458977,1.072765,-0.924344,0.160939,-2.302918,-0.574158,-0.195191,2.009651,0.174501,-0.361396,1.407593,0.077497,0.497405,1.409170,-0.159476,-0.406555,-0.430901,-1.166600,1.326362,-0.615913,0.714830,0.630274,0.917950,0.171739,-0.501276,0.760431,-1.615547,0.952073,-1.061820,0.767185,0.336021,-0.509177,0.403001,-0.703142,1.652349,-0.778522,-0.873893,-1.899433,0.196458,0.440680,-0.778424,-1.976537,-2.070985,-0.664560,-2.560243,0.391297,-1.388778
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-1.079846,-0.005777,0.231124,2.564376,-1.216872,0.024146,0.160305,2.728099,0.458977,1.072765,0.366170,-0.536962,-1.514246,0.925023,1.140789,-1.489921,-0.047117,-0.491158,-0.827150,-0.263136,1.003888,-0.784910,-0.612241,-1.169882,0.632228,-0.121413,1.143173,0.981146,0.359502,-0.259121,1.625334,-1.728398,1.265498,2.049627,-0.374945,1.407867,-0.279962,-1.049748,2.225323,0.155167,0.958755,1.145142,-1.377043,1.319855,0.933729,-1.899433,0.196458,0.440680,-0.778424,-1.976537,-0.167827,0.344729,1.506905,-1.005204,1.072493
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
709519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.053717,-1.455626,-0.924142,0.449567,-0.151341,0.985892,-0.030760,0.392873,-0.296175,-0.163778,0.896685,-0.518521,1.249421,0.684893,-0.249691,-0.190319,-1.373128,0.587280,-0.088329,-2.993003,-0.194329,1.464508,-0.538831,-0.061508,-1.281322,1.835995,-0.803063,-1.235332,-0.536295,-0.481237,1.258572,0.215742,1.440394,-1.012784,-0.059625,1.407867,-0.279962,-1.049748,2.225323,0.155167,0.958755,1.145142,-1.377043,1.319855,0.933729,0.689761,-2.332984,1.344425,1.004258,-0.137752,-0.935943,-1.746649,1.262352,0.047579,-0.496787
709520,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.053717,-1.455626,-0.924142,0.449567,-0.151341,0.985892,-0.030760,0.392873,-0.296175,-0.163778,0.278923,-0.804514,-0.565252,0.094857,0.696359,0.794509,-0.938563,0.175733,-1.938936,-0.637332,-0.376132,0.633706,-0.086955,0.928307,1.265653,-0.368144,-0.733234,0.454064,1.885526,-2.215897,-0.122691,-0.956562,0.354071,0.963784,1.092644,1.407867,-0.279962,-1.049748,2.225323,0.155167,0.958755,1.145142,-1.377043,1.319855,0.933729,-1.899433,0.196458,0.440680,-0.778424,-1.976537,0.409757,0.955992,0.915112,-1.026026,0.229121
709521,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.701464,-1.615262,0.754498,-0.262368,1.099497,0.024146,0.160305,2.728099,0.458977,1.072765,0.896685,-0.518521,1.249421,0.684893,-0.249691,-0.190319,-1.373128,0.587280,-0.088329,-2.993003,-0.194329,1.464508,-0.538831,-0.061508,-1.281322,-0.813380,-0.438054,0.013673,0.956159,-0.811564,1.258572,0.215742,1.440394,-1.012784,-0.059625,1.407867,-0.279962,-1.049748,2.225323,0.155167,0.958755,1.145142,-1.377043,1.319855,0.933729,-1.899433,0.196458,0.440680,-0.778424,-1.976537,-0.935943,-1.746649,1.262352,0.047579,-0.496787
709522,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.285341,0.893154,1.284940,-1.815842,-1.006426,-2.384228,-0.122931,1.836658,-0.157787,-0.375916,-1.083411,-0.781880,0.853995,0.045773,-0.253415,0.486324,-0.564573,1.701497,-1.622198,-0.467179,-0.788015,1.381281,0.943555,-0.721234,-1.364549,1.577154,0.607830,0.178441,0.653111,2.056549,-0.170704,-0.403730,-1.501531,1.334968,0.148811,1.407867,-0.279962,-1.049748,2.225323,0.155167,0.958755,1.145142,-1.377043,1.319855,0.933729,-1.899433,0.196458,0.440680,-0.778424,-1.976537,-2.070985,-0.664560,-2.560243,0.391297,-1.388778


In [9]:
X_train, X_test_val, y_train, y_test_val, = train_test_split(final_data, df['salary'], test_size=0.4, random_state=12345)
X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=12345)

print(f'Размеры выборок: Обучающая {X_train.shape}, Валидационная {X_test.shape}, Тестовая {X_val.shape}')

Размеры выборок: Обучающая (425714, 111), Валидационная (141905, 111), Тестовая (141905, 111)


In [10]:
model_dtr = DecisionTreeRegressor(random_state=12345)

regressor = TransformedTargetRegressor(
    regressor=model_dtr,
    func=np.log,
    inverse_func=np.exp
)


param_grid = {
    'regressor__max_depth': [10, 11, 12, 13, 14, 15, 16],
    'regressor__min_samples_split': [2, 5],
    'regressor__min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(
    estimator=regressor,
    param_grid=param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print(f'Лучшие параметры: {best_params}')


y_pred = best_model.predict(X_test)
culc_metrics(y_test, y_pred)

Fitting 3 folds for each of 28 candidates, totalling 84 fits
Лучшие параметры: {'regressor__max_depth': 15, 'regressor__min_samples_leaf': 2, 'regressor__min_samples_split': 2}
Корень из среднеквадратичной ошибки (RMSE): 48970.264067754295
R² Score: 0.5416319993914924
Средняя абсолютная ошибка (MAE): 25268.431367611895
Средняя абсолютная процентная ошибка (SMAPE): 28.27%
Медианная абсолютная ошибка (MedAE): 14970.763403016885


Случайному лесу создание эмбедингов не принесло никакой информации и никак не улучшило обобщающую способность. Продолжем использщовать изначальный DF.

#Полносвязная нейронная сеть

Создадим свою нейронную сеть основаную на **Sequentia**
И протестируем на разных вариантах архитектур

In [6]:
def build_and_train_model(architecture, X_train, y_train, X_test, y_test, epochs=100, batch_size=32):
    """
    Строит и обучает модель с заданной архитектурой

    Параметры:
    architecture - список, определяющий архитектуру сети (количество нейронов в каждом слое)
    X_train, y_train - обучающие данные
    X_test, y_test - тестовые данные
    epochs - количество эпох обучения
    batch_size - размер батча

    Возвращает:
    model - обученная модель
    history - история обучения
    metrics - словарь с метриками на тестовых данных
    train_time - время обучения
    """

    input_shape = X_train.shape[1]

    model = Sequential()

    model.add(InputLayer(shape=(input_shape,)))
    model.add(Dense(architecture[0], activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))

    for neurons in architecture[1:]:
        model.add(Dense(neurons, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dropout(0.2))

    model.add(Dense(1))

    model.compile(optimizer='adam', loss='mse', metrics=['mae'])

    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    start_time = time.time()
    history = model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stopping],
        verbose=0
    )
    train_time = time.time() - start_time

    y_pred = model.predict(X_test).flatten()

    metrics = {
        'MAE': mean_absolute_error(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'R2': r2_score(y_test, y_pred)
    }

    return model, history, y_pred, train_time, metrics

In [24]:
architectures = {
    'small': [64, 32],
    'medium': [128, 64, 32],
    'large': [256, 128, 64, 32],
    'wide': [512, 256],
    'deep': [64, 64, 64, 64, 64]
}

results = {}

for name, arch in architectures.items():
    print(f"\nTraining {name} architecture: {arch}")
    model, history, y_pred, train_time, metrics = build_and_train_model(
        arch, X_train, y_train, X_test, y_test
    )

    results[name] = {
        'architecture': arch,
        'train_time': train_time,
        'metrics': metrics,
        'epochs_trained': len(history.history['loss'])
    }

    print(f"Training time: {train_time:.2f}s")
    culc_metrics(y_test, y_pred)


Training small architecture: [64, 32]
[1m4435/4435[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step
Training time: 1012.70s
Корень из среднеквадратичной ошибки (RMSE): 50941.297970951884
R² Score: 0.5039911649265851
Средняя абсолютная ошибка (MAE): 29103.112701003913
Средняя абсолютная процентная ошибка (SMAPE): 32.33%
Медианная абсолютная ошибка (MedAE): 20343.0625

Training medium architecture: [128, 64, 32]
[1m4435/4435[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step
Training time: 1239.07s
Корень из среднеквадратичной ошибки (RMSE): 63997.72743782658
R² Score: 0.2171500930664486
Средняя абсолютная ошибка (MAE): 29133.000331506486
Средняя абсолютная процентная ошибка (SMAPE): 31.22%
Медианная абсолютная ошибка (MedAE): 18446.55078125

Training large architecture: [256, 128, 64, 32]
[1m4435/4435[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step
Training time: 903.88s
Корень из среднеквадратичной ошибки (RMSE): 99553.35289036723
R² Score: 

In [8]:
scaler = StandardScaler()
num_df = pd.DataFrame(scaler.fit_transform(df[num_columns]), columns=num_columns)

label_columns = []
ohe_columns = []

for column in cat_columns:
    if df[column].nunique() > 10:
        label_columns.append(column)
    else:
        ohe_columns.append(column)

to_bool = list(df[cat_columns].select_dtypes(include=['bool']).columns)
df[['salary_gross', 'employer_accredited_it_employer']] = df[['salary_gross', 'employer_accredited_it_employer']].astype(bool).astype(int)
df[to_bool] = df[to_bool].astype(int)

ohe = OneHotEncoder(sparse_output=False, drop='first')
ohe_encoded = ohe.fit_transform(df[ohe_columns])
ohe_feature_names = ohe.get_feature_names_out(ohe_columns).tolist()
encoded_ohe_data = pd.DataFrame(ohe_encoded, columns=ohe_feature_names)

label_encoder = LabelEncoder()
for col in label_columns:
    df[col] = label_encoder.fit_transform(df[col])

X = pd.concat([df[label_columns], encoded_ohe_data, num_df], axis=1)
y = df['salary']

X_2_train, X_2_test_val, y_2_train, y_2_test_val, = train_test_split(X, y, test_size=0.4, random_state=12345)
X_2_test, X_2_val, y_2_test, y_2_val = train_test_split(X_2_test_val, y_2_test_val, test_size=0.5, random_state=12345)

print(f'Размеры выборок: Обучающая {X_2_train.shape}, Валидационная {X_2_test.shape}, Тестовая {X_2_val.shape}')

Размеры выборок: Обучающая (425714, 69), Валидационная (141905, 69), Тестовая (141905, 69)


In [None]:
results_2 = {}

architectures = {
    'small': [64, 32],
    'medium': [128, 64, 32],
    'large': [256, 128, 64, 32],
    'wide': [512, 256],
    'deep': [64, 64, 64, 64, 64]
}

for name, arch in architectures.items():
    print(f"\nTraining {name} architecture: {arch}")
    model, history, y_pred, train_time, metrics = build_and_train_model(
        arch, X_2_train, y_2_train, X_2_test, y_2_test
    )

    results_2[name] = {
        'architecture': arch,
        'train_time': train_time,
        'metrics': metrics,
        'epochs_trained': len(history.history['loss'])
    }

    print(f"Training time: {train_time:.2f}s")
    culc_metrics(y_2_test, y_pred)


Training small architecture: [64, 32]
[1m4435/4435[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step
Training time: 702.78s
Корень из среднеквадратичной ошибки (RMSE): 71249.22691526295
R² Score: 0.029691776178032314
Средняя абсолютная ошибка (MAE): 41372.06422014223
Средняя абсолютная процентная ошибка (SMAPE): 45.38%
Медианная абсолютная ошибка (MedAE): 33547.609375

Training medium architecture: [128, 64, 32]
[1m4435/4435[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step
Training time: 1053.81s
Корень из среднеквадратичной ошибки (RMSE): 70796.97617631152
R² Score: 0.041970643708062916
Средняя абсолютная ошибка (MAE): 38817.90827763207
Средняя абсолютная процентная ошибка (SMAPE): 42.65%
Медианная абсолютная ошибка (MedAE): 28399.53125

Training large architecture: [256, 128, 64, 32]
[1m4435/4435[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step
Training time: 1138.94s
Корень из среднеквадратичной ошибки (RMSE): 75041.47628239205
R² Score: 

## Выводы