In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error, mean_absolute_error
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('final_data.csv')
df.salary_gross.fillna(False, inplace=True)
df.shape

(709524, 47)

In [None]:
def culc_metrics(y_test, y_pred):
    test_mse = mean_squared_error(y_test, y_pred)
    rmse = test_mse**0.5
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    def symmetric_mean_absolute_percentage_error(y_true, y_pred):
        y_true = np.array(y_true)
        y_pred = np.array(y_pred)
        smape = 100 * np.mean(2 * np.abs(y_true - y_pred) / (np.abs(y_true) + np.abs(y_pred)))
        return smape

    smape = symmetric_mean_absolute_percentage_error(y_test, y_pred)

    medae = median_absolute_error(y_test, y_pred)

    print(f'Корень из среднеквадратичной ошибки (RMSE): {rmse}')
    print(f"R² Score: {r2}")
    print(f"Средняя абсолютная ошибка (MAE): {mae}")
    print(f"Средняя абсолютная процентная ошибка (SMAPE): {smape:.2f}%")
    print(f"Медианная абсолютная ошибка (MedAE): {medae}")

### Mean Baseline model

In [None]:
mean_salary = df['salary'].mean()

predictions = np.full(df.shape[0], mean_salary)
true_values = df['salary'].values

culc_metrics(true_values, predictions)

Корень из среднеквадратичной ошибки (RMSE): 99590.3098849821
R² Score: 0.0
Средняя абсолютная ошибка (MAE): 42519.63234325419
Средняя абсолютная процентная ошибка (SMAPE): 46.64%
Медианная абсолютная ошибка (MedAE): 34089.93807447741


### Подготовка данных к обучению

In [None]:
cat_columns = ['premium', 'has_test', 'response_letter_required', 'area_name', 'salary_currency', 'salary_gross', 'type_name', 'address_city', 'address_metro_station_name', 'address_metro_line_name', 'address_metro_stations_0_line_name', 'archived', 'employer_name', 'employer_accredited_it_employer', 'employer_trusted', 'schedule_name', 'accept_temporary', 'professional_roles_0_name', 'accept_incomplete_resumes', 'experience_name', 'employment_name', 'address_metro_stations_3_station_name', 'address_metro_stations_3_line_name', 'working_time_intervals_0_name', 'working_time_modes_0_name', 'working_days_0_name', 'branding_type', 'branding_tariff', 'department_name', 'insider_interview_id', 'brand_snippet_logo', 'brand_snippet_picture', 'brand_snippet_background_color', 'brand_snippet_background_gradient_angle', 'brand_snippet_background_gradient_color_list_0_position', 'brand_snippet_background_gradient_color_list_1_position', 'category']
text_columns = ['name', 'snippet_requirement', 'snippet_responsibility']
num_columns = ['name_length', 'length']

In [None]:
scaler = StandardScaler()
num_df = pd.DataFrame(scaler.fit_transform(df[num_columns]), columns=num_columns)

In [None]:
label_columns = []
ohe_columns = []

for column in cat_columns:
    if df[column].nunique() > 10:
        label_columns.append(column)
    else:
        ohe_columns.append(column)

to_bool = list(df[cat_columns].select_dtypes(include=['bool']).columns)
df[['salary_gross', 'employer_accredited_it_employer']] = df[['salary_gross', 'employer_accredited_it_employer']].astype(bool).astype(int)
df[to_bool] = df[to_bool].astype(int)

ohe = OneHotEncoder(sparse_output=False, drop='first')
ohe_encoded = ohe.fit_transform(df[ohe_columns])
ohe_feature_names = ohe.get_feature_names_out(ohe_columns).tolist()
encoded_ohe_data = pd.DataFrame(ohe_encoded, columns=ohe_feature_names)

label_encoder = LabelEncoder()
for col in label_columns:
    df[col] = label_encoder.fit_transform(df[col])
df[label_columns]

Unnamed: 0,area_name,address_city,address_metro_station_name,address_metro_line_name,address_metro_stations_0_line_name,employer_name,professional_roles_0_name,address_metro_stations_3_station_name,address_metro_stations_3_line_name,department_name,category
0,3284,4395,167,14,14,61563,84,0,0,139,7
1,3284,4395,34,19,19,95441,46,0,0,139,10
2,3284,4395,283,5,5,91973,46,0,0,139,12
3,1987,4395,99,2,2,46854,46,66,34,139,7
4,3466,4395,1,7,7,128549,55,0,0,139,3
...,...,...,...,...,...,...,...,...,...,...,...
709519,4618,6209,392,13,13,113906,115,0,0,438,9
709520,4618,6209,337,10,10,76081,28,0,0,139,2
709521,3284,4395,392,13,13,17924,115,0,0,139,9
709522,2514,3390,249,25,25,62999,84,0,0,139,7


In [None]:
X = pd.concat([df[label_columns], encoded_ohe_data, num_df], axis=1)
y = df['salary']

In [None]:
X_train, X_test_val, y_train, y_test_val, = train_test_split(X, y, test_size=0.4, random_state=12345)
X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=12345)

print(f'Размеры выборок: Обучающая {X_train.shape}, Валидационная {X_test.shape}, Тестовая {X_val.shape}')

Размеры выборок: Обучающая (425714, 70), Валидационная (141905, 70), Тестовая (141905, 70)


### DecisionTreeRegressor

In [None]:
model_dtr = DecisionTreeRegressor(random_state=12345)

regressor = TransformedTargetRegressor(
    regressor=model_dtr,
    func=np.log,
    inverse_func=np.exp
)


param_grid = {
    'regressor__max_depth': [10, 11, 12, 13, 14, 15, 16],
    'regressor__min_samples_split': [2, 5],
    'regressor__min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(
    estimator=regressor,
    param_grid=param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print(f'Лучшие параметры: {best_params}')


y_pred = best_model.predict(X_test)
culc_metrics(y_test, y_pred)

Fitting 3 folds for each of 28 candidates, totalling 84 fits
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__min_samples_split=2; total time=   2.2s
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__min_samples_split=2; total time=   2.4s
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__min_samples_split=5; total time=   2.5s
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__min_samples_split=2; total time=   2.7s
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=2, regressor__min_samples_split=2; total time=   2.6s
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=2, regressor__min_samples_split=5; total time=   2.6s
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=2, regressor__min_samples_split=2; total time=   2.7s
[CV] END regressor__max_depth=10, regressor__min_samples_leaf=2, regressor__min_samples_split=5; total time=   2.8s
[CV] END re

### LinearRegression

In [None]:
param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100]
}

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=Ridge(),
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=kfold
)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = -grid_search.best_score_

print(f"Лучшие параметры: {best_params}")
print(f"Лучший RMSE на VAL кросс-валидации: {best_score:.4f}")

Лучшие параметры: {'alpha': 10}
Лучший RMSE на VAL кросс-валидации: 11602114729.5144


In [None]:
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

y_pred = best_model.predict(X_test)
culc_metrics(y_test, y_pred)

Корень из среднеквадратичной ошибки (RMSE): 64151.253113509396
R² Score: 0.2133895932442882
Средняя абсолютная ошибка (MAE): 32800.139580397095
Средняя абсолютная процентная ошибка (SMAPE): 36.36%
Медианная абсолютная ошибка (MedAE): 23599.38339362858


### RandomForestRegressor

In [None]:
model = RandomForestRegressor(n_estimators = 100, random_state=123)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

culc_metrics(y_test, y_pred)

Корень из среднеквадратичной ошибки (RMSE): 44484.30448880008
R² Score: 0.6217638720661233
Средняя абсолютная ошибка (MAE): 21484.423670028984
Средняя абсолютная процентная ошибка (SMAPE): 24.32%
Медианная абсолютная ошибка (MedAE): 12514.5


### CatBoost

In [None]:
X = pd.concat([df[label_columns], encoded_ohe_data, num_df, df[text_columns]], axis=1)
y = df['salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=12345)

text_features_indices = [X.columns.get_loc(col) for col in text_columns]

In [None]:
model = CatBoostRegressor(iterations=1500, learning_rate=0.0001, depth=6, verbose=0)
model.fit(X_train, y_train, text_features=text_features_indices)
y_pred = model.predict(X_test)

culc_metrics(y_test, y_pred)

Корень из среднеквадратичной ошибки (RMSE): 72738.67567411887
R² Score: 0.069107422050953
Средняя абсолютная ошибка (MAE): 41696.838558920696
Средняя абсолютная процентная ошибка (SMAPE): 45.99%
Медианная абсолютная ошибка (MedAE): 34411.85295931041


Модель с текстовыми колонками работает очень долго
Стоит попробовать без них

In [None]:
X = pd.concat([df[label_columns], encoded_ohe_data, num_df], axis=1)
y = df['salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=12345)

In [None]:
model = CatBoostRegressor(iterations=1500, learning_rate=0.0001, depth=6, verbose=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

culc_metrics(y_test, y_pred)

Корень из среднеквадратичной ошибки (RMSE): 73759.42644008527
R² Score: 0.04279744307284694
Средняя абсолютная ошибка (MAE): 41187.002324358386
Средняя абсолютная процентная ошибка (SMAPE): 45.45%
Медианная абсолютная ошибка (MedAE): 33597.586754171665


Модель отработала за 16 секунд, rmse остался на том же уровне, но r2 сильно упал
Стоит попробовать кросс-валидацию и оптимизацию гиперпараметров

In [None]:
model = CatBoostRegressor(iterations=1500, learning_rate=0.0001, depth=6, verbose=0)

param_grid = {
    'depth': [5, 6, 7, 8],
    'learning_rate': [0.001, 0.005, 0.01, 0.02, 0.05, 0.1],
    'iterations': [1000, 1250, 1500, 1750, 2000]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, verbose=1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

culc_metrics(y_test, y_pred)
print("Best parameters found: ", grid_search.best_params_)

Fitting 3 folds for each of 120 candidates, totalling 360 fits
Корень из среднеквадратичной ошибки (RMSE): 53996.9639177637
R² Score: 0.48701168692199803
Средняя абсолютная ошибка (MAE): 29299.93847814121
Средняя абсолютная процентная ошибка (SMAPE): 32.66%
Медианная абсолютная ошибка (MedAE): 20747.6798167968
Best parameters found:  {'depth': 6, 'iterations': 1250, 'learning_rate': 0.01}


In [None]:
X = pd.concat([df[label_columns], encoded_ohe_data, num_df, df[text_columns]], axis=1)
y = df['salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=12345)

text_features_indices = [X.columns.get_loc(col) for col in text_columns]

In [None]:
model = CatBoostRegressor(iterations=1500, learning_rate=0.01, depth=6, verbose=0)
model.fit(X_train, y_train, text_features=text_features_indices)
y_pred = model.predict(X_test)

culc_metrics(y_test, y_pred)

Корень из среднеквадратичной ошибки (RMSE): 54271.6901434194
R² Score: 0.48177843495127703
Средняя абсолютная ошибка (MAE): 29581.659199920625
Средняя абсолютная процентная ошибка (SMAPE): 33.25%
Медианная абсолютная ошибка (MedAE): 21173.341701797523


### XGBoost

In [None]:
X = pd.concat([df[label_columns], encoded_ohe_data, num_df], axis=1)
y = df['salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=12345)

model = XGBRegressor(objective='reg:squarederror', n_estimators=1250, learning_rate=0.01, max_depth=6, verbosity=0)
param_grid = {
    'max_depth': [5, 6, 7, 8],
    'learning_rate': [0.001, 0.01, 0.1],
    'n_estimators': [1000, 1500, 2000]
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, verbose=1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

culc_metrics(y_test, y_pred)
print("Best parameters found: ", grid_search.best_params_)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Корень из среднеквадратичной ошибки (RMSE): 55360.58799637801
R² Score: 0.4607748045033424
Средняя абсолютная ошибка (MAE): 30357.43626278008
Средняя абсолютная процентная ошибка (SMAPE): 34.26%
Медианная абсолютная ошибка (MedAE): 21872.0234375
Best parameters found:  {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 2000}


### LightGBM

In [None]:
X = pd.concat([df[label_columns], encoded_ohe_data, num_df], axis=1)
y = df['salary']

new_columns = [
    'area_name', 'address_city', 'address_metro_station_name',
    'address_metro_line_name', 'address_metro_stations_0_line_name',
    'employer_name', 'professional_roles_0_name',
    'address_metro_stations_3_station_name',
    'address_metro_stations_3_line_name', 'department_name', 'category',
    'premium_1', 'has_test_1', 'response_letter_required_1',
    'salary_currency_BYR', 'salary_currency_EUR', 'salary_currency_GEL',
    'salary_currency_KGS', 'salary_currency_KZT', 'salary_currency_RUR',
    'salary_currency_USD', 'salary_currency_UZS', 'salary_gross_1',
    'type_name_Закрытая', 'type_name_Открытая', 'type_name_Рекламная',
    'archived_1', 'employer_accredited_it_employer_1', 'employer_trusted_1',
    'schedule_name_Гибкий_график', 'schedule_name_Полный_день',
    'schedule_name_Сменный_график', 'schedule_name_Удаленная_работа',
    'accept_temporary_1', 'accept_incomplete_resumes_1',
    'experience_name_Нет_опыта', 'experience_name_От_1_года_до_3_лет',
    'experience_name_От_3_до_6_лет', 'employment_name_Полная_занятость',
    'employment_name_Проектная_работа', 'employment_name_Стажировка',
    'employment_name_Частичная_занятость',
    'working_time_intervals_0_name_Можно_сменами_по_4_6_часов_в_день',
    'working_time_modes_0_name_С_началом_дня_после_16_00',
    'working_days_0_name_По_субботам_и_воскресеньям',
    'branding_type_MAKEUP', 'branding_type_Unknown',
    'branding_tariff_Unknown', 'insider_interview_id_1',
    'brand_snippet_logo_Unknown', 'brand_snippet_picture_Unknown',
    'brand_snippet_background_color__EF3124',
    'brand_snippet_background_color__FF5B29',
    'brand_snippet_background_color_Unknown',
    'brand_snippet_background_gradient_angle_134_0',
    'brand_snippet_background_gradient_angle_200_0',
    'brand_snippet_background_gradient_angle_206_43',
    'brand_snippet_background_gradient_angle_67_0',
    'brand_snippet_background_gradient_angle_Unknown',
    'brand_snippet_background_gradient_color_list_0_position_0_0',
    'brand_snippet_background_gradient_color_list_0_position_0_52',
    'brand_snippet_background_gradient_color_list_0_position_6_96',
    'brand_snippet_background_gradient_color_list_0_position_Unknown',
    'brand_snippet_background_gradient_color_list_1_position_40_0',
    'brand_snippet_background_gradient_color_list_1_position_88_86',
    'brand_snippet_background_gradient_color_list_1_position_90_95',
    'brand_snippet_background_gradient_color_list_1_position_94_48',
    'brand_snippet_background_gradient_color_list_1_position_Unknown',
    'name_length', 'length'
]

X.columns = new_columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=12345)

In [None]:
model = lgb.LGBMRegressor(objective='regression', n_estimators=1250, learning_rate=0.01, max_depth=6, verbosity=-1)

param_grid = {
    'max_depth': [5, 6, 7, 8],
    'learning_rate': [0.001, 0.01, 0.1],
    'n_estimators': [1000, 1500, 2000]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, verbose=1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

culc_metrics(y_test, y_pred)
print("Best parameters found: ", grid_search.best_params_)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Корень из среднеквадратичной ошибки (RMSE): 53473.19215970539
R² Score: 0.496915415856513
Средняя абсолютная ошибка (MAE): 26745.592593777295
Средняя абсолютная процентная ошибка (SMAPE): 29.52%
Медианная абсолютная ошибка (MedAE): 17557.40014850231
Best parameters found:  {'learning_rate': 0.01, 'max_depth': 8, 'n_estimators': 1500}
