In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import TransformedTargetRegressor

In [2]:
df = pd.read_csv('final_data.csv')
df.salary_gross.fillna(False, inplace=True)
df.shape

  df = pd.read_csv('final_data.csv')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.salary_gross.fillna(False, inplace=True)
  df.salary_gross.fillna(False, inplace=True)


(709524, 43)

### Mean Baseline model

In [5]:
mean_salary = df['salary'].mean()

mse = mean_squared_error(df['salary'].values, np.full(df.shape[0], mean_salary))
r2 = r2_score(df['salary'].values, np.full(df.shape[0], mean_salary))

print(f'Корень из среднеквадратичной ошибки (RMSE): {mse**0.5}')

Корень из среднеквадратичной ошибки (RMSE): 99590.3098849821


### Подготовка данных к обучению

In [6]:
cat_columns = ['premium', 'has_test', 'response_letter_required', 'area_name', 'salary_currency', 'salary_gross', 'type_name', 'address_city', 'address_metro_station_name', 'address_metro_line_name', 'address_metro_stations_0_line_name', 'archived', 'employer_name', 'employer_accredited_it_employer', 'employer_trusted', 'schedule_name', 'accept_temporary', 'professional_roles_0_name', 'accept_incomplete_resumes', 'experience_name', 'employment_name', 'address_metro_stations_3_station_name', 'address_metro_stations_3_line_name', 'working_time_intervals_0_name', 'working_time_modes_0_name', 'working_days_0_name', 'branding_type', 'branding_tariff', 'department_name', 'insider_interview_id', 'brand_snippet_logo', 'brand_snippet_picture', 'brand_snippet_background_color', 'brand_snippet_background_gradient_angle', 'brand_snippet_background_gradient_color_list_0_position', 'brand_snippet_background_gradient_color_list_1_position', 'category']
text_columns = ['name', 'snippet_requirement', 'snippet_responsibility']
num_columns = ['name_length', 'length']

In [7]:
scaler = StandardScaler()
num_df = pd.DataFrame(scaler.fit_transform(df[num_columns]), columns=num_columns)

In [8]:
label_columns = []
ohe_columns = []

for column in cat_columns:
    if df[column].nunique() > 10:
        label_columns.append(column)
    else:
        ohe_columns.append(column)

to_bool = list(df[cat_columns].select_dtypes(include=['bool']).columns)
df[['salary_gross', 'employer_accredited_it_employer']] = df[['salary_gross', 'employer_accredited_it_employer']].astype(bool).astype(int)
df[to_bool] = df[to_bool].astype(int)

ohe = OneHotEncoder(sparse_output=False, drop='first')
ohe_encoded = ohe.fit_transform(df[ohe_columns])
ohe_feature_names = ohe.get_feature_names_out(ohe_columns).tolist()
encoded_ohe_data = pd.DataFrame(ohe_encoded, columns=ohe_feature_names)

label_encoder = LabelEncoder()
for col in label_columns:
    df[col] = label_encoder.fit_transform(df[col])
df[label_columns]

Unnamed: 0,area_name,address_city,address_metro_station_name,address_metro_line_name,address_metro_stations_0_line_name,employer_name,professional_roles_0_name,address_metro_stations_3_station_name,address_metro_stations_3_line_name,department_name,category
0,3284,4395,167,14,14,61563,84,0,0,139,7
1,3284,4395,34,19,19,95441,46,0,0,139,10
2,3284,4395,283,5,5,91973,46,0,0,139,12
3,1987,4395,99,2,2,46854,46,66,34,139,7
4,3466,4395,1,7,7,128549,55,0,0,139,3
...,...,...,...,...,...,...,...,...,...,...,...
709519,4618,6209,392,13,13,113906,115,0,0,438,9
709520,4618,6209,337,10,10,76081,28,0,0,139,2
709521,3284,4395,392,13,13,17924,115,0,0,139,9
709522,2514,3390,249,25,25,62999,84,0,0,139,7


In [9]:
X = pd.concat([df[label_columns], encoded_ohe_data, num_df], axis=1)
y = df['salary']

In [14]:
X_train, X_test_val, y_train, y_test_val, = train_test_split(X, y, test_size=0.4, random_state=12345)
X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=12345)

print(f'Размеры выборок: Обучающая {X_train.shape}, Валидационная {X_test.shape}, Тестовая {X_val.shape}')

162180    220000.0
478536     40000.0
234686     40000.0
366018     42400.0
654961    115000.0
            ...   
73421      67500.0
410992    223000.0
698251     80000.0
297282     70000.0
301119     84197.6
Name: salary, Length: 283810, dtype: float64
Размеры выборок: Обучающая (425714, 70), Валидационная (141905, 70), Тестовая (141905, 70)


In [11]:
model_dtr = DecisionTreeRegressor(random_state=12345)

regressor = TransformedTargetRegressor(
    regressor=model_dtr,
    func=np.log,
    inverse_func=np.exp
)


param_grid = {
    'regressor__max_depth': [10, 11, 12, 13, 14, 15, 16],
    'regressor__min_samples_split': [2, 5], 
    'regressor__min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(
    estimator=regressor,
    param_grid=param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print(f'Лучшие параметры: {best_params}')


y_pred = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred)
print(f'Корень из среднеквадратичной ошибки (RMSE): {test_mse**0.5}')

Fitting 3 folds for each of 28 candidates, totalling 84 fits
Лучшие параметры: {'regressor__max_depth': 15, 'regressor__min_samples_leaf': 2, 'regressor__min_samples_split': 5}
Корень из среднеквадратичной ошибки (RMSE): 49027.495687209725


### Кросс-валидации модели

#### Обучающая выборка

In [25]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

model = LinearRegression()

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error')

mean_mse = -np.mean(cv_scores)
std_mse = np.std(cv_scores)

print(f"Средний MSE по кросс-валидации на обучающей выборке: {mean_mse:.4f}")
print(f"Стандартное отклонение MSE на обучающей выборке: {std_mse:.4f}")
print(f"Средний RMSE по кросс-валидации на обучающей выборке: {mean_mse**0.5:.4f}")
print(f"Стандартное отклонение RMSE на обучающей выборке: {std_mse**0.5:.4f}")

Средний MSE по кросс-валидации на обучающей выборке: 11602588176.1473
Стандартное отклонение MSE на обучающей выборке: 11184916946.7310
Средний RMSE по кросс-валидации на обучающей выборке: 107715.3108
Стандартное отклонение RMSE на обучающей выборке: 105758.7677


#### Тестовая выборка

In [27]:
cv_scores_test = cross_val_score(model, X_test, y_test, cv=kfold, scoring='neg_mean_squared_error')

mean_mse_test = -np.mean(cv_scores_test)
std_mse_test = np.std(cv_scores_test)

print(f"Средний MSE по кросс-валидации: {mean_mse_test:.4f}")
print(f"Стандартное отклонение MSE: {std_mse_test:.4f}")
print(f"Средний RMSE по кросс-валидации: {mean_mse_test**0.5:.4f}")
print(f"Стандартное отклонение RMSE: {std_mse_test**0.5:.4f}")

Средний MSE по кросс-валидации: 4105919474.3884
Стандартное отклонение MSE: 660828429.6468
Средний RMSE по кросс-валидации: 64077.4490
Стандартное отклонение RMSE: 25706.5834
