In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import Ridge

In [None]:
df = pd.read_csv('final_data.csv')
df.salary_gross.fillna(False, inplace=True)
df.shape

  df = pd.read_csv('final_data.csv')


(709524, 43)

### Mean Baseline model

In [None]:
mean_salary = df['salary'].mean()

mse = mean_squared_error(df['salary'].values, np.full(df.shape[0], mean_salary))
r2 = r2_score(df['salary'].values, np.full(df.shape[0], mean_salary))

print(f'Корень из среднеквадратичной ошибки (RMSE): {mse**0.5}')

Корень из среднеквадратичной ошибки (RMSE): 99590.3098849821


### Подготовка данных к обучению

In [None]:
cat_columns = ['premium', 'has_test', 'response_letter_required', 'area_name', 'salary_currency', 'salary_gross', 'type_name', 'address_city', 'address_metro_station_name', 'address_metro_line_name', 'address_metro_stations_0_line_name', 'archived', 'employer_name', 'employer_accredited_it_employer', 'employer_trusted', 'schedule_name', 'accept_temporary', 'professional_roles_0_name', 'accept_incomplete_resumes', 'experience_name', 'employment_name', 'address_metro_stations_3_station_name', 'address_metro_stations_3_line_name', 'working_time_intervals_0_name', 'working_time_modes_0_name', 'working_days_0_name', 'branding_type', 'branding_tariff', 'department_name', 'insider_interview_id', 'brand_snippet_logo', 'brand_snippet_picture', 'brand_snippet_background_color', 'brand_snippet_background_gradient_angle', 'brand_snippet_background_gradient_color_list_0_position', 'brand_snippet_background_gradient_color_list_1_position', 'category']
text_columns = ['name', 'snippet_requirement', 'snippet_responsibility']
num_columns = ['name_length', 'length']

In [None]:
scaler = StandardScaler()
num_df = pd.DataFrame(scaler.fit_transform(df[num_columns]), columns=num_columns)

In [None]:
label_columns = []
ohe_columns = []

for column in cat_columns:
    if df[column].nunique() > 10:
        label_columns.append(column)
    else:
        ohe_columns.append(column)

to_bool = list(df[cat_columns].select_dtypes(include=['bool']).columns)
df[['salary_gross', 'employer_accredited_it_employer']] = df[['salary_gross', 'employer_accredited_it_employer']].astype(bool).astype(int)
df[to_bool] = df[to_bool].astype(int)

ohe = OneHotEncoder(sparse_output=False, drop='first')
ohe_encoded = ohe.fit_transform(df[ohe_columns])
ohe_feature_names = ohe.get_feature_names_out(ohe_columns).tolist()
encoded_ohe_data = pd.DataFrame(ohe_encoded, columns=ohe_feature_names)

label_encoder = LabelEncoder()
for col in label_columns:
    df[col] = label_encoder.fit_transform(df[col])
df[label_columns]

Unnamed: 0,area_name,address_city,address_metro_station_name,address_metro_line_name,address_metro_stations_0_line_name,employer_name,professional_roles_0_name,address_metro_stations_3_station_name,address_metro_stations_3_line_name,department_name,category
0,3284,4395,167,14,14,61563,84,0,0,139,7
1,3284,4395,34,19,19,95441,46,0,0,139,10
2,3284,4395,283,5,5,91973,46,0,0,139,12
3,1987,4395,99,2,2,46854,46,66,34,139,7
4,3466,4395,1,7,7,128549,55,0,0,139,3
...,...,...,...,...,...,...,...,...,...,...,...
709519,4618,6209,392,13,13,113906,115,0,0,438,9
709520,4618,6209,337,10,10,76081,28,0,0,139,2
709521,3284,4395,392,13,13,17924,115,0,0,139,9
709522,2514,3390,249,25,25,62999,84,0,0,139,7


In [None]:
X = pd.concat([df[label_columns], encoded_ohe_data, num_df], axis=1)
y = df['salary']

In [None]:
X_train, X_test_val, y_train, y_test_val, = train_test_split(X, y, test_size=0.4, random_state=12345)
X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=12345)

print(f'Размеры выборок: Обучающая {X_train.shape}, Валидационная {X_test.shape}, Тестовая {X_val.shape}')

Размеры выборок: Обучающая (425714, 70), Валидационная (141905, 70), Тестовая (141905, 70)


### DecisionTreeRegressor

In [None]:
model_dtr = DecisionTreeRegressor(random_state=12345)

regressor = TransformedTargetRegressor(
    regressor=model_dtr,
    func=np.log,
    inverse_func=np.exp
)


param_grid = {
    'regressor__max_depth': [10, 11, 12, 13, 14, 15, 16],
    'regressor__min_samples_split': [2, 5],
    'regressor__min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(
    estimator=regressor,
    param_grid=param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print(f'Лучшие параметры: {best_params}')


y_pred = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred)
print(f'Корень из среднеквадратичной ошибки (RMSE): {test_mse**0.5}')

Fitting 3 folds for each of 28 candidates, totalling 84 fits
Лучшие параметры: {'regressor__max_depth': 15, 'regressor__min_samples_leaf': 2, 'regressor__min_samples_split': 5}
Корень из среднеквадратичной ошибки (RMSE): 49008.688268060476


### LinearRegression

In [None]:
param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100]
}

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=Ridge(),
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=kfold
)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = -grid_search.best_score_

print(f"Лучшие параметры: {best_params}")
print(f"Лучший RMSE на VAL кросс-валидации: {best_score:.4f}")

Лучшие параметры: {'alpha': 10}
Лучший RMSE на VAL кросс-валидации: 11602114729.5144


In [None]:
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

y_pred = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred)
print(f'Корень из среднеквадратичной ошибки (RMSE): {test_mse**0.5}')

Корень из среднеквадратичной ошибки (RMSE): 64151.25311350936


###RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=123)

regressor = TransformedTargetRegressor(
    regressor=model,
    func=np.log,
    inverse_func=np.exp
)

param_grid = {
    'regressor__n_estimators': [50],
    'regressor__max_depth': [2, 4],
    'regressor__min_samples_split': [2, 5],
    'regressor__min_samples_leaf': [1, 2]
}


grid_search = GridSearchCV(estimator=regressor,
                           param_grid=param_grid,
                           cv=3,
                           scoring='neg_mean_squared_error',
                           verbose=2,
                           n_jobs=-1)


grid_search.fit(X_train, y_train)

print("Лучшие параметры: ", grid_search.best_params_)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

test_mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Корень из среднеквадратичной ошибки (RMSE): {test_mse**0.5}')
print(f"R^2: {r2}")


Fitting 3 folds for each of 8 candidates, totalling 24 fits
Лучшие параметры:  {'regressor__max_depth': 4, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 50}
Корень из среднеквадратичной ошибки (RMSE): 68896.36280691826
R^2: 0.09271863741736486
