In [22]:
import pandas as pd
pd.set_option('display.max_columns', None)
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
import warnings
warnings.filterwarnings("ignore")

In [23]:
df = pd.read_csv('final_data.csv')
df.salary_gross.fillna(False, inplace=True)
df.shape

(709524, 43)

In [9]:
cat_columns = ['premium', 'has_test', 'response_letter_required', 'area_name', 'salary_currency', 'salary_gross', 'type_name', 'address_city', 'address_metro_station_name', 'address_metro_line_name', 'address_metro_stations_0_line_name', 'archived', 'employer_name', 'employer_accredited_it_employer', 'employer_trusted', 'schedule_name', 'accept_temporary', 'professional_roles_0_name', 'accept_incomplete_resumes', 'experience_name', 'employment_name', 'address_metro_stations_3_station_name', 'address_metro_stations_3_line_name', 'working_time_intervals_0_name', 'working_time_modes_0_name', 'working_days_0_name', 'branding_type', 'branding_tariff', 'department_name', 'insider_interview_id', 'brand_snippet_logo', 'brand_snippet_picture', 'brand_snippet_background_color', 'brand_snippet_background_gradient_angle', 'brand_snippet_background_gradient_color_list_0_position', 'brand_snippet_background_gradient_color_list_1_position', 'category']
text_columns = ['name', 'snippet_requirement', 'snippet_responsibility']
num_columns = ['name_length', 'length']

In [10]:
scaler = StandardScaler()
num_df = pd.DataFrame(scaler.fit_transform(df[num_columns]), columns=num_columns)

In [11]:
label_columns = []
ohe_columns = []

for column in cat_columns:
    if df[column].nunique() > 10:
        label_columns.append(column)
    else:
        ohe_columns.append(column)

to_bool = list(df[cat_columns].select_dtypes(include=['bool']).columns)
df[['salary_gross', 'employer_accredited_it_employer']] = df[['salary_gross', 'employer_accredited_it_employer']].astype(bool).astype(int)
df[to_bool] = df[to_bool].astype(int)

ohe = OneHotEncoder(sparse_output=False, drop='first')
ohe_encoded = ohe.fit_transform(df[ohe_columns])
ohe_feature_names = ohe.get_feature_names_out(ohe_columns).tolist()
encoded_ohe_data = pd.DataFrame(ohe_encoded, columns=ohe_feature_names)

label_encoder = LabelEncoder()
for col in label_columns:
    df[col] = label_encoder.fit_transform(df[col])

Подготовка данных с текстовыми колонками

In [None]:
X = pd.concat([df[label_columns], encoded_ohe_data, num_df, df[text_columns]], axis=1)
y = df['salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=12345)

text_features_indices = [X.columns.get_loc(col) for col in text_columns]

### CatBoost

In [None]:
model = CatBoostRegressor(iterations=1500, learning_rate=0.0001, depth=6, verbose=0)
model.fit(X_train, y_train, text_features=text_features_indices)
y_pred = model.predict(X_test)

test_mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Корень из среднеквадратичной ошибки (RMSE): {test_mse**0.5}')
print(f"R² Score: {r2}")

Корень из среднеквадратичной ошибки (RMSE): 72738.67567411887
R² Score: 0.069107422050953


Модель с текстовыми колонками работает очень долго
17 минут
Стоит попробовать без них

In [16]:
X = pd.concat([df[label_columns], encoded_ohe_data, num_df], axis=1)
y = df['salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=12345)

In [19]:
model = CatBoostRegressor(iterations=1500, learning_rate=0.0001, depth=6, verbose=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

test_mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Корень из среднеквадратичной ошибки (RMSE): {test_mse**0.5}')
print(f"R² Score: {r2}")

Корень из среднеквадратичной ошибки (RMSE): 73759.42644008527
R² Score: 0.04279744307284694


Модель отработала за 16 секунд, rmse остался на том же уровне, но r2 сильно упал
Стоит попробовать кросс-валидацию и оптимизацию гиперпараметров

In [20]:
model = CatBoostRegressor(iterations=1500, learning_rate=0.0001, depth=6, verbose=0)

param_grid = {
    'depth': [4, 6, 10],
    'learning_rate': [0.0001, 0.01, 0.1],
    'iterations': [500, 1500, 2500]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, verbose=1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

test_mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Корень из среднеквадратичной ошибки (RMSE): {test_mse**0.5}')
print(f"R² Score: {r2}")

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Корень из среднеквадратичной ошибки (RMSE): 53388.5185270501
R² Score: 0.498507401337542


In [21]:
print("Best parameters found: ", grid_search.best_params_)

Best parameters found:  {'depth': 6, 'iterations': 1500, 'learning_rate': 0.01}
