In [1]:
import pandas as pd
from catboost import CatBoostRegressor, Pool
import joblib
import numpy as np

In [2]:
# Загрузка и подготовка данных
data = pd.read_csv("clear_data_fin.csv", low_memory=False)
#Заполнение NaN пустой строкой
data = data.fillna("")

In [3]:
# Отбор нужных столбцов
data = data[['academic_degree','accommodation_capability', 'additional_requirements',
             'busy_type','career_perspective', 'education', 'education_speciality', 'is_mobility_program',
             'need_medcard', 'other_vacancy_benefit', 'position_requirements', 'position_responsibilities',
             'regionName', 'regionNameTerm', 'company_business_size', 'required_certificates',
             'required_drive_license', 'required_experience', 'salary', 'schedule_type',
             'professionalSphereName', 'languageKnowledge', 'hardSkills', 'softSkills']]

In [4]:
# Определяем целевую переменную
y = pd.to_numeric(data['salary'], errors='coerce')

# Удаляем ненужные столбцы из признаков
useless_columns = ['salary']
X = data.drop(columns=useless_columns, errors='ignore')

In [6]:
# Определяем категориальные признаки
categorical_features = [col for col in X.columns if X[col].dtype == 'object']

# Обработка категориальных данных
for feature in categorical_features:
    X[feature] = X[feature].astype(str).fillna('missing')

# Преобразуем все числовые колонки в числовой формат, заменяя пропуски
for col in X.columns:
    if col not in categorical_features:
        X[col] = pd.to_numeric(X[col], errors='coerce').fillna(-1)

In [7]:
# Обучение модели на 100% данных
train_pool = Pool(data=X, label=y, cat_features=categorical_features)

model = CatBoostRegressor(iterations=800, learning_rate=0.05, depth=10, task_type='GPU', verbose=100)
model.fit(train_pool)

0:	learn: 24018.0606754	total: 112ms	remaining: 1m 29s
100:	learn: 17843.2550698	total: 7.96s	remaining: 55.1s
200:	learn: 17313.6297857	total: 15.9s	remaining: 47.3s
300:	learn: 17017.9313238	total: 23.9s	remaining: 39.7s
400:	learn: 16809.5044282	total: 32s	remaining: 31.9s
500:	learn: 16627.2291707	total: 40.1s	remaining: 24s
600:	learn: 16468.9071301	total: 48.6s	remaining: 16.1s
700:	learn: 16354.1240704	total: 56.9s	remaining: 8.04s
799:	learn: 16238.6508214	total: 1m 5s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1e05e6bc050>

In [8]:
# Сохранение модели в файл
joblib.dump(model, 'catboost_model.joblib')

['catboost_model.joblib']