In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
import pandas as pd
import numpy as np
import re
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import xgboost as xgb


def classify_job(title):

    if pd.isna(title) or str(title).strip() == "":
        return ("unknown", "unknown")
    t = str(title).lower()
    t = re.sub(r'[^а-яa-z\s\-]', ' ', t)
    t = re.sub(r'\s+', ' ', t).strip()
    t = re.sub(r'\-', ' ', t)

    if any(word in t for word in ['программист', 'разработчик', 'dev', 'developer', 'инженер', 'data', 'аналитик', 'it', 'айти', 'ml']):
        sector = "tech"
    elif any(word in t for word in ['врач', 'медсестра', 'фельдшер', 'медицин', 'санитар', 'лаборант', 'фармацевт']):
        sector = "healthcare"
    elif any(word in t for word in ['учитель', 'преподаватель', 'педагог', 'воспитатель', 'репетитор', 'дефектолог', 'логопед']):
        sector = "education"
    elif any(word in t for word in ['директор', 'генеральный', 'управляющий', 'начальник', 'руководитель', 'заведующий', 'шеф', 'chief', 'head', 'lead', 'ceo', 'cfo', 'заместитель']):
        sector = "management"
    elif any(word in t for word in ['водитель', 'курьер', 'доставк', 'такси', 'экспедитор', 'логист', 'грузчик', 'кладовщик']):
        sector = "logistics"
    elif any(word in t for word in ['продавец', 'кассир', 'администратор', 'офис', 'секретарь', 'оператор', 'консультант', 'менеджер по продажам']):
        sector = "retail_admin"
    elif any(word in t for word in ['строитель', 'монтаж', 'электрик', 'сварщик', 'рабочий', 'мастер', 'бригадир', 'механик', 'слесарь']):
        sector = "construction"
    elif any(word in t for word in ['финанс', 'бухгалтер', 'экономист', 'аудитор', 'кредит', 'аналитик', 'юрист', 'адвокат', 'нотариус']):
        sector = "finance"
    else:
        sector = "other"

    # Уровень
    if any(word in t for word in ['старший', 'senior', 'lead', 'главный', 'ведущий', 'директор', 'управляющий']):
        level = "senior"
    elif any(word in t for word in ['младший', 'junior', 'помощник', 'ассистент', 'стажер', 'практикант', 'студент', 'интерн']):
        level = "junior"
    else:
        level = "middle"

    return (sector, level)


class BankPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, min_job_freq=10):
        self.min_job_freq = min_job_freq
        self.frequent_jobs_ = set(["unknown"])
        self.train_categories_ = {}

    def fit(self, X, y=None):
        X = X.copy()
        if "dp_ewb_last_employment_position" in X.columns:

            tmp = X["dp_ewb_last_employment_position"].apply(
                lambda x: " ".join(sorted(re.findall(r'[а-яa-z]+', str(x).lower()))) if pd.notna(x) else "unknown"
            )
            counts = tmp.value_counts()
            self.frequent_jobs_ = set(counts[counts >= self.min_job_freq].index)

        X = self._create_features(X)

        for col in X.select_dtypes(include=["object", "category"]).columns:
            self.train_categories_[col] = set(X[col].dropna().unique())

        return self

    def transform(self, X):
        X = X.copy()
        X = self._create_features(X)


        for col in X.select_dtypes(include=["object", "category"]).columns:
            if col in self.train_categories_:
                mask_unknown = ~X[col].isin(self.train_categories_[col])
                X.loc[mask_unknown, col] = "__unknown__"


        for col in X.select_dtypes(include=["object"]).columns:
            X[col] = X[col].astype("category")

        return X

    def _create_features(self, X):
        X = X.copy()


        if "dp_ewb_last_employment_position" in X.columns:

            X["job_simplified"] = X["dp_ewb_last_employment_position"].apply(
                lambda x: " ".join(sorted(re.findall(r'[а-яa-z]+', str(x).lower()))) if pd.notna(x) else "unknown"
            )
            X["job_simplified"] = X["job_simplified"].apply(
                lambda x: x if x in self.frequent_jobs_ else "other"
            )


            job_info = X["dp_ewb_last_employment_position"].apply(classify_job)
            X["job_sector"] = [info[0] for info in job_info]
            X["job_level"] = [info[1] for info in job_info]
        else:
            X["job_simplified"] = "unknown"
            X["job_sector"] = "unknown"
            X["job_level"] = "unknown"

        # -------- Регион --------
        if "adminarea" in X.columns and "addrref" in X.columns:
            X["adminarea"] = X["adminarea"].replace("", np.nan)
            X["addrref"] = X["addrref"].replace("", np.nan)
            X["region"] = X["adminarea"].combine_first(X["addrref"])
            X = X.drop(columns=["adminarea", "addrref"])
        elif "adminarea" in X.columns:
            X["region"] = X["adminarea"]
            X = X.drop(columns=["adminarea"])
        elif "addrref" in X.columns:
            X["region"] = X["addrref"]
            X = X.drop(columns=["addrref"])
        else:
            X["region"] = "unknown"


        drop_cols = [
            'city_smart_name', 'period_last_act_ad',
            'dp_address_unique_regions', 'dt',
            'dp_ewb_last_employment_position', 'dp_ewb_last_organization'
        ]
        X = X.drop(columns=[c for c in drop_cols if c in X.columns], errors='ignore')


        if 'turn_cur_db_sum_v2' in X.columns and 'turn_cur_cr_sum_v2' in X.columns:
            X['debit_credit_ratio'] = X['turn_cur_db_sum_v2'] / (X['turn_cur_cr_sum_v2'] + 1e-9)
        if 'days_to_last_transaction' in X.columns:
            X['recent_txn_flag'] = (X['days_to_last_transaction'] <= 30).astype(int)
        if 'mob_cnt_days' in X.columns:
            X['mobile_user_flag'] = (pd.to_numeric(X['mob_cnt_days'], errors='coerce') > 0).astype(int)
        if 'acard' in X.columns:
            X['card_user_flag'] = (pd.to_numeric(X['acard'], errors='coerce') > 0).astype(int)

        return X




df_train = pd.read_csv('/content/drive/MyDrive/DATASETS/hackathon_income_train.csv', sep=";", engine="python", decimal=",")
df_test = pd.read_csv('/content/drive/MyDrive/DATASETS/hackathon_income_test.csv', sep=";", engine="python", decimal=",")


y = pd.to_numeric(df_train['target'], errors='coerce')
w = pd.to_numeric(df_train['w'], errors='coerce')
X_raw = df_train.drop(columns=['id', 'target', 'w'])


mask = (~y.isna()) & (~w.isna()) & (y >= 0) & (np.abs(y) < 1e10) & (w > 0)
X_raw, y, w = X_raw[mask], y[mask], w[mask]



prep = BankPreprocessor(min_job_freq=10)
X_train_proc = prep.fit_transform(X_raw)


model = xgb.XGBRegressor(
    n_estimators=582,
    max_depth=9,
    learning_rate=0.0734,
    subsample=0.7694,
    colsample_bytree=0.9058,
    min_child_weight=10,
    reg_alpha=0.41883,
    reg_lambda=0.4814,
    objective='reg:absoluteerror',
    tree_method='hist',
    enable_categorical=True,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train_proc, y, sample_weight=w, verbose=False)



In [7]:
joblib.dump(prep, 'bank_preprocessor.pkl')
joblib.dump(model, 'pipeline.pkl')

['pipeline.pkl']

In [11]:
import pandas as pd
import numpy as np
import joblib

df_test = pd.read_csv('/content/drive/MyDrive/DATASETS/hackathon_income_test.csv', sep=";", engine="python", decimal=",")
X_test_raw = df_test.drop(columns=['id'])


prep = joblib.load('bank_preprocessor.pkl')
model = joblib.load('pipeline.pkl')

X_test_proc = prep.transform(X_test_raw)


test_pred = model.predict(X_test_proc)
test_pred = np.clip(test_pred, a_min=0, a_max=None)



df_test['target'] = test_pred
df_test[['id', 'target']].to_csv('submission.csv', index=False, decimal='.', sep=',')

print("Прогнозирование завершено! Результат сохранен в submission.csv.")

Прогнозирование завершено! Результат сохранен в submission.csv.
