In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import QuantileTransformer
import joblib

class FeaturePreprocessor(BaseEstimator, TransformerMixin):
    AVAILABLE_TS_FEATURES = {
        'borrow_timestamp':{'day', 'dayofyear', 'week'},
        'first_tx_timestamp':{'day','dayofweek','dayofyear','hour','hour_cos','hour_sin','minute','second','week'},
        'last_tx_timestamp':{'week', 'dayofyear'},
        'risky_first_tx_timestamp':{'day','dayofweek','dayofyear','hour','hour_cos','hour_sin','minute','second','week'},
        'risky_last_tx_timestamp':{'dayofyear'},
    }

    def __init__(self,
                 timestamp_columns=None,
                 timestamp_features=None,
                 numeric_columns=None,
                 quantile_output_distribution='uniform',
                 quantile_n_quantiles=1000,
                 random_state=42):
        self.timestamp_columns = timestamp_columns or ['borrow_timestamp',
                                                       'first_tx_timestamp',
                                                       'last_tx_timestamp',
                                                       'risky_first_tx_timestamp',
                                                       'risky_last_tx_timestamp'
                                                      ]
        self.timestamp_features = timestamp_features if timestamp_features else self.AVAILABLE_TS_FEATURES
        self.numeric_columns = numeric_columns or ['repay_amount_sum_eth']
        self.quantile_output_distribution = quantile_output_distribution
        self.quantile_n_quantiles = quantile_n_quantiles
        self.random_state = random_state

        self.quantile_transformers = {}

    def fit(self, X: pd.DataFrame, y=None):
        X = X.copy()

        # Определим числовые колонки, исключая timestamp
        if self.numeric_columns is None:
            self.numeric_columns_ = X.select_dtypes(include=[np.number]).columns.difference(self.timestamp_columns).tolist()
        else:
            self.numeric_columns_ = [col for col in self.numeric_columns if col not in self.timestamp_columns]

        # Обучим QuantileTransformer по числовым колонкам
        for col in self.numeric_columns_:
            qt = QuantileTransformer(
                output_distribution=self.quantile_output_distribution,
                n_quantiles=min(self.quantile_n_quantiles, X.shape[0]),
                random_state=self.random_state
            )
            qt.fit(X[[col]])
            self.quantile_transformers[col] = qt

        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        X = X.copy()
        result = pd.DataFrame(index=X.index)

        # Преобразуем числовые признаки
        for col in self.numeric_columns_:
            qt = self.quantile_transformers.get(col)
            if qt:
                result[f'{col}_qt'] = qt.transform(X[[col]]).flatten()

        # Временные признаки
        for col in self.timestamp_columns:
            ts = pd.to_datetime(X[col], unit='s')
            prefix = col
            temp = {}

            # Сначала извлечём все timestamp признаки
            if 'day' in self.timestamp_features[col]:
                temp[f'{prefix}_day'] = ts.dt.day
            if 'dayofweek' in self.timestamp_features[col]:
                temp[f'{prefix}_dayofweek'] = ts.dt.dayofweek
            if 'dayofyear' in self.timestamp_features[col]:
                temp[f'{prefix}_dayofyear'] = ts.dt.dayofyear
            if 'week' in self.timestamp_features[col]:
                temp[f'{prefix}_week'] = ts.dt.isocalendar().week
            if 'hour' in self.timestamp_features[col]:
                temp[f'{prefix}_hour'] = ts.dt.hour
            if 'minute' in self.timestamp_features[col]:
                temp[f'{prefix}_minute'] = ts.dt.minute
            if 'second' in self.timestamp_features[col]:
                temp[f'{prefix}_second'] = ts.dt.second
            if 'hour_sin' in self.timestamp_features[col]:
                temp[f'{prefix}_hour_sin'] = np.sin(2 * np.pi * ts.dt.hour / 24)
            if 'hour_cos' in self.timestamp_features[col]:
                temp[f'{prefix}_hour_cos'] = np.cos(2 * np.pi * ts.dt.hour / 24)
            temp_df = pd.DataFrame(temp)
            result = pd.concat([result, temp_df], axis=1)

        return result

    def save(self, path: str):
        joblib.dump(self, path)

    @staticmethod
    def load(path: str):
        return joblib.load(path)

# timestamp_columns = ['borrow_timestamp',
#                      'first_tx_timestamp',
#                      'last_tx_timestamp',
#                      'risky_first_tx_timestamp',
#                      'risky_last_tx_timestamp'
#                     ]
# numeric_columns = ['repay_amount_sum_eth']
feature_preproc = FeaturePreprocessor()

In [2]:
raw_data = pd.read_parquet("..\\Data\\raw\\dataset.parquet")

In [3]:
X = raw_data.drop(columns=['target'])
y = raw_data['target']

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
feature_preproc.fit(X_train)

In [6]:
feature_preproc.save("../models/preprocessor.pkl")

Обновим для проверки

In [7]:
del feature_preproc
feature_preproc = FeaturePreprocessor.load("../models/preprocessor.pkl")

In [8]:
class DataPreprocessor():
    COLUMNS = [
        'repay_amount_sum_eth',
        'risk_factor',
        'max_risk_factor',
        'avg_risk_factor',
        'total_available_borrows_avg_eth',
        'time_since_first_deposit',
        'borrow_block_number',
        'risk_factor_above_threshold_daily_count',
        'market_atr',
        'borrow_count',
        'wallet_age',
        'borrow_amount_avg_eth',
        'repay_count',
        'min_eth_ever',
        'deposit_amount_sum_eth',
        'total_available_borrows_eth',
        'incoming_tx_avg_eth',
        'avg_weighted_risk_factor',
        'total_collateral_avg_eth',
        'withdraw_amount_sum_eth',
        'market_natr',
        'market_adxr',
        'risky_first_tx_timestamp',
        'risky_tx_count',
        'outgoing_tx_count',
        'incoming_tx_count',
        'risky_sum_outgoing_amount_eth',
        'market_aroonosc',
        'risky_unique_contract_count',
        'market_macdsignal_macdfix',
        'max_eth_ever',
        'deposit_count',
        'total_balance_eth',
        'time_since_last_liquidated',
        'market_plus_dm',
        'repay_amount_avg_eth',
        'first_tx_timestamp',
        'total_collateral_eth',
        'total_gas_paid_eth',
        'risky_first_last_tx_timestamp_diff',
        'borrow_repay_diff_eth',
        'liquidation_amount_sum_eth',
        'outgoing_tx_sum_eth',
        'outgoing_tx_avg_eth',
        'liquidation_count',
        'incoming_tx_sum_eth',
        'market_macd_macdfix',
        'borrow_amount_sum_eth',
        'market_apo',
        'market_linearreg_slope',
        'withdraw_deposit_diff_if_positive_eth',
        'market_cmo',
        'unique_lending_protocol_count',
        'market_macdsignal_macdext',
        'unique_borrow_protocol_count',
        'market_adx',
        'market_cci',
        'market_fastk'
    ]
    def data_preprocessing(self, df : pd.DataFrame, feature_preprocessor : FeaturePreprocessor):
        return pd.concat([df , feature_preprocessor.transform(df)], axis=1)[self.COLUMNS]
data_preprocessor = DataPreprocessor()

In [9]:
X_train = data_preprocessor.data_preprocessing(X_train, feature_preproc)
X_test = data_preprocessor.data_preprocessing(X_test, feature_preproc)

In [10]:
pd.concat([X_train , y_train], axis=1).to_csv('../Data/processed/train.csv', index=False)
pd.concat([X_test , y_test], axis=1).to_csv('../Data/processed/test.csv', index=False)

In [11]:
from catboost import CatBoostClassifier, Pool

In [12]:
%%time

cb_model = CatBoostClassifier(
    iterations=1000,
    eval_metric='F1',
    loss_function='Logloss',
    random_seed=42,
    early_stopping_rounds=30,
    verbose=False,
    allow_writing_files=False,
    task_type='GPU',
    learning_rate=0.151,
    depth=14,
    l2_leaf_reg= 7.8457,
    bagging_temperature=0.0443,
    border_count=32,
    random_strength=1.4266
)

cb_model.fit(Pool(X_train, y_train), eval_set=Pool(X_test, y_test), use_best_model=True, plot=False)

CPU times: total: 4min 26s
Wall time: 2min 17s


<catboost.core.CatBoostClassifier at 0x1ae8acf29a0>

In [50]:
from sklearn.metrics import classification_report, roc_auc_score
y_pred = cb_model.predict(Pool(X_test, y_test))
y_proba = cb_model.predict_proba(Pool(X_test, y_test))[:, 1]

In [51]:
roc_auc_score(y_test, y_proba)

0.9612124794364876

In [52]:
print(classification_report(y_test, y_pred, output_dict=False))

              precision    recall  f1-score   support

           0       0.89      0.96      0.93     55353
           1       0.92      0.81      0.86     33240

    accuracy                           0.90     88593
   macro avg       0.91      0.89      0.89     88593
weighted avg       0.90      0.90      0.90     88593



In [53]:
cb_model.save_model("../models/catboost_model.cbm")