In [None]:
import pandas as pd

# Загрузка данных
train_df = pd.read_csv('data/processed/train_data.csv')
test_df = pd.read_csv('data/processed/test_data.csv')

# Проверка загрузки
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print("\nПервые 3 строки train_df:")
print(train_df.head(3))


In [None]:
def prepare_data(train_df, test_df):
    """Подготовка данных с одинаковым набором признаков"""
    # Удаляем лишние колонки если они есть
    for df in [train_df, test_df]:
        if 'is_attack' in df.columns:
            df.drop('is_attack', axis=1, inplace=True, errors='ignore')

    # Объединяем редкие классы
    rare_classes = ['NetBIOS', 'LDAP']
    train_df['Label'] = train_df['Label'].replace({k: 'RARE' for k in rare_classes})
    test_df['Label'] = test_df['Label'].replace({k: 'RARE' for k in rare_classes})

    # Фиксируем кодировщик
    le = LabelEncoder()
    le.fit(train_df['Label'])

    # Кодируем метки
    train_df['Label_encoded'] = le.transform(train_df['Label'])
    test_df['Label_encoded'] = le.transform(test_df['Label'])

    return train_df, test_df, le

def balance_data(train_df, le):
    """Балансировка с гарантией одинаковых признаков"""
    features = train_df.drop(['Label', 'Label_encoded'], axis=1).columns
    X = train_df[features]  # Явный выбор колонок
    y = train_df['Label_encoded']

    class_counts = Counter(y)
    strategy = {
        cls: min(count * 2, 50000)
        for cls, count in class_counts.items()
        if count < 30000
    }

    if strategy:
        smote = SMOTE(sampling_strategy=strategy, k_neighbors=3)
        X, y = smote.fit_resample(X, y)

    return X, y

def train_boosting(X_train, y_train, model_type='xgb'):
    """Обучение с фиксированными feature_names"""
    model = XGBClassifier(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.1,
        objective='multi:softmax',
        random_state=42
    ) if model_type == 'xgb' else CatBoostClassifier(
        iterations=500,
        depth=6,
        learning_rate=0.1,
        loss_function='MultiClass',
        verbose=0,
        random_state=42
    ) if model_type == 'catboost' else LGBMClassifier(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.1,
        objective='multiclass',
        random_state=42
    )

    model.fit(X_train, y_train)
    return model, X_train.columns.tolist()  # Возвращаем имена фичей

def boosting_pipeline(train_df, test_df, model_type='xgb'):
    # 1. Подготовка
    train_df, test_df, le = prepare_data(train_df, test_df)

    # 2. Балансировка
    X_train, y_train = balance_data(train_df, le)

    # 3. Обучение
    model, feature_names = train_boosting(X_train, y_train, model_type)

    # 4. Предсказание с проверкой фичей
    X_test = test_df[feature_names]  # Только нужные колонки
    y_test = test_df['Label_encoded']
    y_pred = model.predict(X_test)

    print(classification_report(y_test, y_pred, target_names=le.classes_, digits=4))
    return model, le

# Запуск
model, label_encoder = boosting_pipeline(train_df.copy(), test_df.copy(), model_type='xgb')