In [None]:
import pandas as pd

# Загрузка данных
train_df = pd.read_csv('data/processed/train_data.csv')
test_df = pd.read_csv('data/processed/test_data.csv')

# Проверка загрузки
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print("\nПервые 3 строки train_df:")
print(train_df.head(3))


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from autogluon.tabular import TabularPredictor
from collections import Counter

### 1. Подготовка данных ###
def prepare_data(train_df, test_df):
    """Подготовка данных с одинаковым набором признаков"""
    # Удаляем лишние колонки если они есть
    for df in [train_df, test_df]:
        if 'is_attack' in df.columns:
            df.drop('is_attack', axis=1, inplace=True, errors='ignore')

    # Объединяем редкие классы
    rare_classes = ['NetBIOS', 'LDAP']
    train_df['Label'] = train_df['Label'].replace({k: 'RARE' for k in rare_classes})
    test_df['Label'] = test_df['Label'].replace({k: 'RARE' for k in rare_classes})

    # Фиксируем кодировщик
    le = LabelEncoder()
    le.fit(train_df['Label'])

    # Кодируем метки
    train_df['Label_encoded'] = le.transform(train_df['Label'])
    test_df['Label_encoded'] = le.transform(test_df['Label'])

    return train_df, test_df, le

### 2. Балансировка данных ###
def balance_data(train_df, le):
    """Балансировка с гарантией одинаковых признаков"""
    features = train_df.drop(['Label', 'Label_encoded'], axis=1).columns
    X = train_df[features]
    y = train_df['Label_encoded']

    class_counts = Counter(y)
    strategy = {
        cls: min(count * 2, 50000)
        for cls, count in class_counts.items()
        if count < 30000
    }

    if strategy:
        smote = SMOTE(sampling_strategy=strategy, k_neighbors=3)
        X, y = smote.fit_resample(X, y)

    balanced_data = pd.DataFrame(X)
    balanced_data['Label_encoded'] = y
    return balanced_data

### 3. Обучение AutoGluon ###
def train_autogluon(train_data, time_limit=3600):
    """Настройка и обучение AutoGluon"""
    predictor = TabularPredictor(
        label='Label_encoded',
        problem_type='multiclass',
        eval_metric='f1_weighted'
    ).fit(
        train_data,
        presets='best_quality',
        time_limit=time_limit,
        hyperparameters={
            'GBM': [
                {'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}},
                {},
            ],
            'CAT': {},
            'XGB': {},
            'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini'}}],
        }
    )
    return predictor

### 4. Основной пайплайн ###
def autogluon_pipeline(train_df, test_df, time_limit=3600):
    # 1. Подготовка данных
    train_df, test_df, le = prepare_data(train_df, test_df)

    # 2. Балансировка
    balanced_train = balance_data(train_df, le)

    # 3. Обучение
    predictor = train_autogluon(balanced_train, time_limit)

    # 4. Предсказание
    X_test = test_df.drop(['Label', 'Label_encoded'], axis=1, errors='ignore')
    y_test = test_df['Label_encoded']
    y_pred = predictor.predict(X_test)

    # 5. Отчет
    print("Лучшие модели:")
    print(predictor.leaderboard())

    print("\nClassification Report:")
    print(predictor.evaluate_predictions(
        y_true=y_test,
        y_pred=y_pred,
        auxiliary_metrics=True
    ))

    return predictor, le

### Запуск ###
predictor, label_encoder = autogluon_pipeline(train_df.copy(), test_df.copy(), time_limit=3600)