In [None]:
!pip install --upgrade --force-reinstall numpy pandas scipy
!pip install catboost lightautoml autogluon --upgrade --force-reinstall

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.figure as fgr
from matplotlib.pyplot import figure
import seaborn as sns
import time
import os

import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler,StandardScaler, RobustScaler, MaxAbsScaler, LabelEncoder, OneHotEncoder, PowerTransformer
from sklearn.metrics import mean_squared_error,accuracy_score,recall_score,precision_score,f1_score,roc_auc_score
from sklearn.model_selection import train_test_split, RepeatedKFold, KFold, cross_val_score, GridSearchCV, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import make_scorer, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve, roc_auc_score
import pylab
from scipy.stats import skew

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:
import os
import pandas as pd

# Инициализация списков
train_files = []
test_files = []

# Поиск файлов в текущей директории
for filename in os.listdir('/content'):
    if filename.endswith('-training.parquet'):
        train_files.append(os.path.join('/content', filename))
    elif filename.endswith('-testing.parquet'):
        test_files.append(os.path.join('/content', filename))

# Загрузка и объединение train-файлов
train_df = pd.concat([pd.read_parquet(file) for file in train_files])

# Загрузка и объединение test-файлов
test_df = pd.concat([pd.read_parquet(file) for file in test_files])

# Проверка размеров
print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.columns

In [None]:
def grab_col_names(dataframe, cat_th=10, car_th=20):

    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and dataframe[col].dtypes != "O"]
    cat_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_cat]

    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_cat: {len(cat_but_cat)}')

    return cat_cols, num_cols, cat_but_cat


cat_cols, num_cols, cat_but_cat, = grab_col_names(train_df)
cat_cols, num_cols, cat_but_cat

In [None]:
for i in cat_cols:
    print(i, train_df[i].unique())

In [None]:
for i in train_df.columns:
    print(i, train_df[i].isnull().sum())

In [None]:
def cat_summary(dataframe, col_name, plot=False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))

    if plot:
        fig, axs = plt.subplots(1, 2, figsize=(8, 6))
        plt.subplot(1, 2, 1)
        sns.countplot(x=dataframe[col_name], data=dataframe)
        plt.title("Frequency of " + col_name)
        plt.xticks(rotation=90)

        plt.subplot(1, 2, 2)
        values = dataframe[col_name].value_counts()
        plt.pie(x=values, labels=values.index, autopct=lambda p: '{:.2f}% ({:.0f})'.format(p, p/100 * sum(values)))
        plt.title("Frequency of " + col_name)
        plt.legend(labels=['{} - {:.2f}%'.format(index, value/sum(values)*100) for index, value in zip(values.index, values)],
                   loc='upper center', bbox_to_anchor=(0.5, -0.2), fancybox=True, shadow=True, ncol=1)
        plt.show(block=True)

for col in cat_cols:
    cat_summary(train_df, col, True)

In [None]:
def my_histplot(df, col, ax):
    sns.histplot(df[col], kde=True, ax=ax)
    ax.set_title(f'Histogram Plot of {col}')
def my_distplot(df, col, ax):
    sns.distplot(df[col], ax=ax)
    ax.set_title(f'Distribution Plot of {col}')
def my_kdeplot(df, col, ax):
    sns.kdeplot(df[col], ax=ax, fill=True)
    ax.set_title(f'KDE Plot of {col}')

def my_scatterplot(df, col, ax):
    sns.scatterplot(df[col], ax=ax)
    ax.set_title(f'Scatter Plot of {col}')
def my_lineplot(df, col, ax):
    sns.lineplot(df[col], ax=ax)
    ax.set_title(f'Line Plot of {col}')

def my_pie_chart(df, col, ax):
    labels = df[col].value_counts()
    ax.pie(labels, labels=labels.index, autopct='%1.1f%%')
    ax.set_title(f'Pie Chart of {col}')
def my_countplot(df, col, ax):
    sns.countplot(x=df[col], ax=ax)
    ax.set_title(f'Count Plot of {col}')
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
def my_boxplot(df, col, ax):
    sns.boxplot(y=df[col], ax=ax)
def my_violinplot(df, col, ax):
    sns.violinplot(y=df[col], ax=ax)

def my_heatmap(df, size):
    if size: plt.figure(figsize=size)
    sns.heatmap(df.corr(), annot=True, fmt=".1f", cmap='Blues', annot_kws={"size": 12})
    plt.title('Correlation Heatmap')
    plt.show()

def my_vsplot(df, normal_col, label_col):
    plt.figure(figsize=(10, 6), dpi=80)
    plt.bar(list(dict(df[normal_col].value_counts()).keys()), dict(df[normal_col].value_counts()).values(), color='r')
    plt.bar(list(dict(df[normal_col][df[label_col] == 1].value_counts()).keys()), dict(df[normal_col][df[label_col] == 1].value_counts()).values(), color='b')

    plt.xlabel(normal_col)
    plt.ylabel('Count')
    plt.legend(['All', label_col])

def plot_charts_grid_single_feature(df, plot_func, size=(12, 4), n_col=1):
    if len(df.columns) == 0:
        return
    n_rows = (len(df.columns) + n_col-1) // n_col
    fig, axes = plt.subplots(n_rows, n_col, figsize=(size[0]*n_col, size[1]*n_rows))
    if len(df.columns) == 1:
        axes = np.array([axes])
    axes = axes.flatten()

    for i, label in enumerate(df.columns):
        plot_func(df, label, axes[i])
        axes[i].set_xlabel(label)

    for j in range(i+1, n_rows*n_col):
        axes[j].axis('off')

    plt.tight_layout()
    plt.show()

In [None]:
plot_charts_grid_single_feature(train_df[num_cols], my_distplot)

In [None]:
plot_charts_grid_single_feature(train_df[num_cols], my_boxplot, size=(2, 4), n_col=6)

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='Label', y='Flow Duration', data=train_df)
plt.title('Flow Duration Distribution for DDoS vs Normal Traffic')
plt.show()

plt.figure(figsize=(12, 6))
sns.boxplot(x='Protocol', y='Packet Length Mean', hue='Label', data=train_df)
plt.title('Packet Length Mean by Protocol and Attack Label')
plt.show()

In [None]:
flag_columns = ['SYN Flag Count', 'ACK Flag Count', 'FIN Flag Count', 'RST Flag Count']

for flag in flag_columns:
    plt.figure(figsize=(10, 6))
    sns.countplot(x=flag, hue='Label', data=train_df)
    plt.title(f'{flag} Distribution by Attack Label')
    plt.show()

In [None]:
n_numeric_cols = len(train_df.select_dtypes(include=[np.number]).columns) // 3 * 2
my_heatmap(train_df.select_dtypes(include=[np.number]), size=(n_numeric_cols+1, n_numeric_cols+1))

In [None]:
remove_cols = ['Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'FIN Flag Count', 'Fwd Avg Bytes/Bulk',
'Fwd Avg Packets/Bulk',
'Fwd Avg Bulk Rate',
'Bwd Avg Bytes/Bulk',
'Bwd Avg Packets/Bulk',
'Bwd Avg Bulk Rate', 'ECE Flag Count', 'PSH Flag Count']

In [None]:
train_df.drop(remove_cols, axis=1, inplace=True)

In [None]:
numerical_df = train_df.select_dtypes(include=[np.number])

corr_matrix = numerical_df.corr().abs()

mask = np.triu(np.ones(corr_matrix.shape), k=1) == 1

upper_tri = corr_matrix.where(mask)

to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.8)]

numerical_df.drop(to_drop, axis=1, inplace=True)

In [None]:
to_drop

In [None]:
remove_col1 = ['Bwd Packets Length Total',
 'Fwd Packet Length Mean',
 'Bwd Packet Length Mean',
 'Bwd Packet Length Std',
 'Flow IAT Std',
 'Flow IAT Max',
 'Fwd IAT Total',
 'Fwd IAT Mean',
 'Fwd IAT Std',
 'Fwd IAT Max',
 'Fwd IAT Min',
 'Bwd IAT Std',
 'Bwd IAT Max',
 'Fwd Packets/s',
 'Packet Length Min',
 'Packet Length Max',
 'Packet Length Mean',
 'Packet Length Std',
 'Packet Length Variance',
 'RST Flag Count',
 'Avg Packet Size',
 'Avg Fwd Segment Size',
 'Avg Bwd Segment Size',
 'Subflow Fwd Packets',
 'Subflow Fwd Bytes',
 'Subflow Bwd Packets',
 'Subflow Bwd Bytes',
 'Fwd Act Data Packets',
 'Fwd Seg Size Min',
 'Active Max',
 'Active Min',
 'Idle Mean',
 'Idle Max',
 'Idle Min']

In [None]:
train_df.drop(remove_col1, axis=1, inplace=True)

In [None]:
n_numeric_cols = len(train_df.select_dtypes(include=[np.number]).columns) // 3 * 2
my_heatmap(train_df.select_dtypes(include=[np.number]), size=(n_numeric_cols+1, n_numeric_cols+1))

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
# Проверка размеров
print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

In [None]:
common_columns = train_df.columns.intersection(test_df.columns)
test_df = test_df[common_columns]

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
# Словарь для группировки меток test_df
label_mapping_test = {
    'DrDoS_UDP': 'UDP',
    'DrDoS_LDAP': 'LDAP',
    'DrDoS_MSSQL': 'MSSQL',
    'DrDoS_NetBIOS': 'NetBIOS',
    'DrDoS_SNMP': 'UDP',  # SNMP использует UDP
    'DrDoS_DNS': 'UDP',   # DNS обычно через UDP
    'DrDoS_NTP': 'UDP',   # NTP-атаки через UDP
    'UDP-lag': 'UDPLag',
    'WebDDoS': 'Syn',     # WebDDoS часто использует SYN-флуд
    'TFTP': 'UDP',        # TFTP работает по UDP
    'Benign': 'Benign',
    'Syn': 'Syn'
}

# Применяем преобразование к test_df
test_df['Label'] = test_df['Label'].map(label_mapping_test)

In [None]:
unique_train_labels = train_df['Label'].unique()
unique_test_labels = test_df['Label'].unique()

unknown_labels = set(unique_test_labels) - set(unique_train_labels)
assert not unknown_labels, f"Тестовая выборка содержит неизвестные классы: {unknown_labels}"

In [None]:
train_df = train_df[train_df["Label"] != "Portmap"]

In [None]:
train_df['Label'].unique()

In [None]:
test_df['Label'].unique()

In [None]:
import pandas as pd

# Для train_df
train_class_distribution = train_df['Label'].value_counts().sort_values(ascending=False)
print("Распределение классов в train_df:\n", train_class_distribution)

# Для test_df
test_class_distribution = test_df['Label'].value_counts().sort_values(ascending=False)
print("\nРаспределение классов в test_df:\n", test_class_distribution)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
train_df['Label'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title("Распределение классов в train_df")
plt.ylabel("")
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
test_df['Label'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title("Распределение классов в test_df")
plt.ylabel("")
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
train_class_distribution.plot(kind='bar')
plt.title("Train dataset")

plt.subplot(1, 2, 2)
test_class_distribution.plot(kind='bar')
plt.title("Test dataset")

plt.tight_layout()
plt.show()

In [None]:
max_class_count = train_class_distribution.max()
min_class_count = train_class_distribution.min()
imbalance_ratio = max_class_count / min_class_count

print(f"\nКоэффициент дисбаланса: {imbalance_ratio:.2f}")

In [None]:
# 1. Объединяем исходные данные
full_data = pd.concat([train_df, test_df], ignore_index=True)

# 2. Стратифицированное разделение
train_df_new, test_df_new = train_test_split(
    full_data,
    test_size=0.2,
    stratify=full_data['Label'],  # Стратификация по оригинальным меткам
    random_state=42
)

# 3. Теперь ПЕРЕЗАПИСЫВАЕМ оригинальные данные
train_df = train_df_new.copy()
test_df = test_df_new.copy()

# 4. Кодируем метки в числовой формат (уже в новых данных)
le = LabelEncoder()
train_df['Label_encoded'] = le.fit_transform(train_df['Label'])
test_df['Label_encoded'] = le.transform(test_df['Label'])

# Проверяем распределение
print("Train распределение:\n", train_df['Label'].value_counts())
print("\nTest распределение:\n", test_df['Label'].value_counts())

# Проверяем соответствие
label_mapping = dict(zip(le.classes_, range(len(le.classes_))))
print("Соответствие меток и чисел:\n", label_mapping)