In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import classification_report

In [None]:
import pandas as pd

# Kaggle 데이터셋 경로
input_dir = '/kaggle/input/open123123121212'

# CSV 불러오기
train = pd.read_csv(f'{input_dir}/train.csv')
test = pd.read_csv(f'{input_dir}/test.csv')
submission = pd.read_csv(f'{input_dir}/sample_submission.csv')

# 데이터 확인
print("train:", train.shape)
print("test:", test.shape)
print("submission:", submission.shape)

# 상위 5개 살펴보기
display(train_df.head())


In [None]:
# 2) X, y, test 정의
X = train.drop(['ID','attack_type'], axis=1)
y = train['attack_type']
test = test.drop(['ID'], axis=1)

In [None]:
import pandas as pd
import numpy as np

from sklearn.experimental import enable_iterative_imputer  # noqa: F401
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score

from lightgbm import LGBMClassifier

In [None]:
# 3) 헬퍼 함수 정의
def port_group(port):
    if port <= 1023:
        return 'well_known'
    if port <= 49151:
        return 'registered'
    return 'dynamic'

def make_features(df):
    # 포트 그룹화
    df['port_src_grp'] = df['port_src'].apply(port_group)
    df['port_dst_grp'] = df['port_dst'].apply(port_group)
    # 서브넷 추출 (첫 두 옥텟)
    df['subnet_src'] = (
        df['ip_src']
          .fillna('0.0.0.0').astype(str)
          .str.split('.', n=3).str[:2].str.join('.')
    )
    df['subnet_dst'] = (
        df['ip_dst']
          .fillna('0.0.0.0').astype(str)
          .str.split('.', n=3).str[:2].str.join('.')
    )
    # 파생 변수
    df['pkt_count_total']    = df['pkt_count_fwd'] + df['pkt_count_bwd']
    df['avg_pkt_size']       = (df['rate_fwd_bytes'] + df['rate_bwd_bytes']) / (df['pkt_count_total'] + 1e-6)
    df['byte_ratio_fwd']     = df['rate_fwd_bytes'] / ((df['rate_fwd_bytes'] + df['rate_bwd_bytes']) + 1e-6)
    df['iat_pkt_rate_ratio'] = df['iat_avg_packets'] / ((df['rate_fwd_pkts'] + df['rate_bwd_pkts']) + 1e-6)
    df['tcp_ctrl_ratio']     = (df['tcp_syn_count'] + df['tcp_psh_count'] + df['tcp_rst_count']) / (df['pkt_count_total'] + 1e-6)
    df['throughput']         = (df['rate_fwd_bytes'] + df['rate_bwd_bytes']) / (df['duration'] + 1e-6)
    df['tcp_win_ratio']      = df['tcp_win_fwd_init'] / (df['tcp_win_bwd_init'] + 1e-6)

# 4) 파생 변수 적용
for df_ in (X, test):
    make_features(df_)

# 5) 피처 리스트 정의
numeric_cols = [
    'port_src','port_dst','duration','pkt_count_fwd','pkt_count_bwd',
    'rate_fwd_pkts','rate_bwd_pkts','rate_fwd_bytes','rate_bwd_bytes',
    'payload_fwd_mean','payload_bwd_mean','tcp_win_fwd_init','tcp_win_bwd_init',
    'tcp_syn_count','tcp_psh_count','tcp_rst_count','iat_avg_packets',
    'pkt_count_total','avg_pkt_size','byte_ratio_fwd','iat_pkt_rate_ratio',
    'tcp_ctrl_ratio','throughput','tcp_win_ratio'
]
categorical_features = ['protocol','port_src_grp','port_dst_grp','subnet_src','subnet_dst']

# 6) 파이프라인 구성
numeric_transformer = Pipeline([
    ('imputer', IterativeImputer(random_state=42, max_iter=10, sample_posterior=True)),
    ('scaler',  StandardScaler())
])
categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])
preprocessor = ColumnTransformer([
    ('num', numeric_transformer,   numeric_cols),
    ('cat', categorical_transformer, categorical_features)
])
pipeline = Pipeline([
    ('preproc', preprocessor),
    ('clf',     LGBMClassifier(random_state=42, n_estimators=200))
])


In [None]:
# 7) 교차 검증
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=cv, scoring='f1_macro')
print(f'5-fold Macro F1 Score: {np.mean(scores):.4f}')

# 8) 최종 학습 & 예측
pipeline.fit(X, y)
pred = pipeline.predict(test)

# 9) 제출 파일 생성
submission['attack_type'] = pred
submission.to_csv('final_submission.csv', index=False)

In [None]:
# attack_type별 샘플 수 확인
print(y.value_counts())
y.value_counts(normalize=True)

In [None]:
import matplotlib.pyplot as plt

y.value_counts().plot.bar()
plt.title("Class Distribution")
plt.ylabel("Count")
plt.show()

In [None]:
# 예: cross_val_score로 train, val 나눠 점수 확인
from sklearn.model_selection import cross_validate
scores = cross_validate(pipeline, X, y, cv=5,
                        return_train_score=True,
                        scoring='f1_macro')
print("Train F1:", scores['train_score'].mean())
print("Val   F1:", scores['test_score'].mean()) 

In [None]:
!pip install --upgrade imbalanced-learn

smote 적용 및 xgboost 모델 변경 

In [None]:
import pandas as pd
import numpy as np

from sklearn.experimental import enable_iterative_imputer  # noqa: F401
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.utils import resample
from xgboost import XGBClassifier

# -----------------------------------------------------------------------------
# 1) Kaggle 노트북 환경에서 데이터 로드
# -----------------------------------------------------------------------------
input_dir  = '/kaggle/input/open123123121212'
train      = pd.read_csv(f'{input_dir}/train.csv')
test       = pd.read_csv(f'{input_dir}/test.csv')
submission = pd.read_csv(f'{input_dir}/sample_submission.csv')

print("train:", train.shape)
print("test: ", test.shape)
print("submission:", submission.shape)
display(train.head())

# -----------------------------------------------------------------------------
# 2) X, y, test 준비 및 레이블 인코딩
# -----------------------------------------------------------------------------
X_raw    = train.drop(['ID','attack_type'], axis=1)
y_raw    = train['attack_type']
test_raw = test.drop(['ID'], axis=1)

le = LabelEncoder()
y  = le.fit_transform(y_raw)  # 문자열 레이블 → 0,1,2...

# -----------------------------------------------------------------------------
# 3) 헬퍼 함수 & 파생 변수 생성
# -----------------------------------------------------------------------------
def port_group(port):
    if port <= 1023:
        return 'well_known'
    elif port <= 49151:
        return 'registered'
    else:
        return 'dynamic'

def make_features(df):
    # 포트 구간화
    df['port_src_grp'] = df['port_src'].apply(port_group)
    df['port_dst_grp'] = df['port_dst'].apply(port_group)
    # 서브넷(첫 두 옥텟)
    df['subnet_src'] = (
        df['ip_src'].fillna('0.0.0.0').astype(str)
          .str.split('.', n=3).str[:2].str.join('.')
    )
    df['subnet_dst'] = (
        df['ip_dst'].fillna('0.0.0.0').astype(str)
          .str.split('.', n=3).str[:2].str.join('.')
    )
    # 파생 변수
    df['pkt_count_total']    = df['pkt_count_fwd'] + df['pkt_count_bwd']
    df['avg_pkt_size']       = (df['rate_fwd_bytes'] + df['rate_bwd_bytes']) / (df['pkt_count_total'] + 1e-6)
    df['byte_ratio_fwd']     = df['rate_fwd_bytes'] / (df['rate_fwd_bytes'] + df['rate_bwd_bytes'] + 1e-6)
    df['iat_pkt_rate_ratio'] = df['iat_avg_packets'] / (df['rate_fwd_pkts'] + df['rate_bwd_pkts'] + 1e-6)
    df['tcp_ctrl_ratio']     = (df['tcp_syn_count'] + df['tcp_psh_count'] + df['tcp_rst_count']) / (df['pkt_count_total'] + 1e-6)
    df['throughput']         = (df['rate_fwd_bytes'] + df['rate_bwd_bytes']) / (df['duration'] + 1e-6)
    df['tcp_win_ratio']      = df['tcp_win_fwd_init'] / (df['tcp_win_bwd_init'] + 1e-6)

for df_ in (X_raw, test_raw):
    make_features(df_)

# -----------------------------------------------------------------------------
# 4) 전처리 파이프라인 설정 (희소 행렬 유지)
# -----------------------------------------------------------------------------
numeric_cols = [
    'port_src','port_dst','duration','pkt_count_fwd','pkt_count_bwd',
    'rate_fwd_pkts','rate_bwd_pkts','rate_fwd_bytes','rate_bwd_bytes',
    'payload_fwd_mean','payload_bwd_mean','tcp_win_fwd_init','tcp_win_bwd_init',
    'tcp_syn_count','tcp_psh_count','tcp_rst_count','iat_avg_packets',
    'pkt_count_total','avg_pkt_size','byte_ratio_fwd','iat_pkt_rate_ratio',
    'tcp_ctrl_ratio','throughput','tcp_win_ratio'
]
categorical_features = ['protocol','port_src_grp','port_dst_grp','subnet_src','subnet_dst']

numeric_transformer = Pipeline([
    ('imputer', IterativeImputer(random_state=42, max_iter=10, sample_posterior=True)),
    ('scaler',  StandardScaler(with_mean=False))  # 희소 행렬 호환
])
categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=True))
])
preprocessor = ColumnTransformer([
    ('num', numeric_transformer,   numeric_cols),
    ('cat', categorical_transformer, categorical_features)
], sparse_threshold=1.0)

# -----------------------------------------------------------------------------
# 5) 전처리 실행
# -----------------------------------------------------------------------------
X_proc    = preprocessor.fit_transform(X_raw)
test_proc = preprocessor.transform(test_raw)

# -----------------------------------------------------------------------------
# 6) 수동 오버샘플링 함수
# -----------------------------------------------------------------------------
def oversample(X_arr, y_arr):
    classes, counts = np.unique(y_arr, return_counts=True)
    max_count = counts.max()
    X_res_list, y_res_list = [], []
    for cls in classes:
        idx     = np.where(y_arr == cls)[0]
        X_cls   = X_arr[idx]
        y_cls   = y_arr[idx]
        # 희소행렬을 밀집 배열로 변환 후 복제
        X_dense = X_cls.toarray() if hasattr(X_cls, "toarray") else X_cls
        X_up, y_up = resample(
            X_dense, y_cls,
            replace=True,
            n_samples=max_count,
            random_state=42
        )
        X_res_list.append(X_up)
        y_res_list.append(y_up)
    return np.vstack(X_res_list), np.concatenate(y_res_list)

# -----------------------------------------------------------------------------
# 7) 모델 정의
# -----------------------------------------------------------------------------
clf = XGBClassifier(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)

# -----------------------------------------------------------------------------
# 8) 교차검증 (Manual oversampling + 평가)
# -----------------------------------------------------------------------------
cv        = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = []
for tr_idx, va_idx in cv.split(X_proc, y):
    X_tr, X_va = X_proc[tr_idx], X_proc[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]
    X_tr_res, y_tr_res = oversample(X_tr, y_tr)
    clf.fit(X_tr_res, y_tr_res)
    y_pred = clf.predict(X_va)
    f1_scores.append(f1_score(y_va, y_pred, average='macro'))

print("5-fold Macro F1 scores:", np.round(f1_scores, 4))
print("Mean Macro F1 score:", np.round(np.mean(f1_scores), 4))

# -----------------------------------------------------------------------------
# 9) 최종 학습 및 예측
# -----------------------------------------------------------------------------
X_res_full, y_res_full = oversample(X_proc, y)
clf.fit(X_res_full, y_res_full)
pred_int    = clf.predict(test_proc)
pred_labels = le.inverse_transform(pred_int)

# -----------------------------------------------------------------------------
# 10) 제출 파일 생성
# -----------------------------------------------------------------------------
submission['attack_type'] = pred_labels
submission.to_csv('final_submission.csv', index=False)
print("완료: final_submission.csv 생성되었습니다.")


In [None]:
import pandas as pd
import numpy as np

from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# 1) Load data
INPUT = '/kaggle/input/open123123121212'
train      = pd.read_csv(f'{INPUT}/train.csv')
test       = pd.read_csv(f'{INPUT}/test.csv')
submission = pd.read_csv(f'{INPUT}/sample_submission.csv')

# 2) Prepare X, y and encode labels
X       = train.drop(['ID', 'attack_type'], axis=1)
y_raw   = train['attack_type']
le      = LabelEncoder()
y       = le.fit_transform(y_raw)

# 3) Feature engineering
def port_group(p):
    if p <= 1023:
        return 'well_known'
    elif p <= 49151:
        return 'registered'
    else:
        return 'dynamic'

def make_features(df):
    df['port_src_grp'] = df['port_src'].apply(port_group)
    df['port_dst_grp'] = df['port_dst'].apply(port_group)
    df['subnet_src'] = (
        df['ip_src'].fillna('0.0.0.0').astype(str)
          .str.split(pat='.', n=3).str[:2].str.join('.')
    )
    df['subnet_dst'] = (
        df['ip_dst'].fillna('0.0.0.0').astype(str)
          .str.split(pat='.', n=3).str[:2].str.join('.')
    )
    df['pkt_count_total']    = df['pkt_count_fwd'] + df['pkt_count_bwd']
    df['avg_pkt_size']       = (df['rate_fwd_bytes'] + df['rate_bwd_bytes']) / (df['pkt_count_total'] + 1e-6)
    df['byte_ratio_fwd']     = df['rate_fwd_bytes'] / (df['rate_fwd_bytes'] + df['rate_bwd_bytes'] + 1e-6)
    df['iat_pkt_rate_ratio'] = df['iat_avg_packets'] / (df['rate_fwd_pkts'] + df['rate_bwd_pkts'] + 1e-6)
    df['tcp_ctrl_ratio']     = (df['tcp_syn_count'] + df['tcp_psh_count'] + df['tcp_rst_count']) / (df['pkt_count_total'] + 1e-6)
    df['throughput']         = (df['rate_fwd_bytes'] + df['rate_bwd_bytes']) / (df['duration'] + 1e-6)
    df['tcp_win_ratio']      = df['tcp_win_fwd_init'] / (df['tcp_win_bwd_init'] + 1e-6)
    for c in ['duration','pkt_count_fwd','pkt_count_bwd','rate_fwd_pkts','rate_bwd_pkts',
              'rate_fwd_bytes','rate_bwd_bytes','payload_fwd_mean','payload_bwd_mean',
              'tcp_win_fwd_init','tcp_win_bwd_init','iat_avg_packets']:
        df[f'{c}_log1p'] = np.log1p(df[c].fillna(0))
    return df

X    = make_features(X.copy())
test = make_features(test.copy())

# 4) Preprocessing pipeline
numeric_cols = [
    'port_src','port_dst','duration','pkt_count_fwd','pkt_count_bwd',
    'rate_fwd_pkts','rate_bwd_pkts','rate_fwd_bytes','rate_bwd_bytes',
    'payload_fwd_mean','payload_bwd_mean','tcp_win_fwd_init','tcp_win_bwd_init',
    'tcp_syn_count','tcp_psh_count','tcp_rst_count','iat_avg_packets',
    'pkt_count_total','avg_pkt_size','byte_ratio_fwd','iat_pkt_rate_ratio',
    'tcp_ctrl_ratio','throughput','tcp_win_ratio'
] + [f'{c}_log1p' for c in ['duration','pkt_count_fwd','pkt_count_bwd',
                            'rate_fwd_pkts','rate_bwd_pkts','rate_fwd_bytes',
                            'rate_bwd_bytes','payload_fwd_mean','payload_bwd_mean',
                            'tcp_win_fwd_init','tcp_win_bwd_init','iat_avg_packets']]
categorical_features = ['protocol','port_src_grp','port_dst_grp','subnet_src','subnet_dst']

num_pipe = Pipeline([
    ('imputer', IterativeImputer(random_state=42, max_iter=10, sample_posterior=True)),
    ('scaler',  StandardScaler(with_mean=False))
])
cat_pipe = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=True))
])
preproc = ColumnTransformer([
    ('num', num_pipe, numeric_cols),
    ('cat', cat_pipe, categorical_features)
], sparse_threshold=1.0)

# 5) Stacking ensemble with early stopping
estimators = [
    ('xgb', XGBClassifier(
        n_estimators=1000, learning_rate=0.05, max_depth=6,
        subsample=0.8, colsample_bytree=0.8,
        use_label_encoder=False, eval_metric='mlogloss',
        early_stopping_rounds=50, random_state=42, n_jobs=4
    )),
    ('lgb', LGBMClassifier(
        n_estimators=1000, learning_rate=0.05, num_leaves=50,
        subsample=0.8, colsample_bytree=0.8,
        class_weight='balanced', random_state=42, n_jobs=4
    )),
    ('cat', CatBoostClassifier(
        iterations=1000, learning_rate=0.05, depth=6,
        auto_class_weights='Balanced',
        early_stopping_rounds=50, random_seed=42,
        verbose=0
    ))
]
stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5,
    passthrough=True,
    n_jobs=-1
)

pipeline = Pipeline([
    ('pre',   preproc),
    ('stack', stack)
])

# 6) Hyperparameter search
param_dist = {
    'stack__xgb__learning_rate': [0.01, 0.05, 0.1],
    'stack__xgb__max_depth':      [4, 6, 8],
    'stack__lgb__num_leaves':     [31, 50, 100],
    'stack__cat__depth':          [4, 6, 8],
    'stack__final_estimator__C':  [0.01, 0.1, 1, 10],
}

search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    scoring=make_scorer(f1_score, average='macro'),
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# 7) Fit and tune
search.fit(X, y)
print("Best CV Macro F1:", search.best_score_)
print("Best Params:", search.best_params_)

# 8) Final predict and submit
best = search.best_estimator_
pred_int   = best.predict(test)
pred_label = le.inverse_transform(pred_int)

submission['attack_type'] = pred_label
submission.to_csv('stacking_tuned_submission.csv', index=False)
print("Completed: stacking_tuned_submission.csv")


In [None]:
import pandas as pd
import numpy as np

from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# 1) 데이터 로드
INPUT = '/kaggle/input/open123123121212'
train      = pd.read_csv(f'{INPUT}/train.csv')
test       = pd.read_csv(f'{INPUT}/test.csv')
submission = pd.read_csv(f'{INPUT}/sample_submission.csv')

# 2) X, y 준비 및 레이블 인코딩
X       = train.drop(['ID', 'attack_type'], axis=1)
y_raw   = train['attack_type']
le      = LabelEncoder()
y       = le.fit_transform(y_raw)

# 3) 피처 엔지니어링
def port_group(p):
    if p <= 1023:       return 'well_known'
    elif p <= 49151:    return 'registered'
    else:               return 'dynamic'

def make_features(df):
    df['port_src_grp'] = df['port_src'].apply(port_group)
    df['port_dst_grp'] = df['port_dst'].apply(port_group)
    df['subnet_src'] = (
        df['ip_src'].fillna('0.0.0.0')
          .astype(str)
          .str.split(pat='.', n=3).str[:2].str.join('.')
    )
    df['subnet_dst'] = (
        df['ip_dst'].fillna('0.0.0.0')
          .astype(str)
          .str.split(pat='.', n=3).str[:2].str.join('.')
    )
    df['pkt_count_total']    = df['pkt_count_fwd'] + df['pkt_count_bwd']
    df['avg_pkt_size']       = (df['rate_fwd_bytes'] + df['rate_bwd_bytes']) / (df['pkt_count_total'] + 1e-6)
    df['byte_ratio_fwd']     = df['rate_fwd_bytes'] / (df['rate_fwd_bytes'] + df['rate_bwd_bytes'] + 1e-6)
    df['iat_pkt_rate_ratio'] = df['iat_avg_packets'] / (df['rate_fwd_pkts'] + df['rate_bwd_pkts'] + 1e-6)
    df['tcp_ctrl_ratio']     = (df['tcp_syn_count'] + df['tcp_psh_count'] + df['tcp_rst_count']) / (df['pkt_count_total'] + 1e-6)
    df['throughput']         = (df['rate_fwd_bytes'] + df['rate_bwd_bytes']) / (df['duration'] + 1e-6)
    df['tcp_win_ratio']      = df['tcp_win_fwd_init'] / (df['tcp_win_bwd_init'] + 1e-6)
    for c in ['duration','pkt_count_fwd','pkt_count_bwd','rate_fwd_pkts','rate_bwd_pkts',
              'rate_fwd_bytes','rate_bwd_bytes','payload_fwd_mean','payload_bwd_mean',
              'tcp_win_fwd_init','tcp_win_bwd_init','iat_avg_packets']:
        df[f'{c}_log1p'] = np.log1p(df[c].fillna(0))
    return df

X    = make_features(X.copy())
test = make_features(test.copy())

# 4) 전처리 파이프라인
numeric_cols = [
    'port_src','port_dst','duration','pkt_count_fwd','pkt_count_bwd',
    'rate_fwd_pkts','rate_bwd_pkts','rate_fwd_bytes','rate_bwd_bytes',
    'payload_fwd_mean','payload_bwd_mean','tcp_win_fwd_init','tcp_win_bwd_init',
    'tcp_syn_count','tcp_psh_count','tcp_rst_count','iat_avg_packets',
    'pkt_count_total','avg_pkt_size','byte_ratio_fwd','iat_pkt_rate_ratio',
    'tcp_ctrl_ratio','throughput','tcp_win_ratio'
] + [f'{c}_log1p' for c in ['duration','pkt_count_fwd','pkt_count_bwd',
                            'rate_fwd_pkts','rate_bwd_pkts','rate_fwd_bytes',
                            'rate_bwd_bytes','payload_fwd_mean','payload_bwd_mean',
                            'tcp_win_fwd_init','tcp_win_bwd_init','iat_avg_packets']]
categorical_features = ['protocol','port_src_grp','port_dst_grp','subnet_src','subnet_dst']

num_pipe = Pipeline([
    ('imputer', IterativeImputer(random_state=42, max_iter=10, sample_posterior=True)),
    ('scaler',  StandardScaler(with_mean=False))
])
cat_pipe = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=True))
])
preproc = ColumnTransformer([
    ('num', num_pipe, numeric_cols),
    ('cat', cat_pipe, categorical_features)
], sparse_threshold=1.0)

# 5) Stacking ensemble with early stopping
estimators = [
    ('xgb', XGBClassifier(
        n_estimators=1000, learning_rate=0.05, max_depth=6,
        subsample=0.8, colsample_bytree=0.8,
        use_label_encoder=False, eval_metric='mlogloss',
        early_stopping_rounds=50, random_state=42, n_jobs=4
    )),
    ('lgb', LGBMClassifier(
        n_estimators=1000, learning_rate=0.05, num_leaves=50,
        subsample=0.8, colsample_bytree=0.8,
        class_weight='balanced', random_state=42, n_jobs=4
    )),
    ('cat', CatBoostClassifier(
        iterations=1000, learning_rate=0.05, depth=6,
        auto_class_weights='Balanced',
        early_stopping_rounds=50, random_seed=42,
        verbose=0
    ))
]
stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5,
    passthrough=True,
    n_jobs=-1
)
pipeline = Pipeline([
    ('pre',   preproc),
    ('stack', stack)
])

# 6) Hyperparameter search
param_dist = {
    'stack__xgb__learning_rate': [0.01, 0.05, 0.1],
    'stack__xgb__max_depth':      [4, 6, 8],
    'stack__lgb__num_leaves':     [31, 50, 100],
    'stack__cat__depth':          [4, 6, 8],
    'stack__final_estimator__C':  [0.01, 0.1, 1, 10],
}
search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    scoring=make_scorer(f1_score, average='macro'),
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# 7) Fit and tune
search.fit(X, y)
print("Best CV Macro F1:", search.best_score_)
print("Best Params:", search.best_params_)

# 8) Final predict and save to Kaggle working dir
best = search.best_estimator_
pred_int   = best.predict(test)
pred_label = le.inverse_transform(pred_int)
submission['attack_type'] = pred_label
submission.to_csv('/kaggle/working/stacking_tuned_submission.csv', index=False)
print("Completed: /kaggle/working/stacking_tuned_submission.csv")


In [None]:
import pandas as pd
import numpy as np

from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# 1) 데이터 로드
INPUT = '/kaggle/input/open123123121212'
train      = pd.read_csv(f'{INPUT}/train.csv')
test       = pd.read_csv(f'{INPUT}/test.csv')
submission = pd.read_csv(f'{INPUT}/sample_submission.csv')

# 2) X, y 준비 및 레이블 인코딩
X       = train.drop(['ID', 'attack_type'], axis=1)
y_raw   = train['attack_type']
le      = LabelEncoder()
y       = le.fit_transform(y_raw)

# 3) 피처 엔지니어링
def port_group(p):
    if p <= 1023:       return 'well_known'
    elif p <= 49151:    return 'registered'
    else:               return 'dynamic'

def make_features(df):
    df['port_src_grp'] = df['port_src'].apply(port_group)
    df['port_dst_grp'] = df['port_dst'].apply(port_group)
    df['subnet_src'] = (
        df['ip_src'].fillna('0.0.0.0')
          .astype(str)
          .str.split(pat='.', n=3).str[:2].str.join('.')
    )
    df['subnet_dst'] = (
        df['ip_dst'].fillna('0.0.0.0')
          .astype(str)
          .str.split(pat='.', n=3).str[:2].str.join('.')
    )
    df['pkt_count_total']    = df['pkt_count_fwd'] + df['pkt_count_bwd']
    df['avg_pkt_size']       = (df['rate_fwd_bytes'] + df['rate_bwd_bytes']) / (df['pkt_count_total'] + 1e-6)
    df['byte_ratio_fwd']     = df['rate_fwd_bytes'] / (df['rate_fwd_bytes'] + df['rate_bwd_bytes'] + 1e-6)
    df['iat_pkt_rate_ratio'] = df['iat_avg_packets'] / (df['rate_fwd_pkts'] + df['rate_bwd_pkts'] + 1e-6)
    df['tcp_ctrl_ratio']     = (df['tcp_syn_count'] + df['tcp_psh_count'] + df['tcp_rst_count']) / (df['pkt_count_total'] + 1e-6)
    df['throughput']         = (df['rate_fwd_bytes'] + df['rate_bwd_bytes']) / (df['duration'] + 1e-6)
    df['tcp_win_ratio']      = df['tcp_win_fwd_init'] / (df['tcp_win_bwd_init'] + 1e-6)
    for c in ['duration','pkt_count_fwd','pkt_count_bwd','rate_fwd_pkts','rate_bwd_pkts',
              'rate_fwd_bytes','rate_bwd_bytes','payload_fwd_mean','payload_bwd_mean',
              'tcp_win_fwd_init','tcp_win_bwd_init','iat_avg_packets']:
        df[f'{c}_log1p'] = np.log1p(df[c].fillna(0))
    return df

X    = make_features(X.copy())
test = make_features(test.copy())

# 4) 전처리 파이프라인
numeric_cols = [
    'port_src','port_dst','duration','pkt_count_fwd','pkt_count_bwd',
    'rate_fwd_pkts','rate_bwd_pkts','rate_fwd_bytes','rate_bwd_bytes',
    'payload_fwd_mean','payload_bwd_mean','tcp_win_fwd_init','tcp_win_bwd_init',
    'tcp_syn_count','tcp_psh_count','tcp_rst_count','iat_avg_packets',
    'pkt_count_total','avg_pkt_size','byte_ratio_fwd','iat_pkt_rate_ratio',
    'tcp_ctrl_ratio','throughput','tcp_win_ratio'
] + [f'{c}_log1p' for c in ['duration','pkt_count_fwd','pkt_count_bwd',
                            'rate_fwd_pkts','rate_bwd_pkts','rate_fwd_bytes',
                            'rate_bwd_bytes','payload_fwd_mean','payload_bwd_mean',
                            'tcp_win_fwd_init','tcp_win_bwd_init','iat_avg_packets']]
categorical_features = ['protocol','port_src_grp','port_dst_grp','subnet_src','subnet_dst']

num_pipe = Pipeline([
    ('imputer', IterativeImputer(random_state=42, max_iter=10, sample_posterior=True)),
    ('scaler',  StandardScaler(with_mean=False))
])
cat_pipe = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=True))
])
preproc = ColumnTransformer([
    ('num', num_pipe, numeric_cols),
    ('cat', cat_pipe, categorical_features)
], sparse_threshold=1.0)

# 5) Stacking ensemble with early stopping
estimators = [
    ('xgb', XGBClassifier(
        n_estimators=1000, learning_rate=0.05, max_depth=6,
        subsample=0.8, colsample_bytree=0.8,
        use_label_encoder=False, eval_metric='mlogloss',
        early_stopping_rounds=50, random_state=42, n_jobs=4
    )),
    ('lgb', LGBMClassifier(
        n_estimators=1000, learning_rate=0.05, num_leaves=50,
        subsample=0.8, colsample_bytree=0.8,
        class_weight='balanced', random_state=42, n_jobs=4
    )),
    ('cat', CatBoostClassifier(
        iterations=1000, learning_rate=0.05, depth=6,
        auto_class_weights='Balanced',
        early_stopping_rounds=50, random_seed=42,
        verbose=0
    ))
]
stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5,
    passthrough=True,
    n_jobs=-1
)
pipeline = Pipeline([
    ('pre',   preproc),
    ('stack', stack)
])

# 6) Hyperparameter search
param_dist = {
    'stack__xgb__learning_rate': [0.01, 0.05, 0.1],
    'stack__xgb__max_depth':      [4, 6, 8],
    'stack__lgb__num_leaves':     [31, 50, 100],
    'stack__cat__depth':          [4, 6, 8],
    'stack__final_estimator__C':  [0.01, 0.1, 1, 10],
}
search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    scoring=make_scorer(f1_score, average='macro'),
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# 7) Fit and tune
search.fit(X, y)
print("Best CV Macro F1:", search.best_score_)
print("Best Params:", search.best_params_)

# 8) Final predict and save to Kaggle working dir
best = search.best_estimator_
pred_int   = best.predict(test)
pred_label = le.inverse_transform(pred_int)
submission['attack_type'] = pred_label
submission.to_csv('/kaggle/working/stacking_tuned_submission.csv', index=False)
print("Completed: /kaggle/working/stacking_tuned_submission.csv")


In [None]:
import pandas as pd
import numpy as np

from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# 1) 데이터 로드
INPUT = '/kaggle/input/open123123121212'
train      = pd.read_csv(f'{INPUT}/train.csv')
test       = pd.read_csv(f'{INPUT}/test.csv')
submission = pd.read_csv(f'{INPUT}/sample_submission.csv')

# 2) X, y 준비 및 레이블 인코딩
X       = train.drop(['ID', 'attack_type'], axis=1)
y_raw   = train['attack_type']
le      = LabelEncoder()
y       = le.fit_transform(y_raw)

# 3) 피처 엔지니어링
def port_group(p):
    if p <= 1023:       return 'well_known'
    elif p <= 49151:    return 'registered'
    else:               return 'dynamic'

def make_features(df):
    df['port_src_grp'] = df['port_src'].apply(port_group)
    df['port_dst_grp'] = df['port_dst'].apply(port_group)
    df['subnet_src'] = (
        df['ip_src'].fillna('0.0.0.0')
          .astype(str)
          .str.split(pat='.', n=3).str[:2].str.join('.')
    )
    df['subnet_dst'] = (
        df['ip_dst'].fillna('0.0.0.0')
          .astype(str)
          .str.split(pat='.', n=3).str[:2].str.join('.')
    )
    df['pkt_count_total']    = df['pkt_count_fwd'] + df['pkt_count_bwd']
    df['avg_pkt_size']       = (df['rate_fwd_bytes'] + df['rate_bwd_bytes']) / (df['pkt_count_total'] + 1e-6)
    df['byte_ratio_fwd']     = df['rate_fwd_bytes'] / (df['rate_fwd_bytes'] + df['rate_bwd_bytes'] + 1e-6)
    df['iat_pkt_rate_ratio'] = df['iat_avg_packets'] / (df['rate_fwd_pkts'] + df['rate_bwd_pkts'] + 1e-6)
    df['tcp_ctrl_ratio']     = (df['tcp_syn_count'] + df['tcp_psh_count'] + df['tcp_rst_count']) / (df['pkt_count_total'] + 1e-6)
    df['throughput']         = (df['rate_fwd_bytes'] + df['rate_bwd_bytes']) / (df['duration'] + 1e-6)
    df['tcp_win_ratio']      = df['tcp_win_fwd_init'] / (df['tcp_win_bwd_init'] + 1e-6)
    for c in ['duration','pkt_count_fwd','pkt_count_bwd','rate_fwd_pkts','rate_bwd_pkts',
              'rate_fwd_bytes','rate_bwd_bytes','payload_fwd_mean','payload_bwd_mean',
              'tcp_win_fwd_init','tcp_win_bwd_init','iat_avg_packets']:
        df[f'{c}_log1p'] = np.log1p(df[c].fillna(0))
    return df

X    = make_features(X.copy())
test = make_features(test.copy())

# 4) 전처리 파이프라인
numeric_cols = [
    'port_src','port_dst','duration','pkt_count_fwd','pkt_count_bwd',
    'rate_fwd_pkts','rate_bwd_pkts','rate_fwd_bytes','rate_bwd_bytes',
    'payload_fwd_mean','payload_bwd_mean','tcp_win_fwd_init','tcp_win_bwd_init',
    'tcp_syn_count','tcp_psh_count','tcp_rst_count','iat_avg_packets',
    'pkt_count_total','avg_pkt_size','byte_ratio_fwd','iat_pkt_rate_ratio',
    'tcp_ctrl_ratio','throughput','tcp_win_ratio'
] + [f'{c}_log1p' for c in ['duration','pkt_count_fwd','pkt_count_bwd',
                            'rate_fwd_pkts','rate_bwd_pkts','rate_fwd_bytes',
                            'rate_bwd_bytes','payload_fwd_mean','payload_bwd_mean',
                            'tcp_win_fwd_init','tcp_win_bwd_init','iat_avg_packets']]
categorical_features = ['protocol','port_src_grp','port_dst_grp','subnet_src','subnet_dst']

num_pipe = Pipeline([
    ('imputer', IterativeImputer(random_state=42, max_iter=10, sample_posterior=True)),
    ('scaler',  StandardScaler(with_mean=False))
])
cat_pipe = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=True))
])
preproc = ColumnTransformer([
    ('num', num_pipe, numeric_cols),
    ('cat', cat_pipe, categorical_features)
], sparse_threshold=1.0)

# 5) Stacking ensemble (no early stopping inside CV)
estimators = [
    ('xgb', XGBClassifier(
        n_estimators=300, learning_rate=0.05, max_depth=6,
        subsample=0.8, colsample_bytree=0.8,
        use_label_encoder=False, eval_metric='mlogloss',
        random_state=42, n_jobs=4
    )),
    ('lgb', LGBMClassifier(
        n_estimators=300, learning_rate=0.05, num_leaves=50,
        subsample=0.8, colsample_bytree=0.8,
        class_weight='balanced',
        random_state=42, n_jobs=4
    )),
    ('cat', CatBoostClassifier(
        iterations=300, learning_rate=0.05, depth=6,
        auto_class_weights='Balanced',
        random_seed=42,
        verbose=0
    ))
]
stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5,
    passthrough=True,
    n_jobs=-1
)
pipeline = Pipeline([
    ('pre',   preproc),
    ('stack', stack)
])

# 6) Hyperparameter search
param_dist = {
    'stack__xgb__learning_rate': [0.01, 0.05, 0.1],
    'stack__xgb__max_depth':      [4, 6, 8],
    'stack__lgb__num_leaves':     [31, 50, 100],
    'stack__cat__depth':          [4, 6, 8],
    'stack__final_estimator__C':  [0.01, 0.1, 1, 10],
}

search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    scoring=make_scorer(f1_score, average='macro'),
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# 7) Fit and tune
search.fit(X, y)
print("Best CV Macro F1:", search.best_score_)
print("Best Params:", search.best_params_)

# 8) Final predict and save
best = search.best_estimator_
pred_int   = best.predict(test)
pred_label = le.inverse_transform(pred_int)
submission['attack_type'] = pred_label
submission.to_csv('/kaggle/working/stacking_tuned_submission.csv', index=False)
print("Done: /kaggle/working/stacking_tuned_submission.csv")


In [None]:
import pandas as pd
import numpy as np

from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# 1) 데이터 로드
INPUT = '/kaggle/input/open123123121212'
train      = pd.read_csv(f'{INPUT}/train.csv')
test       = pd.read_csv(f'{INPUT}/test.csv')
submission = pd.read_csv(f'{INPUT}/sample_submission.csv')

# 2) X, y 준비 및 레이블 인코딩
X       = train.drop(['ID', 'attack_type'], axis=1)
y_raw   = train['attack_type']
le      = LabelEncoder()
y       = le.fit_transform(y_raw)

# 3) 피처 엔지니어링
def port_group(p):
    if p <= 1023:       return 'well_known'
    elif p <= 49151:    return 'registered'
    else:               return 'dynamic'

def make_features(df):
    df['port_src_grp'] = df['port_src'].apply(port_group)
    df['port_dst_grp'] = df['port_dst'].apply(port_group)
    df['subnet_src'] = (
        df['ip_src'].fillna('0.0.0.0')
          .astype(str)
          .str.split(pat='.', n=3).str[:2].str.join('.')
    )
    df['subnet_dst'] = (
        df['ip_dst'].fillna('0.0.0.0')
          .astype(str)
          .str.split(pat='.', n=3).str[:2].str.join('.')
    )
    df['pkt_count_total']    = df['pkt_count_fwd'] + df['pkt_count_bwd']
    df['avg_pkt_size']       = (df['rate_fwd_bytes'] + df['rate_bwd_bytes']) / (df['pkt_count_total'] + 1e-6)
    df['byte_ratio_fwd']     = df['rate_fwd_bytes'] / (df['rate_fwd_bytes'] + df['rate_bwd_bytes'] + 1e-6)
    df['iat_pkt_rate_ratio'] = df['iat_avg_packets'] / (df['rate_fwd_pkts'] + df['rate_bwd_pkts'] + 1e-6)
    df['tcp_ctrl_ratio']     = (df['tcp_syn_count'] + df['tcp_psh_count'] + df['tcp_rst_count']) / (df['pkt_count_total'] + 1e-6)
    df['throughput']         = (df['rate_fwd_bytes'] + df['rate_bwd_bytes']) / (df['duration'] + 1e-6)
    df['tcp_win_ratio']      = df['tcp_win_fwd_init'] / (df['tcp_win_bwd_init'] + 1e-6)
    for c in ['duration','pkt_count_fwd','pkt_count_bwd','rate_fwd_pkts','rate_bwd_pkts',
              'rate_fwd_bytes','rate_bwd_bytes','payload_fwd_mean','payload_bwd_mean',
              'tcp_win_fwd_init','tcp_win_bwd_init','iat_avg_packets']:
        df[f'{c}_log1p'] = np.log1p(df[c].fillna(0))
    return df

X    = make_features(X.copy())
test = make_features(test.copy())

# 4) 전처리 파이프라인
numeric_cols = [
    'port_src','port_dst','duration','pkt_count_fwd','pkt_count_bwd',
    'rate_fwd_pkts','rate_bwd_pkts','rate_fwd_bytes','rate_bwd_bytes',
    'payload_fwd_mean','payload_bwd_mean','tcp_win_fwd_init','tcp_win_bwd_init',
    'tcp_syn_count','tcp_psh_count','tcp_rst_count','iat_avg_packets',
    'pkt_count_total','avg_pkt_size','byte_ratio_fwd','iat_pkt_rate_ratio',
    'tcp_ctrl_ratio','throughput','tcp_win_ratio'
] + [f'{c}_log1p' for c in ['duration','pkt_count_fwd','pkt_count_bwd',
                            'rate_fwd_pkts','rate_bwd_pkts','rate_fwd_bytes',
                            'rate_bwd_bytes','payload_fwd_mean','payload_bwd_mean',
                            'tcp_win_fwd_init','tcp_win_bwd_init','iat_avg_packets']]
categorical_features = ['protocol','port_src_grp','port_dst_grp','subnet_src','subnet_dst']

num_pipe = Pipeline([
    ('imputer', IterativeImputer(random_state=42, max_iter=10, sample_posterior=True)),
    ('scaler',  StandardScaler(with_mean=False))
])
cat_pipe = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=True))
])
preproc = ColumnTransformer([
    ('num', num_pipe, numeric_cols),
    ('cat', cat_pipe, categorical_features)
], sparse_threshold=1.0)

# 5) Stacking ensemble (no early stopping inside CV)
estimators = [
    ('xgb', XGBClassifier(
        n_estimators=300, learning_rate=0.05, max_depth=6,
        subsample=0.8, colsample_bytree=0.8,
        use_label_encoder=False, eval_metric='mlogloss',
        random_state=42, n_jobs=4
    )),
    ('lgb', LGBMClassifier(
        n_estimators=300, learning_rate=0.05, num_leaves=50,
        subsample=0.8, colsample_bytree=0.8,
        class_weight='balanced',
        random_state=42, n_jobs=4
    )),
    ('cat', CatBoostClassifier(
        iterations=300, learning_rate=0.05, depth=6,
        auto_class_weights='Balanced',
        random_seed=42,
        verbose=0
    ))
]
stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5,
    passthrough=True,
    n_jobs=-1
)
pipeline = Pipeline([
    ('pre',   preproc),
    ('stack', stack)
])

# 6) Hyperparameter search
param_dist = {
    'stack__xgb__learning_rate': [0.01, 0.05, 0.1],
    'stack__xgb__max_depth':      [4, 6, 8],
    'stack__lgb__num_leaves':     [31, 50, 100],
    'stack__cat__depth':          [4, 6, 8],
    'stack__final_estimator__C':  [0.01, 0.1, 1, 10],
}

search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    scoring=make_scorer(f1_score, average='macro'),
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# 7) Fit and tune
search.fit(X, y)
print("Best CV Macro F1:", search.best_score_)
print("Best Params:", search.best_params_)

# 8) Final predict and save
best = search.best_estimator_
pred_int   = best.predict(test)
pred_label = le.inverse_transform(pred_int)
submission['attack_type'] = pred_label
submission.to_csv('/kaggle/working/stacking_tuned_submission.csv', index=False)
print("Done: /kaggle/working/stacking_tuned_submission.csv")


In [None]:
import pandas as pd
import numpy as np

from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# 1) 데이터 로드
INPUT = '/kaggle/input/open123123121212'
train      = pd.read_csv(f'{INPUT}/train.csv')
test       = pd.read_csv(f'{INPUT}/test.csv')
submission = pd.read_csv(f'{INPUT}/sample_submission.csv')

# 2) X, y 준비 및 레이블 인코딩
X       = train.drop(['ID','attack_type'], axis=1)
y_raw   = train['attack_type']
le      = LabelEncoder()
y       = le.fit_transform(y_raw)

# 3) 피처 엔지니어링
def port_group(p):
    if p <= 1023:       return 'well_known'
    elif p <= 49151:    return 'registered'
    else:               return 'dynamic'

def make_features(df):
    df['port_src_grp'] = df['port_src'].apply(port_group)
    df['port_dst_grp'] = df['port_dst'].apply(port_group)
    df['subnet_src'] = (
        df['ip_src'].fillna('0.0.0.0')
          .astype(str)
          .str.split(pat='.', n=3).str[:2].str.join('.')
    )
    df['subnet_dst'] = (
        df['ip_dst'].fillna('0.0.0.0')
          .astype(str)
          .str.split(pat='.', n=3).str[:2].str.join('.')
    )
    df['pkt_count_total']    = df['pkt_count_fwd'] + df['pkt_count_bwd']
    df['avg_pkt_size']       = (df['rate_fwd_bytes'] + df['rate_bwd_bytes']) / (df['pkt_count_total'] + 1e-6)
    df['byte_ratio_fwd']     = df['rate_fwd_bytes'] / (df['rate_fwd_bytes'] + df['rate_bwd_bytes'] + 1e-6)
    df['iat_pkt_rate_ratio'] = df['iat_avg_packets'] / (df['rate_fwd_pkts'] + df['rate_bwd_pkts'] + 1e-6)
    df['tcp_ctrl_ratio']     = (df['tcp_syn_count'] + df['tcp_psh_count'] + df['tcp_rst_count']) / (df['pkt_count_total'] + 1e-6)
    df['throughput']         = (df['rate_fwd_bytes'] + df['rate_bwd_bytes']) / (df['duration'] + 1e-6)
    df['tcp_win_ratio']      = df['tcp_win_fwd_init'] / (df['tcp_win_bwd_init'] + 1e-6)
    # log1p 변환
    for c in ['duration','pkt_count_fwd','pkt_count_bwd','rate_fwd_pkts','rate_bwd_pkts',
              'rate_fwd_bytes','rate_bwd_bytes','payload_fwd_mean','payload_bwd_mean',
              'tcp_win_fwd_init','tcp_win_bwd_init','iat_avg_packets']:
        df[f'{c}_log1p'] = np.log1p(df[c].fillna(0))
    return df

X    = make_features(X.copy())
test = make_features(test.copy())

# 4) 전처리 파이프라인
numeric_cols = [
    'port_src','port_dst','duration','pkt_count_fwd','pkt_count_bwd',
    'rate_fwd_pkts','rate_bwd_pkts','rate_fwd_bytes','rate_bwd_bytes',
    'payload_fwd_mean','payload_bwd_mean','tcp_win_fwd_init','tcp_win_bwd_init',
    'tcp_syn_count','tcp_psh_count','tcp_rst_count','iat_avg_packets',
    'pkt_count_total','avg_pkt_size','byte_ratio_fwd','iat_pkt_rate_ratio',
    'tcp_ctrl_ratio','throughput','tcp_win_ratio'
] + [f'{c}_log1p' for c in ['duration','pkt_count_fwd','pkt_count_bwd',
                            'rate_fwd_pkts','rate_bwd_pkts','rate_fwd_bytes',
                            'rate_bwd_bytes','payload_fwd_mean','payload_bwd_mean',
                            'tcp_win_fwd_init','tcp_win_bwd_init','iat_avg_packets']]
categorical_features = ['protocol','port_src_grp','port_dst_grp','subnet_src','subnet_dst']

num_pipe = Pipeline([
    ('imputer', IterativeImputer(random_state=42)),
    ('scaler',  StandardScaler(with_mean=False))
])
cat_pipe = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=True))
])
preproc = ColumnTransformer([
    ('num', num_pipe, numeric_cols),
    ('cat', cat_pipe, categorical_features)
], sparse_threshold=1.0)

# 5) Stacking ensemble
estimators = [
    ('xgb', XGBClassifier(
        n_estimators=300, learning_rate=0.05, max_depth=6,
        subsample=0.8, colsample_bytree=0.8,
        use_label_encoder=False, eval_metric='mlogloss',
        random_state=42, n_jobs=4
    )),
    ('lgb', LGBMClassifier(
        n_estimators=300, learning_rate=0.05, num_leaves=50,
        subsample=0.8, colsample_bytree=0.8,
        class_weight='balanced',
        random_state=42, n_jobs=4
    )),
    ('cat', CatBoostClassifier(
        iterations=300, learning_rate=0.05, depth=6,
        auto_class_weights='Balanced',
        random_seed=42,
        verbose=0
    ))
]
stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5,
    passthrough=True,
    n_jobs=-1
)
pipeline = Pipeline([
    ('pre',   preproc),
    ('stack', stack)
])

# 6) Hyperparameter search
param_dist = {
    'stack__xgb__learning_rate': [0.01, 0.05, 0.1],
    'stack__xgb__max_depth':      [4,  6,  8],
    'stack__lgb__num_leaves':     [31, 50, 100],
    'stack__cat__depth':          [4,  6,  8],
    'stack__final_estimator__C':  [0.01, 0.1, 1, 10],
}
search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    scoring=make_scorer(f1_score, average='macro'),
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# 7) Fit & tune
search.fit(X, y)
print("Best CV Macro F1:", search.best_score_)
print("Best Params:", search.best_params_)

# 8) 예측 및 제출 파일 저장
best = search.best_estimator_
pred_int   = best.predict(test)
pred_label = le.inverse_transform(pred_int)
submission['attack_type'] = pred_label
submission.to_csv('/kaggle/working/stacking_tuned_submission.csv', index=False)
print("Saved to /kaggle/working/stacking_tuned_submission.csv")


In [None]:
import pandas as pd
import numpy as np

from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.ensemble import VotingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# 1) 데이터 로드
INPUT = '/kaggle/input/open123123121212'
train      = pd.read_csv(f'{INPUT}/train.csv')
test       = pd.read_csv(f'{INPUT}/test.csv')
submission = pd.read_csv(f'{INPUT}/sample_submission.csv')

# 2) X, y 준비 및 레이블 인코딩
X       = train.drop(['ID','attack_type'], axis=1)
y_raw   = train['attack_type']
le      = LabelEncoder()
y       = le.fit_transform(y_raw)

# 3) 파생 변수 + 전처리 (위와 동일)
def port_group(p):
    if p <= 1023:       return 'well_known'
    elif p <= 49151:    return 'registered'
    else:               return 'dynamic'

def make_features(df):
    df['port_src_grp'] = df['port_src'].apply(port_group)
    df['port_dst_grp'] = df['port_dst'].apply(port_group)
    df['subnet_src'] = df['ip_src'].fillna('0.0.0.0').astype(str).str.split('.',3).str[:2].str.join('.')
    df['subnet_dst'] = df['ip_dst'].fillna('0.0.0.0').astype(str).str.split('.',3).str[:2].str.join('.')
    df['pkt_count_total']    = df['pkt_count_fwd'] + df['pkt_count_bwd']
    df['avg_pkt_size']       = (df['rate_fwd_bytes'] + df['rate_bwd_bytes'])/(df['pkt_count_total']+1e-6)
    df['byte_ratio_fwd']     = df['rate_fwd_bytes']/(df['rate_fwd_bytes']+df['rate_bwd_bytes']+1e-6)
    df['throughput']         = (df['rate_fwd_bytes']+df['rate_bwd_bytes'])/(df['duration']+1e-6)
    for c in ['duration','pkt_count_fwd','pkt_count_bwd','rate_fwd_pkts','rate_bwd_pkts',
              'rate_fwd_bytes','rate_bwd_bytes']:
        df[f'{c}_log1p'] = np.log1p(df[c].fillna(0))
    return df

X    = make_features(X.copy())
test = make_features(test.copy())

# numeric / categorical lists (로그 컬럼 포함)
numeric_cols = [c for c in X.columns if c not in ['ID','attack_type','protocol','ip_src','ip_dst']]
categorical = ['protocol','port_src_grp','port_dst_grp','subnet_src','subnet_dst']

preproc = ColumnTransformer([
    ('num', Pipeline([
        ('imp',   IterativeImputer(random_state=42)),
        ('scale', StandardScaler(with_mean=False))
    ]), numeric_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse=True), categorical)
], sparse_threshold=1.0)

# 4) 모델 정의
xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42,
    n_jobs=4
)
lgb = LGBMClassifier(
    n_estimators=200,
    learning_rate=0.05,
    num_leaves=50,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight='balanced',
    random_state=42,
    n_jobs=4
)

voting = VotingClassifier(
    estimators=[('xgb', xgb), ('lgb', lgb)],
    voting='soft',
    weights=[1,1],
    n_jobs=-1
)

pipeline = Pipeline([
    ('pre', preproc),
    ('clf', voting)
])

# 5) Hyperparameter search (5 iters × 3-fold)
param_dist = {
    'clf__xgb__learning_rate': [0.01, 0.05, 0.1],
    'clf__xgb__max_depth':      [4, 6, 8],
    'clf__lgb__num_leaves':     [31, 50, 100]
}

search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=5,
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    scoring=make_scorer(f1_score, average='macro'),
    n_jobs=-1,
    random_state=42,
    verbose=2
)
search.fit(X, y)
print("Lite CV Macro F1:", search.best_score_)
print("Best Params:", search.best_params_)

# 6) 최종 예측 & 저장
best = search.best_estimator_
pred_int   = best.predict(test)
pred_label = le.inverse_transform(pred_int)
submission['attack_type'] = pred_label
submission.to_csv('/kaggle/working/lite_ensemble_submission.csv', index=False)
print("Saved:", '/kaggle/working/lite_ensemble_submission.csv')
