In [16]:
import pandas as pd
import numpy as np
import os
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_validate, StratifiedKFold
from sklearn.metrics import (
    roc_auc_score, average_precision_score, precision_recall_curve, roc_curve,
    confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, make_scorer 
)

#df_train = pd.read_csv("C:\\Users\\Gökhan\\Desktop\\nids-adversarial\\data\\UNSW_NB15_train.csv", low_memory=False)
#df_test = pd.read_csv("C:\\Users\\Gökhan\\Desktop\\nids-adversarial\\data\\UNSW_NB15_test.csv", low_memory=False)

df1 = pd.read_csv("C:\\Users\\Gökhan\\Desktop\\nids-adversarial\\data\\UNSW-NB15_1.csv", low_memory=False)
df2 = pd.read_csv("C:\\Users\\Gökhan\\Desktop\\nids-adversarial\\data\\UNSW-NB15_2.csv", low_memory=False)
df3 = pd.read_csv("C:\\Users\\Gökhan\\Desktop\\nids-adversarial\\data\\UNSW-NB15_3.csv", low_memory=False)
df4 = pd.read_csv("C:\\Users\\Gökhan\\Desktop\\nids-adversarial\\data\\UNSW-NB15_4.csv", low_memory=False)
feat = pd.read_csv("C:\\Users\\Gökhan\\Desktop\\nids-adversarial\\data\\NUSW-NB15_features.csv", encoding="latin1",low_memory=False)

#df_train.columns = [c.lower() for c in df_train.columns]
#df_test.columns = [c.lower() for c in df_test.columns],

print("Input dataframe has been successfully uploaded.")
print("Initial first train size: ", df1.shape)
print("Initial second train size: ", df2.shape)
print("Initial third train size: ", df3.shape)
print("Initial fourth train size: ", df4.shape)
print("Initial features size: ", feat.shape)

TARGET = "label" # 1=attack, 0=normal
RANDOM_STATE = 42

Input dataframe has been successfully uploaded.
Initial first train size:  (700000, 49)
Initial second train size:  (700000, 49)
Initial third train size:  (700000, 49)
Initial fourth train size:  (440043, 49)
Initial features size:  (49, 4)


In [17]:
# --- 1) features dosyasından sütun adlarını çek ---
feature_names = feat["Name"].astype(str).str.strip().tolist()
print("Sütun adı sayısı:", len(feature_names))

# --- 2) subsetlerin kolon adlarını ata ---
for d in (df1, df2, df3, df4):
    d.columns = feature_names

# --- 3) hepsini birleştir ---
df_full = pd.concat([df1, df2, df3, df4], ignore_index=True)
print("Birleşik şekil:", df_full.shape)

# --- 4) kaydetmek istersen ---
df_full.to_csv("UNSW-NB15_full.csv", index=False, encoding="utf-8")

Sütun adı sayısı: 49
Birleşik şekil: (2540043, 49)


In [18]:
df_full.head(10)

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
0,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,0,2,4,2,3,1,1,2,,0
1,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,0,12,8,1,2,2,1,1,,0
2,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,0,6,9,1,1,1,1,1,,0
3,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,...,0,7,9,1,1,1,1,1,,0
4,59.166.0.0,32119,149.171.126.9,111,udp,CON,0.078339,568,312,31,...,0,2,4,2,3,1,1,2,,0
5,59.166.0.6,2142,149.171.126.4,53,udp,CON,0.001134,132,164,31,...,0,12,7,1,2,2,1,1,,0
6,10.40.182.3,0,10.40.182.3,0,arp,INT,0.0,46,0,0,...,0,2,2,2,2,2,2,2,,0
7,59.166.0.5,40726,149.171.126.6,53,udp,CON,0.001126,146,178,31,...,0,6,7,3,1,1,1,1,,0
8,59.166.0.7,12660,149.171.126.4,53,udp,CON,0.001167,132,164,31,...,0,6,7,2,1,1,1,1,,0
9,10.40.170.2,0,10.40.170.2,0,arp,INT,0.0,46,0,0,...,0,2,2,2,2,2,2,2,,0


In [19]:
df_full.replace(['', '-', 'no', '.'], np.nan, inplace=True)

NAs = pd.concat([df_full.isnull().sum()], axis=1, keys=["# Missing Data"])
print(NAs[NAs["# Missing Data"] > 0])

                  # Missing Data
sport                          2
dsport                         7
state                          8
service                  1246395
ct_flw_http_mthd         1348143
is_ftp_login             1429877
attack_cat               2218760


In [20]:
print(df_full.columns.tolist())


['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'Sload', 'Dload', 'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth', 'res_bdy_len', 'Sjit', 'Djit', 'Stime', 'Ltime', 'Sintpkt', 'Dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat', 'Label']


In [21]:
df_full = df_full.drop(columns=['service'])
df_full = df_full.drop(columns=['srcip'])
df_full = df_full.drop(columns=['dstip'])
df_full = df_full.drop(columns=['ct_flw_http_mthd'])
df_full = df_full.drop(columns=['is_ftp_login'])
df_full = df_full.dropna(subset=["state"])
df_full = df_full.dropna(subset=["sport"])
df_full = df_full.dropna(subset=["dsport"])

In [22]:
print("Train shape:", df_full.shape)

Train shape: (2540027, 44)


In [23]:
df_full.replace(['', '-', 'no', '.'], np.nan, inplace=True)


NAs = pd.concat([df_full.isnull().sum()], axis=1, keys=["# Missing Data"])
print(NAs[NAs["# Missing Data"] > 0])
print("------------")


            # Missing Data
attack_cat         2218744
------------


In [24]:
df_full['attack_cat'] = df_full['attack_cat'].fillna('Normal')

In [25]:
df_full.replace(['', '-', 'no', '.'], np.nan, inplace=True)


NAs = pd.concat([df_full.isnull().sum()], axis=1, keys=["# Missing Data"])
print(NAs[NAs["# Missing Data"] > 0])
print("------------")


Empty DataFrame
Columns: [# Missing Data]
Index: []
------------


In [26]:
# --- 2) Low-variance (sabit) sütun tespiti (train'e göre fit) ---
# eşiği ayarlayabilirsin; örn 0.95 = tek bir değer tüm satırların %95'inde mevcutsa sabit kabul et
LOW_VAR_THRESHOLD = 0.95 

# sütunları korumak istiyorsan ekle (etiket vs.)
protected_cols = {'label', 'attack_cat'}

low_var_cols = []
for c in df_full.columns:
    if c in protected_cols:
        continue
    nunique = df_full[c].nunique(dropna=True)
    if nunique <= 1:
        # tamamen sabit veya tek değerli
        low_var_cols.append(c)
        continue
    # top value oranı
    top_pct = df_full[c].value_counts(normalize=True, dropna=True).iloc[0]
    if top_pct >= LOW_VAR_THRESHOLD:
        low_var_cols.append(c)

# Eğer hiçbir düşük varyans sütunu yoksa liste boş olur
print(f"-> Train üzerinde düşük varyans adayları (threshold={LOW_VAR_THRESHOLD}): {len(low_var_cols)} sütun")
print(low_var_cols)

-> Train üzerinde düşük varyans adayları (threshold=0.95): 1 sütun
['is_sm_ips_ports']


In [27]:
# --- 3) Bu sütunları çıkar (varsa) ---
# (önce güvenlik: protected sütunların listeye girmediğinden emin ol)
low_var_cols_to_drop = [c for c in low_var_cols if c not in protected_cols]

dropped_cols = []

# drop et
if low_var_cols_to_drop:
    df_full = df_full.drop(columns=low_var_cols_to_drop, errors='ignore')
    dropped_cols += low_var_cols_to_drop

print("-> Drop işleminden sonra yeni şekiller:", df_full.shape)
print("-> Silinen sütunlar:", dropped_cols)


-> Drop işleminden sonra yeni şekiller: (2540027, 43)
-> Silinen sütunlar: ['is_sm_ips_ports']


In [28]:
df_full.replace(['', '-', 'no', '.'], np.nan, inplace=True)


NAs = pd.concat([df_full.isnull().sum()], axis=1, keys=["# Missing Data"])
print(NAs[NAs["# Missing Data"] > 0])
print("------------")


Empty DataFrame
Columns: [# Missing Data]
Index: []
------------


In [29]:
df_full.to_csv("C:\\Users\\Gökhan\\Desktop\\nids-adversarial\\data\\clear_data_full.csv", index=False, encoding="utf-8-sig")
#data seti kaydet