In [169]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

In [None]:
train_path = os.path.join('Training and Testing Sets', 'UNSW_NB15_training-set.csv')
df_train = pd.read_csv(train_path, low_memory=False).drop('id', axis=1)

In [None]:
num_cols = df_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = df_train.select_dtypes(include=['object']).columns.tolist()


40


In [117]:
# print(df_train.info())

In [153]:
df_num = df_train[num_cols].copy()

corr_matrix = df_num.corr().abs()


np.fill_diagonal(corr_matrix.values, 0)
correlation_matrix = df_num.corr()
correlation_with_target = correlation_matrix['label'].sort_values(ascending=False)
# print(correlation_with_target)

threshold = 0.9
high_corr_num_pairs = np.where(corr_matrix > threshold)

high_corr_features = [(corr_matrix.index[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]) 
                      for i, j in zip(*high_corr_num_pairs) if i < j]
high_corr_num_features = list()

for f1, f2, corr_value in high_corr_features:
    corr_f1 = abs(correlation_with_target[f1])
    corr_f2 = abs(correlation_with_target[f2])

    # print(f"{f1} ↔ {f2}: {corr_value:.3f}")
    # print(f"{f1} correlation with target: {corr_f1:.3f}")
    # print(f"{f2} correlation with target: {corr_f2:.3f}")

    # Drop the feature with lower correlation with the target
    if corr_f1 > corr_f2:
        high_corr_num_features.append(f2)
    else:
        high_corr_num_features.append(f1)
    # print(f"Dropping {f2 if corr_f1 > corr_f2 else f1}\n")



low_corr_num_features = correlation_with_target[correlation_with_target.abs() < 0.03].index.tolist()

# print(f"High correlation features: {high_corr_num_features}")
# print(f"Low correlation features: {low_corr_num_features}")

# print(f"Number of features to drop: {len(set(high_corr_num_features + low_corr_num_features))}")


In [145]:
low_corr_num_features = correlation_with_target[correlation_with_target.abs() < 0.03].index.tolist()

# df_num_decor = df_num.drop(columns=low_corr_num_features)
# print(df_num_decor.columns)



In [133]:
df_cat = df_train[cat_cols].copy()
low_var_cats = {feature : df_cat[feature].value_counts(normalize=True).max() for feature in df_cat.columns}
sorted_vars = dict(sorted(low_var_cats.items(), key=lambda item: item[1], reverse = True))


# Display features with very low variance
print("Low-variance categorical features (over 95% same value):")
for k, v in sorted_vars.items():
    print(f"{k}: {v:.2%} same value")

Low-variance categorical features (over 95% same value):
service: 53.71% same value
state: 46.92% same value
proto: 45.59% same value
attack_cat: 31.94% same value


In [164]:
values_nb = [len(df_cat[feature].value_counts()) for feature in df_cat.columns]
print(sum(values_nb))

165


In [123]:
def drop_low_corr_features(X,cols):
    return X.drop(columns=cols, errors='ignore')


In [174]:
num_transformer = Pipeline(steps=[
    ('drop_low_corr', FunctionTransformer(drop_low_corr_features, kw_args={'cols': high_corr_num_features+low_corr_num_features}, validate=False)),
    ('scaler', StandardScaler())
])
cat_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer,  [col for col in num_cols if col != 'label']),
        ('cat', cat_transformer, cat_cols)
    ]
)



In [None]:
# preprocessor.fit(df_train)
# X_num_dropped = drop_low_corr_features(df_train[num_cols], 
#                                        cols=high_corr_num_features + low_corr_num_features)
# remaining_num_cols = X_num_dropped.columns.tolist()
# cat_feature_names = preprocessor.named_transformers_['cat']\
#     .named_steps['onehot'].get_feature_names_out(cat_cols)
# print(len(cat_feature_names))
# all_features = remaining_num_cols + cat_feature_names.tolist()

# print("Final features after preprocessing:")
# print(len(remaining_num_cols))

Final features after preprocessing:
20


In [177]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(n_estimators=100,verbose=1, random_state=42, n_jobs=-1))
]
)

In [179]:
X_train, y_train = df_train.drop(columns='label'), df_train['label']

In [184]:
pipeline.fit(X_train, y_train)




[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   10.4s finished


In [None]:
test_path = os.path.join('Training and Testing Sets', 'UNSW_NB15_testing-set.csv')
df_test = pd.read_csv(train_path, low_memory=False).drop('id', axis=1)
X_test, y_test = df_test.drop(columns='label'), df_test['label']


Index(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat'],
      dtype='object')


In [187]:
print("Accuracy:", pipeline.score(X_test, y_test))

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.1s


Accuracy: 1.0


[Parallel(n_jobs=6)]: Done 100 out of 100 | elapsed:    0.5s finished
