In [5]:
import pandas as pd
import glob
import os
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.utils import shuffle

In [9]:
data_dir = 'data'

benign_df = pd.read_csv(f'{data_dir}/benign_train.csv')

csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv') and f not in [
    'benign_train.csv', 'train_X.csv', 'X_test.csv', 'y_test.csv', 'val_X.csv']]

csv_dfs = []
for f in csv_files:
    df = pd.read_csv(os.path.join(data_dir, f))
    if 'label' in df.columns:
        df['label'] = df['label'].fillna(1)
        df = df[df['label'] == 1]  
    csv_dfs.append(df)

malicious_df = pd.concat(csv_dfs, ignore_index=True)
print("Benign shape:", benign_df.shape)
print("Malicious shape:", malicious_df.shape)

Benign shape: (15334261, 18)
Malicious shape: (11045641, 18)


In [None]:
train_val_benign, test_benign = train_test_split(benign_df, test_size=0.2, random_state=42)
train_benign, val_benign = train_test_split(train_val_benign, test_size=0.2, random_state=42)

test_malicious = malicious_df.sample(n=len(test_benign), random_state=42)

X_test = pd.concat([test_benign, test_malicious], ignore_index=True)
y_test = [0] * len(test_benign) + [1] * len(test_malicious)
X_test, y_test = shuffle(X_test, y_test, random_state=42)

In [None]:
train_X_df = train_benign.copy()
X_test_df = pd.DataFrame(X_test)
val_benign_df = val_benign.copy()

train_columns = train_X_df.columns
train_X_df = train_X_df[train_columns]
X_test_df = X_test_df[train_columns]
val_benign_df = val_benign_df[train_columns]

median_values = train_X_df.median()
train_X_df = train_X_df.fillna(median_values)
X_test_df = X_test_df.fillna(median_values)
val_benign_df = val_benign_df.fillna(median_values)

scaler = MinMaxScaler()
train_X_scaled = scaler.fit_transform(train_X_df)
X_test_scaled = scaler.transform(X_test_df)
val_X_scaled = scaler.transform(val_benign_df)

assert np.all(train_X_scaled >= 0) and np.all(train_X_scaled <= 1), "Train_X not scaled properly"
assert not np.isnan(train_X_scaled).any(), "NaNs found in scaled train_X"

pd.DataFrame(train_X_scaled).to_csv(f"{data_dir}/train_X.csv", index=False)
pd.DataFrame(X_test_scaled).to_csv(f"{data_dir}/X_test.csv", index=False)
pd.Series(y_test).to_csv(f"{data_dir}/y_test.csv", index=False)
pd.DataFrame(val_X_scaled).to_csv(f"{data_dir}/val_X.csv", index=False)

print("Train X shape:", train_X_scaled.shape, "| min:", train_X_scaled.min(), "| max:", train_X_scaled.max())
print("Test X shape:", X_test_scaled.shape)
print("Val X shape:", val_X_scaled.shape)
print("Test label distribution:")
print(pd.Series(y_test).value_counts())

Train X shape: (9813926, 18) | min: 0.0 | max: 1.0
Test X shape: (6133706, 18)
Val X shape: (2453482, 18)
Test label distribution:
0    3066853
1    3066853
Name: count, dtype: int64
