# 02_Preprocessing

## Imports

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from sklearn.preprocessing import RobustScaler

## Load Data

In [25]:
df= pd.read_csv('../data/processed/windowed_dataset.csv')

## Clean columns

In [26]:
# Borrar las columnas sin variabilidad
df = df.loc[:, df.nunique(dropna=False) > 1]

In [27]:
# Cuando solo existe una conexi칩n, la desviaci칩n est치ndar es NaN, lo cambiamos por 0
std_cols = [c for c in df.columns if c.endswith('_std')]
df[std_cols] = df[std_cols].fillna(0.0)

# Rellenar con 0 ya que representan las primeras conexiones, no existe last connexion
cols = [
    'time_since_last_conn_mean',
    'time_since_last_conn_max'
]

df[cols] = df[cols].fillna(0.0)


In [28]:
null_counts = df.isnull().sum()

null_percent = df.isnull().mean() * 100

null_summary = pd.DataFrame({
    'nulos': null_counts,
    'porcentaje_nulos (%)': null_percent
}).sort_values(by='porcentaje_nulos (%)', ascending=False)

print(null_summary)

                              nulos  porcentaje_nulos (%)
id.orig_h                         0                   0.0
ip_first_seen_hours_ago_mean      0                   0.0
is_known_ip_std                   0                   0.0
is_known_ip_mean                  0                   0.0
unique_ja3_from_ip_std            0                   0.0
...                             ...                   ...
conn_interval_mean                0                   0.0
conn_state_encoded_max            0                   0.0
conn_state_encoded_std            0                   0.0
conn_state_encoded_mean           0                   0.0
is_attack                         0                   0.0

[83 rows x 2 columns]


## Drop main columns

In [29]:
id_cols = ['id.orig_h', 'window_start', 'window_end']
label_col = ('is_attack_any', 'is_attack_majority', 'is_attack_ratio', 'is_attack_count', 'attack_phase_nunique')

In [30]:
X = df.drop(columns=id_cols + list(label_col))
y = df['is_attack']  # SOLO para evaluaci칩n
print("Features shape:", X.shape)
print(X['n_connections'].unique())

Features shape: (24005, 75)
[ 7  2  5 14  4  3  1  6  9 11 12]


## Normalization & Standarization

In [31]:
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
X_scaled_df.to_csv('../data/processed/windowed_dataset_cleaned.csv', index=False)
print(X_scaled_df['n_connections'].unique())

[ 6.  1.  4. 13.  3.  2.  0.  5.  8. 10. 11.]
