# 01_EDA

## Imports

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

## Read data

In [4]:
raw_df = pd.read_csv('../data/raw/dataset_10k_final.csv')
windowed_df = pd.read_csv('../data/processed/windowed_dataset.csv')
print("Raw dataset shape:", raw_df.shape)
print("Windowed dataset shape:", windowed_df.shape) 


Raw dataset shape: (9277, 33)
Windowed dataset shape: (24005, 94)


In [5]:
prop_raw = raw_df['is_attack'].value_counts(normalize=True).reindex([0,1], fill_value=0)
prop_win = windowed_df['is_attack'].value_counts(normalize=True).reindex([0,1], fill_value=0)

datasets = ['Raw Dataset', 'Windowed Dataset']
normal_vals = [prop_raw[0], prop_win[0]]
attack_vals = [prop_raw[1], prop_win[1]]

fig = go.Figure()
fig.add_bar(x=datasets, y=normal_vals, name='Normal')
fig.add_bar(x=datasets, y=attack_vals, name='Ataque')

fig.update_layout(
    title="Proporción de ataques en los datasets",
    barmode='group',
    yaxis=dict(range=[0,1])
)
fig.write_image(
    "../reports/figures/dataset_proportion.png",
    width=1000,      # ancho en píxeles
    height=600,      # alto en píxeles
    scale=2          # factor de escalado (2 = ~200 DPI)
)
fig.show()

## Null Values

In [6]:
import pandas as pd

# Número absoluto de nulos por columna
null_counts = windowed_df.isnull().sum()

# Porcentaje de nulos por columna
null_percent = windowed_df.isnull().mean() * 100

# Mostrar en un DataFrame ordenado
null_summary = pd.DataFrame({
    'nulos': null_counts,
    'porcentaje_nulos (%)': null_percent
}).sort_values(by='porcentaje_nulos (%)', ascending=False)

print(null_summary)

                             nulos  porcentaje_nulos (%)
time_since_last_conn_std     21937             91.385128
bytes_ratio_std              21936             91.380962
interval_stddev_std          21936             91.380962
conn_count_60s_std           21936             91.380962
ip_first_seen_hours_ago_std  21936             91.380962
...                            ...                   ...
conn_count_10s_max               0              0.000000
conn_count_10s_mean              0              0.000000
conn_interval_max                0              0.000000
conn_interval_mean               0              0.000000
is_attack                        0              0.000000

[94 rows x 2 columns]


## Data Structure visualization

In [11]:
df=pd.read_csv('../data/processed/windowed_dataset_cleaned.csv')
X=df.drop(columns=['is_attack'])
y=df['is_attack']
import umap
import plotly.express as px
import pandas as pd
import numpy as np

# -----------------------------
# UMAP embedding
# -----------------------------
reducer = umap.UMAP(
    n_neighbors=30,
    min_dist=0.1,
    n_components=2,
    metric='euclidean',
    random_state=42
)

X_umap = reducer.fit_transform(X)

# -----------------------------
# Crear DataFrame para Plotly
# -----------------------------
df_plot = pd.DataFrame({
    'UMAP1': X_umap[:, 0],
    'UMAP2': X_umap[:, 1],
    'is_attack': y 
})

df_plot['Label'] = df_plot['is_attack'].map({0: 'Normal', 1: 'Attack'})

# -----------------------------
# Gráfico interactivo
# -----------------------------
fig = px.scatter(
    df_plot,
    x='UMAP1',
    y='UMAP2',
    color='Label',
    color_discrete_map={'Normal': '#1f77b4', 'Attack': '#ff7f0e'},
    hover_data=['is_attack'], 
    title="UMAP projection – colored by is_attack",
    width=900,
    height=700
)

fig.update_traces(marker=dict(size=6, opacity=0.8))
fig.update_layout(legend_title_text='Class')
fig.show()


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


Graph is not fully connected, spectral embedding may not work as expected.

