In [7]:
# ----------------------------
# 0. Uvoz knjižnic
# ----------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# ----------------------------
# 1. Branje podatkov
# ----------------------------
df = pd.read_csv("../data/simulation_security_labels_n-1.csv")

# ➕ DODAJ TO SEM:
df['timestamp'] = pd.to_datetime(df['timestamp'])
print("Časovno obdobje:", df['timestamp'].min(), "→", df['timestamp'].max())
print("Prvih 5 timestampov:")
print(df['timestamp'].head(5))
print("Ali so naraščajoči?", df['timestamp'].is_monotonic_increasing)

# ----------------------------
# 2. Osnovno čiščenje
# ----------------------------

# Odstranimo 'timestamp', ki ni uporaben za model
df = df.drop(columns=['timestamp'])
df = df.fillna(-1)  # ali povprečje, ali forward fill, ali drug placeholder


# Pretvori status (label) v numerično obliko
df['status'] = df['status'].map({'secure': 0, 'insecure': 1})

# Preveri manjkajoče vrednosti
missing = df.isnull().sum()
print("Stolpci z manjkajočimi vrednostmi:\n", missing[missing > 0])

# Preprosto: odstranimo vrstice z manjkajočimi vrednostmi
df = df.dropna()

# ----------------------------
# 3. Razdelitev na X in y
# ----------------------------
X = df.drop(columns=['status'])
y = df['status']

# ----------------------------
# 4. Delitev na train in test množico
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train razmerje:\n", y_train.value_counts())

# ----------------------------
# (5) Shraniš pripravljene podatke (opcijsko)
# ----------------------------
X_train.to_csv("../data/X_train.csv", index=False)
X_test.to_csv("../data/X_test.csv", index=False)
y_train.to_csv("../data/y_train.csv", index=False)
y_test.to_csv("../data/y_test.csv", index=False)

print("Podatki pripravljeni in shranjeni.")


Časovno obdobje: 2023-01-01 00:00:00 → 2023-12-31 23:00:00
Prvih 5 timestampov:
0   2023-01-01 00:00:00
1   2023-01-01 01:00:00
2   2023-01-01 02:00:00
3   2023-01-01 03:00:00
4   2023-01-01 04:00:00
Name: timestamp, dtype: datetime64[ns]
Ali so naraščajoči? False
Stolpci z manjkajočimi vrednostmi:
 Series([], dtype: int64)
X_train shape: (7015, 271)
X_test shape: (1754, 271)
y_train razmerje:
 status
0    3597
1    3418
Name: count, dtype: int64
Podatki pripravljeni in shranjeni.
