# Network attacks - Preparation de données

## I - Modules

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os



## II - Charger donnée

In [2]:
def load_dataset_keeping_proportions(file_path, size):
    def stratified_sample(df, size_frac):
        if 'label' in df.columns:
            # Use train_test_split from sklearn to reduce the dataset while keeping label proportions
            _, df_reduced = train_test_split(df, test_size=size_frac, stratify=df['label'], random_state=42)
            return df_reduced
        else:
            # If 'label' column is not present, return a regular sampled subset
            return df.sample(frac=size_frac, random_state=42)

    # Map size input to fractions
    size_map = {"full": 1, "/2": 0.5, "/4": 0.25, "/10": 0.1, "/100": 0.01, "/1000": 0.001}

    # Load the dataset
    df = pd.read_csv(file_path)

    # Apply stratified sampling if size is not 'full'
    size_frac = size_map.get(size, 1)  # Default to 'full' if size not recognized
    if size != "full":
        df = stratified_sample(df, size_frac)

    return df



In [3]:
# Applying the function to each file
df_normal = load_dataset_keeping_proportions("network/normal.csv", "/1000")  
df_attack_1 = load_dataset_keeping_proportions("network/attack_1.csv", "/1000")
df_attack_2 = load_dataset_keeping_proportions("network/attack_2.csv", "/1000")
df_attack_3 = load_dataset_keeping_proportions("network/attack_3.csv", "/1000")
df_attack_4 = load_dataset_keeping_proportions("network/attack_4.csv", "/1000")

#all_data_concat_df = pd.concat([df_normal, df_attack_1, df_attack_2, df_attack_3, df_attack_4], ignore_index=True)

## III - Nettoyer donnée

Nous allons nettoyer la donnée avec les informations obtenus à partir de l'exploration

### 1 - Supprimer espaces superflus

Certaines colonnes sont écrites " sport" et d'autres "sport", supprimons les espaces.

In [7]:
def del_espace(string):
    ret = ""
    for i in range(len(string)):
        if string[i]!=" ":
            ret+=string[i]
    return ret


In [8]:
def clean_espace(df):
    df_ret = df.copy()
    for col in df_ret.columns:
        df_ret = df_ret.rename(columns={col:del_espace(col)})
    return df_ret

In [None]:
df_attack_1.columns

Index(['Time', ' mac_s', ' mac_d', ' ip_s', ' ip_d', ' sport', ' dport',
       ' proto', ' flags', ' size', ' modbus_fn', ' n_pkt_src', ' n_pkt_dst',
       ' modbus_response', ' label_n', ' label'],
      dtype='object')

In [None]:
cleaned_attack_1 = clean_espace(df_attack_1)
cleaned_attack_1.columns

Index(['Time', 'mac_s', 'mac_d', 'ip_s', 'ip_d', 'sport', 'dport', 'proto',
       'flags', 'size', 'modbus_fn', 'n_pkt_src', 'n_pkt_dst',
       'modbus_response', 'label_n', 'label'],
      dtype='object')

Cela a fonctionné. Applicons cela à tous les dataset.

In [None]:
cleaned_normal = clean_espace(df_normal)
cleaned_attack_2 = clean_espace(df_attack_2)
cleaned_attack_3 = clean_espace(df_attack_3)
cleaned_attack_4 = clean_espace(df_attack_4)
#all_data_concat_df = clean_espace(all_data_concat_df)

In [None]:
# on drop ce dont on a pas besoin de la ram
del df_normal
del df_attack_1
del df_attack_2
del df_attack_3
del df_attack_4

Vérifions que tous les dataset ont désormais les mêmes colonnes : 

In [None]:
print(list(cleaned_normal.columns)==list(cleaned_attack_2.columns) and
      list(cleaned_attack_1.columns)==list(cleaned_attack_2.columns) and 
      list(cleaned_attack_1.columns)==list(cleaned_attack_3.columns) and 
      list(cleaned_attack_1.columns)==list(cleaned_attack_3.columns))

True


### 2 - Supprimer de colonnes (par exemple catégorielles, temporelles, ou fortement corrélées)

Cela peut être utile de supprimer les données catégorielles qui ne peuvent pas être prises en compte par certains algorithms

In [9]:
def clean_category(df):
    df_ret = df.drop("Time",axis=1)
    df_ret = df_ret.drop("mac_s",axis=1)
    df_ret = df_ret.drop("mac_d",axis=1)
    df_ret = df_ret.drop("ip_s",axis=1)
    df_ret = df_ret.drop("ip_d",axis=1)
    df_ret = df_ret.drop("modbus_fn",axis=1)
    df_ret = df_ret.drop("modbus_response",axis=1)
    df_ret = df_ret.drop("label_n",axis=1)
    df_ret = df_ret.drop("proto",axis=1)

    # Tout ça c'est corrélé à s_port
    df_ret = df_ret.drop("n_pkt_src",axis=1)
    df_ret = df_ret.drop("n_pkt_dst",axis=1)
    df_ret = df_ret.drop("dport",axis=1)

    


    return df_ret

In [None]:
cleaned_normal = clean_category(cleaned_normal)
cleaned_attack_1 = clean_category(cleaned_attack_1)
cleaned_attack_2 = clean_category(cleaned_attack_2)
cleaned_attack_3 = clean_category(cleaned_attack_3)
cleaned_attack_4 = clean_category(cleaned_attack_4)
#all_data_concat_df = clean_category(all_data_concat_df)

Vérifions :

In [None]:
cleaned_normal.head()

Unnamed: 0,sport,flags,size,label
5473660,61516,11000,66,normal
4654798,502,11000,65,normal
1307506,61517,11000,66,normal
1575326,502,11000,65,normal
7189223,502,11000,64,normal


In [None]:
cleaned_attack_1.head()

Unnamed: 0,sport,flags,size,label
2987365,502.0,11000.0,64,MITM
3302474,56667.0,11000.0,66,normal
2289618,502.0,11000.0,65,normal
3634355,502.0,11000.0,65,physical fault
1682246,502.0,11000.0,64,normal


### 3 - Supprimer les NaN

In [None]:
for k in cleaned_attack_1.columns:
    print(k, cleaned_attack_1[k].isna().any())

sport False
flags False
size False
label False


In [None]:
# A voir pour ne pas drop tous les NaN, ils sont peut être porteur d'information
cleaned_normal = cleaned_normal.dropna(subset=["sport"])
cleaned_attack_1 = cleaned_attack_1.dropna(subset=["sport"])
cleaned_attack_2 = cleaned_attack_2.dropna(subset=["sport"])
cleaned_attack_3 = cleaned_attack_3.dropna(subset=["sport"])
cleaned_attack_4 = cleaned_attack_4.dropna(subset=["sport"])
#all_data_concat_df = all_data_concat_df.dropna(subset=["sport"])

Vérifions que ça a retiré tous les NaN :

In [None]:
# On ne retire pas forcément tous les NaN 
# for k in cleaned_attack_1.columns:
#     print(k, cleaned_attack_1[k].isna().any())

In [None]:
# On s'assure que la colonne d'indice n'ets pas compté comme une colonne
cleaned_normal.head()
cleaned_attack_1.reset_index(drop=True, inplace=True)
cleaned_attack_2.reset_index(drop=True, inplace=True)
cleaned_attack_3.reset_index(drop=True, inplace=True)
cleaned_attack_4.reset_index(drop=True, inplace=True)
#all_data_concat_df.reset_index(drop=True, inplace=True)



In [10]:
# Definition ud'un fonction de préparation (qui effectue toutes les opérations de nettoyage)

def prepare(df):
    def_ret = clean_espace(df)
    def_ret = clean_category(def_ret)
    def_ret = def_ret.dropna(subset=["sport"])
    def_ret.reset_index(drop=True, inplace=True) 

    return def_ret

## IV - Enregistrer données nettoyées

In [11]:
# create the directory if it doesn't exist
if not os.path.exists("network/all_data_concat"):
    os.makedirs("network/all_data_concat")

if not os.path.exists("network/all_data_concat/full_size"):
    os.makedirs("network/all_data_concat/full_size")

if not os.path.exists("network/all_data_concat/divided_by_10"):
    os.makedirs("network/all_data_concat/divided_by_10")

if not os.path.exists("network/all_data_concat/divided_by_100"):
    os.makedirs("network/all_data_concat/divided_by_100")

if not os.path.exists("network/all_data_concat/divided_by_1000"):
    os.makedirs("network/all_data_concat/divided_by_1000")

if not os.path.exists("preparation/network"):
    os.makedirs("preparation/network")

if not os.path.exists("preparation/network/all_data_concat"):
    os.makedirs("preparation/network/all_data_concat")


In [None]:
cleaned_normal.to_csv("preparation/network_normal.csv", index=False)
cleaned_attack_1.to_csv("preparation/network_1.csv", index=False)
cleaned_attack_2.to_csv("preparation/network_2.csv", index=False)
cleaned_attack_3.to_csv("preparation/network_3.csv", index=False)
cleaned_attack_4.to_csv("preparation/network_4.csv", index=False)

In [None]:
# drop the datasets that are not needed anymore
del cleaned_normal
del cleaned_attack_2
del cleaned_attack_3
del cleaned_attack_4

In [None]:
df = pd.read_csv("network/attack_1.csv")

In [12]:
# enregistrement des données concaténées de toutes les tailles

#sizes = ["full", "/10", "/100", "/1000"]
sizes = ["/1000"]
#file_names = ["full_size", "divided_by_10", "divided_by_100", "divided_by_1000"]
file_names = ["divided_by_1000"]

size_to_name = dict(zip(sizes, file_names))

file_paths = [
    "network/normal.csv",
    "network/attack_1.csv",
    "network/attack_2.csv",
    "network/attack_3.csv",
    "network/attack_4.csv"
]

for size in sizes:
    # Initialize an empty DataFrame for concatenated data
    cleaned_all_data_concat = pd.DataFrame()

    for file_path in file_paths:
        # Load and prepare each dataset
        df = load_dataset_keeping_proportions(file_path, size)
        prepared_df = prepare(df)

        # Concatenate to the cumulative DataFrame
        cleaned_all_data_concat = pd.concat([cleaned_all_data_concat, prepared_df], ignore_index=True)
    
    # Save the concatenated and prepared data to CSV
    #cleaned_all_data_concat.to_csv("preparation/network/all_data_concat/" + size_to_name[size] + "all_data_concat.csv", index=False)
    cleaned_all_data_concat.to_csv("preparation/network/all_data_concat/all_data_concat_" + size_to_name[size] + ".csv", index=False)
    del cleaned_all_data_concat
    

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.