# Network attacks - Preparation de données

## I - Modules

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split

## II - Charger donnée

In [None]:
# On utilise train_test_split de sklearn pour reduire les dataset à la taille voulue en concervant les proportions pour les labels
# Pour garder les proportions, il faut les connaitres, pour les connaitres, il faut d'abord load le dataset en entier (ce qui peut être lourd) 
def load_dataset_keeping_proportions(size, dataset_path):
    def stratified_sample(df, size_frac):
        if 'label' in df.columns:
            
            # On utilise train_test_split de sklearn pour reduire les dataset en concervant les proportions pour les labels
            _, df_reduced = train_test_split(df, test_size=size_frac, stratify=df['label'], random_state=42)
            return df_reduced
        else:
            # If 'label' column is not present, return a regular sampled subset
            return df.sample(frac=size_frac, random_state=42)

    # Map size input to fractions
    size_map = {"full": 1, "/2": 0.5, "/4": 0.25, "/10": 0.1, "/100": 0.01, "/1000": 0.001}

    # Load the dataset
    df = pd.read_csv(dataset_path)

    # Apply stratified sampling if size is not 'full'
    if size != "full":
        size_frac = size_map.get(size, 1)  # Default to 'full' if size not recognized
        df = stratified_sample(df, size_frac)

    return df


In [None]:
# df_normal = load_dataset_keeping_proportions("/100", "network/normal.csv")
# df_attack_1 = load_dataset_keeping_proportions("/100", "network/attack_1.csv")
# df_attack_2 = load_dataset_keeping_proportions("/100", "network/attack_2.csv")
# df_attack_3 = load_dataset_keeping_proportions("/100", "network/attack_3.csv")
# df_attack_4 = load_dataset_keeping_proportions("/100", "network/attack_4.csv")

In [22]:
def load_data(size):
    if size=="full":
        df_normal = pd.read_csv("network/normal.csv")
        df_attack_1 = pd.read_csv("network/attack_1.csv")
        df_attack_2 = pd.read_csv("network/attack_2.csv")
        df_attack_3 = pd.read_csv("network/attack_3.csv")
        df_attack_4 = pd.read_csv("network/attack_4.csv")
    
    elif size=="/2":
        df_normal = pd.read_csv("network/normal.csv", skiprows=lambda x: x % 2 != 0)
        df_attack_1 = pd.read_csv("network/attack_1.csv", skiprows=lambda x: x % 2 != 0)
        df_attack_2 = pd.read_csv("network/attack_2.csv", skiprows=lambda x: x % 2 != 0)
        df_attack_3 = pd.read_csv("network/attack_3.csv", skiprows=lambda x: x % 2 != 0)
        df_attack_4 = pd.read_csv("network/attack_4.csv", skiprows=lambda x: x % 2 != 0)
    
    elif size=="/4":
        df_normal = pd.read_csv("network/normal.csv", skiprows=lambda x: x % 4 != 0)
        df_attack_1 = pd.read_csv("network/attack_1.csv", skiprows=lambda x: x % 4 != 0)
        df_attack_2 = pd.read_csv("network/attack_2.csv", skiprows=lambda x: x % 4 != 0)
        df_attack_3 = pd.read_csv("network/attack_3.csv", skiprows=lambda x: x % 4 != 0)
        df_attack_4 = pd.read_csv("network/attack_4.csv", skiprows=lambda x: x % 4 != 0)

    elif size=="/10":
        df_normal = pd.read_csv("network/normal.csv", skiprows=lambda x: x % 10 != 0)
        df_attack_1 = pd.read_csv("network/attack_1.csv", skiprows=lambda x: x % 10 != 0)
        df_attack_2 = pd.read_csv("network/attack_2.csv", skiprows=lambda x: x % 10 != 0)
        df_attack_3 = pd.read_csv("network/attack_3.csv", skiprows=lambda x: x % 10 != 0)
        df_attack_4 = pd.read_csv("network/attack_4.csv", skiprows=lambda x: x % 10 != 0)
    
    elif size=="/100":
        df_normal = pd.read_csv("network/normal.csv", skiprows=lambda x: x % 100 != 0)
        df_attack_1 = pd.read_csv("network/attack_1.csv", skiprows=lambda x: x % 100 != 0)
        df_attack_2 = pd.read_csv("network/attack_2.csv", skiprows=lambda x: x % 100 != 0)
        df_attack_3 = pd.read_csv("network/attack_3.csv", skiprows=lambda x: x % 100 != 0)
        df_attack_4 = pd.read_csv("network/attack_4.csv", skiprows=lambda x: x % 100 != 0)
    
    elif size=="/1000":
        df_normal = pd.read_csv("network/normal.csv", skiprows=lambda x: x % 1000 != 0)
        df_attack_1 = pd.read_csv("network/attack_1.csv", skiprows=lambda x: x % 1000 != 0)
        df_attack_2 = pd.read_csv("network/attack_2.csv", skiprows=lambda x: x % 1000 != 0)
        df_attack_3 = pd.read_csv("network/attack_3.csv", skiprows=lambda x: x % 1000 != 0)
        df_attack_4 = pd.read_csv("network/attack_4.csv", skiprows=lambda x: x % 1000 != 0)


    
    return df_normal, df_attack_1, df_attack_2, df_attack_3, df_attack_4

In [23]:
df_normal, df_attack_1, df_attack_2, df_attack_3, df_attack_4 = load_data("/100")

## III - Nettoyer donnée

Nous allons nettoyer la donnée avec les informations obtenus à partir de l'exploration

### 1 - Supprimer espaces superflus

Certaines colonnes sont écrites " sport" et d'autres "sport", supprimons les espaces.

In [24]:
def del_espace(string):
    ret = ""
    for i in range(len(string)):
        if string[i]!=" ":
            ret+=string[i]
    return ret


In [25]:
def clean_espace(df):
    df_ret = df.copy()
    for col in df_ret.columns:
        df_ret = df_ret.rename(columns={col:del_espace(col)})
    return df_ret

In [26]:
df_attack_1.columns

Index(['Time', ' mac_s', ' mac_d', ' ip_s', ' ip_d', ' sport', ' dport',
       ' proto', ' flags', ' size', ' modbus_fn', ' n_pkt_src', ' n_pkt_dst',
       ' modbus_response', ' label_n', ' label'],
      dtype='object')

In [27]:
cleaned_attack_1 = clean_espace(df_attack_1)
cleaned_attack_1.columns

Index(['Time', 'mac_s', 'mac_d', 'ip_s', 'ip_d', 'sport', 'dport', 'proto',
       'flags', 'size', 'modbus_fn', 'n_pkt_src', 'n_pkt_dst',
       'modbus_response', 'label_n', 'label'],
      dtype='object')

Cela a fonctionné. Applicons cela à tous les dataset.

In [28]:
cleaned_normal = clean_espace(df_normal)
cleaned_attack_2 = clean_espace(df_attack_2)
cleaned_attack_3 = clean_espace(df_attack_3)
cleaned_attack_4 = clean_espace(df_attack_4)

Vérifions que tous les dataset ont désormais les mêmes colonnes : 

In [29]:
print(list(cleaned_normal.columns)==list(cleaned_attack_2.columns) and
      list(cleaned_attack_1.columns)==list(cleaned_attack_2.columns) and 
      list(cleaned_attack_1.columns)==list(cleaned_attack_3.columns) and 
      list(cleaned_attack_1.columns)==list(cleaned_attack_3.columns))

True


### 2 - Supprimer de colonnes (par exemple catégorielles, temporelles, ou fortement corrélées)

Cela peut être utile de supprimer les données catégorielles qui ne peuvent pas être prises en compte par certains algorithms

In [30]:
def clean_category(df):
    #df_ret = df.drop("Time",axis=1)
    df_ret = df.drop("mac_s",axis=1)
    df_ret = df_ret.drop("mac_d",axis=1)
    df_ret = df_ret.drop("ip_s",axis=1)
    df_ret = df_ret.drop("ip_d",axis=1)
    df_ret = df_ret.drop("modbus_fn",axis=1)
    df_ret = df_ret.drop("modbus_response",axis=1)
    df_ret = df_ret.drop("label_n",axis=1)
    #df_ret = df_ret.drop("proto",axis=1)

    # Tout ça c'est corrélé à s_port
    df_ret = df_ret.drop("n_pkt_src",axis=1)
    df_ret = df_ret.drop("n_pkt_dst",axis=1)
    df_ret = df_ret.drop("dport",axis=1)

    # On enlève flags parce qu'il indique surement le type d'attaque
    #df_ret = df_ret.drop("flags") 


    return df_ret

In [31]:
cleaned_normal = clean_category(cleaned_normal)
cleaned_attack_1 = clean_category(cleaned_attack_1)
cleaned_attack_2 = clean_category(cleaned_attack_2)
cleaned_attack_3 = clean_category(cleaned_attack_3)
cleaned_attack_4 = clean_category(cleaned_attack_4)

Vérifions :

In [32]:
cleaned_normal.head()

Unnamed: 0,Time,sport,proto,flags,size,label
0,2021-04-09 11:30:52.761572,502,Modbus,11000,65,normal
1,2021-04-09 11:30:52.808170,61517,Modbus,11000,66,normal
2,2021-04-09 11:30:52.848609,502,Modbus,11000,65,normal
3,2021-04-09 11:30:52.889732,502,Modbus,11000,65,normal
4,2021-04-09 11:30:52.936601,502,Modbus,11000,64,normal


In [33]:
cleaned_attack_1.head()

Unnamed: 0,Time,sport,proto,flags,size,label
0,2021-04-09 18:23:28.430018,56666.0,Modbus,11000.0,66,normal
1,2021-04-09 18:23:28.475587,502.0,Modbus,11000.0,64,normal
2,2021-04-09 18:23:28.523120,56668.0,Modbus,11000.0,66,normal
3,2021-04-09 18:23:28.568588,502.0,Modbus,11000.0,65,normal
4,2021-04-09 18:23:28.617394,502.0,Modbus,11000.0,65,normal


### 3 - Supprimer les NaN

In [34]:
for k in cleaned_attack_1.columns:
    print(k, cleaned_attack_1[k].isna().any())

Time False
sport True
proto False
flags True
size False
label False


In [35]:
# On drop pas les NaN restants, ils sont peut être porteur d'information
# cleaned_normal = cleaned_normal.dropna(subset=["sport"])
# cleaned_attack_1 = cleaned_attack_1.dropna(subset=["sport"])
# cleaned_attack_2 = cleaned_attack_2.dropna(subset=["sport"])
# cleaned_attack_3 = cleaned_attack_3.dropna(subset=["sport"])
# cleaned_attack_4 = cleaned_attack_4.dropna(subset=["sport"])

Vérifions que ça a retiré tous les NaN :

In [36]:
# On ne retire pas forcément tous les NaN 
# for k in cleaned_attack_1.columns:
#     print(k, cleaned_attack_1[k].isna().any())

Time False
sport True
proto False
flags True
size False
label False


## IV - Enregistrer données nettoyées

In [37]:
cleaned_normal.to_csv("preparation/network_normal.csv")
cleaned_attack_1.to_csv("preparation/network_1.csv")
cleaned_attack_2.to_csv("preparation/network_2.csv")
cleaned_attack_3.to_csv("preparation/network_3.csv")
cleaned_attack_4.to_csv("preparation/network_4.csv")