# Network attacks - Exploration des données

## I - Modules

In [2]:
import pandas as pd

## II - Charger donnée

In [3]:
def load_data(size):
    # Mapping of size to the fraction of rows to skip
    skip_mapping = {
        "full": 1,
        "/2": 2,
        "/4": 4,
        "/10": 10,
        "/100": 100,
        "/1000": 1000
    }

    # If size is not valid, raise an error
    if size not in skip_mapping:
        raise ValueError(f"Invalid size: {size}")

    skip_val = skip_mapping[size]

    # Optimize the lambda function
    if skip_val == 1:
        skiprows = None
    else:
        total_rows = sum(1 for _ in open("network/normal.csv")) # Assumes all files have the same number of rows
        rows_to_keep = list(range(0, total_rows, skip_val))
        skiprows = lambda x: x not in rows_to_keep

    # Load the data
    file_names = ["normal", "attack_1", "attack_2", "attack_3", "attack_4"]
    dataframes = [pd.read_csv(f"network/{name}.csv", skiprows=skiprows) for name in file_names]

    return tuple(dataframes)

df_normal, df_attack_1, df_attack_2, df_attack_3, df_attack_4 = load_data("/1000")

ParserError: Error tokenizing data. C error: no error message set

In [6]:
df_normal, df_attack_1, df_attack_2, df_attack_3, df_attack_4 = load_data("/1000")

Afficher la taille mémoire prise en tout par les dataframe

In [None]:
df = pd.concat([df_normal, df_attack_1, df_attack_2, df_attack_3, df_attack_4], ignore_index=True)
df.info(memory_usage='deep')

## III - Analyse

In [3]:
df_attack_1.head()

Unnamed: 0,Time,mac_s,mac_d,ip_s,ip_d,sport,dport,proto,flags,size,modbus_fn,n_pkt_src,n_pkt_dst,modbus_response,label_n,label
0,2021-04-09 18:23:28.385003,74:46:a0:bd:a7:1b,0a:fe:ec:47:74:fb,84.3.251.20,84.3.251.102,56667.0,502.0,Modbus,11000.0,66,Read Coils Request,0.0,0.0,,0,normal
1,2021-04-09 18:23:28.385005,74:46:a0:bd:a7:1b,e6:3f:ac:c9:a8:8c,84.3.251.20,84.3.251.101,56666.0,502.0,Modbus,11000.0,66,Read Coils Request,1.0,0.0,,0,normal
2,2021-04-09 18:23:28.385006,74:46:a0:bd:a7:1b,fa:00:bc:90:d7:fa,84.3.251.20,84.3.251.103,56668.0,502.0,Modbus,11000.0,66,Read Coils Request,2.0,0.0,,0,normal
3,2021-04-09 18:23:28.385484,0a:fe:ec:47:74:fb,74:46:a0:bd:a7:1b,84.3.251.102,84.3.251.20,502.0,56667.0,Modbus,11000.0,64,Read Coils Response,0.0,0.0,[0],0,normal
4,2021-04-09 18:23:28.385486,fa:00:bc:90:d7:fa,74:46:a0:bd:a7:1b,84.3.251.103,84.3.251.20,502.0,56668.0,Modbus,11000.0,64,Read Coils Response,0.0,1.0,[0],0,normal


### 1 - Tailles

In [5]:
print("normal -",len(df_normal))
print("attaque 1 -",len(df_attack_1))
print("attaque 2 -",len(df_attack_2))
print("attaque 3 -",len(df_attack_3))
print("attaque 4 -",len(df_attack_4))

normal - 7757289
attaque 1 - 5527409
attaque 2 - 5159469
attaque 3 - 5862547
attaque 4 - 5522490


### 2 - Colonnes

#### A - Cohérence

In [6]:
print(list(df_normal.columns)==list(df_attack_1.columns) and
      list(df_attack_1.columns)==list(df_attack_2.columns) and 
      list(df_attack_1.columns)==list(df_attack_3.columns) and 
      list(df_attack_1.columns)==list(df_attack_4.columns))

False

Les colonnes ne sont pas identiques. Voyons pourquoi.

In [7]:
print(list(df_normal.columns))
print(list(df_attack_1.columns))
print(list(df_attack_2.columns))
print(list(df_attack_3.columns))
print(list(df_attack_4.columns))

['Time', 'mac_s', 'mac_d', 'ip_s', 'ip_d', 'sport', 'dport', 'proto', 'flags', 'size', 'modbus_fn', 'n_pkt_src', 'n_pkt_dst', 'modbus_response', 'label_n', 'label']
['Time', ' mac_s', ' mac_d', ' ip_s', ' ip_d', ' sport', ' dport', ' proto', ' flags', ' size', ' modbus_fn', ' n_pkt_src', ' n_pkt_dst', ' modbus_response', ' label_n', ' label']
['Time', ' mac_s', ' mac_d', ' ip_s', ' ip_d', ' sport', ' dport', ' proto', ' flags', ' size', ' modbus_fn', ' n_pkt_src', ' n_pkt_dst', ' modbus_response', ' label_n', ' label']
['Time', ' mac_s', ' mac_d', ' ip_s', ' ip_d', ' sport', ' dport', ' proto', ' flags', ' size', ' modbus_fn', ' n_pkt_src', ' n_pkt_dst', ' modbus_response', ' label_n', ' label']
['Time', 'mac_s', 'mac_d', 'ip_s', 'ip_d', 'sport', 'dport', 'proto', 'flags', 'size', 'modbus_fn', 'modbus_response', 'n_pkt_src', 'n_pkt_dst', 'label_n', 'label']


Même colonnes mais certaines ont un espace avant, il faudra le prendre en compte lors de la préparation de donnée

In [8]:
df_attack_1.dtypes

Time                 object
 mac_s               object
 mac_d               object
 ip_s                object
 ip_d                object
 sport              float64
 dport              float64
 proto               object
 flags              float64
 size                 int64
 modbus_fn           object
 n_pkt_src          float64
 n_pkt_dst          float64
 modbus_response     object
 label_n              int64
 label               object
dtype: object

In [10]:
for k in df_attack_1.columns:
    print(k,list(pd.unique(df_attack_1[k]))[:5])

Time ['2021-04-09 18:23:28.385003', '2021-04-09 18:23:28.385005', '2021-04-09 18:23:28.385006', '2021-04-09 18:23:28.385484', '2021-04-09 18:23:28.385486']
 mac_s ['74:46:a0:bd:a7:1b', '0a:fe:ec:47:74:fb', 'fa:00:bc:90:d7:fa', 'e6:3f:ac:c9:a8:8c', '00:80:f4:03:fb:12']
 mac_d ['0a:fe:ec:47:74:fb', 'e6:3f:ac:c9:a8:8c', 'fa:00:bc:90:d7:fa', '74:46:a0:bd:a7:1b', '00:80:f4:03:fb:12']
 ip_s ['84.3.251.20', '84.3.251.102', '84.3.251.103', '84.3.251.101', '84.3.251.18']
 ip_d ['84.3.251.102', '84.3.251.101', '84.3.251.103', '84.3.251.20', '84.3.251.18']
 sport [56667.0, 56666.0, 56668.0, 502.0, 56665.0]
 dport [502.0, 56667.0, 56668.0, 56666.0, 56665.0]
 proto ['Modbus', 'TCP', 'ARP', 'ICMP']
 flags [11000.0, 10.0, 10010.0, 10000.0, 10001.0]
 size [66, 64, 65, 74, 78]
 modbus_fn ['Read Coils Request', 'Read Coils Response', 'Read Holding Registers', 'Read Holding Registers Response', nan]
 n_pkt_src [0.0, 1.0, 2.0, 3.0, 4.0]
 n_pkt_dst [0.0, 1.0, 2.0, 3.0, 4.0]
 modbus_response [nan, '[0]', '[

Certaines colonnes comme l'IP sont des données catégorielles ou des IP toutes différentes. Il faudra peut être s'en passer

### B - Etude des NaN

In [10]:
for k in df_normal.columns:
    print(k,df_normal[k].isna().any())

Time False
mac_s False
mac_d False
ip_s False
ip_d False
sport False
dport False
proto False
flags False
size False
modbus_fn True
n_pkt_src False
n_pkt_dst False
modbus_response True
label_n False
label False


Pas de problème sur le set de données sans attaque à part modbus_response, qui est NaN car il ne peut y avoir de valeur que dans des réponses à des requêtes modbus.

In [11]:
for k in df_attack_1.columns:
    print(k,df_attack_1[k].isna().any())

Time False
 mac_s False
 mac_d False
 ip_s True
 ip_d True
 sport True
 dport True
 proto False
 flags True
 size False
 modbus_fn True
 n_pkt_src True
 n_pkt_dst True
 modbus_response True
 label_n False
 label False


La date d'acquisition, la taille de l'attaque, l'adresse mac, le protocole et le label de l'attaque sont les colonnes sans NaN montrant leur importance.

Voyons ce qu'il se passe si on supprime les colonnes catégorielles et temporelles difficilement utilisables, pour garder les données numériques et booléennes : 

In [12]:
df_temp =  df_attack_1.drop("Time",axis=1)
df_temp =  df_temp.drop(" mac_s",axis=1)
df_temp = df_temp.drop(" mac_d",axis=1)
df_temp = df_temp.drop(" ip_s",axis=1)
df_temp = df_temp.drop(" ip_d",axis=1)
df_temp = df_temp.drop(" modbus_fn",axis=1)
df_temp = df_temp.drop(" modbus_response",axis=1)
df_temp = df_temp.drop(" label_n",axis=1)
df_temp = df_temp.drop(" proto",axis=1)

for k in df_temp.columns:
    print(k,df_temp[k].isna().any())

 sport True
 dport True
 flags True
 size False
 n_pkt_src True
 n_pkt_dst True
 label False


Analysons les NaN sur ces données numériques

In [17]:
print("nombre de NaN : ",sum(df_temp[" sport"].isna()))
print("nombre de NaN : ",sum(df_temp[" dport"].isna()))
print("nombre de NaN : ",sum(df_temp[" n_pkt_src"].isna()))
print("nombre de NaN : ",sum(df_temp[" n_pkt_dst"].isna()))

nombre de NaN :  515
nombre de NaN :  515
nombre de NaN :  475
nombre de NaN :  475


Unnamed: 0,sport,dport,flags,size,n_pkt_src,n_pkt_dst,label
65920,,,,60,,,anomaly
65932,,,,60,,,anomaly
65933,,,,60,,,anomaly
70594,,,,60,,,anomaly
70608,,,,60,,,anomaly


Très peu de valeurs NaN. Correspondent elles aux mêmes lignes ? 

In [20]:
print(sum(df_temp[df_temp[" n_pkt_src"].isna()][" sport"].isna()))

475


On peut voir que toutes les lignes où "n_pkt_src" sont NaN, le sport est aussi NaN.

### 3 - PCA

In [5]:
from sklearn.decomposition import PCA

In [4]:
worked_df_normal = pd.read_csv("preparation/network_normal.csv")
worked_df_attack_1 = pd.read_csv("preparation/network_1.csv")
worked_df_attack_2 = pd.read_csv("preparation/network_2.csv")
worked_df_attack_3 = pd.read_csv("preparation/network_3.csv")
worked_df_attack_4 = pd.read_csv("preparation/network_4.csv")

In [34]:
def print_pca(df,rate=0.99):
    
    new_df = df.copy().drop("label",axis=1)
    pca = PCA()
    pca.fit(new_df)
    pca.explained_variance_ratio_S=0
    maxi = 0.99
    S=0
    for k in range(len( pca.explained_variance_ratio_)):
        S+=pca.explained_variance_ratio_[k]
        if S>=maxi:
            print(k+1)
            break


In [35]:
print_pca(worked_df_attack_1)

1
