In [52]:
import pandas as pd
from datetime import datetime
from datetime import timedelta
import random
import math
import concurrent.futures
from multiprocessing import Manager

## Almost all configurable variables are kept here

In [53]:
days_to_go_back = 1
frequency = '1S'   #every second

mac_list=[
    "F6:B8:15:1E:A8:02",
    "6C:D0:7E:63:10:F1",
    "97:46:62:3B:15:2C",
    "05:A6:2E:30:98:E2"
]
#one flag for each mac

#should be between 9 (am) and 16 (pm) to be safe
list_attack_down=[
    {
        "start":datetime(2023,12,6,12,0,0),
        "stop":datetime(2023,12,6,12,15,0),
        "plc":[1,3]
    }
]
#should be between 21 (pm) and (6am) to be safe
list_attack_up=[
    {
        "start":datetime(2023,12,6,3,0,0),
        "stop":datetime(2023,12,6,3,15,0),
        "plc":[2]
    }
]

plc_state=["ok","ok","ok","ok"]
plc_counter=[0,0,0,0]
max_cycle_time=[41,48,34,74]
#mu is the mean
mu_cycle_time=[24,27,21,38]
mu_ram=[7464,8197,6465,9883]
mu_packets=20
mu_packets_size=50
#sigma is the standard deviation
sigma_cycle_time=4
sigma_ram=256
sigma_packets=3
sigma_packets_size=15

In [54]:
#day of weeks
#Monday =0 .. Sunday=6
work_days=range(5)

#window_check must be in seconds and 
new_day_check_time = datetime(1,1,1,0,0,0,0)

In [55]:
## those variables are for future features
#down period is in seconds
down_for_all_day=False
#up period is in seconds
up_period=3600

start_working_time=8
end_working_time=17

packets_nmap_probability=0.01
packets_big_probability=0.01
#probabilità che i sistemi siano spenti durante un giorno normale
random_down_for_all_day_probability=0.01
#probabilità che i sistemi siano accesi un ora durante un giorno di shutdown
random_up_probability=0.01

## variables regarding the temporal space to generate the data

In [56]:
start_date = (datetime.today() - timedelta(days = days_to_go_back)).replace(hour=0,minute=0,second=0, microsecond=0)
stop_date = datetime.today().replace(hour=0,minute=0,second=0, microsecond=0)

datelist = pd.date_range(start_date, stop_date, freq='24H').tolist()

In [57]:
# how many rows are created
print(len(datelist))
#to check that the first line is actually the day you chose
print(datelist[0])

2
2023-12-06 00:00:00


## Functions to simplify the code down and making easier to mantain

In [58]:
def generate_cpu_cycle_time(date, mac_index):
    return math.floor(random.normalvariate(mu=mu_cycle_time[mac_index],sigma=sigma_cycle_time))

In [59]:
def generate_ram_usage(date, mac_index):
    return math.floor(random.normalvariate(mu=mu_ram[mac_index],sigma=sigma_ram))

### ping packets are 56 bytes

In [60]:
def generate_rx_size(packet_n, date, mac_index):
    result = 0
    for i in range(packet_n):
        result += random.normalvariate(mu=mu_packets_size,sigma=sigma_packets_size)
    return math.floor(result)

In [61]:
def generate_tx_size(packet_n, date, mac_index):
    result = 0
    for i in range(packet_n):
        result += random.normalvariate(mu=mu_packets_size,sigma=sigma_packets_size)
    return math.floor(result)

## randomize the starting time with a margin of 15 minnutes, to make shure the data are not too predictable

In [62]:
#TODO: refactoring with variables and not static integers

def randomize_start_and_stop():
    day_start_time = datetime(1,1,1,0,0,0)
    day_stop_time = datetime(1,1,1,0,0,0)

    day_start_time = day_start_time.replace(hour=random.randint(7,8))
    if (day_start_time.hour == 8):
        day_start_time = day_start_time.replace(minute=random.randint(0,15))
    else:
        day_start_time = day_start_time.replace(minute=59-random.randint(0,15))

    day_stop_time = day_start_time.replace(hour=random.randint(16,17))
    if (day_stop_time.hour == 17):
        day_stop_time = day_stop_time.replace(minute=random.randint(0,15))
    else:
        day_stop_time = day_stop_time.replace(minute=59-random.randint(0,15))

    return day_start_time, day_stop_time

In [63]:
def check_plc_state(day_start_time, day_stop_time, date, plc_index):
    # Check if the current date is within the working hours
    if date.dayofweek in work_days and day_start_time.time() <= date.time() <= day_stop_time.time():
        # Check for attack_down
        #attack_down = any(att['start'] <= date <= att['stop'] and att['plc'] == plc_index for att in list_attack_down)
        attack_down = any(att['start'] <= date <= att['stop'] and plc_index in att['plc'] for att in list_attack_down)

        if not attack_down:
            plc_state[plc_index] = "ok"
        else:
            plc_state[plc_index] = "atk_down"
    else:
        # Check for attack_up
        #attack_up = any(att['start'] <= date <= att['stop'] and att['plc'] == plc_index for att in list_attack_up)
        attack_up = any(att['start'] <= date <= att['stop'] and plc_index in att['plc'] for att in list_attack_up)

        if not attack_up:
            plc_state[plc_index] = "off"
        else:
            plc_state[plc_index] = "atk_up"

In [64]:
#TODO: make the function right and add in the upper cell
def check_plc_status_old(date, plc_index):

    if(date.dayofweek in work_days):
        if (plc_state[plc_index]==False):
            if (plc_counter[plc_index]>=up_period):
                plc_state[plc_index]=True
            plc_counter += 1

        elif (random.random()<=random_up_probability):
            plc_state[plc_index]=False

    elif (plc_state[plc_index]==True):
        if (plc_counter[plc_index]>=down_for_all_day):
            plc_state[plc_index]=False
        plc_counter += 1
    
    elif (random.random()<=random_down_for_all_day_probability):
            plc_state[plc_index]=True
            plc_counter += 1

## REAL process function that generate the data
This function get a date, devide it with a frequency that can be 10 or 1 second (manual mod), generate the rows of each subdivision, append all of them in a list, and then return the result.

In [65]:
#columns names
columns =["timestamp","mac_address","cpu_max_cycle","cpu_current_cycle","ram_usage","rx_packets","rx_bytes","tx_packets","tx_bytes","flag"]

In [66]:
def process_data(date):

    sublist = pd.date_range(date, date + timedelta(hours=23,minutes=59,seconds=59), freq=frequency).tolist()

    result = []

    day_start_time, day_stop_time = randomize_start_and_stop()
    
    for subdate in sublist:
            
        for plc in range(0,len(mac_list)):
            check_plc_state(day_start_time, day_stop_time, subdate, plc)

            if plc_state[plc] in ["ok", "atk_up"]:
                cpu_time = generate_cpu_cycle_time(subdate, plc)
                ram_usage = generate_ram_usage(subdate, plc)
                tx_packets = math.floor(random.normalvariate(mu_packets, sigma_packets))
                tx_bytes = generate_tx_size(tx_packets, subdate, plc)
                rx_packets = math.floor(random.normalvariate(mu_packets, sigma_packets))
                rx_bytes = generate_rx_size(rx_packets, subdate, plc)
                new_row = {'timestamp': subdate,
                        'mac_address': mac_list[plc],
                        'cpu_max_cycle': max_cycle_time[plc],
                        'cpu_current_cycle': cpu_time,
                        'ram_usage': ram_usage,
                        'rx_packets': rx_packets,
                        'rx_bytes': rx_bytes,
                        'tx_packets': tx_packets,
                        'tx_bytes': tx_bytes,
                        'flag': plc_state[plc]}
                result.append(new_row)
            else:
                new_row = {'timestamp': subdate,
                        'mac_address': mac_list[plc],
                        'cpu_max_cycle': 0,
                        'cpu_current_cycle': 0,
                        'ram_usage': 0,
                        'rx_packets': 0,
                        'rx_bytes': 0,
                        'tx_packets': 0,
                        'tx_bytes': 0,
                        'flag': plc_state[plc]}
                result.append(new_row)
    return result

## Parallelising part

In [67]:
# Use a ThreadPoolExecutor for parallel processing
max_process_workers = 8

In [68]:
with concurrent.futures.ProcessPoolExecutor(max_workers=max_process_workers) as executor:
    futures2 = [executor.submit(process_data, date) for date in datelist]
    
# Wait for all tasks to complete
results2 = [future.result() for future in concurrent.futures.as_completed(futures2)]
# Flatten the list of lists into a single list of dictionaries
flat_results = [item for sublist in results2 for item in sublist]
# Create a DataFrame from the results
df = pd.DataFrame(flat_results)

In [69]:
df.set_index('timestamp', inplace=True)
print(df.shape)
df.head()

(691200, 9)


Unnamed: 0_level_0,mac_address,cpu_max_cycle,cpu_current_cycle,ram_usage,rx_packets,rx_bytes,tx_packets,tx_bytes,flag
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-12-07 00:00:00,F6:B8:15:1E:A8:02,0,0,0,0,0,0,0,off
2023-12-07 00:00:00,6C:D0:7E:63:10:F1,0,0,0,0,0,0,0,off
2023-12-07 00:00:00,97:46:62:3B:15:2C,0,0,0,0,0,0,0,off
2023-12-07 00:00:00,05:A6:2E:30:98:E2,0,0,0,0,0,0,0,off
2023-12-07 00:00:01,F6:B8:15:1E:A8:02,0,0,0,0,0,0,0,off


In [70]:
df.to_csv("norm_zero_{}_{}d_with_attacks.csv".format(frequency,days_to_go_back))