In [1]:
import numpy as np
import pandas as pd
import matplotlib as mp

## Método para extraer Carácteristicas

El método get_vector_features es el encargado de analizar los registros en un intervalo de tiempo y extraer el vector de 11 carácteristicas

In [108]:
def get_vector_features(trafic_device,label):
    
    trafic_device = trafic_device.reset_index()
    #Calcule Active/Sleep Time by IoT generate traffic in every Second
    start_time = trafic_device.iloc[0]['TIME']
    end_time = trafic_device.iloc[-1]['TIME']
    #print(start_time)
    #print(end_time)
    #print(end_time-start_time)
    
    active_time = []
    sleep_time = []
    mean_rate = []
    f_trigger = 0
    count_sleep_time = 0
    count_active_time = 0
    for time in range(start_time,end_time+1):
        #The traces of the second i
        second_traces = trafic_device[trafic_device["TIME"] == time]
        if len(second_traces)>0:            
            #Se deberían incluir los 0 ? porque si no hay tráfico no se debería analizar.
            mean_rate.append(second_traces["Size"].sum())
            #It's a trigger ?
            trigger_traces = second_traces[(second_traces['port.dst'] != 53) & 
                                           (second_traces['port.dst'] != 123) &
                                           (second_traces['port.src'] != 53) & 
                                           (second_traces['port.src'] != 123)
                                          ]
            if len(trigger_traces)>0:
                f_trigger+= 1
            
            #Reset sleep time count
            if count_sleep_time > 0:
                sleep_time.append(count_sleep_time)
                count_sleep_time = 0
            count_active_time +=1
        else:
            #Reset active time count
            if count_active_time > 0:
                active_time.append(count_active_time)
                count_active_time = 0
            count_sleep_time+=1
    
    #Calcule Active Volume
    f_trigger = f_trigger/(end_time - start_time)
    x_frame = 79 #Maximum frame overhead
    x_payload = 1 #Heartbeat payload
    active_volume = f_trigger*(x_frame+x_payload)
    
    #N servers and protocols
    n_servers = trafic_device[(trafic_device['IP.dst'] != "255.255.255.255")]
    n_protocols = trafic_device["IP.proto"].nunique()
    
    #Calcule DNS Interval and Unique Request
    dns_request = trafic_device[(trafic_device['port.dst'] == 53)]
    
    dns_interval = []
    
    if len(dns_request) > 1:
        times_dns_interval = []
        for index, row in dns_request.iterrows():
            times_dns_interval.append(row["TIME"])
        for i in range(len(times_dns_interval)-1):
            dns_interval.append(times_dns_interval[i+1]-times_dns_interval[i])
            
    else:
        dns_interval.append(0)
    
    #Calcule NTP Traffic
    ntp_interval = []
    
    npt_request = trafic_device[(trafic_device['port.dst'] == 123)]
    
    if len(npt_request) > 1:
        times_ntp_interval = []
        for index, row in npt_request.iterrows():
            times_ntp_interval.append(row["TIME"])
        for i in range(len(times_ntp_interval)-1):
            ntp_interval.append(times_ntp_interval[i+1]-times_ntp_interval[i])
            
    else:
        ntp_interval.append(0)
    
    
    #Features
    if np.sum(active_time) > 0:
        mean_active_time = round(np.mean(active_time),3)
    else:
        mean_active_time = 0
    
    
    if np.sum(sleep_time) > 0:
        mean_sleep_time = round(np.mean(sleep_time),3)
    else:
        mean_sleep_time = 0
    
    if trafic_device["Size"].sum() > 0:
        size_mean = round(trafic_device["Size"].mean(),3)
    else: 
        size_mean = 0
    
    if np.sum(mean_rate) > 0:
        mean_mean_rate = round(np.mean(mean_rate),3)
        peak_mean_rate = round(np.max(mean_rate)/np.mean(mean_rate),3)
    else:
        mean_mean_rate = 0
        peak_mean_rate = 0
    
    if np.sum(dns_interval) > 0:
        mean_dns_interval = round(np.mean(dns_interval),3)
    else:
        mean_dns_interval = 0
    
    if np.sum(ntp_interval) > 0:
        mean_ntp_interval = round(np.mean(ntp_interval),3)
    else:
        mean_ntp_interval = 0
        
    
    
    #Generate Feature Vectors.
    feature = [mean_active_time,mean_sleep_time,active_volume,size_mean,
               mean_mean_rate,peak_mean_rate,n_servers['device_dest'].nunique(),n_protocols,dns_request['IP.dst'].nunique(),
               mean_dns_interval,mean_ntp_interval]
    
    #Remove Inf values
    for i in range(len(feature)):
        if feature[i] == np.inf:
            feature[i]=0
    
    return feature
    

## Método para procesar archivo csv

Este metodo permite procesar un archivo csv, se encarga de analizar etiqueta por etiqueta extrallendo el intervalo de tiempo donde se encuentran registros del dispositivo en el archivo, posteriormente calcula si puede generar intervalos de 5 minutos como mínimo (300 segundos) para extraer un vector de carácteristicas. 

In [1]:
def read_crude_csv(day,total_vectors=48,time_step=300):    
    
    #Parameters
    #48*11= 528
    #seg - 5 min    
    
    print("Day: "+day)
    
    features = []
    
    dataset = pd.read_csv('dataset/crude/f-'+day)
    
    for label in range(1,22):

        data_device = dataset[(dataset["device_src"] == label) | (dataset["device_dest"] == label)].loc[:,"Packet ID":]
        data_device = data_device.sort_values(by=['TIME'])
        total_vector_generate = 0
        vector_generate = 0
        print("label: "+str(label))
        
        if len(data_device) >0:

            start_time =  data_device.iloc[0]['TIME']
            end_time = data_device.iloc[-1]['TIME']
            

            total_seconds = end_time - start_time
            
            iterations = int(total_seconds/time_step)

            feature = [label]

            for i in range(0,iterations):
                for windows in range(0,300,60):                   
                    if ((start_time+((i+1)*time_step)+windows) <= end_time):
                        #print("windows: "+str(windows))
                        #print("i "+str(i))
                        #print("start: "+str((start_time+(i*time_step)+windows)))
                        #print("end: " + str((start_time+((i+1)*time_step)+windows)))
                        data_windows_time = data_device[(data_device.TIME >= (start_time+(i*time_step)+windows)) & (data_device.TIME<(start_time+((i+1)*time_step)+windows))]
                        if len(data_windows_time)>0:
                            vector = get_vector_features(data_windows_time,label)
                            feature = np.concatenate((feature,vector))
                            vector_generate+=1
                            total_vector_generate+=1
                        if vector_generate == 48:
                            features.append(feature.tolist())
                            vector_generate = 0
                            feature = [label]

                    else:
                        break
        
        
        print("Features: "+str(int(total_vector_generate/total_vectors)))

    generate_df = pd.DataFrame(features)
    
    return generate_df

## Procesar 20 días 
En las siguientes lineas se hace el llamado dia por día y se procesa cada día extrallendo vectores de carácteristicas para cada dispositivo, posteriormente se guarda como un archivo procesado csv. 

In [None]:
days = ['16-09-23.csv','16-09-24.csv','16-09-25.csv','16-09-26.csv','16-09-27.csv','16-09-28.csv',
        '16-09-29.csv','16-09-30.csv','16-10-01.csv','16-10-02.csv','16-10-03.csv','16-10-04.csv',
        '16-10-05.csv','16-10-06.csv','16-10-07.csv','16-10-08.csv','16-10-09.csv','16-10-10.csv',
        '16-10-11.csv','16-10-12.csv']

for day in days:    
    generate_df = read_crude_csv(day)
    generate_df.to_csv('dataset/p-'+day)

Day: 16-09-23.csv
label: 1
Features: 29
label: 2
Features: 29
label: 3
Features: 29
label: 4


In [94]:
vector

[1.118, 8.529, 9.411764705882353, 72.757, 86.322, 4.425, 1, 2, 1, 0, 0]

## References 

- https://erg.abdn.ac.uk/users/gorry/course/lan-pages/enet-calc.html
    

In [97]:
column_names =  ['active_time', 'sleep_time', 'active_volume',
                 'avg_pack_size','mean_rate','peak_mean_rate','n_servers',
                 'n_proto','unique_dns_request','dns_interval','ntp_interval','dispositive']

39
