In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import time
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [2]:
from tqdm.autonotebook import tqdm
tqdm.pandas()

  """Entry point for launching an IPython kernel.


In [3]:
def openCSV(filename):
    '''
    Takes in filename string (with position relative to this location)
    Returns dataframe
    '''
    file = pd.read_csv(filename)
    return pd.DataFrame(file)

In [4]:
# Open all the files
dos_df = openCSV("./DoS_dataset.csv")
fuzzy_df = openCSV("./Fuzzy_dataset.csv")
gear_df = openCSV("./gear_dataset.csv")
rpm_df = openCSV("./RPM_dataset.csv")

# DOS
1. Reformat the data to combine the data payload and to equalize the 8bit and 2bit data
2. Encode to matrix of binary
3. Apply KNN

In [5]:
dos_df.head()

Unnamed: 0,1478198376.389427,0316,8,05,21,68,09,21.1,21.2,00,6f,R
0,1478198000.0,018f,8,fe,5b,00,00,0,3c,00,00,R
1,1478198000.0,0260,8,19,21,22,30,8,8e,6d,3a,R
2,1478198000.0,02a0,8,64,00,9a,1d,97,02,bd,00,R
3,1478198000.0,0329,8,40,bb,7f,14,11,20,00,14,R
4,1478198000.0,0545,8,d8,00,00,8a,0,00,00,00,R


In [6]:
'''
T are aTtacks
R are Regular
nan is because the table is much shorter with only 2 bytes of data
'''
Counter(dos_df["R"])

Counter({'R': 3047061, nan: 31188, 'T': 587521})

In [7]:
# Add label to the data
dos_df.columns = ["Timestamp","CAN ID","Data size (bytes)","[P","a","Y","L","O","A","D","data]","Label"]
dos_df.head()

Unnamed: 0,Timestamp,CAN ID,Data size (bytes),[P,a,Y,L,O,A,D,data],Label
0,1478198000.0,018f,8,fe,5b,00,00,0,3c,00,00,R
1,1478198000.0,0260,8,19,21,22,30,8,8e,6d,3a,R
2,1478198000.0,02a0,8,64,00,9a,1d,97,02,bd,00,R
3,1478198000.0,0329,8,40,bb,7f,14,11,20,00,14,R
4,1478198000.0,0545,8,d8,00,00,8a,0,00,00,00,R


In [8]:
# Splitting the 2 bytes and 8 bytes data
twob_dos_df = dos_df[dos_df["Label"].isnull()]
eightb_dos_df = dos_df[dos_df["Label"].isnull()  == False]

In [9]:
# Combine payload
eightb_dos_df["data"] = eightb_dos_df['[P'] + eightb_dos_df["a"] + eightb_dos_df["Y"] + eightb_dos_df["L"]  + eightb_dos_df["O"]  + eightb_dos_df["A"]  + eightb_dos_df["D"]  + eightb_dos_df["data]"]
eightb_dos_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Timestamp,CAN ID,Data size (bytes),[P,a,Y,L,O,A,D,data],Label,data
0,1478198000.0,018f,8,fe,5b,00,00,0,3c,00,00,R,fe5b0000003c0000
1,1478198000.0,0260,8,19,21,22,30,8,8e,6d,3a,R,19212230088e6d3a
2,1478198000.0,02a0,8,64,00,9a,1d,97,02,bd,00,R,64009a1d9702bd00
3,1478198000.0,0329,8,40,bb,7f,14,11,20,00,14,R,40bb7f1411200014
4,1478198000.0,0545,8,d8,00,00,8a,0,00,00,00,R,d800008a00000000


In [10]:
# Clean up unused columns
eightb_dos_df = eightb_dos_df.drop(columns=['[P',"a","Y","L","O","A","D","data]"])
eightb_dos_df.head()

Unnamed: 0,Timestamp,CAN ID,Data size (bytes),Label,data
0,1478198000.0,018f,8,R,fe5b0000003c0000
1,1478198000.0,0260,8,R,19212230088e6d3a
2,1478198000.0,02a0,8,R,64009a1d9702bd00
3,1478198000.0,0329,8,R,40bb7f1411200014
4,1478198000.0,0545,8,R,d800008a00000000


In [11]:
# Label and clean up the 2 bytes entries
twob_dos_df = twob_dos_df.dropna(axis='columns')
twob_dos_df.columns = ["Timestamp","CAN ID","Data size (bytes)","[Payload","Data]","Label"]
twob_dos_df.head()

Unnamed: 0,Timestamp,CAN ID,Data size (bytes),[Payload,Data],Label
35,1478198000.0,05f0,2,1,0,R
134,1478198000.0,05f0,2,1,0,R
226,1478198000.0,05f0,2,1,0,R
319,1478198000.0,05f0,2,1,0,R
411,1478198000.0,05f0,2,1,0,R


In [12]:
# Combine payload for 2 bytes entries
twob_dos_df["data"] = twob_dos_df['[Payload'] + twob_dos_df["Data]"]
twob_dos_df.head()

Unnamed: 0,Timestamp,CAN ID,Data size (bytes),[Payload,Data],Label,data
35,1478198000.0,05f0,2,1,0,R,100
134,1478198000.0,05f0,2,1,0,R,100
226,1478198000.0,05f0,2,1,0,R,100
319,1478198000.0,05f0,2,1,0,R,100
411,1478198000.0,05f0,2,1,0,R,100


In [13]:
# Clean up unused label 
twob_dos_df = twob_dos_df.drop(columns=['[Payload',"Data]"])
twob_dos_df.head()

Unnamed: 0,Timestamp,CAN ID,Data size (bytes),Label,data
35,1478198000.0,05f0,2,R,100
134,1478198000.0,05f0,2,R,100
226,1478198000.0,05f0,2,R,100
319,1478198000.0,05f0,2,R,100
411,1478198000.0,05f0,2,R,100


In [14]:
# Combine the 8bytes and 2bytes entries together
frames = [eightb_dos_df,twob_dos_df]
dos_formatted = pd.concat(frames)

In [15]:
dos_formatted.head()

Unnamed: 0,Timestamp,CAN ID,Data size (bytes),Label,data
0,1478198000.0,018f,8,R,fe5b0000003c0000
1,1478198000.0,0260,8,R,19212230088e6d3a
2,1478198000.0,02a0,8,R,64009a1d9702bd00
3,1478198000.0,0329,8,R,40bb7f1411200014
4,1478198000.0,0545,8,R,d800008a00000000


In [16]:
# Return dataframe to the same order as original based on timestamp
dos_formatted = dos_formatted.sort_values(by=['Timestamp'])
dos_formatted.tail()

Unnamed: 0,Timestamp,CAN ID,Data size (bytes),Label,data
3665765,1478201000.0,018f,8,R,fe59000000410000
3665766,1478201000.0,0260,8,R,18212130088f6d19
3665767,1478201000.0,02a0,8,R,24009a1d9702bd00
3665768,1478201000.0,0329,8,R,dcb77f1411200014
3665769,1478201000.0,0545,8,R,d800008b00000000


In [17]:
# Randomize to break locality and then split data to 70:30 
train, test = np.split(dos_formatted.sample(frac=1), [int(.7*len(dos_formatted))])
train.head()

Unnamed: 0,Timestamp,CAN ID,Data size (bytes),Label,data
743482,1478199000.0,0000,8,T,0000000000000000
68311,1478198000.0,02c0,8,R,1400000000000000
1051654,1478199000.0,043f,8,R,004060ff7ed70800
698067,1478199000.0,0131,8,R,0e800000247f057c
2178975,1478200000.0,0000,8,T,0000000000000000


In [18]:
print(train.shape)
print(test.shape)

(2566039, 5)
(1099731, 5)


In [19]:
def createIntLabel(df):
    '''
    Input is df
    Output 0/1 version of the label
    '''
    label = df[df["Label"] == "T"]
    int_label = []
    for x in label:
        if x == True:
            int_label.append(1)
        else:
            int_label.append(0)
    return int_label

In [29]:
# Prepare data for 
train_data = train.drop(columns="Label")
train_label = createIntLabel(train)
test_data = test.drop(columns="Label")
test_label = createIntLabel(test)

In [30]:
# Encode string data to integer 
le = LabelEncoder()
train_data = train_data.apply(le.fit_transform)
test_data = test_data.apply(le.fit_transform)
train_data.head()

Unnamed: 0,Timestamp,CAN ID,Data size (bytes),data
743482,520021,0,1,0
68311,47729,12,1,25661
1051654,736072,18,1,1680
698067,488223,5,1,14627
2178975,1524919,0,1,0


In [31]:
# Encode data to matrix of integers
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(test_data)
test_data = enc.transform(test_data).toarray()

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
enc.fit(train_data)
train_data = enc.transform(train_data).toarray()

In [28]:
#Let's try...KNN (it should be able to detect DOS)
classifier = KNeighborsClassifier(n_neighbors = 5)
start_time = time.time()
classifier.fit(train_data,train_label)
elapsed_time = time.time() - start_time

TypeError: float() argument must be a string or a number, not 'OneHotEncoder'

In [None]:
elapsed_time

In [None]:
guesses = classifier.predict(test_data)
guesses

In [None]:
classifier.score(test_data,test_label)

In [176]:
print("Accuracy:",accuracy_score(test_label,guesses))
print("Recall:",recall_score(test_label,guesses))
print("Precision:",precision_score(test_label,guesses))
print("F1:",f1_score(test_label,guesses))

Accuracy: 0.8864840583742751
Recall: 0.406304163126593
Precision: 0.7818054995476987
F1: 0.5347163468168451


In [17]:
fuzzy_df.head()

Unnamed: 0,1478195721.903877,0545,8,d8,00,00.1,8a,00.2,00.3,00.4,00.5,R
0,1478196000.0,02b0,5,ff,7f,0,05,49,R,,,
1,1478196000.0,0002,8,00,00,0,00,00,01,7.0,15,R
2,1478196000.0,0153,8,00,21,10,ff,00,ff,0.0,00,R
3,1478196000.0,0130,8,19,80,0,ff,fe,7f,7.0,60,R
4,1478196000.0,0131,8,17,80,0,00,65,7f,7.0,9f,R


In [67]:
fuzzy_df[fuzzy_df['R'] == "T"].head(10)

Unnamed: 0,1478195721.903877,0545,8,d8,00,00.1,8a,00.2,00.3,00.4,00.5,R
1566,1478196000.0,00df,8,8c,ab,f2,26,7a,29,1a,0c,T
1567,1478196000.0,06ea,8,25,10,9c,ed,5b,16,2c,18,T
1568,1478196000.0,02fd,8,3f,bd,68,f3,c3,4f,28,d4,T
1575,1478196000.0,012d,8,35,45,99,cf,09,80,c7,77,T
1579,1478196000.0,033a,8,1c,04,86,90,7f,08,7e,c9,T
1583,1478196000.0,039f,8,0e,2c,1c,49,e7,58,f6,6d,T
1585,1478196000.0,07e3,8,7e,67,f3,ce,18,db,45,23,T
1587,1478196000.0,0491,8,57,b5,20,cc,ad,83,d4,ef,T
1588,1478196000.0,0522,8,04,72,b8,73,8c,bc,1a,79,T
1589,1478196000.0,02fd,8,02,fd,0b,1b,94,54,94,54,T


In [62]:
fuzzy_df[fuzzy_df['R'].isnull()].head()

Unnamed: 0,1478195721.903877,0545,8,d8,00,00.1,8a,00.2,00.3,00.4,00.5,R
0,1478196000.0,02b0,5,ff,7f,00,5.0,49,R,,,
18,1478196000.0,02b0,5,ff,7f,00,5.0,7a,R,,,
41,1478196000.0,02b0,5,ff,7f,00,5.0,6b,R,,,
59,1478196000.0,02b0,5,ff,7f,00,5.0,1c,R,,,
79,1478196000.0,05f0,2,00,00,R,,,,,,


In [65]:
fuzzy_df[fuzzy_df['00.3'] == "T"].head()

Unnamed: 0,1478195721.903877,0545,8,d8,00,00.1,8a,00.2,00.3,00.4,00.5,R


In [66]:
fuzzy_df[fuzzy_df['00.1'] == "T"].head()

Unnamed: 0,1478195721.903877,0545,8,d8,00,00.1,8a,00.2,00.3,00.4,00.5,R


In [18]:
gear_df.head()

Unnamed: 0,1478193190.056566,0140,8,00,00.1,00.2,00.3,10,29,2a,24,R
0,1478193000.0,02c0,8,15,0,0,00,00,00,0,00,R
1,1478193000.0,0350,8,05,20,44,68,77,00,0,7e,R
2,1478193000.0,0370,8,00,20,0,00,00,00,0,00,R
3,1478193000.0,043f,8,10,40,60,ff,78,c4,8,00,R
4,1478193000.0,0440,8,ff,0,0,00,ff,c4,8,00,R


In [58]:
gear_df[gear_df['R'] == "T"].head()

Unnamed: 0,1478193190.056566,0140,8,00,00.1,00.2,00.3,10,29,2a,24,R
2139,1478193000.0,043f,8,1,45,60,ff,6b,0,0,0,T
2140,1478193000.0,043f,8,1,45,60,ff,6b,0,0,0,T
2141,1478193000.0,043f,8,1,45,60,ff,6b,0,0,0,T
2143,1478193000.0,043f,8,1,45,60,ff,6b,0,0,0,T
2149,1478193000.0,043f,8,1,45,60,ff,6b,0,0,0,T


In [19]:
rpm_df.head()

Unnamed: 0,1478191030.045114,0316,8,05,22,68,09,22.1,20,00,75,R
0,1478191000.0,018f,8,fe,3b,00,00,00,3c,00,00,R
1,1478191000.0,0260,8,19,22,22,30,ff,8f,6e,3f,R
2,1478191000.0,02a0,8,60,00,83,1d,96,02,bd,00,R
3,1478191000.0,0329,8,dc,b8,7e,14,11,20,00,14,R
4,1478191000.0,0545,8,d8,00,00,83,00,00,00,00,R


In [59]:
rpm_df[rpm_df['R'] == "T"].head()

Unnamed: 0,1478191030.045114,0316,8,05,22,68,09,22.1,20,00,75,R
1707,1478191000.0,316,8,45,29,24,ff,29,24,0,ff,T
1710,1478191000.0,316,8,45,29,24,ff,29,24,0,ff,T
1711,1478191000.0,316,8,45,29,24,ff,29,24,0,ff,T
1712,1478191000.0,316,8,45,29,24,ff,29,24,0,ff,T
1715,1478191000.0,316,8,45,29,24,ff,29,24,0,ff,T
