# Loading data and overview of summary statistics:

In [1]:
import pandas as pd
df = pd.read_csv (r'E:\NMT MS\Research\Data\Processed_datasets\Processed_datasets\Processed_IoT_dataset\IoT_Fridge.csv')
df.head()

Unnamed: 0,date,time,fridge_temperature,temp_condition,label,type
0,31-Mar-19,12:36:52,13.1,high,0,normal
1,31-Mar-19,12:36:53,8.65,high,0,normal
2,31-Mar-19,12:36:54,2.0,low,0,normal
3,31-Mar-19,12:36:55,4.8,low,0,normal
4,31-Mar-19,12:36:56,10.7,high,0,normal


In [71]:
df.dtypes

date                   object
time                   object
fridge_temperature    float64
temp_condition         object
label                   int64
type                   object
dtype: object

In [72]:
df.shape

(587076, 6)

## Summary for numerical variables:

In [73]:
df.describe()

Unnamed: 0,fridge_temperature,label
count,587076.0,587076.0
mean,7.146993,0.146913
std,3.624818,0.354019
min,1.0,0.0
25%,4.2,0.0
50%,6.7,0.0
75%,10.25,0.0
max,14.0,1.0


## Summary of categorical variables

In [74]:
df["temp_condition"].value_counts()

high      193129
low       150183
high      114287
low        89420
high       22563
low        17494
Name: temp_condition, dtype: int64

In [75]:
df["type"].value_counts()

normal        500827
backdoor       35568
password       28425
ddos           10233
injection       7079
ransomware      2902
xss             2042
Name: type, dtype: int64

# Checking missing values:

In [76]:
# Find the column
df.columns[df.isna().any()].tolist()

[]

In [77]:
df = df.dropna()
df.shape

(587076, 6)

# Feature engineering:

In [2]:
df['month'] = pd.DatetimeIndex(df['date']).month

In [3]:
df.tail()

Unnamed: 0,date,time,fridge_temperature,temp_condition,label,type,month
587071,27-Apr-19,12:41:17,4.0,low,0,normal,4
587072,27-Apr-19,12:41:17,6.05,high,0,normal,4
587073,27-Apr-19,12:41:17,12.8,high,0,normal,4
587074,27-Apr-19,12:41:18,6.5,high,0,normal,4
587075,27-Apr-19,12:41:20,5.3,low,0,normal,4


In [4]:
df['time'] = df['time'].str.replace(' ','')

In [5]:
df['time_hour'] = pd.to_datetime(df['time'],format='%H:%M:%S').dt.hour

In [6]:
df.head()

Unnamed: 0,date,time,fridge_temperature,temp_condition,label,type,month,time_hour
0,31-Mar-19,12:36:52,13.1,high,0,normal,3,12
1,31-Mar-19,12:36:53,8.65,high,0,normal,3,12
2,31-Mar-19,12:36:54,2.0,low,0,normal,3,12
3,31-Mar-19,12:36:55,4.8,low,0,normal,3,12
4,31-Mar-19,12:36:56,10.7,high,0,normal,3,12


In [7]:
df = df[['type','month','time_hour','fridge_temperature','temp_condition','label']]

In [8]:
df.head()

Unnamed: 0,type,month,time_hour,fridge_temperature,temp_condition,label
0,normal,3,12,13.1,high,0
1,normal,3,12,8.65,high,0
2,normal,3,12,2.0,low,0
3,normal,3,12,4.8,low,0
4,normal,3,12,10.7,high,0


# K-means clustering:

In [15]:
#Now let's open it with pandas
import pandas as pd
from pandas import Series,DataFrame
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

import numpy as np
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as shc
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

import warnings
warnings.filterwarnings('ignore')

## Encoding

In [24]:
#get all categorical columns
#cat_columns = df.select_dtypes(['object']).columns
cat_columns = ['temp_condition']
#convert all categorical columns to numeric
df[cat_columns] = df[cat_columns].apply(lambda x: pd.factorize(x)[0])

#view updated DataFrame
df

Unnamed: 0,type,month,time_hour,fridge_temperature,temp_condition,label
0,normal,3,12,13.10,0,0
1,normal,3,12,8.65,0,0
2,normal,3,12,2.00,1,0
3,normal,3,12,4.80,1,0
4,normal,3,12,10.70,0,0
...,...,...,...,...,...,...
587071,normal,4,12,4.00,4,0
587072,normal,4,12,6.05,5,0
587073,normal,4,12,12.80,5,0
587074,normal,4,12,6.50,5,0


## Train test splitting:

In [16]:
X = df.iloc[:, 1:6] # first two columns of data frame with all rows
y = df.iloc[:,0]

In [17]:
data = pd.DataFrame(X)

In [88]:
# from sklearn.cluster import KMeans 
clusters = 7
  
kmeans = KMeans(n_clusters = clusters) 
kmeans.fit(data) 

labels = kmeans.labels_

In [89]:
#Glue back to originaal data
data['clusters'] = labels

In [90]:
data["Actual_cat"] = y

In [91]:
data.columns = ['month','time_hour','fridge_temperature','temp_condition','label',"Predicted","Actual"]

In [92]:
pd.crosstab(index=data['Actual'], columns=data['Predicted'])

Predicted,0,1,2,3,4,5,6
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
backdoor,7331,6444,8158,2379,7167,1976,2113
ddos,852,792,3950,427,3458,376,378
injection,3620,3044,0,394,0,20,1
normal,66846,54671,49413,93134,40888,89526,106349
password,2474,1999,8033,3062,6935,2859,3063
ransomware,0,0,0,1128,0,900,874
xss,0,0,0,814,0,626,602


# SOM

In [18]:
def maha(wt,data):
    # import the math module 
    import math
    diff = data-wt
    left_term = np.dot(diff, IV)
    mahal = np.dot(left_term, diff.T)
    dist = math.sqrt(mahal)
    return dist

In [25]:
X = df.iloc[:, 1:6] # first two columns of data frame with all rows
y = df.iloc[:,0]

In [26]:
X.head()

Unnamed: 0,month,time_hour,fridge_temperature,temp_condition,label
0,3,12,13.1,0,0
1,3,12,8.65,0,0
2,3,12,2.0,1,0
3,3,12,4.8,1,0
4,3,12,10.7,0,0


In [27]:
X = pd.DataFrame(X)

In [28]:
cov_X = np.cov(X.T)
IV = np.linalg.inv(cov_X)

In [29]:
X = np.array(X)

#Self organizing map clustering algorithm
# The Academician
import numpy as np, numpy.random
from scipy.spatial import distance
np.set_printoptions(suppress=True) #Force-suppress all exponential notation

k = 7
p = 1
alpha = 0.5 # Initial learning rate


# Print the number of data and dimension 
n = len(X)
d = len(X[0])
addZeros = np.zeros((n, 1))
X = np.append(X, addZeros, axis=1)
print("The SOM algorithm: \n")
#print("The training data: \n", X)
print("\nTotal number of data: ",n)
print("Total number of features: ",d)
print("Total number of Clusters k: ",k)

The SOM algorithm: 


Total number of data:  587076
Total number of features:  5
Total number of Clusters k:  7


In [30]:
C = np.zeros((k,d+1))

weight = np.random.rand(len(X[0])-1,k)
print("\nThe initial weight: \n", np.round(weight,2))


The initial weight: 
 [[0.57 0.1  0.85 0.58 0.13 0.08 0.22]
 [0.92 0.04 0.48 0.53 0.71 0.27 0.63]
 [0.76 0.77 0.82 0.55 0.25 0.44 0.02]
 [0.7  0.55 0.13 0.02 0.36 0.07 0.5 ]
 [0.94 0.48 0.62 0.61 0.36 0.17 0.56]]


In [31]:
weight.shape

(5, 7)

In [32]:
for it in range(100): # Total number of iterations
    for i in range(n):
        distMin = 99999999
        for j in range(k):
            #dist = np.square(distance.euclidean(weight[:,j], X[i,0:d]))
            dist = maha(wt = weight[:,j], data = X[i,0:d])
            if distMin>dist:
                distMin = dist
                jMin = j
        weight[:,jMin] = weight[:,jMin]*(1-alpha) + alpha*X[i,0:d]   
    alpha = 0.5*alpha
    
print("\nThe final weight: \n",np.round(weight,4))


The final weight: 
 [[0.5694 0.1024 3.924  0.5808 0.1329 0.0786 0.2245]
 [0.9173 0.0366 9.6214 0.5332 0.7111 0.2704 0.6251]
 [0.7558 0.7706 7.1465 0.5523 0.2458 0.4398 0.0219]
 [0.6984 0.5507 3.3149 0.0227 0.3634 0.0695 0.4973]
 [0.9394 0.4805 0.1769 0.6084 0.3571 0.165  0.5624]]


In [43]:
for i in range(len(X[0])-1):    
    cNumber = np.where(weight[i] == np.amax(weight[i]))
    X[i,d] = cNumber[0]

In [41]:
range(len(X[0]))

range(0, 6)

In [44]:
predicted = pd.DataFrame(X)

In [45]:
predicted.head()

Unnamed: 0,0,1,2,3,4,5
0,3.0,12.0,13.1,0.0,0.0,2.0
1,3.0,12.0,8.65,0.0,0.0,2.0
2,3.0,12.0,2.0,1.0,0.0,2.0
3,3.0,12.0,4.8,1.0,0.0,2.0
4,3.0,12.0,10.7,0.0,0.0,0.0


In [36]:
predicted["Actual_cat"] = y

In [38]:
predicted.columns = ['month','time_hour','fridge_temperature','temp_condition','label',"Predicted","Actual"]

In [39]:
pd.crosstab(index=predicted['Actual'], columns=predicted['Predicted'])

Predicted,0.0,2.0
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
backdoor,35568,0
ddos,10233,0
injection,7079,0
normal,500823,4
password,28425,0
ransomware,2902,0
xss,2042,0
