# Read data from R and process 

In [3]:
df <- read.csv("C:/Users/stati/OneDrive/Desktop/Research/windows10_dataset.csv")
df <- na.omit(df)

In [4]:
df = Filter(function(x)(length(unique(x))>1), df)

In [5]:
write.csv(df,'C:/Users/stati/OneDrive/Desktop/Research/windows10_dataset_processed.csv')

# Required Libraries

We are going to use Windows 10 system generated values to predict whether the system is under cyber attack or not !

In [1]:
import numpy as np
from numpy.ma.core import ceil
from scipy.spatial import distance #distance calculation
from sklearn.preprocessing import MinMaxScaler #normalisation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score #scoring
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from matplotlib import animation, colors

# Loading the data and EDA


In [5]:
# Set the working directory
import os
os.chdir("C:/Users/stati/OneDrive/Desktop/Research")

In [18]:
import pandas as pd
df = pd.read_csv('windows10_dataset_processed.csv',encoding='latin-1')

In [23]:
#df = df.drop(columns=df.columns[0], axis=1, inplace=True)
df.columns

Index(['ï..ts', 'Processor_DPC_Rate', 'Processor_pct_.Idle_Time',
       'Processor_pct_.Interrupt_Time', 'Processor_pct_.User_Time',
       'Processor_pct_.C1_Time', 'Processor_pct_.Processor_Time',
       'Processor_C1_ransitions_sec', 'Processor_pct_.DPC_Time',
       'Processor_pct_.Privileged_Time',
       ...
       'LogicalDisk._Total..Avg..Disk.Write.Queue.Length',
       'LogicalDisk._Total..Avg..Disk.Queue.Length',
       'LogicalDisk._Total..pct_.Disk.Read.Time',
       'LogicalDisk._Total..Disk.Write.Bytes.sec',
       'LogicalDisk._Total..Disk.Transfers.sec',
       'LogicalDisk._Total..Avg..Disk.Bytes.Transfer',
       'LogicalDisk._Total..pct_.Disk.Write.Time',
       'LogicalDisk._Total..Avg..Disk.sec.Transfer', 'label', 'type'],
      dtype='object', length=108)

In [24]:
df = df.drop('ï..ts', axis=1)

In [26]:
# Attack counts by type:
pd.crosstab(index=df['label'], columns=df['type'])

type,ddos,dos,injection,mitm,normal,password,scanning,xss
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,0,0,0,24283,0,0,0
1,4539,505,606,13,0,3594,434,1240


In [27]:
# Remove the label column:
df = df.drop(['type'],axis=1)

In [28]:
df.head()

Unnamed: 0,Processor_DPC_Rate,Processor_pct_.Idle_Time,Processor_pct_.Interrupt_Time,Processor_pct_.User_Time,Processor_pct_.C1_Time,Processor_pct_.Processor_Time,Processor_C1_ransitions_sec,Processor_pct_.DPC_Time,Processor_pct_.Privileged_Time,Processor_DPCs_Queued_sec,...,LogicalDisk._Total..Avg..Disk.Bytes.Read,LogicalDisk._Total..Avg..Disk.Write.Queue.Length,LogicalDisk._Total..Avg..Disk.Queue.Length,LogicalDisk._Total..pct_.Disk.Read.Time,LogicalDisk._Total..Disk.Write.Bytes.sec,LogicalDisk._Total..Disk.Transfers.sec,LogicalDisk._Total..Avg..Disk.Bytes.Transfer,LogicalDisk._Total..pct_.Disk.Write.Time,LogicalDisk._Total..Avg..Disk.sec.Transfer,label
0,4,29.908172,0.07824,61.027509,29.908172,66.200149,480.094991,0.07824,5.163866,568.322711,...,20546.65089,0.134876,0.163098,2.822198,1924988.0,402.582784,9100.481592,13.487612,0.000405,0
1,9,31.751682,0.312521,59.144594,31.751682,66.247735,427.041284,0.07813,7.109852,512.249521,...,25742.31139,0.157564,0.189279,3.171506,1497233.0,355.134332,11153.55449,15.756429,0.000533,0
2,5,29.495167,1.168222,46.02794,29.495167,66.900381,1159.426821,0.233644,20.87223,917.339935,...,35969.10535,0.401774,0.782472,38.06981,1854229.0,760.431768,26994.26595,40.177378,0.001028,0
3,12,18.224375,1.097192,47.806219,18.224375,79.545208,736.622011,0.313483,31.740194,894.340589,...,40019.44778,0.259062,0.768934,50.987167,15912390.0,699.499954,53141.6638,25.906233,0.001099,0
4,12,14.861187,1.562431,41.873151,14.861187,82.735137,896.108598,0.703094,40.857571,1381.842235,...,32025.59015,0.103836,1.08173,97.789353,23609300.0,1184.550425,40988.68444,10.383619,0.000913,0


# Making the data balance

In [76]:
freq = df['label'].value_counts()
freq

0    24283
1    10931
Name: label, dtype: int64

In [75]:
delete_frac = round((freq[0] - freq[1])/freq[0],2)

In [79]:
df = df.drop(df.query('label == 0').sample(frac= delete_frac).index)

In [80]:
df['label'].value_counts()

1    10931
0    10927
Name: label, dtype: int64

## Checking column types and converting them in the right format:

In [81]:
print(df.info())
print(df.shape)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21858 entries, 3 to 35213
Columns: 106 entries, Processor_DPC_Rate to label
dtypes: float64(74), int64(32)
memory usage: 17.8 MB
None
(21858, 106)


So, there a total of 125 features presented in this data file and number of rows are 35,975!

In [82]:
columns = df.applymap(np.isreal).all()
print((columns).value_counts())

True    106
dtype: int64


So, there are a total of 69 categorical and 56 numerical columns available in this data file. We need to investigate that, the 69 categorical variables whether they are really categorical or there are some problem in data types for them. 

In [83]:
# Now considering blank space as missing entry and recategorizing the numerical object columns to actual number columns
df.replace(r'^\s*$', np.nan, regex=True)

Unnamed: 0,Processor_DPC_Rate,Processor_pct_.Idle_Time,Processor_pct_.Interrupt_Time,Processor_pct_.User_Time,Processor_pct_.C1_Time,Processor_pct_.Processor_Time,Processor_C1_ransitions_sec,Processor_pct_.DPC_Time,Processor_pct_.Privileged_Time,Processor_DPCs_Queued_sec,...,LogicalDisk._Total..Avg..Disk.Bytes.Read,LogicalDisk._Total..Avg..Disk.Write.Queue.Length,LogicalDisk._Total..Avg..Disk.Queue.Length,LogicalDisk._Total..pct_.Disk.Read.Time,LogicalDisk._Total..Disk.Write.Bytes.sec,LogicalDisk._Total..Disk.Transfers.sec,LogicalDisk._Total..Avg..Disk.Bytes.Transfer,LogicalDisk._Total..pct_.Disk.Write.Time,LogicalDisk._Total..Avg..Disk.sec.Transfer,label
3,12,18.224375,1.097192,47.806219,18.224375,79.545208,736.622011,0.313483,31.740194,894.340589,...,40019.44778,0.259062,0.768934,50.987167,1.591239e+07,699.499954,53141.663800,25.906233,0.001099,0
5,19,22.657346,0.781280,39.454652,22.657346,74.061496,807.794015,0.234384,34.610715,955.502633,...,36368.13317,0.151794,0.804626,65.283183,1.283909e+07,788.780033,47320.635370,15.179443,0.001021,0
6,9,19.296817,2.734334,35.077595,19.296817,75.781616,1665.275721,0.624991,40.702510,1290.768735,...,16248.93350,0.072069,1.082287,101.021753,2.044406e+07,1103.615271,32610.079580,7.206940,0.000981,0
11,13,48.525733,0.625974,29.029545,48.525733,48.200650,1782.461297,0.078247,19.170454,1832.138254,...,15671.41307,0.000504,0.499560,49.905618,2.953696e+04,1653.962233,15656.057160,0.050356,0.000302,0
13,17,50.849749,0.469484,32.942132,50.849749,46.009332,1281.802760,0.156495,13.067307,903.912323,...,91405.79228,0.007903,0.553646,54.574281,3.325003e+05,711.010923,86504.199460,0.790296,0.000779,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35206,2,18.293129,0.078248,71.362589,18.293129,80.437887,117.170943,0.391242,9.076821,165.741804,...,0.00000,0.000421,0.000421,0.000000,1.394675e+04,0.901315,15473.777780,0.014043,0.000467,0
35208,2,47.080046,0.000000,46.498185,47.080046,51.235517,180.027051,0.156296,4.767041,105.615870,...,0.00000,0.000087,0.000087,0.000000,9.422216e+03,0.400060,23552.000000,0.002909,0.000218,0
35211,0,95.294715,0.000000,2.108141,95.294715,2.088548,263.642684,0.000000,0.078079,29.482408,...,0.00000,0.000168,0.000168,0.000000,9.824538e+03,0.499702,19660.800000,0.005594,0.000336,0
35212,0,94.927619,0.000000,0.859393,94.927619,2.810513,342.110790,0.000000,1.953165,43.001356,...,0.00000,0.000440,0.000440,0.000000,2.129987e+04,1.500047,14199.466670,0.014671,0.000293,0


In [84]:
# Keeping the object columns only to convert them as numberical because they actualy is a numerical column but
# because of space available in the entry python considering this column as an object type.

object_colmns = list(df.select_dtypes(include=['object']).columns)
#object_colmns.remove('type')

In [85]:
df[object_colmns] = df[object_colmns].apply(pd.to_numeric, errors='coerce')

In [86]:
# Checking:
columns = df.applymap(np.isreal).all()
print((columns).value_counts())

True    106
dtype: int64


In [87]:
df.head()

Unnamed: 0,Processor_DPC_Rate,Processor_pct_.Idle_Time,Processor_pct_.Interrupt_Time,Processor_pct_.User_Time,Processor_pct_.C1_Time,Processor_pct_.Processor_Time,Processor_C1_ransitions_sec,Processor_pct_.DPC_Time,Processor_pct_.Privileged_Time,Processor_DPCs_Queued_sec,...,LogicalDisk._Total..Avg..Disk.Bytes.Read,LogicalDisk._Total..Avg..Disk.Write.Queue.Length,LogicalDisk._Total..Avg..Disk.Queue.Length,LogicalDisk._Total..pct_.Disk.Read.Time,LogicalDisk._Total..Disk.Write.Bytes.sec,LogicalDisk._Total..Disk.Transfers.sec,LogicalDisk._Total..Avg..Disk.Bytes.Transfer,LogicalDisk._Total..pct_.Disk.Write.Time,LogicalDisk._Total..Avg..Disk.sec.Transfer,label
3,12,18.224375,1.097192,47.806219,18.224375,79.545208,736.622011,0.313483,31.740194,894.340589,...,40019.44778,0.259062,0.768934,50.987167,15912390.0,699.499954,53141.6638,25.906233,0.001099,0
5,19,22.657346,0.78128,39.454652,22.657346,74.061496,807.794015,0.234384,34.610715,955.502633,...,36368.13317,0.151794,0.804626,65.283183,12839090.0,788.780033,47320.63537,15.179443,0.001021,0
6,9,19.296817,2.734334,35.077595,19.296817,75.781616,1665.275721,0.624991,40.70251,1290.768735,...,16248.9335,0.072069,1.082287,101.021753,20444060.0,1103.615271,32610.07958,7.20694,0.000981,0
11,13,48.525733,0.625974,29.029545,48.525733,48.20065,1782.461297,0.078247,19.170454,1832.138254,...,15671.41307,0.000504,0.49956,49.905618,29536.96,1653.962233,15656.05716,0.050356,0.000302,0
13,17,50.849749,0.469484,32.942132,50.849749,46.009332,1281.80276,0.156495,13.067307,903.912323,...,91405.79228,0.007903,0.553646,54.574281,332500.3,711.010923,86504.19946,0.790296,0.000779,0


Now, only the response variable seems categorical. So, we are good about our actual column types for the dataframe.

## Summary statistics for numerical columns

In [36]:
df.describe()

Unnamed: 0,Processor_DPC_Rate,Processor_pct_.Idle_Time,Processor_pct_.Interrupt_Time,Processor_pct_.User_Time,Processor_pct_.C1_Time,Processor_pct_.Processor_Time,Processor_C1_ransitions_sec,Processor_pct_.DPC_Time,Processor_pct_.Privileged_Time,Processor_DPCs_Queued_sec,...,LogicalDisk._Total..Avg..Disk.Bytes.Read,LogicalDisk._Total..Avg..Disk.Write.Queue.Length,LogicalDisk._Total..Avg..Disk.Queue.Length,LogicalDisk._Total..pct_.Disk.Read.Time,LogicalDisk._Total..Disk.Write.Bytes.sec,LogicalDisk._Total..Disk.Transfers.sec,LogicalDisk._Total..Avg..Disk.Bytes.Transfer,LogicalDisk._Total..pct_.Disk.Write.Time,LogicalDisk._Total..Avg..Disk.sec.Transfer,label
count,35214.0,35214.0,35214.0,35214.0,35214.0,35214.0,35214.0,35214.0,35214.0,35214.0,...,35214.0,35214.0,35214.0,35214.0,35214.0,35214.0,35214.0,35214.0,35214.0,35214.0
mean,5.668314,72.035438,0.167905,17.098864,72.035438,25.426923,551.725609,0.513604,8.32507,379.435748,...,14597.567963,0.020388,0.137814,4.391958,1177482.0,234.204762,25353.973162,1.576697,0.000643,0.310416
std,18.716827,20.546655,0.359206,13.988805,20.546655,20.338239,844.635668,1.233437,10.807045,963.693771,...,23113.333385,0.101185,0.417481,14.523762,3328933.0,720.44034,49029.709022,9.926539,0.000945,0.462671
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.794126,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,62.589187,0.0,5.318951,62.589187,8.429947,154.499255,0.0,1.326054,25.561099,...,0.0,0.000361,0.000418,0.0,11469.07,0.898604,11638.779422,0.012078,0.000307,0.0
50%,0.0,74.940266,0.0,14.509127,74.940266,22.928763,307.176978,0.078124,2.968731,38.258653,...,4096.0,0.000799,0.001174,0.001352,27077.18,2.600066,16384.0,0.02693,0.000417,0.0
75%,1.0,89.113131,0.234013,25.117544,89.113131,34.163972,421.401051,0.390012,12.748875,248.320166,...,22753.681877,0.004369,0.007208,0.053539,202979.4,16.820502,24551.12671,0.148555,0.000656,1.0
max,195.0,99.638956,10.641964,91.56642,99.638956,100.0,10318.52139,21.953888,81.182168,8141.157271,...,944383.0623,1.836753,4.364882,294.529448,60513200.0,7170.887411,977715.2,183.675284,0.045244,1.0


We are observing that the range of the columns are highly fluctuating for the columns. So, definitely the data needs to be standardize or we have to normalize the columns before feeding them in the model. But before that, we are going to observe the missing value distribution over the columns.

## Missing value % investigation and imputation:

In [37]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)

In [38]:
missing_value_df.tail()

Unnamed: 0,column_name,percent_missing
Process_Thread.Count,Process_Thread.Count,0.0
Process_IO.Data.Operations_sec,Process_IO.Data.Operations_sec,0.0
Process_IO.Other.Operations_sec,Process_IO.Other.Operations_sec,0.0
Process_Working.Set,Process_Working.Set,0.0
label,label,0.0


The highest missing percentage is around 1.5% for Process_IO Read Bytes_sec column. So, we are good to go for missing value imputation. We will replace that missing entries by mean at this stage. Later we will check the performance after replacing with Median. 

In [17]:
df.fillna(df.select_dtypes(include='number').mean(), inplace=True)

In [18]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)

In [19]:
missing_value_df.tail()

Unnamed: 0,column_name,percent_missing
Process_ID Process,Process_ID Process,0.0
Process_Page Faults_sec,Process_Page Faults_sec,0.0
Process_Working Set,Process_Working Set,0.0
Network_I(Intel R _82574L_GNC) Bytes Received sec,Network_I(Intel R _82574L_GNC) Bytes Received sec,0.0
label,label,0.0


## Seperating independent and dependent variables:

In [88]:
# Removing Y and Id
X = df.iloc [: ,:-1].values    # independent variables
# Keeping only Y
y = df.iloc [: , -1].values    # dependent variables

## Handling Inf values:


In [89]:
import numpy as np

# find min and max values for each column, ignoring nan, -inf, and inf
mins = [np.nanmin(X[:, i][X[:, i] != -np.inf]) for i in range(X.shape[1])]
maxs = [np.nanmax(X[:, i][X[:, i] != np.inf]) for i in range(X.shape[1])]

# go through matrix one column at a time and replace  + and -infinity 
# with the max or min for that column
for i in range(X.shape[1]):
    X[:, i][X[:, i] == -np.inf] = mins[i]
    X[:, i][X[:, i] == np.inf] = maxs[i]

## Standardizing all numerical columns 

In [90]:
# feature Scaling
sc = MinMaxScaler(feature_range = (0,1))
X = sc.fit_transform(X)

# Train test Split

In [91]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,test_size=0.2)

# Unsupervised modelling:

## SOM

In [92]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Minisom library and module is used for performing Self Organizing Maps
from minisom import MiniSom

In [93]:
def classify(som, data):
    """Classifies each sample in data in one of the classes definited
    using the method labels_map.
    Returns a list of the same length of data where the i-th element
    is the class assigned to data[i].
    """
    winmap = som.labels_map(X_train, y_train)
    default_class = np.sum(list(winmap.values())).most_common()[0][0]
    result = []
    for d in data:
        win_position = som.winner(d)
        if win_position in winmap:
            result.append(winmap[win_position].most_common()[0][0])
        else:
            result.append(default_class)
    return result

In [94]:
# I use linear som topography
som_shape = (1, 2)
som = MiniSom(som_shape[0], som_shape[1], X_train.shape[1],sigma = 3, learning_rate=0.5, 
              neighborhood_function='triangle')
som.pca_weights_init(X_train)
som.train_random(X_train, 100, verbose=False)

print(classification_report(y_test, classify(som, X_test)))

              precision    recall  f1-score   support

           0       0.80      0.72      0.76      2186
           1       0.74      0.82      0.78      2186

    accuracy                           0.77      4372
   macro avg       0.77      0.77      0.77      4372
weighted avg       0.77      0.77      0.77      4372



## K-means clustering 

In [95]:
#Now let's open it with pandas
import pandas as pd
from pandas import Series,DataFrame
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

import numpy as np
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as shc
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn import metrics 


import warnings
warnings.filterwarnings('ignore')

In [96]:
def k_means_fit(covariates,centroid_method):
    data = pd.DataFrame(covariates)
    # from sklearn.cluster import KMeans 
    clusters = 2
    number_of_iteration = 100
    kmeans = KMeans(n_clusters = clusters,n_init= number_of_iteration,init = centroid_method,algorithm = "full") 
    kmeans.fit(data) 
    
    y_labels_test = kmeans.predict(X_test)
    tab = pd.crosstab(index=y_test, columns=y_labels_test)
    print (metrics.classification_report(y_test,y_labels_test))

In [97]:
k_means_fit(X_train,centroid_method = "random")

              precision    recall  f1-score   support

           0       1.00      0.68      0.81      2186
           1       0.76      1.00      0.86      2186

    accuracy                           0.84      4372
   macro avg       0.88      0.84      0.84      4372
weighted avg       0.88      0.84      0.84      4372



# Result

For both SOM and K-means clustering giving us similar measure of accuracy measrure and between them k-means is slightly better and it's around 84%.