# IS-02 Machine Learning - Data and Web Science
## Final Project
## Problem 1 - Unlabeled Clustering
### <i>Avgitidis Konstantinos AM: 65</i>

In [1]:
#importing necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MiniBatchKMeans, Birch

The given problem requires us to train an algorithm with some unlabeled data and predict
the values of labeled data. This seems to be a case of a clustering problem so we should first
take a look into our data and classify the kind of clustering algorithms we're going to be using.

In [2]:
#Loading our data
X_train = pd.read_csv('NSL-KDDTrain.csv')
X_test = pd.read_csv('NSL-KDDTest.csv')

In [3]:
X_train.head(5)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,150,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0
1,0,udp,other,SF,146,0,0,0,0,0,...,255,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0
2,0,tcp,private,S0,0,0,0,0,0,0,...,255,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0
3,0,tcp,http,SF,232,8153,0,0,0,0,...,30,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
X_test.head(5)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,target
0,0,tcp,private,REJ,0,0,0,0,0,0,...,10,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0,attack
1,0,tcp,private,REJ,0,0,0,0,0,0,...,1,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,attack
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,86,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,normal
3,0,icmp,eco_i,SF,20,0,0,0,0,0,...,57,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,attack
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,86,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,attack


Alright, we can clearly see that the test dataset contains one more column called "target" which must
contain the type of communication (normal or attack). We can confirm that by running the following code:

In [5]:
X_test.target.value_counts()

attack    12833
normal     9711
Name: target, dtype: int64

So it appears that we have more attacks than normal communications. If we consider that the test dataset is a
representative subset of an original dataset that contains both (train + test), we can imply that we have
more attacks than normal communications in this Dataset. We will use that logic later on.

Moving on lets look if we have any NaN values in our data.

In [6]:
#Check how many NaN values we got
print(sum(X_train.isnull().sum()))

0


No NaN values so there is no need for further processing.

Now lets see the types of our train dataset.

In [7]:
X_train.dtypes

duration                         int64
protocol_type                   object
service                         object
flag                            object
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
num_compromised                  int64
root_shell                       int64
su_attempted                     int64
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
num_outbound_cmds                int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_rate          

Ok we will use one-hot encoding to remove all categorical (object) values of the dataset.
Doing that we also have to include only the columns included in the test subset, after the
encoding.

In [8]:
#One-hot encoding for categorical values
X_train = pd.get_dummies(X_train,columns=['protocol_type','service','flag'])
X_test = pd.get_dummies(X_test,columns=['protocol_type','service','flag'])
# X_train.drop(axis=1,columns=['protocol_type','service','flag'],inplace=True)
# X_test.drop(axis=1,columns=['protocol_type','service','flag'],inplace=True)

In [9]:
Xtarget = X_test.target #Lets put that away for later use
X_test.drop(axis=1, columns='target', inplace = True)

In [10]:
#keep only the columns present in the test dataset
X_train = X_train[X_test.columns]

In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.values)
X_test = scaler.transform(X_test.values)

In [12]:
pca = PCA(n_components=20,svd_solver="full")
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [13]:
# define the models
models = [KMeans(n_clusters=2,n_init=100,init="k-means++",max_iter=1000000),
          KMeans(n_clusters=2,n_init=20,init="random",max_iter=1000000),
          MiniBatchKMeans(n_clusters=2,max_iter=1000000),
          Birch(n_clusters=2),]

In [14]:
# assign each data point to a cluster
for model in models:
    train_results = model.fit_predict(X_train)
    if list(train_results).count(0) > list(train_results).count(1):
        Y_test = [1 if i == 'normal' else 0 for i in Xtarget.values]
    else:
        Y_test = [0 if i == 'normal' else 1 for i in Xtarget.values]
    Y_pred = model.predict(X_test)
    results = [1 if i==j else 0 for i,j in zip(Y_test,Y_pred)]
    accuracy = results.count(1)/len(results)
    print(f'Model: {str(model)}, Accuracy = {accuracy}')

Model: KMeans(max_iter=1000000, n_clusters=2, n_init=100), Accuracy = 0.26534776437189495
Model: KMeans(init='random', max_iter=1000000, n_clusters=2, n_init=20), Accuracy = 0.26534776437189495
Model: MiniBatchKMeans(max_iter=1000000, n_clusters=2), Accuracy = 0.2708037615330021
Model: Birch(n_clusters=2), Accuracy = 0.5693310858765082
