In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import numpy as np
import pandas as pd


In [2]:
Y_LABELS = {
    'normal.': 'Normal',
    
    'back.': 'DoS',
    'land.': 'DoS',
    'neptune.': 'DoS',
    'pod.': 'DoS',
    'smurf.': 'DoS',
    'teardrop.': 'DoS',
    'ipsweep.': 'Probe',
    'nmap.': 'Probe',
    'portsweep.': 'Probe',
    'satan.': 'Probe',
    
    'ftp_write.': 'R2L',
    'guess_passwd.': 'R2L',
    'imap.': 'R2L',
    'multihop.': 'R2L',
    'phf.': 'R2L',
    'spy.': 'R2L',
    'warezclient.': 'R2L',
    'warezmaster.': 'R2L',
    
    'buffer_overflow.': 'U2R',
    'loadmodule.': 'U2R',
    'perl.': 'U2R',
    'rootkit.': 'U2R'
}

In [3]:
class LabelEncoder(object):
    def __init__(self):
        self.mapping = {}
        self.inverse_mapping = {}

    def fit(self, data):
        unique_values = np.unique(data)
        for i, value in enumerate(unique_values):
            self.mapping[value] = i
            self.inverse_mapping[i] = value

    def transform(self, data):
        return np.array([self.mapping[value] for value in data])

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

    def inverse_transform(self, data):
        return np.array([self.inverse_mapping[value] for value in data])

In [4]:
class Cluster(object):
    def __init__(
            self, 
            n_clusters=4, 
            max_iter=10, 
            eps=1e-6, 
            distance='euclidean'
    ):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.eps = eps
        self.distance = {
            'euclidean': self.euclidean,
        }[distance]
    
    def fit(self, X, y):
        # print(y)
        Mu = X[np.random.choice(
            X.shape[0], self.n_clusters, replace=False
        )]
        for _ in range(self.max_iter):
            Mu_copy = Mu.copy()
            C = [[] for _ in range(self.n_clusters)]
            label = np.array([set() for _ in range(self.n_clusters)])
            for i in range(X.shape[0]):
                dist = np.array([self.distance(X[i], mu) for mu in Mu])
                for _ in range(self.n_clusters):
                    r = np.argmin(dist)
                    if not self.violated(y[i], r, label):
                        C[r].append(i)
                        label[r].add(y[i])
                        break
                    else:
                        dist[r] = np.inf
            print(label)
            for j in range(self.n_clusters):
                Mu[j] = np.mean(X[C[j]], axis=0)
            if np.all(np.abs(Mu_copy - Mu) < self.eps):
                break
        self.label = np.array([list(y)[0] for y in label])
        self.Mu = Mu
    
    def predict(self, X):
        y_pred = np.array([
            self.label[np.argmin(
                [self.distance(X[i], mu) for mu in self.Mu]
            )] for i in range(X.shape[0])
        ])
        return y_pred
    
    def fit_predict(self, X, y):
        self.fit(X, y)
        return self.predict(X)
    
    def score(self, X, y):
        y_pred = self.predict(X)
        return np.mean(y_pred == y)
        
    @staticmethod
    def euclidean(a, b):
        return np.linalg.norm(a - b)
    
    def violated(self, i_label, r, labels):
        if labels[r]:
            if i_label not in labels[r]:
                return True
            else:
                ...
        else:
            for j in range(self.n_clusters):
                if i_label in labels[j]:
                    # print(i_label, labels[j], j)
                    return True
        return False
    
    def saveModel(self):
        with open('Models/verison', 'r') as f:
            version = int(f.read())
        with open(f'Models/cluster_{version}.csv', 'w') as f:
            f.write(f'{self.n_clusters}\n')
            f.write(f'{self.max_iter}\n')
            f.write(f'{self.eps}\n')
            f.write(f'{self.distance}\n')
            f.write(f'{self.label}\n')
            f.write(f'{self.Mu}\n')
            

In [5]:
def min_max_normalize(data):
    min_val = data.min(axis=0)
    max_val = data.max(axis=0)
    return (data - min_val) / (max_val - min_val)

def z_score_normalize(data):
    mean = data.mean(axis=0)
    std = data.std(axis=0)
    return (data - mean) / std

In [6]:
df = pd.read_csv('kddcup.data.gz')
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

string_columns = X.select_dtypes(include=['object']).columns
encoder = LabelEncoder()
for col in string_columns:
    X[col] = encoder.fit_transform(X[col])

X = min_max_normalize(X).dropna(axis=1)
y = y.map(Y_LABELS)

In [7]:
X_train = X.iloc[:X.shape[0] * 5 // 7, :].reset_index(drop=True)
y_train = y.iloc[:y.shape[0] * 5 // 7].reset_index(drop=True)

In [8]:
model = Cluster(n_clusters=4, max_iter=10, eps=1e-6, distance='euclidean')
model.fit(X_train.to_numpy(), y_train)

[{'R2L'} {'Normal'} {'DoS'} {'U2R'}]
[{'Normal'} {'R2L'} {'DoS'} {'U2R'}]
[{'R2L'} {'Normal'} {'DoS'} {'U2R'}]
[{'Normal'} {'R2L'} {'DoS'} {'U2R'}]
[{'R2L'} {'Normal'} {'DoS'} {'U2R'}]
[{'Normal'} {'R2L'} {'DoS'} {'U2R'}]
[{'R2L'} {'Normal'} {'DoS'} {'U2R'}]
[{'Normal'} {'R2L'} {'DoS'} {'U2R'}]
[{'R2L'} {'Normal'} {'DoS'} {'U2R'}]
[{'Normal'} {'R2L'} {'DoS'} {'U2R'}]


In [9]:
X_test = X.iloc[X.shape[0] * 5 // 7:, :].reset_index(drop=True)
y_test = y.iloc[y.shape[0] * 5 // 7:].reset_index(drop=True)

In [10]:
pred = model.predict(X_test.to_numpy())
print(sum(pred == y_test))
print(X_test.shape[0])
pred

1326576
1399552


array(['DoS', 'DoS', 'DoS', ..., 'Normal', 'Normal', 'Normal'],
      dtype='<U6')

In [11]:
print(model.label)
print(model.Mu)

['Normal' 'R2L' 'DoS' 'U2R']
[[1.37211822e-04 5.29773605e-01 3.74176851e-01 8.73407478e-01
  8.60111466e-07 3.02006365e-06 0.00000000e+00 0.00000000e+00
  1.85023864e-06 6.29951837e-04 2.04179275e-05 8.76674194e-01
  3.15767192e-06 3.96168744e-04 7.54244339e-05 5.91494576e-06
  1.17007978e-04 1.34849745e-04 5.73090255e-04 1.52372594e-06
  3.71179638e-03 1.94200511e-02 2.59357089e-02 1.56773114e-03
  1.90665350e-03 3.30300357e-02 3.32114048e-02 9.95535285e-01
  6.92853421e-03 1.39722941e-01 5.28982940e-01 8.73399683e-01
  9.22416492e-01 1.13133146e-02 8.21265729e-02 2.40095934e-02
  1.94287247e-03 9.37289535e-04 3.44659798e-02 3.26716401e-02]
 [1.69653573e-02 5.00000000e-01 3.43314192e-01 8.44182390e-01
  3.48578258e-04 9.72041697e-05 0.00000000e+00 0.00000000e+00
  3.36927224e-04 7.63701707e-02 1.76100629e-02 8.63207547e-01
  8.19905799e-06 6.28930818e-03 7.86163522e-04 1.20008893e-05
  1.13353810e-03 2.35849057e-03 1.22292103e-03 0.00000000e+00
  2.37421384e-01 2.54464670e-03 6.083151

In [12]:
# pred
# # pred = np.array(list(p) for p in pred)
# pred = np.array([list(p)[0] if len(p) == 1 else 'DoS' for p in pred])
# print(sum(pred == y_test.to_numpy()))

In [13]:
# YY_test = y_test.iloc[:, 0].map({
#     'Probe': 0,
#     'R2L': 1,
#     'DoS': 2,
#     'Normal': 3
# })
# print(sum(pred == YY_test))

In [14]:
    # def fit(self, X, y):
    #     self.Mu = X.sample(n=self.n_clusters, replace=False)
    #     for _ in range(self.max_iter):
    #         Mu_copy = self.Mu.copy()
    #         self.C = {cluster: [] for cluster in range(self.n_clusters)}
    #         for i in X.index:
    #             dist = np.array([self.distance(X.loc[i], mu) for mu in self.Mu.itertuples(index=False)])
    #             for _ in range(self.n_clusters):
    #                 r = np.argmin(dist)
    #                 if not self.violated(i, r):
    #                     self.C[r].append(i)
    #                     break
    #                 else:
    #                     dist[r] = np.inf
    #         for j in range(self.n_clusters):
    #             self.Mu.iloc[j] = X.loc[self.C[j]].mean()
    #         if np.all(np.abs(Mu_copy.values - self.Mu.values) < self.eps):
    #             break