In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import numpy as np
import pandas as pd


In [4]:
Y_LABELS = {
    'normal.': 'Normal',
    
    'back.': 'DoS',
    'land.': 'DoS',
    'neptune.': 'DoS',
    'pod.': 'DoS',
    'smurf.': 'DoS',
    'teardrop.': 'DoS',
    
    'ipsweep.': 'Probe',
    'nmap.': 'Probe',
    'portsweep.': 'Probe',
    'satan.': 'Probe',
    
    'ftp_write.': 'R2L',
    'guess_passwd.': 'R2L',
    'imap.': 'R2L',
    'multihop.': 'R2L',
    'phf.': 'R2L',
    'spy.': 'R2L',
    'warezclient.': 'R2L',
    'warezmaster.': 'R2L',
    
    'buffer_overflow.': 'U2R',
    'loadmodule.': 'U2R',
    'perl.': 'U2R',
    'rootkit.': 'U2R'
}

In [5]:
class LabelEncoder(object):
    def __init__(self):
        self.mapping = {}
        self.inverse_mapping = {}

    def fit(self, data):
        unique_values = np.unique(data)
        for i, value in enumerate(unique_values):
            self.mapping[value] = i
            self.inverse_mapping[i] = value

    def transform(self, data):
        return np.array([self.mapping[value] for value in data])

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

    def inverse_transform(self, data):
        return np.array([self.inverse_mapping[value] for value in data])

In [6]:
class Cluster(object):
    def __init__(
            self, 
            n_clusters=4, 
            max_iter=100, 
            eps=1e-6, 
            distance='euclidean'
    ):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.eps = eps
        self.distance = {
            'euclidean': self.euclidean,
        }[distance]
    
    def fit(self, X, y):
        # print(y)
        self.Mu = X[np.random.choice(
            X.shape[0], self.n_clusters, replace=False
        )]
        for _ in range(self.max_iter):
            Mu_copy = self.Mu.copy()
            C = [[] for _ in range(self.n_clusters)]
            label = np.array([set() for _ in range(self.n_clusters)])
            for i in range(X.shape[0]):
                dist = np.array([self.distance(X[i], mu) for mu in self.Mu])
                for _ in range(self.n_clusters):
                    r = np.argmin(dist)
                    if not self.violated(y[i], r, label):
                        C[r].append(i)
                        label[r].add(y[i])
                        break
                    else:
                        dist[r] = np.inf
            # print(C)
            print(label)
            for j in range(self.n_clusters):
                self.Mu[j] = np.mean(X[C[j]], axis=0)
            if np.all(np.abs(Mu_copy - self.Mu) < self.eps):
                break
    
    def predict(self, X):
        y_pred = np.array([np.argmin([self.distance(X[i], mu) for mu in self.Mu]) for i in range(X.shape[0])])
        return y_pred
    
    def fit_predict(self, X, y):
        self.fit(X, y)
        return self.predict(X)
    
    def score(self, X, y):
        y_pred = self.predict(X)
        return np.mean(y_pred == y)
    
    def get_cluster_centers(self):
        return self.Mu
        
    @staticmethod
    def euclidean(a, b):
        return np.linalg.norm(a - b)
    
    def violated(self, i_label, r, labels):
        if labels[r]:
            if i_label not in labels[r]:
                return True
            else:
                ...
        else:
            for j in range(self.n_clusters):
                if i_label in labels[j]:
                    # print(i_label, labels[j], j)
                    return True
        return False


In [7]:
def min_max_normalize(data):
    min_val = data.min(axis=0)
    max_val = data.max(axis=0)
    return (data - min_val) / (max_val - min_val)

def z_score_normalize(data):
    mean = data.mean(axis=0)
    std = data.std(axis=0)
    return (data - mean) / std

In [8]:
df = pd.read_csv('kddcup.data.gz')
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

string_columns = X.select_dtypes(include=['object']).columns
encoder = LabelEncoder()
for col in string_columns:
    X[col] = encoder.fit_transform(X[col])

X = min_max_normalize(X).dropna(axis=1)
y = y.map(Y_LABELS)

In [9]:
train_slice = np.random.choice(X.shape[0], 10000, replace=False)
X_train = X.iloc[train_slice, :].reset_index(drop=True)
y_train = pd.DataFrame(y.iloc[train_slice].reset_index(drop=True))
# X_test = X.iloc[~train_slice, :]
# y_test = y.iloc[8000:]
X_train
y_train
y_train.value_counts()

Unnamed: 0,0,tcp,http,SF,215,45076,0.1,0.2,0.3,0.4,...,0.16,0.17,0.00.6,0.00.7,0.00.8,0.00.9,0.00.10,0.00.11,0.00.12,0.00.13
0,0.0,0.5,0.710145,0.5,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,...,1.000000,0.023529,0.02,0.07,0.00,0.00,1.00,1.0,0.0,0.00
1,0.0,0.0,0.217391,0.9,7.478457e-07,0.000000,0.0,0.0,0.0,0.0,...,1.000000,1.000000,1.00,0.00,1.00,0.00,0.00,0.0,0.0,0.00
2,0.0,0.0,0.217391,0.9,7.478457e-07,0.000000,0.0,0.0,0.0,0.0,...,1.000000,1.000000,1.00,0.00,1.00,0.00,0.00,0.0,0.0,0.00
3,0.0,0.5,0.710145,0.5,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,...,1.000000,0.011765,0.01,0.07,0.00,0.00,1.00,1.0,0.0,0.00
4,0.0,0.0,0.217391,0.9,7.478457e-07,0.000000,0.0,0.0,0.0,0.0,...,1.000000,1.000000,1.00,0.00,1.00,0.00,0.00,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,0.217391,0.9,7.478457e-07,0.000000,0.0,0.0,0.0,0.0,...,1.000000,1.000000,1.00,0.00,1.00,0.00,0.00,0.0,0.0,0.00
9996,0.0,0.5,0.347826,0.9,1.768162e-07,0.000006,0.0,0.0,0.0,0.0,...,1.000000,1.000000,1.00,0.00,0.00,0.00,0.00,0.0,0.0,0.00
9997,0.0,0.5,0.347826,0.9,2.405860e-07,0.000002,0.0,0.0,0.0,0.0,...,0.494118,1.000000,1.00,0.00,0.01,0.01,0.01,0.0,0.0,0.01
9998,0.0,0.0,0.217391,0.9,7.478457e-07,0.000000,0.0,0.0,0.0,0.0,...,1.000000,1.000000,1.00,0.00,1.00,0.00,0.00,0.0,0.0,0.00


Unnamed: 0,normal.
0,DoS
1,DoS
2,DoS
3,DoS
4,DoS
...,...
9995,DoS
9996,Normal
9997,Normal
9998,DoS


normal.
DoS        7896
Normal     2006
Probe        95
R2L           3
Name: count, dtype: int64

In [10]:
model = Cluster(n_clusters=4, max_iter=100, eps=1e-6, distance='euclidean')
model.fit(X_train.to_numpy(), y_train.iloc[:, 0].to_numpy())
# y_pred = model.predict(X_train)
# model.score(X_train, y_train)

[{'Probe'} {'R2L'} {'DoS'} {'Normal'}]
[{'Probe'} {'R2L'} {'DoS'} {'Normal'}]


In [12]:
test_slice = np.random.choice(X.shape[0], 10000, replace=False)
X_test = X.iloc[train_slice, :].reset_index(drop=True)
y_test = pd.DataFrame(y.iloc[train_slice].reset_index(drop=True))
# X_test = X.iloc[~train_slice, :]
# y_test = y.iloc[8000:]
X_test
y_test
y_test.value_counts()

Unnamed: 0,0,tcp,http,SF,215,45076,0.1,0.2,0.3,0.4,...,0.16,0.17,0.00.6,0.00.7,0.00.8,0.00.9,0.00.10,0.00.11,0.00.12,0.00.13
0,0.0,0.5,0.710145,0.5,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,...,1.000000,0.023529,0.02,0.07,0.00,0.00,1.00,1.0,0.0,0.00
1,0.0,0.0,0.217391,0.9,7.478457e-07,0.000000,0.0,0.0,0.0,0.0,...,1.000000,1.000000,1.00,0.00,1.00,0.00,0.00,0.0,0.0,0.00
2,0.0,0.0,0.217391,0.9,7.478457e-07,0.000000,0.0,0.0,0.0,0.0,...,1.000000,1.000000,1.00,0.00,1.00,0.00,0.00,0.0,0.0,0.00
3,0.0,0.5,0.710145,0.5,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,...,1.000000,0.011765,0.01,0.07,0.00,0.00,1.00,1.0,0.0,0.00
4,0.0,0.0,0.217391,0.9,7.478457e-07,0.000000,0.0,0.0,0.0,0.0,...,1.000000,1.000000,1.00,0.00,1.00,0.00,0.00,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,0.217391,0.9,7.478457e-07,0.000000,0.0,0.0,0.0,0.0,...,1.000000,1.000000,1.00,0.00,1.00,0.00,0.00,0.0,0.0,0.00
9996,0.0,0.5,0.347826,0.9,1.768162e-07,0.000006,0.0,0.0,0.0,0.0,...,1.000000,1.000000,1.00,0.00,0.00,0.00,0.00,0.0,0.0,0.00
9997,0.0,0.5,0.347826,0.9,2.405860e-07,0.000002,0.0,0.0,0.0,0.0,...,0.494118,1.000000,1.00,0.00,0.01,0.01,0.01,0.0,0.0,0.01
9998,0.0,0.0,0.217391,0.9,7.478457e-07,0.000000,0.0,0.0,0.0,0.0,...,1.000000,1.000000,1.00,0.00,1.00,0.00,0.00,0.0,0.0,0.00


Unnamed: 0,normal.
0,DoS
1,DoS
2,DoS
3,DoS
4,DoS
...,...
9995,DoS
9996,Normal
9997,Normal
9998,DoS


normal.
DoS        7896
Normal     2006
Probe        95
R2L           3
Name: count, dtype: int64

In [14]:
pred = model.predict(X_test.to_numpy())
pred

array([2, 2, 2, ..., 3, 2, 2], dtype=int64)

In [16]:
YY_test = y_test.iloc[:, 0].map({
    'Probe': 0,
    'R2L': 1,
    'DoS': 2,
    'Normal': 3
})
print(sum(pred == YY_test))

9227


In [11]:
    # def fit(self, X, y):
    #     self.Mu = X.sample(n=self.n_clusters, replace=False)
    #     for _ in range(self.max_iter):
    #         Mu_copy = self.Mu.copy()
    #         self.C = {cluster: [] for cluster in range(self.n_clusters)}
    #         for i in X.index:
    #             dist = np.array([self.distance(X.loc[i], mu) for mu in self.Mu.itertuples(index=False)])
    #             for _ in range(self.n_clusters):
    #                 r = np.argmin(dist)
    #                 if not self.violated(i, r):
    #                     self.C[r].append(i)
    #                     break
    #                 else:
    #                     dist[r] = np.inf
    #         for j in range(self.n_clusters):
    #             self.Mu.iloc[j] = X.loc[self.C[j]].mean()
    #         if np.all(np.abs(Mu_copy.values - self.Mu.values) < self.eps):
    #             break