In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import numpy as np
import pandas as pd

In [107]:
Y_LABELS = {
    'normal.': 'Normal',
    
    'back.': 'DoS',
    'land.': 'DoS',
    'neptune.': 'DoS',
    'pod.': 'DoS',
    'smurf.': 'DoS',
    'teardrop.': 'DoS',
    
    'ipsweep.': 'Probe',
    'nmap.': 'Probe',
    'portsweep.': 'Probe',
    'satan.': 'Probe',
    
    'ftp_write.': 'R2L',
    'guess_passwd.': 'R2L',
    'imap.': 'R2L',
    'multihop.': 'R2L',
    'phf.': 'R2L',
    'spy.': 'R2L',
    'warezclient.': 'R2L',
    'warezmaster.': 'R2L',
    
    'buffer_overflow.': 'U2R',
    'loadmodule.': 'U2R',
    'perl.': 'U2R',
    'rootkit.': 'U2R'
}

In [3]:
class LabelEncoder(object):
    def __init__(self):
        self.mapping = {}
        self.inverse_mapping = {}

    def fit(self, data):
        unique_values = np.unique(data)
        for i, value in enumerate(unique_values):
            self.mapping[value] = i
            self.inverse_mapping[i] = value

    def transform(self, data):
        return np.array([self.mapping[value] for value in data])

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

    def inverse_transform(self, data):
        return np.array([self.inverse_mapping[value] for value in data])

In [109]:
class Cluster(object):
    def __init__(
            self, 
            n_clusters=5, 
            max_iter=100, 
            eps=1e-6, 
            distance='euclidean',
            random_state=None,
            **kwargs
    ):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.eps = eps
        self.distance = {
            'euclidean': self.euclidean,
        }[distance]
        self.random_state = random_state
        for k, v in kwargs.items():
            # print(k, v)
            if type(v) == str:
                v = eval(v)
            setattr(self, k, v)
    
    def to_DataFrame(self):
        kv = self.__dict__.copy()
        kv['distance'] = self.distance.__name__
        kv['Mu'] = kv['Mu'].tolist()
        kv['label'] = kv['label'].tolist()
        return pd.DataFrame([kv])
    
    def fit(self, X, y):
        np.random.seed(self.random_state)
        Mu = X[np.random.choice(
            X.shape[0], self.n_clusters, replace=False
        )]
        
        mp = {
            y.unique()[i]: i
            for i in range(y.unique().shape[0])
        }
        mp.update({
            i: y.unique()[i]
            for i in range(y.unique().shape[0])
        })
        label_ = np.array([
            np.array([0] * y.unique().shape[0])
            for _ in range(self.n_clusters)
        ])
        
        # label_ = {
        #     i : [0] * self.n_clusters
        #     for i in range(y.unique().shape[0])
        # }
        # label_ = pd.DataFrame(label_, index=y.unique())
        
        # print(y.values)
        for _ in range(self.max_iter):
            Mu_copy = Mu.copy()
            C = [[] for _ in range(self.n_clusters)]
            label = label_.copy()
            # print(label)
            for i in range(X.shape[0]):
                dist = np.array([self.distance(X[i], mu) for mu in Mu])
                for _ in range(self.n_clusters):
                    r = np.argmin(dist)
                    if not self.violated(mp[y[i]], r, label):
                        C[r].append(i)
                        # label[r].add(y[i])
                        label[r][mp[y[i]]] += 1
                        break
                    else:
                        dist[r] = np.inf
            print(label)
            for j in range(self.n_clusters):
                Mu[j] = np.mean(X[C[j]], axis=0)
            if np.all(np.abs(Mu_copy - Mu) < self.eps):
                break
            
        self.label = np.array(
            [mp[np.random.choice(np.where(s == s.max())[0], 1)[0]] for s in label]
        )
        self.Mu = Mu
    
    def predict(self, X):
        y_pred = np.array([
            self.label[np.argmin(
                [self.distance(X[i], mu) for mu in self.Mu]
            )] for i in range(X.shape[0])
        ])
        return y_pred
    
    def fit_predict(self, X, y):
        self.fit(X, y)
        return self.predict(X)
    
    def score(self, X, y):
        y_pred = self.predict(X)
        self.accuracy = np.mean(y_pred == y)
        return self.accuracy
        
    @staticmethod
    def euclidean(a, b):
        return np.linalg.norm(a - b)
    
    def violated(self, i_label, r, labels):
        if not labels[r].sum():
            return False
        return not labels[r][i_label] and np.sum(labels[:, i_label])
    
    def saveModel(self):
        with open('Models/version_C', 'r') as f:
            version = int(f.read())
        version += 1
        with open('Models/version_C', 'w') as f:
            f.write(str(version))
        self.to_DataFrame().to_csv(f'Models/cluster_{version}.csv', index=False)
    
    def loadModel(self, version):
        with open(f'Models/cluster_{version}.csv', 'r') as f:
            kv = pd.read_csv(f).iloc[0].to_dict()
        self.__init__(**kv)
    
    def loadFromDataFrame(self, df):
        kv = df.iloc[0].to_dict()
        self.__init__(**kv)

In [5]:
def min_max_normalize(data):
    min_val = data.min(axis=0)
    max_val = data.max(axis=0)
    return (data - min_val) * 1_000_000 / (max_val - min_val)

def z_score_normalize(data):
    mean = data.mean(axis=0)
    std = data.std(axis=0)
    return (data - mean) / std

In [6]:
df = pd.read_csv('kddcup.data.gz')
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

encoder = LabelEncoder()
for col in X.select_dtypes(include=['object']).columns:
    X[col] = encoder.fit_transform(X[col])

X = min_max_normalize(X).dropna(axis=1)
# y = y.map(Y_LABELS)
y.value_counts()

normal.
smurf.              2807886
neptune.            1072017
normal.              972780
satan.                15892
ipsweep.              12481
portsweep.            10413
nmap.                  2316
back.                  2203
warezclient.           1020
teardrop.               979
pod.                    264
guess_passwd.            53
buffer_overflow.         30
land.                    21
warezmaster.             20
imap.                    12
rootkit.                 10
loadmodule.               9
ftp_write.                8
multihop.                 7
phf.                      4
perl.                     3
spy.                      2
Name: count, dtype: int64

In [112]:
X_train = X.iloc[:X.shape[0] * 4 // 5, :].reset_index(drop=True)
y_train = y.iloc[:y.shape[0] * 4 // 5].reset_index(drop=True)

In [113]:
model = Cluster(
    n_clusters=12, max_iter=10, 
    eps=1e-6, distance='euclidean', 
    random_state=36,
)
model.fit(X_train.to_numpy(), y_train)

[[ 766475      27       0    1101   19600]
 [      0       0 1901937       0       0]
 [      0       0       0       0   18368]
 [  13098       0       0       0       0]
 [      0       0  344595       0       0]
 [      0       8       0       0       0]
 [    537       0  852974       0       0]
 [      0       0       0      24       0]]
[[ 701947      17       0       0       0]
 [      0       0 2236285       0       0]
 [   6696       0       0       0       0]
 [  64917       0       0       0   34918]
 [      0       0    7294       0       0]
 [      0      18       0       0       0]
 [      0       0       0       0    3050]
 [   6550       0  855927    1125       0]]
[[ 660329       0       0       0       0]
 [      0       0 2246400       0       0]
 [  45362       0       0       0   37968]
 [  58353       0       0    1125       0]
 [   3461       0       0       0       0]
 [  12511      35       0       0       0]
 [      0       0     433       0       0]
 [     94

In [114]:
X_test = X.iloc[X.shape[0] * 4 // 5:, :].reset_index(drop=True)
y_test = y.iloc[y.shape[0] * 4 // 5:].reset_index(drop=True)

In [115]:
# pred = model.predict(X_test.to_numpy())
# print(np.mean(pred == y_test.to_numpy()))
# pred

In [116]:
score = model.score(X_test.to_numpy(), y_test)
print(score)
model.saveModel()
# print(X_test.shape[0])
# pred

0.7871328160247263


In [117]:
# model_test = Cluster()
# model_test.loadModel(version=11)
# model_test.score(X_test.to_numpy(), y_test)
# pred = model_test.predict(X_test.to_numpy())
# pred