In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import numpy as np
import pandas as pd

In [3]:
Y_LABELS = {
    'normal.': 'Normal',
    
    'back.': 'DoS',
    'land.': 'DoS',
    'neptune.': 'DoS',
    'pod.': 'DoS',
    'smurf.': 'DoS',
    'teardrop.': 'DoS',
    
    'ipsweep.': 'Probe',
    'nmap.': 'Probe',
    'portsweep.': 'Probe',
    'satan.': 'Probe',
    
    'ftp_write.': 'R2L',
    'guess_passwd.': 'R2L',
    'imap.': 'R2L',
    'multihop.': 'R2L',
    'phf.': 'R2L',
    'spy.': 'R2L',
    'warezclient.': 'R2L',
    'warezmaster.': 'R2L',
    
    'buffer_overflow.': 'U2R',
    'loadmodule.': 'U2R',
    'perl.': 'U2R',
    'rootkit.': 'U2R'
}

In [4]:
class LabelEncoder(object):
    def __init__(self):
        self.mapping = {}
        self.inverse_mapping = {}
    def fit(self, data):
        unique_values = np.unique(data)
        for i, value in enumerate(unique_values):
            self.mapping[value] = i
            self.inverse_mapping[i] = value
    def transform(self, data):
        return np.array([self.mapping[value] for value in data])

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

    def inverse_transform(self, data):
        return np.array([self.inverse_mapping[value] for value in data])

In [5]:
class Cluster(object):
    def __init__(
            self, 
            n_clusters=5, 
            max_iter=100, 
            eps=1e-6, 
            distance='euclidean',
            random_state=None,
            **kwargs
    ):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.eps = eps
        self.distance = {
            'euclidean': self.euclidean,
            'manhattan': self.manhattan,
        }[distance]
        self.random_state = random_state
        for k, v in kwargs.items():
            if type(v) == str:
                v = eval(v)
            setattr(self, k, v)
    
    def to_DataFrame(self):
        kv = self.__dict__.copy()
        kv['distance'] = self.distance.__name__
        kv['Mu'] = kv['Mu'].tolist()
        kv['label'] = kv['label'].tolist()
        return pd.DataFrame([kv])
    
    def fit(self, X, y):
        np.random.seed(self.random_state)
        Mu = X[np.random.choice(
            X.shape[0], self.n_clusters, replace=False
        )]
        
        mp = {
            y.unique()[i]: i
            for i in range(y.unique().shape[0])
        }
        mp.update({
            i: y.unique()[i]
            for i in range(y.unique().shape[0])
        })
        label_ = np.array([
            np.array([0] * y.unique().shape[0])
            for _ in range(self.n_clusters)
        ])
        
        for n_iter in range(self.max_iter):
            Mu_copy = Mu.copy()
            C = [[] for _ in range(self.n_clusters)]
            label = label_.copy()
        
            for i in range(X.shape[0]):
                dist = np.array([self.distance(X[i], mu) for mu in Mu])
                for _ in range(self.n_clusters):
                    r = np.argmin(dist)
                    if not self.violated(mp[y[i]], r, label):
                        C[r].append(i)
                        label[r][mp[y[i]]] += 1
                        break
                    else:
                        dist[r] = np.inf
            print(f'iter: {n_iter + 1}')
            print(label)
            for j in range(self.n_clusters):
                Mu[j] = np.mean(X[C[j]], axis=0)
            if np.all(np.abs(Mu_copy - Mu) < self.eps):
                break
        
        self.label = np.array(
            [mp[np.random.choice(np.where(s == s.max())[0], 1)[0]] for s in label]
        )
        self.Mu = Mu
    
    def predict(self, X):
        y_pred = np.array([
            self.label[np.argmin(
                [self.distance(X[i], mu) for mu in self.Mu]
            )] for i in range(X.shape[0])
        ])
        return y_pred
    
    def fit_predict(self, X, y):
        self.fit(X, y)
        return self.predict(X)
    
    def score(self, X, y):
        y_pred = self.predict(X)
        self.accuracy = np.mean(y_pred == y)
        return self.accuracy
    
    @staticmethod
    def euclidean(a, b):
        return np.linalg.norm(a - b)
    
    @staticmethod
    def manhattan(a, b):
        return np.sum(np.abs(a - b))
    
    @staticmethod
    def violated(i_label, r, labels):
        if not labels[r].sum():
            return False
        return not labels[r][i_label] and np.sum(labels[:, i_label])
    
    def saveModel(self):
        with open('Models/version_C', 'r') as f:
            version = int(f.read())
        version += 1
        with open('Models/version_C', 'w') as f:
            f.write(str(version))
        self.to_DataFrame().to_csv(f'Models/cluster_{version}.csv', index=False)
    
    def loadModel(self, version):
        with open(f'Models/cluster_{version}.csv', 'r') as f:
            kv = pd.read_csv(f).iloc[0].to_dict()
        self.__init__(**kv)
    
    def loadFromDataFrame(self, df):
        kv = df.iloc[0].to_dict()
        self.__init__(**kv)

In [6]:
def OverSample(X: pd.DataFrame, y: pd.Series, random_state=42):
    np.random.seed(random_state)
    unique, counts = np.unique(y, return_counts=True)
    MAX = counts.max()
    for i in range(unique.shape[0]):
        if counts[i] < MAX:
            idx = np.where(y == unique[i])[0]
            idx = np.random.choice(idx, (MAX - counts[i]) // 10)
            X = pd.concat([X, X.iloc[idx, :]], axis=0)
            y = pd.concat([y, y.iloc[idx]], axis=0)
    X = X.reset_index(drop=True)
    y = y.reset_index(drop=True)
    return X, y

def min_max_normalize(data):
    min_val = data.min(axis=0)
    max_val = data.max(axis=0)
    return (data - min_val) * 1_000_000 / (max_val - min_val)

In [7]:
df = pd.read_csv('kddcup.data.gz')
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

encoder = LabelEncoder()
for col in X.select_dtypes(include=['object']).columns:
    X[col] = encoder.fit_transform(X[col])

X = min_max_normalize(X).dropna(axis=1)
y = y.map(Y_LABELS)
y.value_counts()

normal.
DoS       3883370
Normal     972780
Probe       41102
R2L          1126
U2R            52
Name: count, dtype: int64

In [8]:
X_train = X.iloc[:X.shape[0] * 3 // 5, :].reset_index(drop=True)
y_train = y.iloc[:y.shape[0] * 3 // 5].reset_index(drop=True)
y_train.value_counts().to_frame()

X_train, y_train = OverSample(X_train, y_train, random_state=42)
y_train.value_counts().to_frame()

X_test = X.iloc[X.shape[0] * 3 // 5:, :].reset_index(drop=True)
y_test = y.iloc[y.shape[0] * 3 // 5:].reset_index(drop=True)

Unnamed: 0_level_0,count
normal.,Unnamed: 1_level_1
DoS,2192451
Normal,712266
Probe,33187
R2L,1125
U2R,29


Unnamed: 0_level_0,count
normal.,Unnamed: 1_level_1
DoS,2192451
Normal,860284
Probe,249113
R2L,220257
U2R,219271


In [161]:
# model = Cluster(
#     n_clusters=6, max_iter=20, 
#     eps=1e-6, distance='euclidean', 
#     random_state=24,
# )
# model.fit(X_train.to_numpy(), y_train)

iter: 1
[[      0       0 1705940       0       0]
 [   4637       0       0       0       0]
 [      0       0  484848       0       0]
 [ 792977       0       0       0  237484]
 [      0       0       0       0   11629]
 [  62670  219271    1663  220257       0]]
iter: 2
[[      0       0 1780891       0       0]
 [  63016       0       0       0       0]
 [    110       0  411560       0       0]
 [ 684385       0       0       0  249113]
 [  41702       0       0       0       0]
 [  71071  219271       0  220257       0]]
iter: 3
[[      0       0 1781124       0       0]
 [  74844       0       0       0       0]
 [     60       0  411327       0       0]
 [ 657462       0       0       0       0]
 [  39101       0       0       0  249113]
 [  88817  219271       0  220257       0]]
iter: 4
[[      0       0 1781126       0       0]
 [  82404       0       0       0       0]
 [     71       0  411325       0       0]
 [ 650909       0       0       0       0]
 [  39535       0  

In [162]:
# model.score(X_test.to_numpy(), y_test)
# model.saveModel()

0.8703666276745814

In [10]:
from sklearn.metrics import fowlkes_mallows_score, rand_score, jaccard_score, silhouette_score

model = Cluster()
model.loadModel(version=30)
y_pred = model.predict(X_test.to_numpy())

In [None]:
from sklearn.metrics import silhouette_score
silhouette_score(X_test.to_numpy(), y_pred)
silhouette_score(X, y)

In [10]:
print(f'FM: {fowlkes_mallows_score(y_test, y_pred)}')
print(f'Rand: {rand_score(y_test, y_pred)}')
print(f'Jaccard: {jaccard_score(y_test, y_pred, average="weighted")}')

FM: 0.8831637364293012
Rand: 0.8322279594153843
Jaccard: 0.8678624839493645


In [22]:
result = pd.DataFrame(columns=['FM', 'Rand', 'Jaccard'], index=['Normal', 'DoS', 'Probe', 'R2L', 'U2R', 'All'])
# result.index = ['Normal', 'DoS', 'Probe', 'R2L', 'U2R', 'All']
for name in ['Normal', 'DoS', 'Probe', 'R2L', 'U2R']:
    part = np.where(y_test == name)
    y_test_part = y_test.iloc[part]
    y_pred_part = y_pred[part]
    result.loc[name, 'FM'] = fowlkes_mallows_score(y_test_part, y_pred_part)
    result.loc[name, 'Rand'] = rand_score(y_test_part, y_pred_part)
    result.loc[name, 'Jaccard'] = jaccard_score(y_test_part, y_pred_part, average="weighted")

result.loc['All', 'FM'] = fowlkes_mallows_score(y_test, y_pred)
result.loc['All', 'Rand'] = rand_score(y_test, y_pred)
result.loc['All', 'Jaccard'] = jaccard_score(y_test, y_pred, average="weighted")
result


Unnamed: 0,FM,Rand,Jaccard
Normal,0.829213,0.687594,0.819326
DoS,0.886631,0.786114,0.878334
Probe,0.702985,0.494188,0.505749
R2L,0.0,1.0,0.0
U2R,0.955533,0.913043,0.0
All,0.882689,0.831585,0.866473


In [163]:
# test_model = Cluster()
# test_model.loadModel(version=30)
# test_model.score(X_test.to_numpy(), y_test)