In [12]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import numpy as np
import pandas as pd


In [13]:
Y_LABELS = {
    'normal.': 'Normal',
    
    'back.': 'DoS',
    'land.': 'DoS',
    'neptune.': 'DoS',
    'pod.': 'DoS',
    'smurf.': 'DoS',
    'teardrop.': 'DoS',
    
    'ipsweep.': 'Probe',
    'nmap.': 'Probe',
    'portsweep.': 'Probe',
    'satan.': 'Probe',
    
    'ftp_write.': 'R2L',
    'guess_passwd.': 'R2L',
    'imap.': 'R2L',
    'multihop.': 'R2L',
    'phf.': 'R2L',
    'spy.': 'R2L',
    'warezclient.': 'R2L',
    'warezmaster.': 'R2L',
    
    'buffer_overflow.': 'U2R',
    'loadmodule.': 'U2R',
    'perl.': 'U2R',
    'rootkit.': 'U2R'
}

In [14]:
class LabelEncoder(object):
    def __init__(self):
        self.mapping = {}
        self.inverse_mapping = {}

    def fit(self, data):
        unique_values = np.unique(data)
        for i, value in enumerate(unique_values):
            self.mapping[value] = i
            self.inverse_mapping[i] = value

    def transform(self, data):
        return np.array([self.mapping[value] for value in data])

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

    def inverse_transform(self, data):
        return np.array([self.inverse_mapping[value] for value in data])

In [15]:
class Cluster(object):
    def __init__(
            self, 
            n_clusters=4, 
            max_iter=10, 
            eps=1e-6, 
            distance='euclidean'
    ):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.eps = eps
        self.distance = {
            'euclidean': self.euclidean,
        }[distance]
    
    def fit(self, X: pd.DataFrame, y):
        Mu = X.sample(n=self.n_clusters, replace=False)
        for _ in range(self.max_iter):
            Mu_copy = Mu.copy()
            
            C = pd.DataFrame({
                'index': [np.array([]) for _ in range(self.n_clusters)], 
                'label': [set() for _ in range(self.n_clusters)]
            })
            
            for i in X.index:
                dist = np.array([
                    self.distance(X.iloc[i], mu) for mu in Mu.values
                ])
                for _ in range(self.n_clusters):
                    r = np.argmin(dist)
                    if not self.violated(y.iloc[i], r, C['label']):
                        C['index'][r] = np.append(C['index'][r], i)
                        C['label'][r].add(y.iloc[i])
                        break
                    else:
                        dist[r] = np.inf
            for j in range(self.n_clusters):
                Mu.iloc[j] = X.iloc[C['index'][j]].mean()
            if np.all(np.abs(Mu_copy.values - Mu.values) < self.eps):
                break
            C['label']
        
        self.C = pd.DataFrame({
            'Mu': Mu, 
            'label': [list(label)[0] for label in C['label']]
        })
        
        
    def predict(self, X: pd.DataFrame):
        y_pred = np.array([
            self.C.at[[
                np.argmin(
                    [self.distance(x, mu) 
                    for mu in self.C['Mu'].itertuples(index=False)]
                )
            ],'label']
            for x in X.itertuples(index=False)
        ])
        return y_pred
    
    def fit_predict(self, X, y):
        self.fit(X, y)
        return self.predict(X)
    
    def score(self, X, y):
        y_pred = self.predict(X)
        return np.mean(y_pred == y)
        
    @staticmethod
    def euclidean(a, b):
        return np.linalg.norm(a - b)
    
    def violated(self, i_label, r, labels):
        if labels[r]:
            if i_label not in labels[r]:
                return True
            else:
                ...
        else:
            for j in range(self.n_clusters):
                if i_label in labels[j]:
                    return True
        return False
    
    def saveModel(self):
        with open('Models/version', 'r') as f:
            version = int(f.read())
        with open(f'Models/cluster_{version}.csv', 'w') as f:
            f.write(f'{self.n_clusters}\n')
            f.write(f'{self.max_iter}\n')
            f.write(f'{self.eps}\n')
            f.write(f'{self.distance}\n')
            

In [16]:
def min_max_normalize(data):
    min_val = data.min(axis=0)
    max_val = data.max(axis=0)
    return (data - min_val) / (max_val - min_val)

def z_score_normalize(data):
    mean = data.mean(axis=0)
    std = data.std(axis=0)
    return (data - mean) / std

In [17]:
df = pd.read_csv('kddcup.data.gz')
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

string_columns = X.select_dtypes(include=['object']).columns
encoder = LabelEncoder()
for col in string_columns:
    X[col] = encoder.fit_transform(X[col])

X = min_max_normalize(X).dropna(axis=1)
y = y.map(Y_LABELS)

In [18]:
X_train = X.iloc[:X.shape[0] * 5 // 7, :].reset_index(drop=True)
y_train = y.iloc[:y.shape[0] * 5 // 7].reset_index(drop=True)
y_train.value_counts()

normal.
DoS       2689737
Normal     771198
Probe       36785
R2L          1125
U2R            33
Name: count, dtype: int64

In [19]:
X_train.index
X_train.shape

RangeIndex(start=0, stop=3498878, step=1)

(3498878, 40)

In [20]:
model = Cluster(n_clusters=2, max_iter=10, eps=1e-6, distance='euclidean')
model.fit(X_train, y_train)


KeyboardInterrupt



In [None]:
X_test = X.iloc[X.shape[0] * 5 // 7:, :].reset_index(drop=True)
y_test = y.iloc[y.shape[0] * 5 // 7:].reset_index(drop=True)

In [None]:
pred = model.predict(X_test)
print(sum(pred == y_test))
print(X_test.shape[0])
pred

In [None]:
print(model.label)
print(model.Mu)

In [None]:
# pred
# # pred = np.array(list(p) for p in pred)
# pred = np.array([list(p)[0] if len(p) == 1 else 'DoS' for p in pred])
# print(sum(pred == y_test.to_numpy()))

In [None]:
# YY_test = y_test.iloc[:, 0].map({
#     'Probe': 0,
#     'R2L': 1,
#     'DoS': 2,
#     'Normal': 3
# })
# print(sum(pred == YY_test))