In [3]:
import pandas as pd
import numpy as np
import hdbscan
import random
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.cluster import DBSCAN, KMeans, OPTICS
import matplotlib.pyplot as plt

pd.options.mode.chained_assignment = None 

class AbstractResult:
    def __init__(self, name, X, features):
        self.name = name
        self.features = features
        self.X = X
        self.X_scaled = self.preprocess()

    def preprocess(self):
        print(f"Preprocessing data for {self.name}")
        X_scaled = self.X[self.features]
        # encode
        label_encoder = LabelEncoder()
        X_scaled["encoded_country"] = label_encoder.fit_transform(self.X["country"])
        # scale
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_scaled) 
        print(f"Preprocessing done...")
        return X_scaled

    def show_overview(self):
        print(f"Model {self.name} overview")
        cluster_no = len(np.unique(self.labels))
        if cluster_no == 1:
            print(f"Only one cluster found for {method}")
        else:
            print(f"Number of clusters: {cluster_no}")
        noise = self.X[self.labels == -1].shape[0]
    
        print(f"Noise points: {noise}. Calculating metrics...")
    
        no_noise_X = self.X_scaled[self.labels != -1]
        no_noise_labels = self.labels[self.labels != -1]
        #sil_score = silhouette_score(no_noise_X, no_noise_labels)
        #print(f"Silhouette Score: {sil_score}")
        ch_index = calinski_harabasz_score(no_noise_X, no_noise_labels)
        print(f"Calinski-Harabasz Index: {ch_index}")
        #db_index = davies_bouldin_score(no_noise_X, no_noise_labels)
        #print(f"Davies-Bouldin Index: {db_index}")

        
    def write_random_groups(self, proxy_folder=""):
        print(f"Extracting random groups for {self.name}...")
        random_groups = np.random.choice(self.labels[self.labels != -1], size=40, replace=True)
        path = self.name if proxy_folder == "" else f"{self.name}/{proxy_folder}"  
        #for group in random_groups:
        #    group_df = self.X[self.labels == group]
        #    group_df.to_csv(f"results/{path}/groups/group_{group}.csv")
        #print("Groups extracted!")
        pd.DataFrame(self.labels).to_csv(f"results/{path}/labels.csv")
        print("Labels written.")

    
        
class ModelFramework(AbstractResult):
    def __init__(self, name, X, features, model):
        self.model = model 
        super().__init__(name, X, features)

    def train(self, output_result=True):
        print(f"Start training {self.name}")
        self.labels = self.model.fit_predict(self.X_scaled)
        print(f"Training done")
        self.show_overview()
        if output_result: 
            self.write_random_groups()

class RandomisedKMeansRunner(AbstractResult):
    def train(self, k_min=1000, k_max=20000, samples=20):
        random_ks = random.sample(range(k_min, k_max+1), samples)
        for k in random_ks:
            print(f"[NEW RUN] KMeans with {k} clusters.")
            kmeans = KMeans(n_clusters=k)
            self.labels = kmeans.fit_predict(self.X_scaled)
            self.show_overview()




In [4]:
df = pd.read_csv("3_weeks.csv")

features_active_days = ['subnet', 'asn', 'active_days', 'scan_length_seconds', 'median_time_diff', 'distinct_src_ports', 'distinct_dest_ports', 'top_port', 'total_hits', 'distinct_ips', 'top_fingerprint', 'generation_algorithm']
features_hours = ['subnet', 'asn', 'top_start_hour', 'top_end_hour', 'scan_length_seconds', 'median_time_diff', 'distinct_src_ports', 'distinct_dest_ports', 'top_port', 'total_hits', 'distinct_ips', 'top_fingerprint', 'generation_algorithm'] 
features_quartiles = ['subnet', 'asn', 'active_days', 'scan_length_seconds', 'median_time_diff', 'distinct_src_ports', 'avg_payload_length', 'distinct_dest_ports', 'top_port', 'total_hits', 'distinct_ips', 'top_fingerprint', 'q1_prev_ip', 'median_prev_ip', 'q3_prev_ip'] 

hdb_days = ModelFramework("hdb_v1", df, features_active_days, hdbscan.HDBSCAN(min_cluster_size=10, gen_min_span_tree=True))
hdb_hours = ModelFramework("hdb_v2", df, features_hours, hdbscan.HDBSCAN(min_cluster_size=10, gen_min_span_tree=True))


Preprocessing data for hdb_v1
Preprocessing done...
Preprocessing data for hdb_v2
Preprocessing done...


In [3]:
features_quartiles_hits = ['subnet', 'asn', 'active_days', 'scan_length_seconds', 'median_time_diff', 'distinct_src_ports', 'avg_payload_length', 'distinct_dest_ports', 'total_hits', 'distinct_ips','top_port', 'top_fingerprint', 'q1_prev_ip', 'median_prev_ip', 'q3_prev_ip']
hdb_fs1 = ModelFramework("hdb_fs1", df, features_quartiles_hits, hdbscan.HDBSCAN(min_cluster_size=2, gen_min_span_tree=True))
hdb_fs1.train()

Preprocessing data for hdb_fs1
Preprocessing done...
Start training hdb_fs1




Training done
Model hdb_fs1 overview
Number of clusters: 115249
Noise points: 316442. Calculating metrics...
Calinski-Harabasz Index: 1171.744651069235
Extracting random groups for hdb_fs1...
Labels written.


In [4]:
features_quartiles_nohits = ['subnet', 'asn', 'active_days', 'scan_length_seconds', 'median_time_diff', 'distinct_src_ports', 'avg_payload_length', 'distinct_dest_ports','top_port', 'top_fingerprint', 'q1_prev_ip', 'median_prev_ip', 'q3_prev_ip']
hdb_fs2 = ModelFramework("hdb_fs2", df, features_quartiles_nohits, hdbscan.HDBSCAN(min_cluster_size=2, gen_min_span_tree=True))
hdb_fs2.train()

Preprocessing data for hdb_fs2
Preprocessing done...
Start training hdb_fs2




Training done
Model hdb_fs2 overview
Number of clusters: 116333
Noise points: 316265. Calculating metrics...
Calinski-Harabasz Index: 1868.9511963100351
Extracting random groups for hdb_fs2...
Labels written.


In [None]:
features_quartiles_hits = ['subnet', 'asn', 'active_days', 'scan_length_seconds', 'median_time_diff', 'distinct_src_ports', 'avg_payload_length', 'distinct_dest_ports', 'total_hits', 'distinct_ips','top_port', 'top_fingerprint', 'q1_prev_ip', 'median_prev_ip', 'q3_prev_ip']
for i in range(3):
    epsilon = 0.5+0.05*i
    print(f"Epsilon={epsilon}")
    db = ModelFramework("db_eps_test", df, features_quartiles_hits, DBSCAN(eps=epsilon, min_samples=2))
    db.train(False)

Epsilon=0.5
Preprocessing data for db_eps_test
Preprocessing done...
Start training db_eps_test
Training done
Model db_eps_test overview
Number of clusters: 10178
Noise points: 53615. Calculating metrics...
Calinski-Harabasz Index: 138.85137342364882
Epsilon=0.55
Preprocessing data for db_eps_test
Preprocessing done...
Start training db_eps_test
Training done
Model db_eps_test overview
Number of clusters: 8329
Noise points: 43923. Calculating metrics...
Calinski-Harabasz Index: 163.8052464182611
Epsilon=0.6
Preprocessing data for db_eps_test
Preprocessing done...
Start training db_eps_test


In [3]:
features_quartiles_nohits = ['subnet', 'asn', 'active_days', 'scan_length_seconds', 'median_time_diff', 'distinct_src_ports', 'avg_payload_length', 'distinct_dest_ports','top_port', 'top_fingerprint', 'q1_prev_ip', 'median_prev_ip', 'q3_prev_ip']
for i in range(12):
    epsilon = 0.05+0.05*i
    print(f"Epsilon={epsilon}")
    db = ModelFramework("db_eps_test", df, features_quartiles_nohits, DBSCAN(eps=epsilon, min_samples=2))
    db.train(False)

Epsilon=0.05
Preprocessing data for db_eps_test
Preprocessing done...
Start training db_eps_test
Training done
Model db_eps_test overview
Number of clusters: 48031
Noise points: 469659. Calculating metrics...
Calinski-Harabasz Index: 391.3572604852084
Epsilon=0.1
Preprocessing data for db_eps_test
Preprocessing done...
Start training db_eps_test
Training done
Model db_eps_test overview
Number of clusters: 48082
Noise points: 360209. Calculating metrics...
Calinski-Harabasz Index: 281.17330421181595
Epsilon=0.15000000000000002
Preprocessing data for db_eps_test
Preprocessing done...
Start training db_eps_test
Training done
Model db_eps_test overview
Number of clusters: 43385
Noise points: 280721. Calculating metrics...
Calinski-Harabasz Index: 173.97649081251697
Epsilon=0.2
Preprocessing data for db_eps_test
Preprocessing done...
Start training db_eps_test
Training done
Model db_eps_test overview
Number of clusters: 38360
Noise points: 220419. Calculating metrics...
Calinski-Harabasz In

In [6]:
features_quartiles_hits = ['subnet', 'asn', 'active_days', 'scan_length_seconds', 'median_time_diff', 'distinct_src_ports', 'avg_payload_length', 'distinct_dest_ports', 'total_hits', 'distinct_ips','top_port', 'top_fingerprint', 'q1_prev_ip', 'median_prev_ip', 'q3_prev_ip']
db = ModelFramework("db_fs1", df, features_quartiles_hits, DBSCAN(eps=0.4, min_samples=2))
db.train()

Preprocessing data for db_fs1
Preprocessing done...
Start training db_fs1
Training done
Model db_fs1 overview
Number of clusters: 15092
Noise points: 82745. Calculating metrics...
Calinski-Harabasz Index: 103.18848580282702
Extracting random groups for db_fs1...
Labels written.


In [8]:
features_quartiles_nohits = ['subnet', 'asn', 'active_days', 'scan_length_seconds', 'median_time_diff', 'distinct_src_ports', 'avg_payload_length', 'distinct_dest_ports','top_port', 'top_fingerprint', 'q1_prev_ip', 'median_prev_ip', 'q3_prev_ip']
db = ModelFramework("db_fs2", df, features_quartiles_nohits, DBSCAN(eps=0.4, min_samples=2))
db.train()

Preprocessing data for db_fs2
Preprocessing done...
Start training db_fs2
Training done
Model db_fs2 overview
Number of clusters: 14797
Noise points: 79674. Calculating metrics...
Calinski-Harabasz Index: 98.53820172115266
Extracting random groups for db_fs2...
Labels written.


In [9]:
features_quartiles_hits = ['subnet', 'asn', 'active_days', 'scan_length_seconds', 'median_time_diff', 'distinct_src_ports', 'avg_payload_length', 'distinct_dest_ports', 'total_hits', 'distinct_ips','top_port', 'top_fingerprint', 'q1_prev_ip', 'median_prev_ip', 'q3_prev_ip']
hdb_fs1_10pts = ModelFramework("hdb_fs1_10pts", df, features_quartiles_hits, hdbscan.HDBSCAN(min_cluster_size=10, gen_min_span_tree=True))
hdb_fs1_10pts.train()

Preprocessing data for hdb_fs1_10pts
Preprocessing done...
Start training hdb_fs1_10pts




Training done
Model hdb_fs1_10pts overview
Number of clusters: 10398
Noise points: 493000. Calculating metrics...
Calinski-Harabasz Index: 3649.642678757402
Extracting random groups for hdb_fs1_10pts...
Labels written.


In [10]:
features_quartiles_nohits = ['subnet', 'asn', 'active_days', 'scan_length_seconds', 'median_time_diff', 'distinct_src_ports', 'avg_payload_length', 'distinct_dest_ports','top_port', 'top_fingerprint', 'q1_prev_ip', 'median_prev_ip', 'q3_prev_ip']
hdb_fs2_10pts = ModelFramework("hdb_fs2_10pts", df, features_quartiles_nohits, hdbscan.HDBSCAN(min_cluster_size=10, gen_min_span_tree=True))
hdb_fs2_10pts.train()

Preprocessing data for hdb_fs2_10pts
Preprocessing done...
Start training hdb_fs2_10pts




Training done
Model hdb_fs2_10pts overview
Number of clusters: 10440
Noise points: 493702. Calculating metrics...
Calinski-Harabasz Index: 3671.302359745206
Extracting random groups for hdb_fs2_10pts...
Labels written.


In [11]:
features_quartiles_hits = ['subnet', 'asn', 'active_days', 'scan_length_seconds', 'median_time_diff', 'distinct_src_ports', 'avg_payload_length', 'distinct_dest_ports', 'total_hits', 'distinct_ips','top_port', 'top_fingerprint', 'q1_prev_ip', 'median_prev_ip', 'q3_prev_ip']
db_fs1_eps01 = ModelFramework("db_fs1_eps01", df, features_quartiles_hits, DBSCAN(eps=0.1, min_samples=2))
db_fs1_eps01.train()

Preprocessing data for db_fs1_eps01
Preprocessing done...
Start training db_fs1_eps01
Training done
Model db_fs1_eps01 overview
Number of clusters: 47770
Noise points: 362851. Calculating metrics...
Calinski-Harabasz Index: 294.4206785244829
Extracting random groups for db_fs1_eps01...
Labels written.


In [12]:
features_quartiles_nohits = ['subnet', 'asn', 'active_days', 'scan_length_seconds', 'median_time_diff', 'distinct_src_ports', 'avg_payload_length', 'distinct_dest_ports','top_port', 'top_fingerprint', 'q1_prev_ip', 'median_prev_ip', 'q3_prev_ip']
db_fs2_eps01 = ModelFramework("db_fs2_eps01", df, features_quartiles_nohits, DBSCAN(eps=0.1, min_samples=2))
db_fs2_eps01.train()

Preprocessing data for db_fs2_eps01
Preprocessing done...
Start training db_fs2_eps01
Training done
Model db_fs2_eps01 overview
Number of clusters: 48082
Noise points: 360209. Calculating metrics...
Calinski-Harabasz Index: 281.17330421181595
Extracting random groups for db_fs2_eps01...
Labels written.
