# Подключение библиотек

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs, make_circles, make_moons
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import MeanShift, estimate_bandwidth, AgglomerativeClustering, DBSCAN, AffinityPropagation
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score, adjusted_rand_score
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import plotly.express as px

# Чтение данных из файла

In [10]:
pd.set_option('display.max_columns', None)
data = pd.read_parquet(r'data18.parquet')

In [11]:
data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,Label
84672,0,tcp,http,SF,200,563,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,10,0.0,0.0,0.0,0.0,1.0,0.0,0.3,8,255,1.0,0.0,0.12,0.03,0.0,0.02,0.0,0.0,normal.
104569,0,tcp,http,SF,167,1578,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,255,1.0,0.0,0.5,0.08,0.0,0.0,0.0,0.0,normal.
147646,0,tcp,smtp,SF,2030,332,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,144,170,0.71,0.05,0.01,0.01,0.0,0.0,0.0,0.0,normal.
37376,0,tcp,http,SF,339,2037,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,56,255,1.0,0.0,0.02,0.04,0.0,0.0,0.0,0.0,normal.
4166,0,udp,domain_u,SF,44,69,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,22,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255,215,0.84,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal.


# Выборка данных для обучения модели

In [17]:
X = data[['src_bytes', 'dst_bytes']].values
y = data[['count', 'srv_count', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate']].values

# Функция генерации и трансформации данных

In [18]:
def generate_and_transform_data():
    scaler = StandardScaler()
    X_normalized = scaler.fit_transform(X)

    tsne = TSNE(n_components=2, random_state=42)
    X_tsne = tsne.fit_transform(X_normalized)

    return X_normalized, y, X_tsne

# Функция кластеризации и визуализации

In [25]:
def apply_clustering_and_visualize(X, y_true, X_tsne, labels, algorithm_name, n_clusters=4):

    silhouette = silhouette_score(X, labels)
    dbi = davies_bouldin_score(X, labels)
    chi = calinski_harabasz_score(X, labels)
    rand_index = adjusted_rand_score(y_true.reshape(-1, 0), labels)

    print(f"{algorithm_name} Clustering")
    print("Silhouette Score:", silhouette)
    print("Davies-Bouldin Index:", dbi)
    print("Calinski-Harabasz Index:", chi)
    print("Rand Index:", rand_index)

    marker_symbols = ['circle', 'square', 'triangle-up', 'diamond']
    fig = go.Figure()


    for i, symbol in zip(np.unique(y_true), marker_symbols):
        cluster_data = X_tsne[y_true == i, :]
        cluster_labels = labels[y_true == i]

        colorscale = px.colors.sample_colorscale(px.colors.sequential.Jet, n_clusters)
        color_dict = dict(zip(set(cluster_labels), colorscale))
        colors = [color_dict[l] for l in cluster_labels]

        fig.add_trace(go.Scatter(x=cluster_data[:, 0], y=cluster_data[:, 1], mode='markers',
                                 marker=dict(symbol=symbol, size=10, color=colors,),
                                 text=[f'Кластер {l}' for l in cluster_labels], line=dict(color='black', width=1),
                                 name=f'Истинный кластер {i}'))

    fig.update_layout(
        height=800,
        title=f't-SNE визуализация {algorithm_name}, истинные кластера обозначены формой, предсказанные - цветом',
        xaxis_title='Компонент 1',
        yaxis_title='Компонент 2',
        legend_title="Кластеры"
    )
    fig.show()

In [23]:
X, y_true, X_tsne = generate_and_transform_data()

In [26]:
bandwidth = estimate_bandwidth(X)
model = MeanShift(bandwidth=bandwidth, bin_seeding=False, min_bin_freq=5)

labels = model.fit_predict(X)
apply_clustering_and_visualize(X, y_true, X_tsne, labels, 'MeanShift')

ValueError: cannot reshape array of size 25735 into shape (0)