In [17]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

In [18]:
class LabelEncoder(object):
    def __init__(self):
        self.mapping = {}
        self.inverse_mapping = {}

    def fit(self, data):
        unique_values = np.unique(data)
        for i, value in enumerate(unique_values):
            self.mapping[value] = i
            self.inverse_mapping[i] = value

    def transform(self, data):
        return np.array([self.mapping[value] for value in data])

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

    def inverse_transform(self, data):
        return np.array([self.inverse_mapping[value] for value in data])

In [19]:
# 读取数据集
data = pd.read_csv('kddcup.data.gz').iloc[:10000, :]
X = data.iloc[:, :-1]
labels = data.iloc[:, -1]

string_columns = X.select_dtypes(include=['object']).columns
encoder = LabelEncoder()
for col in string_columns:
    X[col] = encoder.fit_transform(X[col])

X

Unnamed: 0,0,tcp,http,SF,215,45076,0.1,0.2,0.3,0.4,...,0.16,0.17,0.00.6,0.00.7,0.00.8,0.00.9,0.00.10,0.00.11,0.00.12,0.00.13
0,0,1,6,3,162,4528,0,0,0,0,...,1,1,1.0,0.0,1.00,0.00,0.0,0.0,0.0,0.0
1,0,1,6,3,236,1228,0,0,0,0,...,2,2,1.0,0.0,0.50,0.00,0.0,0.0,0.0,0.0
2,0,1,6,3,233,2032,0,0,0,0,...,3,3,1.0,0.0,0.33,0.00,0.0,0.0,0.0,0.0
3,0,1,6,3,239,486,0,0,0,0,...,4,4,1.0,0.0,0.25,0.00,0.0,0.0,0.0,0.0
4,0,1,6,3,238,1282,0,0,0,0,...,5,5,1.0,0.0,0.20,0.00,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,1,6,3,184,7575,0,0,0,0,...,57,255,1.0,0.0,0.02,0.07,0.0,0.0,0.0,0.0
9996,0,1,6,3,240,312,0,0,0,0,...,58,255,1.0,0.0,0.02,0.07,0.0,0.0,0.0,0.0
9997,0,1,6,3,244,2194,0,0,0,0,...,59,255,1.0,0.0,0.02,0.07,0.0,0.0,0.0,0.0
9998,0,1,6,3,239,691,0,0,0,0,...,60,255,1.0,0.0,0.02,0.07,0.0,0.0,0.0,0.0


In [20]:
def min_max_normalize(data):
    min_val = data.min(axis=0)
    max_val = data.max(axis=0)
    return (data - min_val) / (max_val - min_val)

def z_score_normalize(data):
    mean = data.mean(axis=0)
    std = data.std(axis=0)
    return (data - mean) / std

In [21]:
X = min_max_normalize(X).dropna(axis=1)
X

Unnamed: 0,0,tcp,http,SF,215,45076,0.3,0.4,1,0.6,...,0.00.4,0.00.5,0.16,0.17,0.00.6,0.00.7,0.00.8,0.00.9,0.00.10,0.00.11
0,0.0,0.5,0.666667,1.0,0.008215,0.017023,0.0,0.0,1.0,0.0,...,0.0,0.0,0.000000,0.000000,1.0,0.0,1.00,0.00,0.0,0.0
1,0.0,0.5,0.666667,1.0,0.011967,0.004617,0.0,0.0,1.0,0.0,...,0.0,0.0,0.003937,0.003937,1.0,0.0,0.50,0.00,0.0,0.0
2,0.0,0.5,0.666667,1.0,0.011815,0.007639,0.0,0.0,1.0,0.0,...,0.0,0.0,0.007874,0.007874,1.0,0.0,0.33,0.00,0.0,0.0
3,0.0,0.5,0.666667,1.0,0.012119,0.001827,0.0,0.0,1.0,0.0,...,0.0,0.0,0.011811,0.011811,1.0,0.0,0.25,0.00,0.0,0.0
4,0.0,0.5,0.666667,1.0,0.012068,0.004820,0.0,0.0,1.0,0.0,...,0.0,0.0,0.015748,0.015748,1.0,0.0,0.20,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.5,0.666667,1.0,0.009330,0.028478,0.0,0.0,1.0,0.0,...,0.0,0.0,0.220472,1.000000,1.0,0.0,0.02,0.07,0.0,0.0
9996,0.0,0.5,0.666667,1.0,0.012170,0.001173,0.0,0.0,1.0,0.0,...,0.0,0.0,0.224409,1.000000,1.0,0.0,0.02,0.07,0.0,0.0
9997,0.0,0.5,0.666667,1.0,0.012373,0.008248,0.0,0.0,1.0,0.0,...,0.0,0.0,0.228346,1.000000,1.0,0.0,0.02,0.07,0.0,0.0
9998,0.0,0.5,0.666667,1.0,0.012119,0.002598,0.0,0.0,1.0,0.0,...,0.0,0.0,0.232283,1.000000,1.0,0.0,0.02,0.07,0.0,0.0


In [33]:
import numpy as np

class SemiSupervisedKMeans:
    def __init__(self, n_clusters=8, max_iter=300, tol=1e-4):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.tol = tol

    def fit(self, X, y):
        # 初始化聚类中心
        labeled_centers = []
        for label in np.unique(y):
            labeled_centers.append(np.mean(X[y == label], axis=0))
        unlabeled_centers = X[np.random.choice(len(X), self.n_clusters - len(labeled_centers), replace=False)]
        centers = np.vstack((labeled_centers, unlabeled_centers))
        
        for _ in range(self.max_iter):
            # 分配数据点到最近的聚类中心
            distances = np.linalg.norm(X[:, np.newaxis] - centers, axis=2)
            labels = np.argmin(distances, axis=1)
            
            # 更新聚类中心
            new_centers = np.array([np.mean(X[labels == k], axis=0) for k in range(self.n_clusters)])
            
            # 计算聚类中心的变化量
            center_shift = np.linalg.norm(new_centers - centers)
            
            # 更新聚类中心
            centers = new_centers
            
            # 如果聚类中心的变化量小于阈值，停止迭代
            if center_shift < self.tol:
                break
        
        self.labels_ = labels
        self.cluster_centers_ = centers

# 示例数据
X = np.array([[1, 2],
              [1.5, 1.8],
              [5, 8],
              [8, 8],
              [1, 0.6],
              [9, 11]])

# 示例数据：标签信息（部分数据点）
y = np.array([0, 0, 1, 1, 0, -1])

# 使用自定义的 SemiSupervisedKMeans 算法进行聚类
kmeans = SemiSupervisedKMeans(n_clusters=3)
kmeans.fit(X, y)

# 获取聚类结果
labels = kmeans.labels_

print("聚类结果：", labels)


聚类结果： [1 1 2 2 1 0]


In [22]:
class Cluster(object):
    def __init__(self):
        ...
    def fit(self, X):
        ...
    def predict(self, X):
        ...
    def fit_predict(self, X):
        self.fit(X)
        return self.predict(X)
    def estimate(self, X, y):
        y_pred = self.predict(X)
        return np.mean(y_pred == y)

In [23]:
class KMeans(Cluster):
    def __init__(self, k=2, max_iter=100, tol=1e-4):
        self.k = k
        self.max_iter = max_iter
        self.tol = tol
        super(KMeans, self).__init__()

    def fit(self, X):
        n_samples, n_features = X.shape
        self.centroids = np.zeros((self.k, n_features))
        self.labels = np.zeros(n_samples)
        self.inertia = 0

        for i in range(self.max_iter):
            # 计算每个样本到各个中心的距离
            distances = np.linalg.norm(X - self.centroids, axis=1)
            # 确定样本所属的簇
            self.labels = np.argmin(distances, axis=1)
            # 重新计算各个中心
            for j in range(self.k):
                self.centroids[j] = X[self.labels == j].mean(axis=0)
            # 计算轮廓系数
            new_inertia = np.sum(np.min(distances, axis=1))
            if abs(new_inertia - self.inertia) < self.tol:
                break
            self.inertia = new_inertia

    def predict(self, X):
        distances = np.linalg.norm(X - self.centroids, axis=1)
        return np.argmin(distances, axis=1)

In [24]:
kmeans = KMeans(k=2)
kmeans.fit(X)
kmeans.estimate(X, labels)

ValueError: Unable to coerce to DataFrame, shape must be (10000, 32): given (2, 32)