In [1]:
import hues
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
# 读取数据并做预处理
# df_0 = pd.read_pickle('./202205.pickle')
df_0 = pd.read_table('./二维平面/kmeans_data.txt', header=None)
df = df_0.copy()


# df.drop('ID', axis=1, inplace=True)

class KMeans(object):
    """
    KMeans算法类
    """

    def __init__(self, df, K, iterate_num=50):
        """
        类或对象的初始化
        :param df: 输入矩阵 m×n
        :param K: 聚类蔟数
        :param iterate_num: 迭代循环次数，默认为50次
        """
        self.df = df
        self.K = K
        self.iterate_num = iterate_num
        self.centers = None  # 存放簇中心点，第一次随机选择，之后通过迭代不断更新

    def run(self):
        """
        类、对象入口函数
        """
        hues.info(f'{self.__class__.__name__}已运行...')
        # 1、随机选择K个簇中心点(存入self.centers中)
        self.df['near_center_id'] = np.nan  # 在df中添加一列作为分簇的依据
        self.make_random_center()
        # 2、点归属——求每个点到暂定簇中心的最近距离
        self.calc_distance()
        # 3、更根据分类，更新簇中心
        self.update_centers()
        # 4、迭代循环
        self.do_iteration()
        # 5、绘制聚类结果（仅针对二维数据）
        self.plot_scatter_2D()

    def make_random_center(self):
        """
        从df中随机选择K个点作为K个簇中心点
        """
        random_ids = np.random.permutation(self.df.shape[0])  # 获取传入的DataFrame有多少行数据，对DataFrame行值进行洗牌打乱
        self.centers = self.df.iloc[:, :-1].loc[random_ids[:self.K], :]  # 从df中随机获取K行数据作为随机开始点

    def calc_distance(self):
        """
        计算每个点到簇中心的距离
        df: m*n centers: K*n  dis_df 各个点到各个暂定簇中心的距离 m*K
        """
        dis_df = np.zeros((self.df.shape[0], self.K))  # 定义一个用于存放每个点到每个簇中心距离的df
        # 求出每个点到每个暂定簇中心的欧氏距离
        for i in range(len(self.df)):
            for j in range(self.K):
                dis_df[i, j] = np.sqrt(sum((self.df.iloc[i, :-1] - self.centers.iloc[j]) ** 2))

        # 对比每个点到每个簇中心的距离，将距离某点距离最小的簇中心的ID记录，作为最后分分类的依据
        self.df['near_center_id'] = np.argmin(dis_df, axis=1)

    def update_centers(self):
        """
        在计算距离之后，需要根据新的near_center_id分类来计算新的K个簇中心
        """
        for i in range(self.K):  # 循环更新K个簇中心
            self.centers.iloc[i, :] = self.df[self.df['near_center_id'] == i].iloc[:, :-1].mean()

    def do_iteration(self):
        for _ in range(self.iterate_num):  # 进行迭代循环
            self.calc_distance()  # 计算距离
            # 在每次更新centers位置前，对centers深拷贝，如果更新后没变化，说明迭代完成
            old_centers = self.centers.copy(deep=True)
            # print('Old Centers:')  # DEBUG
            # print(old_centers)
            self.update_centers()  # 更新簇中心
            # print('New Centers:')
            # print(self.centers)
            # print("=" * 20)
            if old_centers.equals(self.centers):
                hues.success(f'结束运行, 共进行了【{_}】次迭代.')
                break

    def plot_scatter_2D(self):
        """
        如果传入的df（m×n）是二维数据（n=2），可以直接调用这个函数来实现画图，如果数据维度大于2（n＞2），则需要进行降维操作后绘制图形
        绘制二维平面图
        """
        get_ipython().run_line_magic('matplotlib', 'notebook')
        sns.scatterplot(x=self.df.iloc[:, 0], y=self.df.iloc[:, 1], hue=self.df['near_center_id'])
        sns.scatterplot(x=self.centers.iloc[:, 0], y=self.centers.iloc[:, 1], marker="*", s=500)


demo = KMeans(df, 4)
demo.run()
demo.df
# df_new = demo.return_df()
# df_new['ID'] = df_0['ID']
# df[['ID', 'near_center_id']]

[35m20:52:10[0m - [36mINFO[0m - [39mKMeans已运行...[0m
[35m20:52:10[0m - [32mSUCCESS[0m - [39m结束运行, 共进行了【2】次迭代.[0m


<IPython.core.display.Javascript object>

Unnamed: 0,0,1,near_center_id
0,1.658985,4.285136,3
1,-3.453687,3.424321,0
2,4.838138,-1.151539,2
3,-5.379713,-3.362104,1
4,0.972564,2.924086,3
...,...,...,...
75,-2.793241,-2.149706,1
76,2.884105,3.043438,3
77,-2.967647,2.848696,0
78,4.479332,-1.764772,2


In [3]:
df = pd.read_pickle('./202205.pickle')
df

Unnamed: 0,ID,pH,DO,TN,NO2-,NO3-,NH4+,COD,PO43-,TP
0,N1,8.13,7.64,5.78,0.0228,0.109,0.928,8.41,0.013,0.21
1,N2,8.2,7.43,5.71,0.0206,0.105,0.888,8.2,0.016,0.15
2,N3,8.43,7.35,5.54,0.0234,0.13,0.809,8.59,0.015,0.24
3,N4,8.44,7.16,5.72,0.0236,0.141,1.061,8.01,0.015,0.09
4,N5,8.3,7.59,5.78,0.0228,0.107,0.689,8.19,0.018,0.26
5,N6,8.39,8.14,5.38,0.0242,0.132,0.742,8.54,0.016,0.13
6,N7,8.65,7.98,5.36,0.0255,0.151,0.981,8.32,0.018,0.24
7,N8,8.38,7.82,5.7,0.0326,0.15,0.53,8.23,0.019,0.2
8,N9,8.17,7.32,4.97,0.0284,0.126,0.703,8.26,0.013,0.16
9,N10,8.25,7.52,5.49,0.0222,0.123,0.875,8.46,0.01,0.15


In [4]:
df[(df[['NO2-', 'NO3-', 'NH4+']].sum(axis=1)) > df['TN']]

Unnamed: 0,ID,pH,DO,TN,NO2-,NO3-,NH4+,COD,PO43-,TP


In [5]:
df = pd.read_pickle('./202205.pickle')
df_0 = df.copy(deep=True)
# df.equals(df_0)
# if df_0 == df:
#     print('yes')
df.TP = np.nan
df_0

Unnamed: 0,ID,pH,DO,TN,NO2-,NO3-,NH4+,COD,PO43-,TP
0,N1,8.13,7.64,5.78,0.0228,0.109,0.928,8.41,0.013,0.21
1,N2,8.2,7.43,5.71,0.0206,0.105,0.888,8.2,0.016,0.15
2,N3,8.43,7.35,5.54,0.0234,0.13,0.809,8.59,0.015,0.24
3,N4,8.44,7.16,5.72,0.0236,0.141,1.061,8.01,0.015,0.09
4,N5,8.3,7.59,5.78,0.0228,0.107,0.689,8.19,0.018,0.26
5,N6,8.39,8.14,5.38,0.0242,0.132,0.742,8.54,0.016,0.13
6,N7,8.65,7.98,5.36,0.0255,0.151,0.981,8.32,0.018,0.24
7,N8,8.38,7.82,5.7,0.0326,0.15,0.53,8.23,0.019,0.2
8,N9,8.17,7.32,4.97,0.0284,0.126,0.703,8.26,0.013,0.16
9,N10,8.25,7.52,5.49,0.0222,0.123,0.875,8.46,0.01,0.15
