# d_zonalst&cluster_data_combine_visualize is designed to combine Cluster and building parameters for easier visualization of subsequent indicators.

This code is intended to extract X, Y, and Cluster data separately, and then combine it with canopy and building parameters to generate the final_cities_cluster_with_index.shp file.

Here is the specific process breakdown:

#### 10 Data Preprocessing
> 10_01 Extract X, Y, 'Place', 'City Name', 'Place ID','Sum ID' and 'Cluster' data separately to obtain an original CSV file.
> 10_02 csv file into shapefile
> 10_03 Convert the shapefile to WGS 1984.

#### 11 Indicator Visualization
> 11_01 This code aims to normalize the 'MEAN', 'FAR', and 'biodensity' indicators.
> 11_01 shapefile into csv
> 11_02 Plot the corresponding indicator images.

# 10_01 Extract X, Y, 'Place', 'City Name', 'Place ID','Sum ID' and 'Cluster' data separately to obtain an original CSV file.

In [1]:
import pandas as pd
import os
import numpy as np
K_range = [11]
for K in K_range:
    cluster_origin_file=rf'D:\file\d_som\223_cities_combined_moving_ave_cluster_sort.csv'
    output_file_path=rf'D:\file\d_som'
    # 创建输出文件所在的目录
    os.makedirs(output_file_path, exist_ok=True)

    data = pd.read_csv(cluster_origin_file)
    data_xy = data.iloc[:, 0:2]  # Assuming that the first two columns are not features
    # 将结果保存为CSV文件，并加入前两列数据
    df = pd.DataFrame(data_xy)
    # 确保这些列存在
    for col in ['Place', 'City Name', 'Place ID','Sum ID', 'Cluster']:
        if col in data.columns:
            df[col] = data[col]

    output_file =  os.path.join(output_file_path,rf'223_cities_combined_moving_ave_cluster_sort_xy.csv')
    df.to_csv(output_file, index=False)

# 10_02 csv file into shapefile

In [2]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

# 定义 K 值

# 定义文件路径
csv_file = r"D:\file\d_som/223_cities_combined_moving_ave_cluster_sort_xy.csv"
shp_file = r"D:\file\d_som/223_cities_combined_moving_ave_cluster_sort_xy.shp"

# 读取 CSV 文件
df = pd.read_csv(csv_file)

# 创建 GeoDataFrame
geometry = [Point(xy) for xy in zip(df['X'], df['Y'])]
gdf = gpd.GeoDataFrame(df, geometry=geometry)

# 定义坐标系
# 使用 PROJ 字符串定义 Sinusoidal 投影
sinusoidal_proj = "+proj=sinu +R=6371007.181 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs"
gdf.set_crs(sinusoidal_proj, inplace=True)  # 设置输入坐标系为 Sinusoidal

# 将坐标系转换为 WGS 1984
gdf = gdf.to_crs(epsg=4326)  # WGS 1984 的 EPSG 代码是 4326

# 保存为 Shapefile
gdf.to_file(shp_file)

print("Shapefile 生成完成：", shp_file)


Shapefile 生成完成： D:\file\d_som/223_cities_combined_moving_ave_cluster_sort_xy.shp


# 10_03 Convert the shapefile to WGS 1984.

In [3]:
import geopandas as gpd

# 设置输入和输出文件路径
input_shapefile = r"D:\file\e_index\223cities_output_file(MEAN_BD_FAR_CD_CR_DistWB_DistGL_DistMT).shp"
output_shapefile = r"D:\file\e_index\223_cities_cluster_with_index.shp"
projection_file = r"D:\file\d_som\223_cities_combined_moving_ave_cluster_sort_xy.shp"

# 读取输入 Shapefile
gdf = gpd.read_file(input_shapefile)

# 读取投影 Shapefile
proj_gdf = gpd.read_file(projection_file)

# 1. 将 gdf 和 proj_gdf 先转换为 UTM 坐标系（以第一个点的 UTM 区域为基准）
utm_crs = gdf.estimate_utm_crs()
gdf = gdf.to_crs(utm_crs)
proj_gdf = proj_gdf.to_crs(utm_crs)

# 3. 进行空间连接
gdf = gdf.sjoin(proj_gdf[['geometry', 'Cluster', 'Place', 'Place ID', 'Sum ID', 'City Name']], how='left', predicate='intersects')

# 将 Cluster 字段重命名为 K_11
gdf.rename(columns={'Cluster': 'Cluster'}, inplace=True)

# 4. 转换回 WGS 84
gdf = gdf.to_crs("EPSG:4326")

# 删除不需要的 sjoin 产生的索引列
gdf.drop(columns=['index_right'], inplace=True, errors='ignore')

# 保存最终结果
gdf.to_file(output_shapefile)
print(f"已成功进行空间连接，并保存到 {output_shapefile}")

已成功进行空间连接，并保存到 D:\file\e_index\223_cities_cluster_with_index.shp


In [13]:
import geopandas as gpd

# 设置输入和输出文件路径
input_shapefile = r"D:\file\e_index\223cities_output_file(MEAN_BD_FAR_CD_CR_DistWB_DistGL_DistMT).shp"
output_shapefile = r"D:\file\e_index\223_cities_cluster_with_index_original.shp"
projection_file = r"D:\file\d_som\223_cities_combined_moving_ave_cluster_sort_xy.shp"

# 读取输入 Shapefile
gdf = gpd.read_file(input_shapefile)

# 读取投影 Shapefile
proj_gdf = gpd.read_file(projection_file)

utm_crs = gdf.estimate_utm_crs()
gdf = gdf.to_crs(utm_crs)
proj_gdf = proj_gdf.to_crs(utm_crs)

# 用缓冲区进行新的匹配
gdf_buffered = gdf.copy()
gdf_buffered['geometry'] = gdf_buffered.geometry.buffer(5)  # 5米缓冲区


gdf = gdf_buffered.sjoin(proj_gdf[['geometry', 'Cluster', 'Place', 'Place ID', 'Sum ID', 'City Name']], how='left', predicate='intersects')

# 5. 替换 "Sum ID" 为空的情况，并同步修改 'Cluster', 'Place', 'Place ID'
fill_columns = ['Sum ID', 'Cluster', 'Place', 'Place ID']
for col in fill_columns:
    gdf[col] = gdf[col].fillna(-9999)

# 6. 转换回 WGS 84 坐标系
gdf = gdf.to_crs("EPSG:4326")

# 7. 保存到 Shapefile
gdf.to_file(output_shapefile)

print(f"已成功进行空间连接，未匹配数据设为 -9999，并保存到 {output_shapefile}")

  gdf.to_file(output_shapefile)


已成功进行空间连接，未匹配数据设为 -9999，并保存到 D:\file\e_index\223_cities_cluster_with_index_original.shp


In [14]:
import geopandas as gpd

# 读取 shapefile
shp_path = r"D:\file\e_index\223_cities_cluster_with_index_original.shp"
gdf = gpd.read_file(shp_path)

print("'Sum ID' 列统计情况：")
print(gdf['Sum ID'].describe())  # 统计描述
print("\n唯一值数量：", gdf['Sum ID'].nunique())  # 统计唯一值个数
print("\n前10个 Sum ID 频次：")
print(gdf['Sum ID'].value_counts().head(10))  # 查看最常见的 10 个值及其数量

'Sum ID' 列统计情况：
count    151352.000000
mean      75674.694731
std       43693.313136
min       -9999.000000
25%       37836.750000
50%       75674.500000
75%      113513.250000
max      151353.000000
Name: Sum ID, dtype: float64

唯一值数量： 151349

前10个 Sum ID 频次：
-9999.0      3
 91495.0     2
 0.0         1
 143259.0    1
 143260.0    1
 143261.0    1
 143262.0    1
 143263.0    1
 143264.0    1
 143265.0    1
Name: Sum ID, dtype: int64


In [None]:
'''
这里需要手动删除重复值 和 nodata值
将 223_cities_cluster_with_index_original.shp
->>> 打开arcgis 把Sum ID = 91495的右边的那个栅格删了
#########? 还有三个没有匹配的数据？
223_cities_cluster_with_index.shp
'''

In [3]:
import geopandas as gpd

# 读取 shapefile
shp_path = r"D:\file\e_index\223_cities_cluster_with_index.shp"
gdf = gpd.read_file(shp_path)

print("'Sum ID' 列统计情况：")
print(gdf['Sum ID'].describe())  # 统计描述
print("\n唯一值数量：", gdf['Sum ID'].nunique())  # 统计唯一值个数
print("\n前10个 Sum ID 频次：")
print(gdf['Sum ID'].value_counts().head(10))  # 查看最常见的 10 个值及其数量

'Sum ID' 列统计情况：
count    151351.000000
mean      75674.590204
std       43693.438558
min       -9999.000000
25%       37836.500000
50%       75674.000000
75%      113513.500000
max      151353.000000
Name: Sum ID, dtype: float64

唯一值数量： 151349

前10个 Sum ID 频次：
-9999.0      3
 0.0         1
 143255.0    1
 143257.0    1
 143258.0    1
 143259.0    1
 143260.0    1
 143261.0    1
 143262.0    1
 143263.0    1
Name: Sum ID, dtype: int64


# 11_01 This code aims to normalize the ‘MEAN’, ‘FAR’, and ‘biodensity’ indicators.

In [5]:
# import geopandas as gpd
# from sklearn.preprocessing import MinMaxScaler
#
# # 读取 Shapefile 文件
# input_shp_path = r"D:\file\e_index\223_cities_cluster_with_index.shp"
# output_shp_path = r"D:\file\e_index\223_cities_cluster_with_index_normal.shp"
#
# # 读取数据
# gdf = gpd.read_file(input_shp_path)
#
# # 选择需要归一化的列
# columns_to_normalize = ['MEAN', 'FAR', 'Candensity','CanovRatio']
#
# # 初始化 MinMaxScaler
# scaler = MinMaxScaler()
#
# # 进行归一化处理
# gdf[columns_to_normalize] = scaler.fit_transform(gdf[columns_to_normalize])
#
# # 保存结果到新的 Shapefile 文件
# gdf.to_file(output_shp_path)
#
# print(f"归一化处理完成，结果已保存到 {output_shp_path}")


归一化处理完成，结果已保存到 D:\file\e_index\223_cities_cluster_with_index_normal.shp


In [6]:
# import pandas as pd
# import geopandas as gpd
# from sklearn.preprocessing import MinMaxScaler
#
# # 读取 Shapefile 文件
# input_shp_path = r"D:\file\e_index\223_cities_cluster_with_index.shp"
# output_shp_path = r"D:\file\e_index\223_cities_cluster_with_index_normal.shp"
#
# # 读取数据
# gdf = gpd.read_file(input_shp_path)
#
# # 选择需要归一化的列
# columns_to_normalize = ['MEAN', 'FAR', 'Candensity', 'CanovRatio']
#
# # 归一化前检查数据
# print("归一化前数据统计：")
# print(gdf[columns_to_normalize].describe())
#
# # 初始化 MinMaxScaler
# scaler = MinMaxScaler()
#
# # 进行归一化处理
# gdf[columns_to_normalize] = scaler.fit_transform(gdf[columns_to_normalize])
#
# # 归一化后检查数据
# print("\n归一化后数据统计：")
# print(gdf[columns_to_normalize].describe())
#
# # 保存结果到新的 Shapefile 文件
# gdf.to_file(output_shp_path)
#
# print(f"归一化处理完成，结果已保存到 {output_shp_path}")


归一化前数据统计：
                MEAN           FAR     Candensity     CanovRatio
count  150651.000000  1.506200e+05  150616.000000  150616.000000
mean        1.771554  4.506156e-01       0.583514       2.331681
std         1.672250  5.395749e-01       0.238900       2.454623
min         0.000148  1.956096e-08       0.000726       0.000007
25%         0.569411  7.443486e-02       0.397272       0.435813
50%         1.313960  2.610610e-01       0.608617       1.381586
75%         2.363189  6.102097e-01       0.782196       3.590397
max        16.534384  5.286934e+00       1.149691      19.507178

归一化后数据统计：
                MEAN            FAR     Candensity     CanovRatio
count  150651.000000  150620.000000  150616.000000  150616.000000
mean        0.107136       0.085232       0.507229       0.119529
std         0.101139       0.102058       0.207926       0.125832
min         0.000000       0.000000       0.000000       0.000000
25%         0.034429       0.014079       0.345133       0.02234

# 11_01 shapefile into csv

In [15]:
import geopandas as gpd
import pandas as pd
# 读取 Shapefile 文件
shp_path = r"D:\file\e_index\223_cities_cluster_with_index.shp"
output_path = r"D:\file\e_index\223_cities_cluster_with_index.csv"

# # 使用 GeoPandas 读取 Shapefile
gdf = gpd.read_file(shp_path)

# 将 GeoDataFrame 转换为 DataFrame
df = pd.DataFrame(gdf)

# 保存为 CSV 文件
df.to_csv(output_path, index=False)

print(f"Shapefile 已成功转换为 CSV，结果已保存到 {output_path}")

Shapefile 已成功转换为 CSV，结果已保存到 D:\file\e_index\223_cities_cluster_with_index.csv


In [8]:
# import geopandas as gpd
#
# # # 读取 Shapefile 文件
# # shp_path = r"D:\file\e_index\223_cities_cluster_with_index.shp"#_normal
# # output_path = r"D:\file\e_index/223_cities_cluster_with_index.csv"
# # 读取 Shapefile 文件
# shp_path = r"D:\file\e_index\223_cities_cluster_with_index_normal.shp"#_normal
# output_path = r"D:\file\e_index/223_cities_cluster_with_index_normal.csv"
# # 使用 GeoPandas 读取 Shapefile
# gdf = gpd.read_file(shp_path)
#
# # 将 GeoDataFrame 转换为 DataFrame
# df = pd.DataFrame(gdf)
#
# # 保存为 CSV 文件
# df.to_csv(output_path, index=False)
#
# print(f"Shapefile 已成功转换为 CSV，结果已保存到 {output_path}")


Shapefile 已成功转换为 CSV，结果已保存到 D:\file\e_index/223_cities_cluster_with_index_normal.csv


# 11_02 Plot the corresponding indicator images.


In [36]:
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# import os
#
# def plot_cluster_histograms(input_csv_path, columns_to_plot, K):
#     # 读取数据
#     df = pd.read_csv(input_csv_path)
#
#     # 创建绘图
#     for cluster_id in range(K):
#         # 选择属于当前 cluster_id 的数据
#         cluster_data = df[df[f'Cluster'] == cluster_id]
#         # 过滤掉值为 -9999 的数据
#         cluster_data = cluster_data[~cluster_data[columns_to_plot].isin([-9999]).any(axis=1)]
#
#
#         # **输出当前 cluster 变量的统计信息**
#         print(f"\n==== Cluster {cluster_id} (K={K}) data ====")
#         print(cluster_data[columns_to_plot].describe())
#
#         # 创建直方图图像
#         plt.figure(figsize=(10, 6))
#
#         # 绘制每个列的直方图
#         for column in columns_to_plot:
#             sns.histplot(cluster_data[column], bins=10, kde=False, label=f'{column} Histogram', stat="density")
#             #print(f"绘制直方图: {column} (Cluster {cluster_id}) - 数据点数: {cluster_data[column].count()}")
#
#         # 设置图像标题和标签
#         plt.title(f'Cluster {cluster_id} Histograms (K={K})')
#         plt.xlabel('Value')
#         plt.ylabel('Density')
#
#         # # 固定横纵坐标范围
#         # plt.xlim(-0.5,20)
#         # plt.ylim(0, 5)
#
#         plt.legend()
#
#         # 保存直方图图像
#         output_hist_path = os.path.join (output_folder, rf'K={K}_histogram_cluster_{cluster_id}.png')
#         plt.savefig(output_hist_path)
#         plt.close()
#         #print(f"直方图已保存: {output_hist_path}")
#
#         # 创建 KDE 图像
#         plt.figure(figsize=(10, 6))
#
#         # 绘制每个列的 KDE 图
#         for column in columns_to_plot:
#             sns.kdeplot(cluster_data[column], label=f'{column} KDE')
#             #print(f"绘制 KDE: {column} (Cluster {cluster_id}) - 数据点数: {cluster_data[column].count()}")
#
#         # 设置图像标题和标签
#         plt.title(f'Cluster {cluster_id} KDE Plots (K={K})')
#         plt.xlabel('Value')
#         plt.ylabel('Density')
#
#         # # 固定横纵坐标范围
#         # plt.xlim(-0.5,20)
#         # plt.ylim(0, 5)
#
#         plt.legend()
#
#         # 保存 KDE 图像
#         output_kde_path = os.path.join (output_folder, rf'K={K}_kdeplot_cluster_{cluster_id}.png')
#         plt.savefig(output_kde_path)
#         plt.close()
#         #print(f"KDE 图像已保存: {output_kde_path}")
#
# def plot_cluster_analysis(input_csv_path, columns_to_plot, K):
#     # 读取数据
#     df = pd.read_csv(input_csv_path)
#
#     # 创建绘图
#     for cluster_id in range(K):
#         # 选择属于当前 cluster_id 的数据
#         cluster_data = df[df[f'Cluster'] == cluster_id]
#         # 过滤掉值为 -9999 的数据
#         cluster_data = cluster_data[~cluster_data[columns_to_plot].isin([-9999]).any(axis=1)]
#
#         # **输出统计信息**
#         #print(f"\n==== Cluster {cluster_id} (K={K}) data ====")
#         #print(cluster_data[columns_to_plot].describe())
#
#         # 将数据从宽格式转换为长格式
#         cluster_data_melted = cluster_data.melt(id_vars=[f'Cluster'], value_vars=columns_to_plot, var_name='Variable', value_name='Value')
#
#         # 创建一个新的图像
#         plt.figure(figsize=(12, 8))
#
#         # 绘制小提琴图
#         sns.violinplot(x='Variable', y='Value', data=cluster_data_melted, inner=None, scale='width', palette='Set3')
#
#         # 绘制箱线图
#         sns.boxplot(x='Variable', y='Value', data=cluster_data_melted, whis=1.5, width=0.2, palette='Set2', fliersize=0)
#
#         # **输出每个变量的统计信息**
#         # for column in columns_to_plot:
#         #     print(f"绘制箱线图 & 小提琴图: {column} (Cluster {cluster_id}) - 数据点数: {cluster_data[column].count()}")
#
#         # 设置图像标题和标签
#         plt.title(f'Cluster {cluster_id} Data Distribution')
#         plt.xlabel('Variable')
#         plt.ylabel('Value')
#         # plt.ylim(-1.5, 14)
#         # 调整布局
#         plt.tight_layout()
#
#         # 保存图像
#         output_image_path = os.path.join (output_folder, rf'K={K}_boxplot_cluster_{cluster_id}.png')
#
#         plt.savefig(output_image_path)
#         plt.close()
#         #print(f"箱线图 & 小提琴图已保存: {output_image_path}")


In [62]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def plot_cluster_histograms(input_csv_path, columns_to_plot, K, output_folder):
    # 读取数据
    df = pd.read_csv(input_csv_path)

    # 创建一个空的 DataFrame 用于存储所有 cluster 的统计数据
    all_cluster_stats = []

    # 遍历每个 cluster 进行绘制
    for cluster_id in range(K):
        # 选择属于当前 cluster_id 的数据
        cluster_data = df[df[f'Cluster'] == cluster_id]
        # 过滤掉值为 -9999 的数据
        cluster_data = cluster_data[~cluster_data[columns_to_plot].isin([-9999]).any(axis=1)]
        # 输出 describe() 的 count 为整数
        describe_output = cluster_data[columns_to_plot].describe().transpose()
        describe_output['count'] = describe_output['count'].astype(int)

        # 记录 cluster 统计信息
        describe_output['Cluster'] = cluster_id
        all_cluster_stats.append(describe_output)

        # 打印统计信息
        print(f"\n==== Cluster {cluster_id} (K={K}) data ====")
        print(describe_output)

        # 创建图像
        plt.figure(figsize=(10, 6))

        for column in columns_to_plot:
            # 绘制直方图
            sns.histplot(cluster_data[column], bins=10, kde=False, label=f'{column} Histogram', stat="proportion", alpha=0.3)

            # 计算直方图数据
            counts, bin_edges = np.histogram(cluster_data[column], bins=10, density=False)
            total_count = counts.sum()  # 总样本数
            proportions = counts / total_count  # 计算 proportion

            # 计算 bin 的中心位置
            bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2

            # 绘制 proportion 曲线
            plt.plot(bin_centers, proportions, marker='o', linestyle='-', label=f'{column} Proportion Curve')

        plt.title(f'Cluster {cluster_id} Histogram with Proportion Curve')
        plt.xlabel('Value')
        plt.ylabel('Proportion')
        plt.xlim(-0.5, 10)
        plt.ylim(0, 1)
        plt.legend()

        # 保存直方图图像
        output_hist_path = os.path.join(output_folder, rf'K={K}_histogram_cluster_{cluster_id}.png')
        plt.savefig(output_hist_path)
        plt.close()  # 关闭图像，避免影响下一个 cluster


        # # 创建 KDE 图像
        # plt.figure(figsize=(10, 6))
        # for column in columns_to_plot:
        #     sns.kdeplot(cluster_data[column], label=f'{column} KDE')
        #
        # plt.title(f'Cluster {cluster_id} KDE Plots (K={K})')
        # plt.xlabel('Value')
        # plt.ylabel('proportion')
        # plt.xlim(0, 10)
        # plt.ylim(0, 1)
        # plt.legend()
        #
        # # 保存 KDE 图像
        # output_kde_path = os.path.join(output_folder, rf'K={K}_kdeplot_cluster_{cluster_id}.png')
        # plt.savefig(output_kde_path)
        # plt.close()
        #
        # # 创建折线图
        # plt.figure(figsize=(10, 6))
        # for column in columns_to_plot:
        #     value_counts = cluster_data[column].value_counts().sort_index()
        #     plt.plot(value_counts.index, value_counts.values, label=f'{column} Lineplot', alpha=0.5)
        #
        # plt.title(f'Cluster {cluster_id} Line Plots (K={K})')
        # plt.xlabel('Value')
        # plt.ylabel('Count')
        # plt.xlim(0, 16)
        # plt.ylim(0, 1)
        # plt.legend()
        #
        # # 保存折线图像
        # output_line_path = os.path.join(output_folder, rf'K={K}_lineplot_cluster_{cluster_id}.png')
        # plt.savefig(output_line_path)
        # plt.close()

    # 合并所有统计数据到一个 DataFrame
    stats_df = pd.concat(all_cluster_stats)
    stats_df.to_csv(os.path.join(output_folder, 'cluster_stats.csv'))

def plot_cluster_analysis(input_csv_path, columns_to_plot, K, output_folder):
    # 读取数据
    df = pd.read_csv(input_csv_path)

    # 创建绘图
    for cluster_id in range(K):
        # 选择属于当前 cluster_id 的数据
        cluster_data = df[df[f'Cluster'] == cluster_id]
        # 过滤掉值为 -9999 的数据
        cluster_data = cluster_data[~cluster_data[columns_to_plot].isin([-9999]).any(axis=1)]

        # 将数据从宽格式转换为长格式
        cluster_data_melted = cluster_data.melt(id_vars=[f'Cluster'], value_vars=columns_to_plot, var_name='Variable', value_name='Value')

        # 创建一个新的图像
        plt.figure(figsize=(12, 8))

        # 绘制小提琴图
        sns.violinplot(x='Variable', y='Value', data=cluster_data_melted, inner=None, scale='width', palette='Set3')

        # 绘制箱线图
        sns.boxplot(x='Variable', y='Value', data=cluster_data_melted, whis=1.5, width=0.2, palette='Set2', fliersize=0)

        # 设置图像标题和标签
        plt.title(f'Cluster {cluster_id} Data Distribution')
        plt.xlabel('Variable')
        plt.ylabel('Value')
        plt.ylim(-1, 15)
        plt.tight_layout()

        # 保存图像
        output_image_path = os.path.join(output_folder, rf'K={K}_boxplot_cluster_{cluster_id}.png')

        plt.savefig(output_image_path)
        plt.close()

# 运行代码
input_csv_path = r"D:\file\e_index/223_cities_cluster_with_index.csv"
columns_to_plot = ['Buiheight', 'CoverRatio', 'FAR', 'GreenRatio', 'GrVolRatio']

for K in range(11, 12):
    output_folder = rf'D:\file\e_index\fig\bui'
    os.makedirs(output_folder, exist_ok=True)

    plot_cluster_histograms(input_csv_path, columns_to_plot, K, output_folder)
    plot_cluster_analysis(input_csv_path, columns_to_plot, K, output_folder)



==== Cluster 0 (K=11) data ====
            count      mean       std           min       25%       50%  \
Buiheight    4157  1.371703  1.320751  5.800000e-04  0.326603  0.971972   
CoverRatio   4157  0.144457  0.097610  6.275001e-05  0.066631  0.127305   
FAR          4157  0.106000  0.159919  2.358936e-08  0.007178  0.040911   
GreenRatio   4157  0.571945  0.291409  2.667314e-03  0.278519  0.650296   
GrVolRatio   4157  3.566075  3.318765  5.334628e-05  0.241232  3.175530   

                 75%        max  Cluster  
Buiheight   1.954704  10.281525        0  
CoverRatio  0.198991   0.480845        0  
FAR         0.129787   1.296226        0  
GreenRatio  0.832622   1.000000        0  
GrVolRatio  6.037145  16.135795        0  

==== Cluster 1 (K=11) data ====
            count      mean       std           min       25%       50%  \
Buiheight   20087  1.121766  1.253995  7.250000e-04  0.279879  0.667863   
CoverRatio  20087  0.125210  0.088932  3.486112e-06  0.059809  0.104024   



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x='Variable', y='Value', data=cluster_data_melted, inner=None, scale='width', palette='Set3')

The `scale` parameter has been renamed and will be removed in v0.15.0. Pass `density_norm='width'` for the same effect.
  sns.violinplot(x='Variable', y='Value', data=cluster_data_melted, inner=None, scale='width', palette='Set3')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Variable', y='Value', data=cluster_data_melted, whis=1.5, width=0.2, palette='Set2', fliersize=0)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x='Variable', y='Value',

In [64]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

def plot_cluster_histograms(input_csv_path, columns_to_plot, K, output_folder):
    # 读取数据
    df = pd.read_csv(input_csv_path)

    # 用来存储统计信息
    stats_df = pd.DataFrame()

    # 遍历每个 cluster 进行绘制
    for cluster_id in range(K):
        # 选择属于当前 cluster_id 的数据
        cluster_data = df[df['Cluster'] == cluster_id]
        # 过滤掉值为 -9999 的数据
        cluster_data = cluster_data[~cluster_data[columns_to_plot].isin([-9999]).any(axis=1)]
        # 过滤距离大于 1000 的数据
        cluster_data = cluster_data[(cluster_data[columns_to_plot] <= 1000).all(axis=1)]

        # 计算统计信息
        describe_output = cluster_data[columns_to_plot].describe().transpose()
        describe_output['Cluster'] = cluster_id
        stats_df = pd.concat([stats_df, describe_output])

        # 创建绘图
        plt.figure(figsize=(10, 6))

        for column in columns_to_plot:
            # 计算直方图数据
            counts, bin_edges = np.histogram(cluster_data[column], bins=10, density=False)
            total_count = counts.sum()
            proportions = counts / total_count  # 计算 proportion
            bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2  # 计算 bin 的中心位置

            # 绘制直方图 (proportion)
            sns.histplot(cluster_data[column], bins=10, stat="proportion", alpha=0.3, label=f'{column} Histogram')

            # 绘制 proportion 曲线
            plt.plot(bin_centers, proportions, marker='o', linestyle='-', label=f'{column} Proportion Curve')

        # 设置标题和坐标轴
        plt.title(f'Cluster {cluster_id} Histogram with Proportion Curve (K={K})')
        plt.xlabel('Value')
        plt.ylabel('Proportion')
        plt.ylim(0, 1)
        plt.legend()

        # 保存图像
        output_hist_path = os.path.join(output_folder, rf'K={K}_histogram_cluster_{cluster_id}.png')
        plt.savefig(output_hist_path)
        plt.close()

        # # 创建 KDE 图像
        # plt.figure(figsize=(10, 6))
        # for column in columns_to_plot:
        #     sns.kdeplot(cluster_data[column], label=f'{column} KDE')
        #
        # plt.title(f'Cluster {cluster_id} KDE Plots (K={K})')
        # plt.xlabel('Value')
        # plt.ylabel('Density')
        # plt.xlim(0, 1000)
        # plt.ylim(0, 0.025)
        # plt.legend()
        #
        # # 保存 KDE 图像
        # output_kde_path = os.path.join(output_folder, rf'K={K}_kdeplot_cluster_{cluster_id}.png')
        # plt.savefig(output_kde_path)
        # plt.close()

    # 将统计信息导出为 CSV 文件
    stats_output_path = os.path.join(output_folder, rf'K={K}_cluster_stats.csv')
    stats_df.to_csv(stats_output_path)
    print(f"统计结果已保存: {stats_output_path}")

def plot_cluster_analysis(input_csv_path, columns_to_plot, K, output_folder):
    # 读取数据
    df = pd.read_csv(input_csv_path)

    # 创建绘图
    for cluster_id in range(K):
        # 选择属于当前 cluster_id 的数据
        cluster_data = df[df[f'Cluster'] == cluster_id]
        # 过滤掉值为 -9999 的数据
        cluster_data = cluster_data[~cluster_data[columns_to_plot].isin([-9999]).any(axis=1)]
        # 假设你的列值表示距离，可以进行如下条件过滤
        cluster_data = cluster_data[(cluster_data[columns_to_plot] <= 1000).all(axis=1)]

        # 将数据从宽格式转换为长格式
        cluster_data_melted = cluster_data.melt(id_vars=[f'Cluster'], value_vars=columns_to_plot, var_name='Variable', value_name='Value')

        # 创建一个新的图像
        plt.figure(figsize=(12, 8))

        # 绘制小提琴图
        sns.violinplot(x='Variable', y='Value', data=cluster_data_melted, inner=None, scale='width', palette='Set3')

        # 绘制箱线图
        sns.boxplot(x='Variable', y='Value', data=cluster_data_melted, whis=1.5, width=0.2, palette='Set2', fliersize=0)

        # 设置图像标题和标签
        plt.title(f'Cluster {cluster_id} Data Distribution')
        plt.xlabel('Variable')
        plt.ylabel('Value')
        # 调整布局
        plt.tight_layout()
        plt.ylim(-200, 1200)

        # 保存图像
        output_image_path = os.path.join(output_folder, rf'K={K}_boxplot_cluster_{cluster_id}.png')

        plt.savefig(output_image_path)
        plt.close()


# 运行代码
input_csv_path = r"D:\file\e_index/223_cities_cluster_with_index.csv"
columns_to_plot = ['MIN_DistWB', 'MIN_DistGL', 'MIN_DistMT']

for K in range(11, 12):
    output_folder = rf'D:\file\e_index\fig\dist'
    os.makedirs(output_folder, exist_ok=True)

    plot_cluster_histograms(input_csv_path, columns_to_plot, K, output_folder)
    plot_cluster_analysis(input_csv_path, columns_to_plot, K, output_folder)


统计结果已保存: D:\file\e_index\fig\dist\K=11_cluster_stats.csv



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x='Variable', y='Value', data=cluster_data_melted, inner=None, scale='width', palette='Set3')

The `scale` parameter has been renamed and will be removed in v0.15.0. Pass `density_norm='width'` for the same effect.
  sns.violinplot(x='Variable', y='Value', data=cluster_data_melted, inner=None, scale='width', palette='Set3')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Variable', y='Value', data=cluster_data_melted, whis=1.5, width=0.2, palette='Set2', fliersize=0)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x='Variable', y='Value',

In [33]:
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# import os
#
# def plot_cluster_histograms(input_csv_path, columns_to_plot, K, output_folder):
#     # 读取数据
#     df = pd.read_csv(input_csv_path)
#
#     # 用来存储统计信息
#     stats_df = pd.DataFrame()
#
#     # 创建绘图
#     for cluster_id in range(K):
#         # 选择属于当前 cluster_id 的数据
#         cluster_data = df[df[f'Cluster'] == cluster_id]
#         # 过滤掉值为 -9999 的数据
#         cluster_data = cluster_data[~cluster_data[columns_to_plot].isin([-9999]).any(axis=1)]
#         # 假设你的列值表示距离，可以进行如下条件过滤
#         cluster_data = cluster_data[(cluster_data[columns_to_plot] ).all(axis=1)]
#
#         # 获取统计信息并添加到 DataFrame 中
#         describe_output = cluster_data[columns_to_plot].describe().transpose()
#         describe_output['Cluster'] = cluster_id  # 添加 Cluster 信息
#         stats_df = pd.concat([stats_df, describe_output])
#
#         # 创建直方图图像
#         plt.figure(figsize=(10, 6))
#
#         # 绘制每个列的直方图
#         for column in columns_to_plot:
#             sns.histplot(cluster_data[column], bins=10, kde=False, label=f'{column} Histogram', stat="density")
#
#         # 设置图像标题和标签
#         plt.title(f'Cluster {cluster_id} Histograms (K={K})')
#         plt.xlabel('Value')
#         plt.ylabel('Density')
#
#         # 固定横纵坐标范围
#         plt.xlim(-0.5, 5000)
#         plt.ylim(0, 0.005)
#
#         plt.legend()
#
#         # 保存直方图图像
#         output_hist_path = os.path.join(output_folder, rf'K={K}_histogram_cluster_{cluster_id}.png')
#         plt.savefig(output_hist_path)
#         plt.close()
#
#     # 将统计信息导出为 CSV 文件
#     stats_output_path = os.path.join(output_folder, rf'K={K}_cluster_stats.csv')
#     stats_df.to_csv(stats_output_path)
#     print(f"统计结果已保存: {stats_output_path}")
#
# def plot_cluster_analysis(input_csv_path, columns_to_plot, K, output_folder):
#     # 读取数据
#     df = pd.read_csv(input_csv_path)
#
#     # 创建绘图
#     for cluster_id in range(K):
#         # 选择属于当前 cluster_id 的数据
#         cluster_data = df[df[f'Cluster'] == cluster_id]
#         # 过滤掉值为 -9999 的数据
#         cluster_data = cluster_data[~cluster_data[columns_to_plot].isin([-9999]).any(axis=1)]
#         # 假设你的列值表示距离，可以进行如下条件过滤
#         cluster_data = cluster_data[(cluster_data[columns_to_plot] ).all(axis=1)]
#
#         # 将数据从宽格式转换为长格式
#         cluster_data_melted = cluster_data.melt(id_vars=[f'Cluster'], value_vars=columns_to_plot, var_name='Variable', value_name='Value')
#
#         # 创建一个新的图像
#         plt.figure(figsize=(12, 8))
#
#         # 绘制小提琴图
#         sns.violinplot(x='Variable', y='Value', data=cluster_data_melted, inner=None, scale='width', palette='Set3')
#
#         # 绘制箱线图
#         sns.boxplot(x='Variable', y='Value', data=cluster_data_melted, whis=1.5, width=0.2, palette='Set2', fliersize=0)
#
#         # 设置图像标题和标签
#         plt.title(f'Cluster {cluster_id} Data Distribution')
#         plt.xlabel('Variable')
#         plt.ylabel('Value')
#         # 调整布局
#         plt.tight_layout()
#         plt.ylim(-500, 50000)
#
#         # 保存图像
#         output_image_path = os.path.join(output_folder, rf'K={K}_boxplot_cluster_{cluster_id}.png')
#
#         plt.savefig(output_image_path)
#         plt.close()
#
# # 运行代码
# input_csv_path = r"D:\file\e_index/223_cities_cluster_with_index.csv"
# columns_to_plot = ['MIN_DistWB', 'MIN_DistGL', 'MIN_DistMT']
#
# for K in range(11, 12):
#     output_folder = rf'D:\file\e_index\fig\dist_original'
#     os.makedirs(output_folder, exist_ok=True)
#
#     plot_cluster_histograms(input_csv_path, columns_to_plot, K, output_folder)
#     plot_cluster_analysis(input_csv_path, columns_to_plot, K, output_folder)


统计结果已保存: D:\file\e_index\fig\dist_original\K=11_cluster_stats.csv



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x='Variable', y='Value', data=cluster_data_melted, inner=None, scale='width', palette='Set3')

The `scale` parameter has been renamed and will be removed in v0.15.0. Pass `density_norm='width'` for the same effect.
  sns.violinplot(x='Variable', y='Value', data=cluster_data_melted, inner=None, scale='width', palette='Set3')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Variable', y='Value', data=cluster_data_melted, whis=1.5, width=0.2, palette='Set2', fliersize=0)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x='Variable', y='Value',