In [None]:
import rasterio
import numpy as np
import glob
import pickle
import os
from tqdm import tqdm

# -------------------------------
# 设置参数
# -------------------------------
pv_change_file = "提取结果_2有无_3无有_4有.tif"
driver_folder = "Factor"   # 驱动力因子路径
sample_rate_all = 0.01     # 总体采样比例 1%
sample_rate_cls = 0.5      # 类别 2/3/4 抽样比例 50%
# -------------------------------

# 读取光伏变化数据
with rasterio.open(pv_change_file) as src:
    pv = src.read(1)

# 位置索引
idx_all = np.where(pv > 0)   # 全域（用于总体抽样）
idx_2 = np.where(pv == 2)
idx_3 = np.where(pv == 3)
idx_4 = np.where(pv == 4)

# --------------------------
# 总体按比例抽样
# --------------------------
num_all = len(idx_all[0])
sample_size_all = max(1, int(num_all * sample_rate_all))
print(f"总体像元：{num_all}, 抽样 {sample_rate_all*100:.1f}% → {sample_size_all}")

np.random.seed(42)
perm = np.random.permutation(num_all)
sample_idx = perm[:sample_size_all]
idx_all_sampled = (idx_all[0][sample_idx], idx_all[1][sample_idx])

# --------------------------
# 输出结构
# --------------------------
all_samples = {}
cls_samples = {2: {}, 3: {}, 4: {}}

# --------------------------
# 遍历驱动因子
# --------------------------
driver_files = glob.glob(os.path.join(driver_folder, "*.tif"))
print("\n读取驱动态图层...")

for f in tqdm(driver_files):
    name = os.path.basename(f)

    with rasterio.open(f) as src:
        arr = src.read(1)

    # ---------------- 总体抽样
    all_samples[name] = arr[idx_all_sampled].astype(float)

    # ---------------- 类别 2/3/4 按比例抽样
    def sample_class(idx, prop):
        n = max(1, int(len(idx[0]) * prop))
        if len(idx[0]) > n:
            perm = np.random.permutation(len(idx[0]))
            sel = perm[:n]
            return (idx[0][sel], idx[1][sel])
        return idx

    idx_2_sampled = sample_class(idx_2, sample_rate_cls)
    idx_3_sampled = sample_class(idx_3, sample_rate_cls)
    idx_4_sampled = sample_class(idx_4, sample_rate_cls)

    cls_samples[2][name] = arr[idx_2_sampled].astype(float)
    cls_samples[3][name] = arr[idx_3_sampled].astype(float)
    cls_samples[4][name] = arr[idx_4_sampled].astype(float)

# --------------------------
# 保存输出
# --------------------------
output_file = "driver_distribution_samples2.pkl"
with open(output_file, "wb") as f:
    pickle.dump({"all": all_samples, "cls": cls_samples}, f)

print(f"\n样本提取完成 → {output_file}")


In [None]:
import pickle
import pandas as pd
import numpy as np
from scipy.stats import gaussian_kde
import os

# --------------------------
# 参数
# --------------------------
pkl_file = "driver_distribution_samples2.pkl"
excel_folder = "./Excel_Data"
os.makedirs(excel_folder, exist_ok=True)

max_bee_samples = 3000
kde_points = 1000

# --------------------------
# 读取数据
# --------------------------
with open(pkl_file, "rb") as f:
    data = pickle.load(f)

all_samples = data["all"]
cls_samples = data["cls"]

# --------------------------
# 导出绘图 Excel
# --------------------------
for driver_name in all_samples.keys():
    print(f">>> 处理 {driver_name}")

    arr_all = all_samples[driver_name]
    arr2 = cls_samples[2][driver_name]
    arr3 = cls_samples[3][driver_name]
    arr4 = cls_samples[4][driver_name]

    # ---------------- Boxplot / Beeswarm 数据抽样
    def sample(a, n=max_bee_samples):
        a = a[np.isfinite(a)]
        if len(a) > n:
            idx = np.random.choice(len(a), n, replace=False)
            return a[idx]
        return a
    all_vals = sample(arr_all)
    c2_vals = sample(arr2)
    c3_vals = sample(arr3)
    c4_vals = sample(arr4)

    # 长表格
    df_box = pd.DataFrame({
        "Value": np.concatenate([all_vals, c2_vals, c3_vals, c4_vals]),
        "Class": (["All"]*len(all_vals) +
                  ["Class2"]*len(c2_vals) +
                  ["Class3"]*len(c3_vals) +
                  ["Class4"]*len(c4_vals))
    })

    # ---------------- KDE 数据
    def compute_kde(a):
        a = a[np.isfinite(a)]
        if len(a) < 50:
            return np.full(kde_points, np.nan), np.full(kde_points, np.nan)
        kde = gaussian_kde(a)
        x = np.linspace(np.percentile(a, 1), np.percentile(a, 99), kde_points)
        y = kde(x)
        return x, y

    x_all, y_all = compute_kde(arr_all)
    _, y2 = compute_kde(arr2)
    _, y3 = compute_kde(arr3)
    _, y4 = compute_kde(arr4)

    df_kde = pd.DataFrame({
        "x": x_all,
        "All": y_all,
        "Class2": y2,
        "Class3": y3,
        "Class4": y4
    })

    # ---------------- Stats
    df_stats = pd.DataFrame({
        "Class": ["All", "Class2", "Class3", "Class4"],
        "N": [len(all_vals), len(c2_vals), len(c3_vals), len(c4_vals)],
        "Mean": [np.mean(all_vals), np.mean(c2_vals), np.mean(c3_vals), np.mean(c4_vals)],
        "Median": [np.median(all_vals), np.median(c2_vals), np.median(c3_vals), np.median(c4_vals)],
        "Q1": [np.percentile(all_vals,25), np.percentile(c2_vals,25), np.percentile(c3_vals,25), np.percentile(c4_vals,25)],
        "Q3": [np.percentile(all_vals,75), np.percentile(c2_vals,75), np.percentile(c3_vals,75), np.percentile(c4_vals,75)],
        "Std": [np.std(all_vals), np.std(c2_vals), np.std(c3_vals), np.std(c4_vals)]
    })

    # ---------------- 写入 Excel
    excel_path = os.path.join(excel_folder, f"{driver_name}.xlsx")
    with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
        df_box.to_excel(writer, sheet_name="Boxplot_Data", index=False)
        df_kde.to_excel(writer, sheet_name="KDE_Data", index=False)
        df_stats.to_excel(writer, sheet_name="Stats", index=False)

    print(f"✔ Excel 已生成: {excel_path}")

print("===== 全部绘图数据 Excel 导出完成 =====")


In [None]:
import os
import numpy as np
import pandas as pd
import rasterio
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

# ================================
# 参数设置
# ================================
result_tif = r"提取结果_2有无_3无有_4有.tif"
driver_folder = r"Factor"     # 存放栅格驱动力因子
output_folder = r"plots"

# 图像风格
sns.set(style="whitegrid", font_scale=1.5)

# 输出目录
folders = {
    "box": "01_Boxplot",
    "kde": "02_KDE",
    "beeswarm": "03_Beeswarm",
    "hist": "04_Hist",
    "violin": "05_Violin"
}
for f in folders.values():
    os.makedirs(os.path.join(output_folder, f), exist_ok=True)

# ================================
# 读取主分类数据 (2,3,4)
# ================================
with rasterio.open(result_tif) as src:
    cls_arr = src.read(1)

valid_mask = np.isin(cls_arr, [2, 3, 4])

# ================================
# 读取驱动力因子
# ================================
def read_raster_as_array(path):
    with rasterio.open(path) as src:
        x = src.read(1)
        return x.astype(float)

driver_files = [os.path.join(driver_folder, f) 
                for f in os.listdir(driver_folder)
                if f.endswith(".tif")]

driver_names = [os.path.splitext(os.path.basename(f))[0] for f in driver_files]

# 存储各类数据
all_samples = {}
cls_samples = {2: {}, 3: {}, 4: {}}

for name, file in zip(driver_names, driver_files):
    arr = read_raster_as_array(file)

    # 采样
    all_valid = arr[np.isfinite(arr)]
    sample_size = min(200000, len(all_valid))
    all_samples[name] = np.random.choice(all_valid, sample_size, replace=False)

    cls_samples[2][name] = arr[cls_arr == 2]
    cls_samples[3][name] = arr[cls_arr == 3]
    cls_samples[4][name] = arr[cls_arr == 4]

# ================================
# 绘图函数
# ================================
def plot_box(name):
    df = pd.DataFrame({
        "Value": np.concatenate([
            all_samples[name],
            cls_samples[2][name],
            cls_samples[3][name],
            cls_samples[4][name]
        ]),
        "Class": (["All"] * len(all_samples[name])) +
                 (["Class2"] * len(cls_samples[2][name])) +
                 (["Class3"] * len(cls_samples[3][name])) +
                 (["Class4"] * len(cls_samples[4][name]))
    })

    plt.figure(figsize=(10, 6))
    sns.boxplot(data=df, x="Class", y="Value")
    plt.title(f"Boxplot - {name}")
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, folders["box"], f"{name}_box.png"))
    plt.close()


def plot_kde(name):
    plt.figure(figsize=(10, 6))
    for label, data in zip(
        ["All", "Class2", "Class3", "Class4"],
        [all_samples[name], cls_samples[2][name], cls_samples[3][name], cls_samples[4][name]]
    ):
        kde = gaussian_kde(data)
        x_min, x_max = np.percentile(data, [1, 99])
        xs = np.linspace(x_min, x_max, 400)
        plt.plot(xs, kde(xs), label=label)

    plt.title(f"KDE - {name}")
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, folders["kde"], f"{name}_kde.png"))
    plt.close()


def plot_beeswarm(name):
    df = pd.DataFrame({
        "Value": np.concatenate([
            all_samples[name][:2000],
            cls_samples[2][name][:2000],
            cls_samples[3][name][:2000],
            cls_samples[4][name][:2000]
        ]),
        "Class": (["All"] * 2000) +
                 (["Class2"] * 2000) +
                 (["Class3"] * 2000) +
                 (["Class4"] * 2000)
    })

    plt.figure(figsize=(10, 6))
    sns.swarmplot(data=df, x="Class", y="Value", size=3)
    plt.title(f"Beeswarm - {name}")
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, folders["beeswarm"], f"{name}_beeswarm.png"))
    plt.close()


def plot_hist(name):
    plt.figure(figsize=(10, 6))
    bins = 40
    plt.hist(all_samples[name], bins=bins, alpha=0.5, label="All")
    plt.hist(cls_samples[2][name], bins=bins, alpha=0.5, label="Class2")
    plt.hist(cls_samples[3][name], bins=bins, alpha=0.5, label="Class3")
    plt.hist(cls_samples[4][name], bins=bins, alpha=0.5, label="Class4")
    plt.title(f"Histogram - {name}")
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, folders["hist"], f"{name}_hist.png"))
    plt.close()


def plot_violin(name):
    df = pd.DataFrame({
        "Value": np.concatenate([
            all_samples[name],
            cls_samples[2][name],
            cls_samples[3][name],
            cls_samples[4][name]
        ]),
        "Class": (["All"] * len(all_samples[name])) +
                 (["Class2"] * len(cls_samples[2][name])) +
                 (["Class3"] * len(cls_samples[3][name])) +
                 (["Class4"] * len(cls_samples[4][name]))
    })

    plt.figure(figsize=(10, 6))
    sns.violinplot(data=df, x="Class", y="Value")
    plt.title(f"Violin - {name}")
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, folders["violin"], f"{name}_violin.png"))
    plt.close()


# ================================
# 主循环：对每个驱动力因子绘图
# ================================
for name in driver_names:
    print(f"绘图中: {name} …")
    plot_box(name)
    plot_kde(name)
    plot_beeswarm(name)
    plot_hist(name)
    plot_violin(name)

print("✔ 所有图已生成完毕！")
