# 省内城市年均温与月均温离散度分析

本笔记本使用 `station.csv` 与 `dump/china_data_insert.sql` 中的 `City_Temp` 数据，
计算省内城市年均温与分月均温的离散程度，并输出：

- 年均温差异最大的两个省份的各市全年均温柱状图
- 某省月均温差异最大的 3 个月份的各市月均温柱状图

输出图片同时保存为 PNG 与 SVG（位于 `Figure/png` 与 `Figure/svg`）。


In [None]:
from __future__ import annotations

import csv
import re
from collections import defaultdict
from statistics import mean, pstdev

import matplotlib.pyplot as plt

from config import FIGURE_PNG_DIR, FIGURE_SVG_DIR, SQL_PATH, STATION_PATH


In [None]:
def load_city_provinces(station_path: str):
    city_to_province: dict[str, str] = {}
    with open(station_path, newline="", encoding="utf-8") as file:
        reader = csv.reader(file)
        next(reader, None)
        for row in reader:
            if len(row) < 4:
                continue
            province = row[2].strip()
            city = row[3].strip()
            if city and province and city not in city_to_province:
                city_to_province[city] = province
    return city_to_province


def load_city_temps(sql_path: str, city_to_province: dict[str, str]):
    pattern = re.compile(r"\((\d+),\s*'([^']+)',\s*([-\d.]+)\)")
    in_city_temp = False
    buffer: list[str] = []

    with open(sql_path, encoding="utf-8") as file:
        for line in file:
            lower = line.lower()
            if lower.startswith("insert into city_temp"):
                in_city_temp = True
                buffer.append(line)
                continue
            if in_city_temp:
                buffer.append(line)
                if ";" in line:
                    break

    for line in buffer:
        for match in pattern.finditer(line):
            month = int(match.group(1))
            city = match.group(2)
            temp = float(match.group(3))
            province = city_to_province.get(city)
            if not province:
                continue
            yield province, city, month, temp


def build_city_monthly_map(city_month_temps):
    province_city_month: dict[str, dict[str, dict[int, float]]] = defaultdict(lambda: defaultdict(dict))
    for province, city, month, temp in city_month_temps:
        province_city_month[province][city][month] = temp
    return province_city_month


def compute_annual_means(province_city_month):
    province_city_annual: dict[str, dict[str, float]] = defaultdict(dict)
    for province, city_months in province_city_month.items():
        for city, months in city_months.items():
            if not months:
                continue
            province_city_annual[province][city] = mean(months.values())
    return province_city_annual


def compute_top_provinces(province_city_annual, count=2):
    dispersions = []
    for province, city_means in province_city_annual.items():
        if len(city_means) < 2:
            continue
        dispersions.append((pstdev(city_means.values()), province))
    dispersions.sort(reverse=True)
    return [province for _, province in dispersions[:count]]


def compute_monthly_dispersion(province_city_month):
    dispersions = []
    for province, city_months in province_city_month.items():
        months = defaultdict(list)
        for temps in city_months.values():
            for month, temp in temps.items():
                months[month].append(temp)
        for month, temps in months.items():
            if len(temps) < 2:
                continue
            dispersions.append((pstdev(temps), province, month))
    dispersions.sort(reverse=True)
    return dispersions


In [None]:
def plot_horizontal_bar(labels, values, title, xlabel, output_basename):
    fig_height = max(4, 0.35 * len(labels))
    fig, ax = plt.subplots(figsize=(12, fig_height))
    ax.barh(labels, values, color="#4C78A8")
    ax.invert_yaxis()
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    fig.tight_layout()

    FIGURE_PNG_DIR.mkdir(parents=True, exist_ok=True)
    FIGURE_SVG_DIR.mkdir(parents=True, exist_ok=True)

    png_path = FIGURE_PNG_DIR / f"{output_basename}.png"
    svg_path = FIGURE_SVG_DIR / f"{output_basename}.svg"

    fig.savefig(png_path, dpi=150)
    fig.savefig(svg_path)
    plt.close(fig)

    return png_path, svg_path


In [None]:
city_to_province = load_city_provinces(STATION_PATH)
city_month_temps = list(load_city_temps(SQL_PATH, city_to_province))
province_city_month = build_city_monthly_map(city_month_temps)
province_city_annual = compute_annual_means(province_city_month)

top_provinces = compute_top_provinces(province_city_annual, count=2)
monthly_dispersion = compute_monthly_dispersion(province_city_month)

if not monthly_dispersion:
    raise ValueError("No monthly dispersion data found.")

selected_province = monthly_dispersion[0][1]
months_for_selected = [month for _, province, month in monthly_dispersion if province == selected_province][:3]

top_provinces, selected_province, months_for_selected


In [None]:
# 执行绘图（会生成 PNG 与 SVG 文件）
for province in top_provinces:
    city_means = province_city_annual[province]
    sorted_items = sorted(city_means.items(), key=lambda x: x[1], reverse=True)
    labels = [city for city, _ in sorted_items]
    values = [val for _, val in sorted_items]
    plot_horizontal_bar(
        labels=labels,
        values=values,
        title=f"{province} 各市全年均温",
        xlabel="全年均温（原始单位）",
        output_basename=f"annual_mean_{province}",
    )

for month in months_for_selected:
    city_temps = {
        city: temps.get(month)
        for city, temps in province_city_month[selected_province].items()
    }
    city_temps = {city: temp for city, temp in city_temps.items() if temp is not None}
    sorted_items = sorted(city_temps.items(), key=lambda x: x[1], reverse=True)
    labels = [city for city, _ in sorted_items]
    values = [val for _, val in sorted_items]
    plot_horizontal_bar(
        labels=labels,
        values=values,
        title=f"{selected_province} {month}月各市月均温",
        xlabel=f"{month}月均温（原始单位）",
        output_basename=f"monthly_mean_{selected_province}_{month}",
    )
