#### 1.1 Reorganize all csv files.

In [10]:
import os
import shutil
from glob import glob  
from tqdm import tqdm
from pathlib import Path

data_root = r".\Data"
directories = glob(os.path.join(data_root, r'城市*'))
print(directories)
for directory in tqdm(directories):
    if os.path.exists(os.path.join(directory, Path(directory).name)):
        inner_directory = os.path.join(directory, Path(directory).name)
        xmls = glob(os.path.join(inner_directory, '*.csv'))
        for xml in xmls:
            shutil.move(xml, 
                        os.path.join(directory, Path(xml).name.replace("china_cities_", "")))
        os.remove(inner_directory)
    else:
        xmls = glob(os.path.join(directory, '*.csv'))
        for xml in xmls:
            os.rename(xml, xml.replace("china_cities_", ""))
    os.rename(directory, os.path.join(Path(directory).parent, Path(directory).name[3:7]))

['.\\Data\\城市_20250101-20250329']


100%|██████████| 1/1 [00:00<00:00, 142.03it/s]


#### 1.2 Get a directory tree.

In [None]:
from rich.tree import Tree
from rich import print
import os

def build_tree(path, tree):
    for item in sorted(os.listdir(path)):
        full_path = os.path.join(path, item)
        if os.path.isdir(full_path):
            branch = tree.add(f"[bold blue]{item}/")
            build_tree(full_path, branch)
        else:
            tree.add(item)

root_path = r".\Data"
tree = Tree(f"[bold green]{os.path.basename(root_path)}/")
build_tree(root_path, tree)
print(tree)

#### 1.3 Process all csv (columns change to rows)

In [14]:
import pandas as pd
from glob import glob
from pathlib import Path
from tqdm import tqdm

# 设置你的输入文件路径和输出路径
input_dir_root = "./Data"
csvs = glob(os.path.join(input_dir_root, '*/*.csv'))
# output_csv_path = "./20140514_long.csv"

for csv in tqdm(csvs):
    output_csv_path = os.path.join(Path(csv).parent, f"{Path(csv).stem}_long.csv")

    # 读取原始宽格式 CSV 数据
    df = pd.read_csv(csv)

    # 将从第4列（索引为3）开始的所有列名作为城市列
    city_columns = df.columns[3:]

    # 使用 pd.melt 进行宽转长
    long_df = pd.melt(
        df,
        id_vars=["date", "hour", "type"],
        value_vars=city_columns,
        var_name="city",
        value_name="value"
    )

    # === 去除 value 为 NaN 的行 ===
    long_df = long_df.dropna(subset=["value"])

    # 保存为新的长格式 CSV 文件
    long_df.to_csv(output_csv_path, index=False, encoding="utf-8")

print(f"✅ 转换成功，保存为：{output_csv_path}")


100%|██████████| 3958/3958 [07:37<00:00,  8.64it/s]

✅ 转换成功，保存为：Data/2016/20160810_long.csv





#### 1.4 Add auxiliary csv

In [None]:
import os
import pandas as pd
import json
from glob import glob
from tqdm import tqdm
from pathlib import Path

# 设置你的输入文件路径和输出路径
input_dir_root = "./Data"
csvs = glob(os.path.join(input_dir_root, '*/*long.csv'))

valid_dates = dict()
valid_cities = dict()

for csv in tqdm(csvs):
    date = Path(csv).stem.strip('_long')
    valid_dates[date] = list()
    
    df = pd.read_csv(csv)
    cities = set(df['city'])
    for city in cities:
        valid_cities[city] = date
    
    valid_dates[date].append(list(cities))

with open('./dates_to_cities_index.json', 'w+') as f:
    json.dump(valid_dates, f, ensure_ascii=False, indent=4)
with open('./cities_to_dates_index.json', 'w+') as f:
    json.dump(valid_cities, f, ensure_ascii=False, indent=4)

100%|██████████| 3958/3958 [02:40<00:00, 24.67it/s]


#### 2. Due to large amount of data, we average the air quality metrics from 24 hours per day to only one