In [None]:
import time
from typing import Generator

import pandas as pd

from utils import chinese_date_to_date, get_all_csv_filepaths
import cpca


def get_chunks(file_path: str, chunk_size: int) -> Generator[pd.DataFrame, None, None]:
    chunks = pd.read_csv(file_path, chunksize=chunk_size)
    for chunk in chunks:
        yield chunk


def process_chunk(chunk: pd.DataFrame) -> pd.DataFrame:
    # 筛选出“案件类型”列中的刑事案件和民事案件
    filtered_chunk = chunk[chunk["案件类型"].isin(["刑事案件", "民事案件"])]
    filtered_chunk = filtered_chunk[filtered_chunk["审理程序"].notnull()]
    # 筛选出“审理程序”列中包含“一审”的行
    filtered_chunk = filtered_chunk[filtered_chunk["审理程序"].str.contains("一审")]
    filtered_chunk = filtered_chunk[filtered_chunk["文书内容"].notnull()]
    # 提取“文书内容”列中的判决时间
    filtered_chunk["判决时间"] = filtered_chunk["文书内容"].str.extract(
        r"(二〇[一二三四五六七八九十〇]{2}年[一二三四五六七八九十〇]{1,2}月[一二三四五六七八九十〇]{1,3}日)"
    )
    filtered_chunk = filtered_chunk[filtered_chunk["判决时间"].notnull()]
    # 将判决时间转换为日期格式
    filtered_chunk["判决时间"] = filtered_chunk["判决时间"].apply(chinese_date_to_date)
    filtered_chunk["判决时间"] = pd.to_datetime(
        filtered_chunk["判决时间"], format="%Y-%m-%d", errors="coerce"
    )
    filtered_chunk = filtered_chunk[filtered_chunk["判决时间"].notnull()]
    # 提取“文书内容”列中的审判法院所在地址
    filtered_chunk["审判法院所在地址"] = filtered_chunk["文书内容"].str.extract(
        r"((?:[^省市区县]*[省市区县]){1,3})(?=人民法院)"
    )
    filtered_chunk = filtered_chunk.reset_index(drop=True)
    filtered_chunk = pd.concat(
        [
            filtered_chunk,
            cpca.transform(filtered_chunk["审判法院所在地址"]).reset_index(drop=True),
        ],
        axis=1,
    )
    return filtered_chunk


def process_file(file_path: str, chunk_size: int) -> pd.DataFrame:
    chunks = get_chunks(file_path, chunk_size)
    filtered_df = pd.DataFrame()

    for chunk in chunks:
        filtered_chunk = process_chunk(chunk)
        filtered_df = pd.concat([filtered_df, filtered_chunk], ignore_index=True)

    return filtered_df


start_time = time.time()
file_paths = get_all_csv_filepaths()
final_df = pd.DataFrame()
for file_path in file_paths:
    result_df = process_file(file_path, chunk_size=10000)
    final_df = pd.concat([final_df, result_df], ignore_index=True)
# 保存为csv文件
final_df.to_csv("processed_data.csv", index=False)
end_time = time.time()
print(f"Total time: {end_time - start_time} seconds.")

In [None]:
# import pandas as pd

# # 读取处理后的数据
# final_df = pd.read_csv("processed_data.csv")

In [None]:

from pyecharts import options as opts
from pyecharts.charts import Map

# 统计每个市的案件数量
city_counts = final_df["市"].value_counts().reset_index()
city_counts.columns = ["市", "案件数量"]

map_chart = (
    Map()
    .add(
        "案件数量",
        [list(z) for z in zip(city_counts["市"], city_counts["案件数量"])],
        "china-cities",
        label_opts=opts.LabelOpts(is_show=False),
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="各市案件数量分布"),
        visualmap_opts=opts.VisualMapOpts(),
    )
)

map_chart.render_notebook()
