In [1]:
import pandas as pd
import os
from pathlib import Path
import time
import requests

# 设置本地缓存目录
DATA_DIR = Path("covid_data")
DATA_DIR.mkdir(exist_ok=True)

# 使用国内镜像源（避免直接连接GitHub）
MIRROR_URLS = {
    "confirmed": "https://cdn.jsdelivr.net/gh/CSSEGISandData/COVID-19@master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv",
    "deaths": "https://cdn.jsdelivr.net/gh/CSSEGISandData/COVID-19@master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv",
    "recovered": "https://cdn.jsdelivr.net/gh/CSSEGISandData/COVID-19@master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"
}

def robust_download(url, filename, max_retries=3):
    """带重试机制的可靠下载函数"""
    filepath = DATA_DIR / filename
    
    # 如果本地文件存在且较新（3天内），直接使用
    if filepath.exists() and (time.time() - os.path.getmtime(filepath)) < 259200:  # 3天=259200秒
        print(f"使用本地缓存: {filename}")
        return pd.read_csv(filepath)
    
    # 尝试下载
    for attempt in range(max_retries):
        try:
            print(f"尝试下载 {filename} (尝试 {attempt+1}/{max_retries})...")
            
            # 使用requests下载（比pd.read_csv更好的错误处理）
            response = requests.get(url, timeout=15)
            response.raise_for_status()  # 检查HTTP错误
            
            # 保存到文件
            with open(filepath, 'wb') as f:
                f.write(response.content)
                
            print(f"下载成功! 保存到: {filepath}")
            return pd.read_csv(filepath)
            
        except Exception as e:
            print(f"下载失败: {str(e)}")
            if attempt < max_retries - 1:
                wait_time = 5 * (attempt + 1)  # 指数退避等待
                print(f"等待 {wait_time}秒后重试...")
                time.sleep(wait_time)
            else:
                print("达到最大重试次数")
                if filepath.exists():
                    print("使用旧版本本地数据")
                    return pd.read_csv(filepath)
                else:
                    raise ConnectionError(f"无法下载数据且无本地缓存: {filename}")

# 加载数据集
print("="*50)
print("开始加载COVID-19数据集")
print("="*50)

confirmed = robust_download(
    MIRROR_URLS["confirmed"], 
    "confirmed_global.csv"
)

deaths = robust_download(
    MIRROR_URLS["deaths"], 
    "deaths_global.csv"
)

recovered = robust_download(
    MIRROR_URLS["recovered"], 
    "recovered_global.csv"
)

print("="*50)
print("数据集加载完成!")
print("="*50)

# 验证数据
print("\n数据概览:")
print(f"确诊数据: {confirmed.shape}")
print(f"死亡数据: {deaths.shape}")
print(f"康复数据: {recovered.shape}")

开始加载COVID-19数据集
尝试下载 confirmed_global.csv (尝试 1/3)...
下载成功! 保存到: covid_data\confirmed_global.csv
尝试下载 deaths_global.csv (尝试 1/3)...
下载成功! 保存到: covid_data\deaths_global.csv
尝试下载 recovered_global.csv (尝试 1/3)...
下载成功! 保存到: covid_data\recovered_global.csv
数据集加载完成!

数据概览:
确诊数据: (289, 1147)
死亡数据: (289, 1147)
康复数据: (274, 1147)
