In [21]:

import pandas as pd
from pymongo import MongoClient
from tqdm.notebook import tqdm # 在 Jupyter 中使用 tqdm.notebook 获得更好的进度条体验
from datetime import date
# 配置数据库连接信息

MONGO_CONNECTION_STRING = "mongodb://localhost:27017/"
DB_NAME = "barra_financial_data"


# 创建一个辅助函数，用于将集合加载到 DataFrame

def load_filtered_collection_to_df(db, collection_name: str, query: dict, projection: dict) -> pd.DataFrame:
    """高效地加载经过筛选和投影的集合数据。"""
    print(f"正在从 '{collection_name}' 加载精确数据...")
    collection = db[collection_name]
    cursor = collection.find(query, projection)
    df = pd.DataFrame(list(cursor))
    print(f"-> 成功加载 {len(df):,} 行数据。")
    return df

# 连接到数据库
client = MongoClient(MONGO_CONNECTION_STRING)
db = client[DB_NAME]
print("成功连接到 MongoDB。")

成功连接到 MongoDB。


In [None]:
start_date_str = "20200101"
end_date_str = date.today().strftime('%Y%m%d')
print(f"数据筛选范围: {start_date_str} 到 {end_date_str}")

# 日线行情数据的筛选条件
daily_prices_query = {
    "trade_date": {
        "$gte": start_date_str,
        "$lte": end_date_str
    }
}

# 财务数据的筛选条件 (基于公告日期 ann_date)
# 财报公告可能会稍晚，所以我们把开始日期往前放一点，确保能覆盖2020年初的数据
financial_ann_start_date = "20191201" 
financial_query = {
    "ann_date": {
        "$gte": financial_ann_start_date,
        "$lte": end_date_str
    }
}


# ----------------------------------------------------
# 2. 定义投影 (Projection) - 只选择我们需要的列
# ----------------------------------------------------
# 这可以极大地减少内存占用！
daily_prices_projection = {
    "ts_code": 1, "trade_date": 1, "close": 1, "turnover_rate": 1, 
    "total_mv": 1, "circ_mv": 1, "_id": 0
}

# 财务数据，只选择几个关键指标用于示例
financial_projection = {
    "ts_code": 1, "ann_date": 1, "end_date": 1, "q_profit_yoy": 1, "roe": 1,
    "debt_to_assets": 1, "n_cashflow_act": 1, "total_revenue": 1, "_id": 0
}

数据筛选范围: 20200101 到 20251017


In [23]:
daily_prices_df = load_filtered_collection_to_df(db, 'daily_prices', daily_prices_query, daily_prices_projection)

正在从 'daily_prices' 加载精确数据...
-> 成功加载 6,598,385 行数据。


In [27]:
daily_prices_df[daily_prices_df['ts_code'] == '000001.SZ'].sort_values('trade_date')

Unnamed: 0,ts_code,trade_date,close,turnover_rate,total_mv,circ_mv
1399,000001.SZ,20200102,16.87,0.7885,3.273778e+07,3.273750e+07
1398,000001.SZ,20200103,17.18,0.5752,3.333937e+07,3.333908e+07
1397,000001.SZ,20200106,17.07,0.4442,3.312590e+07,3.312562e+07
1396,000001.SZ,20200107,17.15,0.3755,3.328115e+07,3.328087e+07
1395,000001.SZ,20200108,16.66,0.4369,3.233026e+07,3.232998e+07
...,...,...,...,...,...,...
3,000001.SZ,20251010,11.43,0.5606,2.218096e+07,2.218060e+07
2,000001.SZ,20251013,11.40,0.6023,2.212275e+07,2.212238e+07
1,000001.SZ,20251014,11.57,0.9499,2.245265e+07,2.245228e+07
0,000001.SZ,20251015,11.40,0.6550,2.212275e+07,2.212238e+07


In [19]:
financial_df = load_filtered_collection_to_df(db, 'financial_indicators', financial_query, financial_projection)

正在从 'financial_indicators' 加载精确数据...
-> 成功加载 124,903 行数据。


In [20]:
financial_df

Unnamed: 0,ts_code,ann_date,end_date,roe,debt_to_assets,q_profit_yoy
0,000001.SZ,20250823,20250630,4.9497,91.3180,-1.5803
1,000001.SZ,20250419,20250331,2.8165,91.2405,-5.5987
2,000001.SZ,20250315,20241231,9.2038,91.4228,-29.9267
3,000001.SZ,20241019,20240930,8.2528,91.4641,-2.7934
4,000001.SZ,20240816,20240630,5.4242,91.6255,1.5021
...,...,...,...,...,...,...
124898,689009.SH,20200930,20191231,,35.9494,
124899,689009.SH,20201028,20190930,,,
124900,689009.SH,20200930,20190630,,,
124901,689009.SH,20200930,20181231,,187.5630,


In [12]:
# ----------------------------------------------------
# 1. 导入库并配置连接 (不变)
# ----------------------------------------------------
import pandas as pd
from pymongo import MongoClient
from datetime import date

MONGO_CONNECTION_STRING = "mongodb://localhost:27017/"
DB_NAME = "barra_financial_data"
COLLECTION_NAME = "daily_prices"

# ----------------------------------------------------
# 2. 定义查询范围和所需字段 (不变)
# ----------------------------------------------------
start_date_str = "20200101"
end_date_str = date.today().strftime('%Y%m%d')

query = {
    "trade_date": {
        "$gte": start_date_str,
        "$lte": end_date_str
    }
}
projection = {
    "ts_code": 1, "trade_date": 1, "close": 1, "total_mv": 1, "_id": 0
}

# ----------------------------------------------------
# 3. 【核心修正】使用正确的、高效的分块加载逻辑
# ----------------------------------------------------
all_dataframes = []
chunk = []
chunk_size = 500000 # 每次处理50万行

try:
    with MongoClient(MONGO_CONNECTION_STRING) as client:
        db = client[DB_NAME]
        collection = db[COLLECTION_NAME]
        print("正在执行高效索引查询...")
        
        # 1. 创建一次游标，no_cursor_timeout 确保长时间运行不会中断
        cursor = collection.find(query, projection, no_cursor_timeout=True)
        
        # 2. 逐条迭代游标，手动分块
        print("开始从数据库流式加载数据并分块...")
        for doc in cursor:
            chunk.append(doc)
            if len(chunk) >= chunk_size:
                all_dataframes.append(pd.DataFrame(chunk))
                print(f"已处理一个 {len(chunk):,} 行的数据块...")
                chunk = [] # 清空 chunk，为下一个数据块做准备
        
        # 3. 处理最后一个可能不满 chunk_size 的数据块
        if chunk:
            all_dataframes.append(pd.DataFrame(chunk))
            print(f"已处理最后一个 {len(chunk):,} 行的数据块...")

        print("\n所有数据块加载完毕，正在合并...")
        final_df = pd.concat(all_dataframes, ignore_index=True)
        print(f"成功合并成一个 DataFrame，总行数: {len(final_df):,}")

except Exception as e:
    print(f"查询时发生错误: {e}")
    final_df = pd.DataFrame()

# ----------------------------------------------------
# 4. 后续处理 (不变)

正在执行高效索引查询...
开始从数据库流式加载数据并分块...


  return Cursor(self, *args, **kwargs)


已处理一个 500,000 行的数据块...
已处理一个 500,000 行的数据块...
已处理一个 500,000 行的数据块...
已处理一个 500,000 行的数据块...
已处理一个 500,000 行的数据块...
已处理一个 500,000 行的数据块...
已处理一个 500,000 行的数据块...
已处理一个 500,000 行的数据块...
已处理一个 500,000 行的数据块...
已处理一个 500,000 行的数据块...
已处理一个 500,000 行的数据块...
已处理一个 500,000 行的数据块...
已处理一个 500,000 行的数据块...
已处理最后一个 92,957 行的数据块...

所有数据块加载完毕，正在合并...
成功合并成一个 DataFrame，总行数: 6,592,957


In [14]:
final_df

Unnamed: 0,ts_code,trade_date,close,total_mv
0,000001.SZ,20251015,11.40,2.212275e+07
1,000001.SZ,20251014,11.57,2.245265e+07
2,000001.SZ,20251013,11.40,2.212275e+07
3,000001.SZ,20251010,11.43,2.218096e+07
4,000001.SZ,20251009,11.40,2.212275e+07
...,...,...,...,...
6592952,600101.SH,20200108,7.23,3.046958e+05
6592953,600101.SH,20200107,7.35,3.097530e+05
6592954,600101.SH,20200106,7.34,3.093316e+05
6592955,600101.SH,20200103,7.37,3.105959e+05
