# 金融‘五篇大文章’示例建模（评分 + 简单预测）

本 Notebook 演示如何基于采集的世界银行公开指标构建一个可复用的小模型：
- 评分模型：对多指标进行标准化并加权求和，得到年度综合评分（0-100）。
- 简单预测：对某个指标进行时间序列预测并做基本验证（MAPE）。

运行环境：
- Python 3.10+；在项目根目录执行 `pip install -r requirements.txt`。

运行顺序：
1. 读取最新运行产物（优先 runs/manifest_*.json）。
2. 指标宽表构建与清洗。
3. 评分模型计算（标准化、方向、权重、0-100归一）。
4. 时间序列预测与验证（MAPE）。
5. 保存结果到 data/model/。


In [None]:
import os, json, math
from glob import glob
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Optional, Tuple
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import plotly.express as px

# 可选：手动指定项目根目录（绝对路径）。优先级高于自动检测。
PROJECT_OVERRIDE = os.getenv("PROJECT_ROOT", "").strip() or None


def detect_project_root() -> Path:
    """在 Notebook 环境下鲁棒地定位项目根目录。
    优先使用包含 requirements.txt 且存在 data/ 或 src/ 的上层目录。
    """
    here = Path.cwd()
    # 候选：当前目录及其若干上层
    candidates = [here] + list(here.parents)
    for base in candidates[:5]:
        if (base / "requirements.txt").exists() and ((base / "data").exists() or (base / "src").exists()):
            return base
    # 常见情况：notebook/ 目录内运行
    if here.name == "notebook" and (here.parent / "data").exists():
        return here.parent
    return here

PROJECT = Path(PROJECT_OVERRIDE) if PROJECT_OVERRIDE else detect_project_root()
DATA_DIR = PROJECT / 'data'
MODEL_DIR = DATA_DIR / 'model'
MODEL_DIR.mkdir(parents=True, exist_ok=True)

DEFAULT_INDICATORS = ['IP.PAT.RESD','EN.ATM.CO2E.PC','SP.POP.65UP.TO.ZS','IT.NET.USER.ZS']
DIRECTION = {  # +1 越大越好；-1 越小越好
    'IP.PAT.RESD': +1,
    'IT.NET.USER.ZS': +1,
    'EN.ATM.CO2E.PC': -1,
    'SP.POP.65UP.TO.ZS': 0,  # 中性：不纳入评分，或按需改为 +1/-1
}
WEIGHTS = {k: (0.0 if DIRECTION.get(k,0)==0 else 1.0) for k in DEFAULT_INDICATORS}
# 权重归一化
w_sum = sum(WEIGHTS.values()) or 1.0
WEIGHTS = {k: v/w_sum for k,v in WEIGHTS.items()}

def _find_latest(pattern:str)->Optional[str]:
    files = glob(pattern)
    if not files: return None
    files.sort(key=os.path.getmtime, reverse=True)
    return files[0]

def _from_manifest()->Tuple[Optional[str], Optional[str]]:
    runs = PROJECT / 'runs'
    m = _find_latest(str(runs / 'manifest_*.json'))
    if not m: return None, None
    with open(m,'r',encoding='utf-8') as f:
        man = json.load(f)
    outs = man.get('outputs',{})
    return outs.get('worldbank'), outs.get('gov_news')

def load_worldbank(csv_path:str)->pd.DataFrame:
    df = pd.read_csv(csv_path)
    df['date'] = pd.to_numeric(df['date'], errors='coerce')
    df = df.rename(columns={'date':'year'})
    df['value'] = pd.to_numeric(df['value'], errors='coerce')
    df = df.dropna(subset=['year']).copy()
    df['year'] = df['year'].astype(int)
    if 'countryiso3code' in df.columns:
        df = df[df['countryiso3code']=='CHN'].copy()
    return df

def to_wide(df:pd.DataFrame, indicators:list)->pd.DataFrame:
    use = df[df['indicator_id'].isin(indicators)].copy()
    piv = use.pivot_table(index='year', columns='indicator_id', values='value', aggfunc='mean')
    piv = piv.sort_index()
    return piv

def zscore(x:pd.Series)->pd.Series:
    m = x.mean(skipna=True); s = x.std(skipna=True)
    return (x-m)/s if s and not np.isclose(s,0) else x*0

def compute_score(wide:pd.DataFrame, direction:dict, weights:dict)->pd.DataFrame:
    Z = pd.DataFrame(index=wide.index)
    for col in wide.columns:
        if direction.get(col,0)==0: continue
        sign = 1 if direction.get(col,1)>0 else -1
        Z[col] = zscore(wide[col].fillna(method='ffill')).mul(sign)
    # 加权求和
    score_raw = pd.Series(0.0, index=wide.index)
    for col in Z.columns:
        w = weights.get(col, 0.0)
        score_raw = score_raw.add(Z[col]*w, fill_value=0.0)
    # 0-100 归一
    s_min, s_max = score_raw.min(), score_raw.max()
    score = (score_raw - s_min)/(s_max-s_min) * 100.0 if s_max> s_min else score_raw*0+50
    out = pd.DataFrame({'score_raw':score_raw, 'score':score}).round(4)
    return out

def ensure_dir(p:str):
    os.makedirs(p, exist_ok=True)


In [None]:
# 1) 读取数据（含路径诊断）
print("Project root:", PROJECT)
print("Data dir:", DATA_DIR)
wb_path, news_path = _from_manifest()
if wb_path is None:
    wb_path = _find_latest(str(DATA_DIR / 'wb' / 'worldbank_*.csv'))
if wb_path is None:
    raise FileNotFoundError(
        "未找到世界银行CSV。请先运行采集脚本，或设置环境变量 PROJECT_ROOT=项目根路径 后重启内核。"
    )
print("Using WB CSV:", wb_path)
df = load_worldbank(wb_path)
df.head()

AssertionError: 未找到世界银行CSV，请先运行采集脚本

In [None]:
# 2) 宽表构建 + 3) 评分模型
inds = DEFAULT_INDICATORS
wide = to_wide(df, inds)
score_df = compute_score(wide, DIRECTION, WEIGHTS)
display(wide.tail())
display(score_df.tail())
# 保存
ensure_dir(str(MODEL_DIR))
wide.to_csv(MODEL_DIR / 'wb_wide.csv')
score_df.to_csv(MODEL_DIR / 'scoring_by_year.csv')
print('Saved:', MODEL_DIR / 'scoring_by_year.csv')
# 可视化评分
fig = px.line(score_df.reset_index(), x='year', y='score', markers=True, title='综合评分(0-100)')
fig.show()


In [None]:
# 4) 简单预测（以互联网使用率 IT.NET.USER.ZS 为例）
target = 'IT.NET.USER.ZS'
ts = wide[target].dropna()
assert len(ts)>=6, '可用于预测的数据点太少'
# 划分训练/验证（最后3年做验证）
split_n = 3
train, test = ts.iloc[:-split_n], ts.iloc[-split_n:]
model = ExponentialSmoothing(train, trend='add', seasonal=None, initialization_method='estimated')
res = model.fit()
pred = res.forecast(split_n)
mape = (abs((test - pred)/test).mean()*100).round(2)
print('MAPE(%) =', mape)
# 合并绘图
pred_full = pd.concat([train, pred])
fig2 = px.line(pred_full.reset_index(), x='year', y=target, title=f'{target} 预测（验证期MAPE={mape}%）')
fig2.add_scatter(x=test.index, y=test.values, mode='markers+lines', name='实际(验证)')
fig2.show()
# 保存
pred_df = pred.reset_index(); pred_df.columns = ['year','forecast']
pred_df.to_csv(MODEL_DIR / f'forecast_{target}.csv', index=False)
with open(MODEL_DIR / 'metrics.json','w',encoding='utf-8') as f:
    json.dump({'target':target,'mape_pct':float(mape)}, f, ensure_ascii=False, indent=2)
print('Saved:', MODEL_DIR / f'forecast_{target}.csv')


## 特征与可调参数说明
- 指标：DEFAULT_INDICATORS；可按主题替换或扩展。
- 方向 DIRECTION：+1 越大越好；-1 越小越好；0 表示不纳入评分。
- 权重 WEIGHTS：默认对纳入指标等权；可自定义后归一。
- 评分：各指标 z-score 处理（按方向取正/负）后加权求和；再 0-100 归一。
- 预测：ExponentialSmoothing（加性趋势），划分最近 3 年为验证期，指标为 MAPE。

> 该 Notebook 仅作方法演示；口径、方向与权重请结合业务含义调整。
