In [2]:
!pip install yfinance



## Input the Target code and Time period, use the following code the get each stock's information (Including open, high, low, close and volumn)

In [5]:
# ======================================================
# 超稳版：TSE 日股数据抓取（Yahoo优先，Stooq兜底）
# 严格单只串行 + 强等待 + 指数退避 + 断点续跑
# ======================================================

import os, time, random, math
from datetime import datetime
import pandas as pd
import yfinance as yf

# 备源
try:
    from pandas_datareader import data as pdr
    HAVE_PDR = True
except Exception:
    HAVE_PDR = False

# -------------------------
# 输入
# -------------------------
companies = {
    "8927": "明豊エンタープライズ",
    "8869": "明和地所",
    "8871": "ゴールドクレスト",
    "8877": "エスリード",
    "3489": "フェイスネットワーク",
    "146A": "コロンビア・ワークス",         # 非4位数字 -> 跳过
    "3236": "プロパスト",
    "3242": "アーバネットコーポレーション",
    "3238": "セントラル総合開発",
    "3245": "ディア・ライフ",
    "3246": "コーセーアールイー",
    "3486": "グローバル・リンク・マネジメント",
    "3498": "霞ヶ関キャピタル",
    "391A": "山忠",                       # 非4位数字 -> 跳过
    "8844": "コスモスイニシア",
    "8881": "日神グループホールディングス",
    "8887": "シーラホールディングス",
    "8892": "日本エスコン"
}
start_date = "2015-10-01"
end_date   = "2025-10-01"

# -------------------------
# 参数（可按需调大/调小）
# -------------------------
YF_MAX_RETRIES   = 7          # yfinance 最大重试
BASE_SLEEP       = 2.5        # 指数退避基准秒
BETWEEN_TICKERS  = (12, 20)   # 每只之间的强制等待区间（秒）
AUTO_ADJUST      = True       # True=复权价，False=原始价
OUT_DIR          = "data"
os.makedirs(OUT_DIR, exist_ok=True)

# -------------------------
# 辅助函数
# -------------------------
def human_sleep(seconds: float):
    seconds = max(0.5, float(seconds))
    time.sleep(seconds)

def sleep_between_tickers():
    wait = random.uniform(*BETWEEN_TICKERS)
    print(f"⏳ 等待 {wait:.1f}s 再抓下一只…")
    human_sleep(wait)

def save_one_df(df: pd.DataFrame, code: str, name: str, source: str):
    if df is None or df.empty:
        return False
    df = df.copy()
    df["Code"] = code
    df["Company"] = name
    df["Source"] = source
    # 统一列名
    df.columns = [c.title() if isinstance(c, str) else c for c in df.columns]
    # 落地单文件
    fn = os.path.join(OUT_DIR, f"{code}_{source}_{start_date}_to_{end_date}.csv")
    df.to_csv(fn, encoding="utf-8-sig")
    print(f"✅ [{code}] {name} -> 已保存单只数据：{fn}（{len(df):,} 行）")
    return True

def fetch_yahoo(code: str):
    """用 yfinance 串行抓取单只，指数退避重试"""
    tk = f"{code}.T"
    attempt = 0
    while attempt < YF_MAX_RETRIES:
        try:
            print(f"  • Yahoo 抓取 {tk}（尝试 {attempt+1}/{YF_MAX_RETRIES}）")
            df = yf.download(
                tickers=[tk],
                start=start_date,
                end=end_date,
                auto_adjust=AUTO_ADJUST,
                group_by="ticker",
                threads=False,
                progress=False,
                interval="1d",
            )
            # yfinance 单只时可能返回普通列或多重列
            if df is None or df.empty:
                raise RuntimeError("empty dataframe")
            if isinstance(df.columns, pd.MultiIndex):
                if tk in df.columns.get_level_values(0):
                    df = df[tk].copy()
                else:
                    # 多重列但不含目标 -> 视作失败
                    raise RuntimeError("missing ticker in returned columns")
            return df
        except Exception as e:
            msg = str(e)
            # 粗判限流/网络
            if any(k in msg for k in ["Too Many Requests", "Rate", "429", "timed out"]):
                sleep_s = BASE_SLEEP * (2 ** attempt) + random.uniform(0, 1.5)
                print(f"    ⚠️ 限流/网络问题：{msg} -> {sleep_s:.1f}s 后重试")
                human_sleep(sleep_s)
                attempt += 1
            else:
                print(f"    ❌ 非限流错误：{msg}")
                break
    return None

def fetch_stooq(code: str):
    """用 Stooq 兜底（覆盖有限）—— 代码形如 8927.jp"""
    if not HAVE_PDR:
        return None
    sym = f"{code}.jp"
    try:
        print(f"  • Stooq 兜底 {sym}")
        df = pdr.DataReader(sym, "stooq", start=start_date, end=end_date)
        # Stooq 列名是 Open/High/Low/Close/Volume 等（倒序或正序）
        if df is None or df.empty:
            return None
        # 统一按日期升序
        df = df.sort_index()
        return df
    except Exception as e:
        print(f"    ❌ Stooq 失败：{e}")
        return None

# -------------------------
# 主流程：严格单只串行
# -------------------------
numeric = {c: n for c, n in companies.items() if c.isdigit() and len(c) == 4}
skipped = {c: n for c, n in companies.items() if c not in numeric}

if skipped:
    print("⏭️ 已跳过（非4位数字）：")
    for c, n in skipped.items():
        print(f"  - {c}: {n}")

success_codes = []
fail_codes    = []

for code, name in numeric.items():
    print(f"\n==============================")
    print(f"▶︎ 抓取 {code} — {name}")
    # 如果该代码已有成功落地文件，跳过（断点续跑）
    already = [f for f in os.listdir(OUT_DIR) if f.startswith(f"{code}_") and f.endswith(".csv")]
    if already:
        print(f"  🔁 检测到已有文件，跳过抓取：{already[0]}")
        success_codes.append(code)
        continue

    df = fetch_yahoo(code)
    if df is not None and save_one_df(df, code, name, "yahoo"):
        success_codes.append(code)
        sleep_between_tickers()
        continue

    # Yahoo 失败 -> 尝试 Stooq
    df2 = fetch_stooq(code)
    if df2 is not None and save_one_df(df2, code, name, "stooq"):
        success_codes.append(code)
        sleep_between_tickers()
        continue

    print(f"  ❗最终失败：{code} {name}")
    fail_codes.append(code)
    sleep_between_tickers()

# -------------------------
# 汇总合并（把 data/ 下的单只CSV 合为一份）
# -------------------------
all_paths = [os.path.join(OUT_DIR, f) for f in os.listdir(OUT_DIR) if f.endswith(".csv")]
frames = []
for path in all_paths:
    try:
        frames.append(pd.read_csv(path, parse_dates=["Date"], infer_datetime_format=True))
    except Exception:
        # Stooq 文件可能列名是 'Date' 或 'Date' 已在索引；上面的 to_csv 保证有 Date 列
        frames.append(pd.read_csv(path))

if frames:
    combined = pd.concat(frames, ignore_index=True)
    # 规范列顺序
    front = [c for c in ["Company", "Code", "Source"] if c in combined.columns]
    rest  = [c for c in combined.columns if c not in front]
    combined = combined[front + rest]
    out_all = f"jp_tse_prices_combined_{start_date}_to_{end_date}.csv"
    combined.sort_values(["Code","Date"], inplace=True, ignore_index=True)
    combined.to_csv(out_all, encoding="utf-8-sig", index=False)
    print(f"\n📦 合并完成：{out_all}（{len(combined):,} 行）")
else:
    print("\n⚠️ 没有任何成功文件可合并。")

print("\n—— 抓取结果 ——")
print(f"✅ 成功：{len(success_codes)} | ❌ 失败：{len(fail_codes)}")
if fail_codes:
    print("失败代码：", ", ".join(fail_codes))


⏭️ 已跳过（非4位数字）：
  - 146A: コロンビア・ワークス
  - 391A: 山忠

▶︎ 抓取 8927 — 明豊エンタープライズ
  • Yahoo 抓取 8927.T（尝试 1/7）



1 Failed download:
['8927.T']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


    ❌ 非限流错误：empty dataframe
  • Stooq 兜底 8927.jp
✅ [8927] 明豊エンタープライズ -> 已保存单只数据：data\8927_stooq_2015-10-01_to_2025-10-01.csv（2,442 行）
⏳ 等待 16.0s 再抓下一只…

▶︎ 抓取 8869 — 明和地所
  • Yahoo 抓取 8869.T（尝试 1/7）



1 Failed download:
['8869.T']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


    ❌ 非限流错误：empty dataframe
  • Stooq 兜底 8869.jp
✅ [8869] 明和地所 -> 已保存单只数据：data\8869_stooq_2015-10-01_to_2025-10-01.csv（2,442 行）
⏳ 等待 19.7s 再抓下一只…

▶︎ 抓取 8871 — ゴールドクレスト
  • Yahoo 抓取 8871.T（尝试 1/7）



1 Failed download:
['8871.T']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


    ❌ 非限流错误：empty dataframe
  • Stooq 兜底 8871.jp
✅ [8871] ゴールドクレスト -> 已保存单只数据：data\8871_stooq_2015-10-01_to_2025-10-01.csv（2,442 行）
⏳ 等待 14.2s 再抓下一只…

▶︎ 抓取 8877 — エスリード
  • Yahoo 抓取 8877.T（尝试 1/7）



1 Failed download:
['8877.T']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


    ❌ 非限流错误：empty dataframe
  • Stooq 兜底 8877.jp
✅ [8877] エスリード -> 已保存单只数据：data\8877_stooq_2015-10-01_to_2025-10-01.csv（2,442 行）
⏳ 等待 16.8s 再抓下一只…

▶︎ 抓取 3489 — フェイスネットワーク
  • Yahoo 抓取 3489.T（尝试 1/7）



1 Failed download:
['3489.T']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


    ❌ 非限流错误：empty dataframe
  • Stooq 兜底 3489.jp
✅ [3489] フェイスネットワーク -> 已保存单只数据：data\3489_stooq_2015-10-01_to_2025-10-01.csv（1,841 行）
⏳ 等待 20.0s 再抓下一只…

▶︎ 抓取 3236 — プロパスト
  • Yahoo 抓取 3236.T（尝试 1/7）



1 Failed download:
['3236.T']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


    ❌ 非限流错误：empty dataframe
  • Stooq 兜底 3236.jp
✅ [3236] プロパスト -> 已保存单只数据：data\3236_stooq_2015-10-01_to_2025-10-01.csv（2,442 行）
⏳ 等待 17.4s 再抓下一只…

▶︎ 抓取 3242 — アーバネットコーポレーション
  • Yahoo 抓取 3242.T（尝试 1/7）



1 Failed download:
['3242.T']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


    ❌ 非限流错误：empty dataframe
  • Stooq 兜底 3242.jp
✅ [3242] アーバネットコーポレーション -> 已保存单只数据：data\3242_stooq_2015-10-01_to_2025-10-01.csv（2,442 行）
⏳ 等待 19.4s 再抓下一只…

▶︎ 抓取 3238 — セントラル総合開発
  • Yahoo 抓取 3238.T（尝试 1/7）



1 Failed download:
['3238.T']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


    ❌ 非限流错误：empty dataframe
  • Stooq 兜底 3238.jp
✅ [3238] セントラル総合開発 -> 已保存单只数据：data\3238_stooq_2015-10-01_to_2025-10-01.csv（2,441 行）
⏳ 等待 14.7s 再抓下一只…

▶︎ 抓取 3245 — ディア・ライフ
  • Yahoo 抓取 3245.T（尝试 1/7）



1 Failed download:
['3245.T']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


    ❌ 非限流错误：empty dataframe
  • Stooq 兜底 3245.jp
✅ [3245] ディア・ライフ -> 已保存单只数据：data\3245_stooq_2015-10-01_to_2025-10-01.csv（2,442 行）
⏳ 等待 13.5s 再抓下一只…

▶︎ 抓取 3246 — コーセーアールイー
  • Yahoo 抓取 3246.T（尝试 1/7）



1 Failed download:
['3246.T']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


    ❌ 非限流错误：empty dataframe
  • Stooq 兜底 3246.jp
✅ [3246] コーセーアールイー -> 已保存单只数据：data\3246_stooq_2015-10-01_to_2025-10-01.csv（2,442 行）
⏳ 等待 16.0s 再抓下一只…

▶︎ 抓取 3486 — グローバル・リンク・マネジメント
  • Yahoo 抓取 3486.T（尝试 1/7）



1 Failed download:
['3486.T']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


    ❌ 非限流错误：empty dataframe
  • Stooq 兜底 3486.jp
✅ [3486] グローバル・リンク・マネジメント -> 已保存单只数据：data\3486_stooq_2015-10-01_to_2025-10-01.csv（1,903 行）
⏳ 等待 12.7s 再抓下一只…

▶︎ 抓取 3498 — 霞ヶ関キャピタル
  • Yahoo 抓取 3498.T（尝试 1/7）



1 Failed download:
['3498.T']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


    ❌ 非限流错误：empty dataframe
  • Stooq 兜底 3498.jp
✅ [3498] 霞ヶ関キャピタル -> 已保存单只数据：data\3498_stooq_2015-10-01_to_2025-10-01.csv（1,667 行）
⏳ 等待 17.8s 再抓下一只…

▶︎ 抓取 8844 — コスモスイニシア
  • Yahoo 抓取 8844.T（尝试 1/7）



1 Failed download:
['8844.T']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


    ❌ 非限流错误：empty dataframe
  • Stooq 兜底 8844.jp
✅ [8844] コスモスイニシア -> 已保存单只数据：data\8844_stooq_2015-10-01_to_2025-10-01.csv（2,442 行）
⏳ 等待 16.0s 再抓下一只…

▶︎ 抓取 8881 — 日神グループホールディングス
  • Yahoo 抓取 8881.T（尝试 1/7）



1 Failed download:
['8881.T']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


    ❌ 非限流错误：empty dataframe
  • Stooq 兜底 8881.jp
✅ [8881] 日神グループホールディングス -> 已保存单只数据：data\8881_stooq_2015-10-01_to_2025-10-01.csv（2,442 行）
⏳ 等待 16.2s 再抓下一只…

▶︎ 抓取 8887 — シーラホールディングス
  • Yahoo 抓取 8887.T（尝试 1/7）



1 Failed download:
['8887.T']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


    ❌ 非限流错误：empty dataframe
  • Stooq 兜底 8887.jp
✅ [8887] シーラホールディングス -> 已保存单只数据：data\8887_stooq_2015-10-01_to_2025-10-01.csv（2,442 行）
⏳ 等待 12.0s 再抓下一只…

▶︎ 抓取 8892 — 日本エスコン
  • Yahoo 抓取 8892.T（尝试 1/7）



1 Failed download:
['8892.T']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


    ❌ 非限流错误：empty dataframe
  • Stooq 兜底 8892.jp
✅ [8892] 日本エスコン -> 已保存单只数据：data\8892_stooq_2015-10-01_to_2025-10-01.csv（2,442 行）
⏳ 等待 15.0s 再抓下一只…


  frames.append(pd.read_csv(path, parse_dates=["Date"], infer_datetime_format=True))
  frames.append(pd.read_csv(path, parse_dates=["Date"], infer_datetime_format=True))
  frames.append(pd.read_csv(path, parse_dates=["Date"], infer_datetime_format=True))
  frames.append(pd.read_csv(path, parse_dates=["Date"], infer_datetime_format=True))
  frames.append(pd.read_csv(path, parse_dates=["Date"], infer_datetime_format=True))
  frames.append(pd.read_csv(path, parse_dates=["Date"], infer_datetime_format=True))
  frames.append(pd.read_csv(path, parse_dates=["Date"], infer_datetime_format=True))
  frames.append(pd.read_csv(path, parse_dates=["Date"], infer_datetime_format=True))
  frames.append(pd.read_csv(path, parse_dates=["Date"], infer_datetime_format=True))
  frames.append(pd.read_csv(path, parse_dates=["Date"], infer_datetime_format=True))
  frames.append(pd.read_csv(path, parse_dates=["Date"], infer_datetime_format=True))



📦 合并完成：jp_tse_prices_combined_2015-10-01_to_2025-10-01.csv（37,156 行）

—— 抓取结果 ——
✅ 成功：16 | ❌ 失败：0


  frames.append(pd.read_csv(path, parse_dates=["Date"], infer_datetime_format=True))
  frames.append(pd.read_csv(path, parse_dates=["Date"], infer_datetime_format=True))
  frames.append(pd.read_csv(path, parse_dates=["Date"], infer_datetime_format=True))
  frames.append(pd.read_csv(path, parse_dates=["Date"], infer_datetime_format=True))
  frames.append(pd.read_csv(path, parse_dates=["Date"], infer_datetime_format=True))


# The stock including Alphabet may not be catched perfectedly, so try the following code and get the stock information.

In [9]:
import pandas_datareader.data as web
import datetime

start = datetime.date(2015, 10, 1)
end   = datetime.date(2025, 10, 1)

for code, name in [("146A", "コロンビア・ワークス"), ("391A", "山忠")]:
    symbol = f"{code}.jp"
    try:
        df = web.DataReader(symbol, "stooq", start, end)
        df = df.sort_index()
        print(f"✅ {code} {name} 数据行数: {len(df)}")
        display(df.head())
        df.to_csv(f"{code}_stooq.csv", encoding="utf-8-sig")
    except Exception as e:
        print(f"❌ {code} {name} 获取失败: {e}")


✅ 146A コロンビア・ワークス 数据行数: 371


Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-03-28,1792.17,2022.56,1704.86,1806.72,4914169.0
2024-03-29,1906.15,2138.96,1857.65,2078.34,4042875.0
2024-04-01,2214.14,2417.86,2214.14,2342.68,2801499.0
2024-04-02,2371.78,2415.43,2109.86,2134.11,1720319.0
2024-04-03,2131.69,2294.18,1957.08,2073.48,2108134.0


✅ 391A 山忠 数据行数: 0


# Combined all the csv files into 1 csv

In [17]:
import pandas as pd
import tkinter as tk
from tkinter import filedialog

# 打开文件选择窗口
root = tk.Tk()
root.withdraw()  # 不显示主窗口
files = filedialog.askopenfilenames(
    title="选择要合并的CSV文件",
    filetypes=[("CSV Files", "*.csv"), ("All Files", "*.*")]
)

if not files:
    print("❌ 没有选择任何文件。")
else:
    print(f"✅ 已选择 {len(files)} 个文件：")
    for f in files:
        print(" -", f)

    # 读取并合并
    dfs = [pd.read_csv(f, encoding="utf-8-sig") for f in files]
    combined = pd.concat(dfs, ignore_index=True)

    # 去重（如果有 Code + Date 列）
    if {"Code", "Date"}.issubset(combined.columns):
        combined = combined.sort_values(["Code", "Date"]).drop_duplicates(["Code", "Date"])

    # 保存
    combined.to_csv("combined_stocks.csv", index=False, encoding="utf-8-sig")
    print(f"\n💾 已保存为 combined_stocks.csv，共 {len(combined):,} 行。")


✅ 已选择 17 个文件：
 - D:/jupyter/Job/Restar/開発計画/XU/data/146A_stooq_2015-10-01_to_2025-10-01.csv
 - D:/jupyter/Job/Restar/開発計画/XU/data/3236_stooq_2015-10-01_to_2025-10-01.csv
 - D:/jupyter/Job/Restar/開発計画/XU/data/3238_stooq_2015-10-01_to_2025-10-01.csv
 - D:/jupyter/Job/Restar/開発計画/XU/data/3242_stooq_2015-10-01_to_2025-10-01.csv
 - D:/jupyter/Job/Restar/開発計画/XU/data/3245_stooq_2015-10-01_to_2025-10-01.csv
 - D:/jupyter/Job/Restar/開発計画/XU/data/3246_stooq_2015-10-01_to_2025-10-01.csv
 - D:/jupyter/Job/Restar/開発計画/XU/data/3486_stooq_2015-10-01_to_2025-10-01.csv
 - D:/jupyter/Job/Restar/開発計画/XU/data/3489_stooq_2015-10-01_to_2025-10-01.csv
 - D:/jupyter/Job/Restar/開発計画/XU/data/3498_stooq_2015-10-01_to_2025-10-01.csv
 - D:/jupyter/Job/Restar/開発計画/XU/data/8844_stooq_2015-10-01_to_2025-10-01.csv
 - D:/jupyter/Job/Restar/開発計画/XU/data/8869_stooq_2015-10-01_to_2025-10-01.csv
 - D:/jupyter/Job/Restar/開発計画/XU/data/8871_stooq_2015-10-01_to_2025-10-01.csv
 - D:/jupyter/Job/Restar/開発計画/XU/data/8877_stooq_2