## データのフェッチと解凍

In [None]:
import sys
import os

os.chdir(os.path.abspath(".."))
sys.path.append(os.path.abspath(".."))

In [None]:
# rawデータのインポート
import pandas as pd
from scripts.data_fetcher import unzip_file

unzip_file("data/raw/Edinetcode_20250625.zip", "data/raw")

In [None]:
import sys

sys.path.append("../")  # src/を読み込むために1階層上へ

from scripts.data_fetcher import fetch_and_save_snapshots

# 実行（2017〜2024年分を保存）
fetch_and_save_snapshots(start_year=2017, end_year=2025)

## 読み込み

In [None]:
import pandas as pd

df_topix_2017 = pd.read_csv("data/raw/topix_growth_snapshots/2017_topix_growth.csv")
df_edinet = pd.read_csv("data/raw/EdinetcodeDlInfo.csv", encoding="cp932", skiprows=1)

In [None]:
df_topix_2017.head()

In [None]:
df_edinet.head()

In [None]:
df_edinet["証券コード"] = pd.to_numeric(
    df_edinet["証券コード"], errors="coerce"
).astype("Int64")
df_merged2017 = pd.merge(
    df_topix_2017, df_edinet, left_on="Code", right_on="証券コード", how="left"
)

In [None]:
_df_unmatched = df_merged2017[df_merged2017["証券コード"].isna()]
matches = []

# 1行ずつ銘柄名を含む提出者名を探す
for i, row in _df_unmatched.iterrows():
    name = row["CompanyName"]
    hit = df_edinet[df_edinet["提出者名"].str.contains(name, na=False)]
    if not hit.empty:
        # 最初にマッチしたものだけ使う（複数マッチ対応も可能）
        merged_row = row.to_dict()
        merged_row.update(hit.iloc[0].to_dict())
        matches.append(merged_row)

# DataFrame化
matched_df = pd.DataFrame(matches)

In [None]:
df_merged2017[df_merged2017["Code"] == 26510]

In [None]:
# 1. 両方に共通する補完対象カラムを取得
cols_to_update = list(set(matched_df.columns) & set(df_merged2017.columns))

# 2. CompanyNameをキーにしてループで補完
for _, row in matched_df.iterrows():
    company_name = row["CompanyName"]
    for col in cols_to_update:
        # df_merged2017の該当行に対して、欠損している箇所だけ補完
        mask = (df_merged2017["CompanyName"] == company_name) & (
            df_merged2017[col].isna()
        )
        df_merged2017.loc[mask, col] = row[col]

In [None]:
import uuid
import pandas as pd

df_merged2017 = pd.read_csv("data/processed/topix_companies_2017.csv")

# カラム名を統一
df_merged2017 = df_merged2017.rename(
    columns={
        "提出者名": "name_ja",
        "提出者法人番号": "corp_number",
        "ＥＤＩＮＥＴコード": "edinet_code",
        "Code": "security_code",  # J-Quantsで得られる "Code" 列（証券コード）
        "提出者業種": "sector_33",
        "Sector17CodeName": "sector_17",
        "ScaleCategory": "scale_category",
        "MarketCode": "market_code",
        "提出者名（英字）": "name_en",
        "所在地": "address",
    }
)

In [None]:
df_merged2017.columns

In [None]:
df_merged2017.to_csv("data/processed/topix_companies_2017.csv", index=False)

In [None]:
import pandas as pd
import uuid
from psycopg2.extras import execute_values
from db.connection import get_connection
from scripts.preprocess import overwrite_company_ids_if_exists

# ----------- ① 年度指定 & ファイルパス ----------
YEAR = 2017
CSV_PATH = f"data/processed/topix_companies_{YEAR}.csv"

# ----------- ② データ読み込み & カラム統一 ----------
df = pd.read_csv(CSV_PATH, dtype=str).fillna("")

column_map = {
    "提出者名": "name_ja",
    "提出者法人番号": "corp_number",
    "ＥＤＩＮＥＴコード": "edinet_code",
    "Code": "security_code",
    "提出者業種": "sector_33",
    "Sector17CodeName": "sector_17",
    "ScaleCategory": "scale_category",
    "MarketCode": "market_code",
    "提出者名（英字）": "name_en",
    "所在地": "address",
}
df = df.rename(columns=column_map)

keep_cols = [
    "security_code",
    "name_ja",
    "name_en",
    "sector_17",
    "sector_33",
    "scale_category",
    "market_code",
    "edinet_code",
    "address",
    "corp_number",
]
df = df[keep_cols]

# 整形処理
df["security_code"] = df["security_code"].str.zfill(5)
df["edinet_code"] = df["edinet_code"].str.zfill(6)
df["corp_number"] = df["corp_number"].apply(
    lambda x: str(int(float(x))).zfill(13) if pd.notnull(x) and x != "" else ""
)

# ----------- ③ DB接続 & company_id 上書き ----------
conn = get_connection()
cur = conn.cursor()
df = overwrite_company_ids_if_exists(df, cur)


# ----------- ④ UUIDバリデーション & 重複チェック ----------
def is_valid_uuid(val: str) -> bool:
    try:
        uuid.UUID(str(val))
        return True
    except ValueError:
        return False


df["company_id"] = df["company_id"].astype(str)
valid_mask = df["company_id"].apply(is_valid_uuid)

# 無効なUUID検出
invalid_df = df[~valid_mask]
if not invalid_df.empty:
    print("🟥 無効な company_id を持つレコード:")
    display(invalid_df)

# 重複検出
duplicates = df[valid_mask].duplicated(subset=["company_id"], keep=False)
if duplicates.any():
    print("⚠️ company_id が重複しているレコード（同一年度内）:")
    display(df[valid_mask][duplicates])

# 有効なUUIDのみ残す
df = df[valid_mask].copy()

print("🔍 company_idが有効な件数:", len(df))
print("✅ ユニークな company_id 件数:", df["company_id"].nunique())

# ----------- ⑤ companies テーブルに先に挿入（外部キー用） ----------
company_records = [
    (row["company_id"], row["corp_number"], row["edinet_code"], row["security_code"])
    for _, row in df.iterrows()
]
execute_values(
    cur,
    """
    INSERT INTO companies (company_id, corp_number, edinet_code, security_code)
    VALUES %s
    ON CONFLICT (company_id) DO UPDATE
    SET corp_number = EXCLUDED.corp_number,
        edinet_code = EXCLUDED.edinet_code,
        security_code = EXCLUDED.security_code;
""",
    company_records,
)

# ----------- ⑥ company_profiles テーブルに挿入 or 更新 ----------
profile_records = [
    (
        row["company_id"],
        YEAR,
        row["name_ja"],
        row["name_en"],
        row["sector_17"],
        row["sector_33"],
        row["scale_category"],
        row["market_code"].zfill(3),
        row["address"],
    )
    for _, row in df.iterrows()
]
execute_values(
    cur,
    """
    INSERT INTO company_profiles (
        company_id, year, name_ja, name_en,
        sector_17, sector_33, scale_category, market_code, address
    ) VALUES %s
    ON CONFLICT (company_id, year) DO UPDATE
    SET name_ja = EXCLUDED.name_ja,
        name_en = EXCLUDED.name_en,
        sector_17 = EXCLUDED.sector_17,
        sector_33 = EXCLUDED.sector_33,
        scale_category = EXCLUDED.scale_category,
        market_code = EXCLUDED.market_code,
        address = EXCLUDED.address;
""",
    profile_records,
)

# ----------- ⑦ コミット & 終了処理 ----------
conn.commit()
cur.close()
conn.close()

print(f"✅ {len(df)} 件のデータを {YEAR} 年分として挿入・更新しました。")

In [None]:
## 2018
from scripts.preprocess import load_topix_and_edinet

df_2018 = load_topix_and_edinet(
    "data/raw/topix_growth_snapshots/2018_topix_growth.csv",
    "data/raw/EdinetcodeDlInfo.csv",
)
df_2018.to_csv("data/processed/topix_companies_2018.csv", index=False)

In [None]:
## ここで手動で補完を行う

In [None]:
import pandas as pd
import uuid
from psycopg2.extras import execute_values
from db.connection import get_connection
from scripts.preprocess import overwrite_company_ids_if_exists

# ----------- ① 年度指定 & ファイルパス ----------
YEAR = 2018
CSV_PATH = f"data/processed/topix_companies_{YEAR}.csv"

# ----------- ② データ読み込み & カラム統一 ----------
df = pd.read_csv(CSV_PATH, dtype=str).fillna("")

column_map = {
    "提出者名": "name_ja",
    "提出者法人番号": "corp_number",
    "ＥＤＩＮＥＴコード": "edinet_code",
    "Code": "security_code",
    "提出者業種": "sector_33",
    "Sector17CodeName": "sector_17",
    "ScaleCategory": "scale_category",
    "MarketCode": "market_code",
    "提出者名（英字）": "name_en",
    "所在地": "address",
}
df = df.rename(columns=column_map)

keep_cols = [
    "security_code",
    "name_ja",
    "name_en",
    "sector_17",
    "sector_33",
    "scale_category",
    "market_code",
    "edinet_code",
    "address",
    "corp_number",
]
df = df[keep_cols]

# 整形処理
df["security_code"] = df["security_code"].str.zfill(5)
df["edinet_code"] = df["edinet_code"].str.zfill(6)
df["corp_number"] = df["corp_number"].apply(
    lambda x: str(int(float(x))).zfill(13) if pd.notnull(x) and x != "" else ""
)

# ----------- ③ DB接続 & company_id 上書き ----------
conn = get_connection()
cur = conn.cursor()
df = overwrite_company_ids_if_exists(df, cur)


# ----------- ④ UUIDバリデーション & 重複チェック ----------
def is_valid_uuid(val: str) -> bool:
    try:
        uuid.UUID(str(val))
        return True
    except ValueError:
        return False


df["company_id"] = df["company_id"].astype(str)
valid_mask = df["company_id"].apply(is_valid_uuid)

# 無効なUUID検出
invalid_df = df[~valid_mask]
if not invalid_df.empty:
    print("🟥 無効な company_id を持つレコード:")
    display(invalid_df)

# 重複検出
duplicates = df[valid_mask].duplicated(subset=["company_id"], keep=False)
if duplicates.any():
    print("⚠️ company_id が重複しているレコード（同一年度内）:")
    display(df[valid_mask][duplicates])

# 有効なUUIDのみ残す
df = df[valid_mask].copy()

print("🔍 company_idが有効な件数:", len(df))
print("✅ ユニークな company_id 件数:", df["company_id"].nunique())

# ----------- ⑤ companies テーブルに先に挿入（外部キー用） ----------
company_records = [
    (row["company_id"], row["corp_number"], row["edinet_code"], row["security_code"])
    for _, row in df.iterrows()
]
execute_values(
    cur,
    """
    INSERT INTO companies (company_id, corp_number, edinet_code, security_code)
    VALUES %s
    ON CONFLICT (company_id) DO UPDATE
    SET corp_number = EXCLUDED.corp_number,
        edinet_code = EXCLUDED.edinet_code,
        security_code = EXCLUDED.security_code;
""",
    company_records,
)

# ----------- ⑥ company_profiles テーブルに挿入 or 更新 ----------
profile_records = [
    (
        row["company_id"],
        YEAR,
        row["name_ja"],
        row["name_en"],
        row["sector_17"],
        row["sector_33"],
        row["scale_category"],
        row["market_code"].zfill(3),
        row["address"],
    )
    for _, row in df.iterrows()
]
execute_values(
    cur,
    """
    INSERT INTO company_profiles (
        company_id, year, name_ja, name_en,
        sector_17, sector_33, scale_category, market_code, address
    ) VALUES %s
    ON CONFLICT (company_id, year) DO UPDATE
    SET name_ja = EXCLUDED.name_ja,
        name_en = EXCLUDED.name_en,
        sector_17 = EXCLUDED.sector_17,
        sector_33 = EXCLUDED.sector_33,
        scale_category = EXCLUDED.scale_category,
        market_code = EXCLUDED.market_code,
        address = EXCLUDED.address;
""",
    profile_records,
)

# ----------- ⑦ コミット & 終了処理 ----------
conn.commit()
cur.close()
conn.close()

print(f"✅ {len(df)} 件のデータを {YEAR} 年分として挿入・更新しました。")

In [None]:
### 2019年
from scripts.preprocess import load_topix_and_edinet

df_2019 = load_topix_and_edinet(
    "data/raw/topix_growth_snapshots/2019_topix_growth.csv",
    "data/raw/EdinetcodeDlInfo.csv",
)
df_2019.to_csv("data/processed/topix_companies_2019.csv", index=False)

In [None]:
## 手動で補完、LINE yahooも処理必要

In [None]:
import pandas as pd
import uuid
from psycopg2.extras import execute_values
from db.connection import get_connection
from scripts.preprocess import overwrite_company_ids_if_exists

# ----------- ① 年度指定 & ファイルパス ----------
YEAR = 2019
CSV_PATH = f"data/processed/topix_companies_{YEAR}.csv"

# ----------- ② データ読み込み & カラム統一 ----------
df = pd.read_csv(CSV_PATH, dtype=str).fillna("")

column_map = {
    "提出者名": "name_ja",
    "提出者法人番号": "corp_number",
    "ＥＤＩＮＥＴコード": "edinet_code",
    "Code": "security_code",
    "提出者業種": "sector_33",
    "Sector17CodeName": "sector_17",
    "ScaleCategory": "scale_category",
    "MarketCode": "market_code",
    "提出者名（英字）": "name_en",
    "所在地": "address",
}
df = df.rename(columns=column_map)

keep_cols = [
    "security_code",
    "name_ja",
    "name_en",
    "sector_17",
    "sector_33",
    "scale_category",
    "market_code",
    "edinet_code",
    "address",
    "corp_number",
]
df = df[keep_cols]

# 整形処理
df["security_code"] = df["security_code"].str.zfill(5)
df["edinet_code"] = df["edinet_code"].str.zfill(6)
df["corp_number"] = df["corp_number"].apply(
    lambda x: str(int(float(x))).zfill(13) if pd.notnull(x) and x != "" else ""
)

# ----------- ③ DB接続 & company_id 上書き ----------
conn = get_connection()
cur = conn.cursor()
df = overwrite_company_ids_if_exists(df, cur)


# ----------- ④ UUIDバリデーション & 重複チェック ----------
def is_valid_uuid(val: str) -> bool:
    try:
        uuid.UUID(str(val))
        return True
    except ValueError:
        return False


df["company_id"] = df["company_id"].astype(str)
valid_mask = df["company_id"].apply(is_valid_uuid)

# 無効なUUID検出
invalid_df = df[~valid_mask]
if not invalid_df.empty:
    print("🟥 無効な company_id を持つレコード:")
    display(invalid_df)

# 重複検出
duplicates = df[valid_mask].duplicated(subset=["company_id"], keep=False)
if duplicates.any():
    print("⚠️ company_id が重複しているレコード（同一年度内）:")
    display(df[valid_mask][duplicates])

# 有効なUUIDのみ残す
df = df[valid_mask].copy()

print("🔍 company_idが有効な件数:", len(df))
print("✅ ユニークな company_id 件数:", df["company_id"].nunique())

# ----------- ⑤ companies テーブルに先に挿入（外部キー用） ----------
company_records = [
    (row["company_id"], row["corp_number"], row["edinet_code"], row["security_code"])
    for _, row in df.iterrows()
]
execute_values(
    cur,
    """
    INSERT INTO companies (company_id, corp_number, edinet_code, security_code)
    VALUES %s
    ON CONFLICT (company_id) DO UPDATE
    SET corp_number = EXCLUDED.corp_number,
        edinet_code = EXCLUDED.edinet_code,
        security_code = EXCLUDED.security_code;
""",
    company_records,
)

# ----------- ⑥ company_profiles テーブルに挿入 or 更新 ----------
profile_records = [
    (
        row["company_id"],
        YEAR,
        row["name_ja"],
        row["name_en"],
        row["sector_17"],
        row["sector_33"],
        row["scale_category"],
        row["market_code"].zfill(3),
        row["address"],
    )
    for _, row in df.iterrows()
]
execute_values(
    cur,
    """
    INSERT INTO company_profiles (
        company_id, year, name_ja, name_en,
        sector_17, sector_33, scale_category, market_code, address
    ) VALUES %s
    ON CONFLICT (company_id, year) DO UPDATE
    SET name_ja = EXCLUDED.name_ja,
        name_en = EXCLUDED.name_en,
        sector_17 = EXCLUDED.sector_17,
        sector_33 = EXCLUDED.sector_33,
        scale_category = EXCLUDED.scale_category,
        market_code = EXCLUDED.market_code,
        address = EXCLUDED.address;
""",
    profile_records,
)

# ----------- ⑦ コミット & 終了処理 ----------
conn.commit()
cur.close()
conn.close()

print(f"✅ {len(df)} 件のデータを {YEAR} 年分として挿入・更新しました。")

In [None]:
### 2020
from scripts.preprocess import load_topix_and_edinet

df_2020 = load_topix_and_edinet(
    "data/raw/topix_growth_snapshots/2020_topix_growth.csv",
    "data/raw/EdinetcodeDlInfo.csv",
)
df_2020.to_csv("data/processed/topix_companies_2020.csv", index=False)

In [None]:
## 手動で

In [None]:
import pandas as pd
import uuid
from psycopg2.extras import execute_values
from db.connection import get_connection
from scripts.preprocess import overwrite_company_ids_if_exists

# ----------- ① 年度指定 & ファイルパス ----------
YEAR = 2020
CSV_PATH = f"data/processed/topix_companies_{YEAR}.csv"

# ----------- ② データ読み込み & カラム統一 ----------
df = pd.read_csv(CSV_PATH, dtype=str).fillna("")

column_map = {
    "提出者名": "name_ja",
    "提出者法人番号": "corp_number",
    "ＥＤＩＮＥＴコード": "edinet_code",
    "Code": "security_code",
    "提出者業種": "sector_33",
    "Sector17CodeName": "sector_17",
    "ScaleCategory": "scale_category",
    "MarketCode": "market_code",
    "提出者名（英字）": "name_en",
    "所在地": "address",
}
df = df.rename(columns=column_map)

keep_cols = [
    "security_code",
    "name_ja",
    "name_en",
    "sector_17",
    "sector_33",
    "scale_category",
    "market_code",
    "edinet_code",
    "address",
    "corp_number",
]
df = df[keep_cols]

# 整形処理
df["security_code"] = df["security_code"].str.zfill(5)
df["edinet_code"] = df["edinet_code"].str.zfill(6)
df["corp_number"] = df["corp_number"].apply(
    lambda x: str(int(float(x))).zfill(13) if pd.notnull(x) and x != "" else ""
)

# ----------- ③ DB接続 & company_id 上書き ----------
conn = get_connection()
cur = conn.cursor()
df = overwrite_company_ids_if_exists(df, cur)


# ----------- ④ UUIDバリデーション & 重複チェック ----------
def is_valid_uuid(val: str) -> bool:
    try:
        uuid.UUID(str(val))
        return True
    except ValueError:
        return False


df["company_id"] = df["company_id"].astype(str)
valid_mask = df["company_id"].apply(is_valid_uuid)

# 無効なUUID検出
invalid_df = df[~valid_mask]
if not invalid_df.empty:
    print("🟥 無効な company_id を持つレコード:")
    display(invalid_df)

# 重複検出
duplicates = df[valid_mask].duplicated(subset=["company_id"], keep=False)
if duplicates.any():
    print("⚠️ company_id が重複しているレコード（同一年度内）:")
    display(df[valid_mask][duplicates])

# 有効なUUIDのみ残す
df = df[valid_mask].copy()

print("🔍 company_idが有効な件数:", len(df))
print("✅ ユニークな company_id 件数:", df["company_id"].nunique())

# ----------- ⑤ companies テーブルに先に挿入（外部キー用） ----------
company_records = [
    (row["company_id"], row["corp_number"], row["edinet_code"], row["security_code"])
    for _, row in df.iterrows()
]
execute_values(
    cur,
    """
    INSERT INTO companies (company_id, corp_number, edinet_code, security_code)
    VALUES %s
    ON CONFLICT (company_id) DO UPDATE
    SET corp_number = EXCLUDED.corp_number,
        edinet_code = EXCLUDED.edinet_code,
        security_code = EXCLUDED.security_code;
""",
    company_records,
)

# ----------- ⑥ company_profiles テーブルに挿入 or 更新 ----------
profile_records = [
    (
        row["company_id"],
        YEAR,
        row["name_ja"],
        row["name_en"],
        row["sector_17"],
        row["sector_33"],
        row["scale_category"],
        row["market_code"].zfill(3),
        row["address"],
    )
    for _, row in df.iterrows()
]
execute_values(
    cur,
    """
    INSERT INTO company_profiles (
        company_id, year, name_ja, name_en,
        sector_17, sector_33, scale_category, market_code, address
    ) VALUES %s
    ON CONFLICT (company_id, year) DO UPDATE
    SET name_ja = EXCLUDED.name_ja,
        name_en = EXCLUDED.name_en,
        sector_17 = EXCLUDED.sector_17,
        sector_33 = EXCLUDED.sector_33,
        scale_category = EXCLUDED.scale_category,
        market_code = EXCLUDED.market_code,
        address = EXCLUDED.address;
""",
    profile_records,
)

# ----------- ⑦ コミット & 終了処理 ----------
conn.commit()
cur.close()
conn.close()

print(f"✅ {len(df)} 件のデータを {YEAR} 年分として挿入・更新しました。")

In [None]:
### 2021
from scripts.preprocess import load_topix_and_edinet

df_2021 = load_topix_and_edinet(
    "data/raw/topix_growth_snapshots/2021_topix_growth.csv",
    "data/raw/EdinetcodeDlInfo.csv",
)
df_2021.to_csv("data/processed/topix_companies_2021.csv", index=False)

In [None]:
# 手動処理

In [None]:
import pandas as pd
import uuid
from psycopg2.extras import execute_values
from db.connection import get_connection
from scripts.preprocess import overwrite_company_ids_if_exists

# ----------- ① 年度指定 & ファイルパス ----------
YEAR = 2021
CSV_PATH = f"data/processed/topix_companies_{YEAR}.csv"

# ----------- ② データ読み込み & カラム統一 ----------
df = pd.read_csv(CSV_PATH, dtype=str).fillna("")

column_map = {
    "提出者名": "name_ja",
    "提出者法人番号": "corp_number",
    "ＥＤＩＮＥＴコード": "edinet_code",
    "Code": "security_code",
    "提出者業種": "sector_33",
    "Sector17CodeName": "sector_17",
    "ScaleCategory": "scale_category",
    "MarketCode": "market_code",
    "提出者名（英字）": "name_en",
    "所在地": "address",
}
df = df.rename(columns=column_map)

keep_cols = [
    "security_code",
    "name_ja",
    "name_en",
    "sector_17",
    "sector_33",
    "scale_category",
    "market_code",
    "edinet_code",
    "address",
    "corp_number",
]
df = df[keep_cols]

# 整形処理
df["security_code"] = df["security_code"].str.zfill(5)
df["edinet_code"] = df["edinet_code"].str.zfill(6)
df["corp_number"] = df["corp_number"].apply(
    lambda x: str(int(float(x))).zfill(13) if pd.notnull(x) and x != "" else ""
)

# ----------- ③ DB接続 & company_id 上書き ----------
conn = get_connection()
cur = conn.cursor()
df = overwrite_company_ids_if_exists(df, cur)


# ----------- ④ UUIDバリデーション & 重複チェック ----------
def is_valid_uuid(val: str) -> bool:
    try:
        uuid.UUID(str(val))
        return True
    except ValueError:
        return False


df["company_id"] = df["company_id"].astype(str)
valid_mask = df["company_id"].apply(is_valid_uuid)

# 無効なUUID検出
invalid_df = df[~valid_mask]
if not invalid_df.empty:
    print("🟥 無効な company_id を持つレコード:")
    display(invalid_df)

# 重複検出
duplicates = df[valid_mask].duplicated(subset=["company_id"], keep=False)
if duplicates.any():
    print("⚠️ company_id が重複しているレコード（同一年度内）:")
    display(df[valid_mask][duplicates])

# 有効なUUIDのみ残す
df = df[valid_mask].copy()

print("🔍 company_idが有効な件数:", len(df))
print("✅ ユニークな company_id 件数:", df["company_id"].nunique())

# ----------- ⑤ companies テーブルに先に挿入（外部キー用） ----------
company_records = [
    (row["company_id"], row["corp_number"], row["edinet_code"], row["security_code"])
    for _, row in df.iterrows()
]
execute_values(
    cur,
    """
    INSERT INTO companies (company_id, corp_number, edinet_code, security_code)
    VALUES %s
    ON CONFLICT (company_id) DO UPDATE
    SET corp_number = EXCLUDED.corp_number,
        edinet_code = EXCLUDED.edinet_code,
        security_code = EXCLUDED.security_code;
""",
    company_records,
)

# ----------- ⑥ company_profiles テーブルに挿入 or 更新 ----------
profile_records = [
    (
        row["company_id"],
        YEAR,
        row["name_ja"],
        row["name_en"],
        row["sector_17"],
        row["sector_33"],
        row["scale_category"],
        row["market_code"].zfill(3),
        row["address"],
    )
    for _, row in df.iterrows()
]
execute_values(
    cur,
    """
    INSERT INTO company_profiles (
        company_id, year, name_ja, name_en,
        sector_17, sector_33, scale_category, market_code, address
    ) VALUES %s
    ON CONFLICT (company_id, year) DO UPDATE
    SET name_ja = EXCLUDED.name_ja,
        name_en = EXCLUDED.name_en,
        sector_17 = EXCLUDED.sector_17,
        sector_33 = EXCLUDED.sector_33,
        scale_category = EXCLUDED.scale_category,
        market_code = EXCLUDED.market_code,
        address = EXCLUDED.address;
""",
    profile_records,
)

# ----------- ⑦ コミット & 終了処理 ----------
conn.commit()
cur.close()
conn.close()

print(f"✅ {len(df)} 件のデータを {YEAR} 年分として挿入・更新しました。")

In [None]:
## 2022
from scripts.preprocess import load_topix_and_edinet

df_2022 = load_topix_and_edinet(
    "data/raw/topix_growth_snapshots/2022_topix_growth.csv",
    "data/raw/EdinetcodeDlInfo.csv",
)
df_2022.to_csv("data/processed/topix_companies_2022.csv", index=False)

In [None]:
## 手動処理

In [None]:
import pandas as pd
import uuid
from psycopg2.extras import execute_values
from db.connection import get_connection
from scripts.preprocess import overwrite_company_ids_if_exists

# ----------- ① 年度指定 & ファイルパス ----------
YEAR = 2022
CSV_PATH = f"data/processed/topix_companies_{YEAR}.csv"

# ----------- ② データ読み込み & カラム統一 ----------
df = pd.read_csv(CSV_PATH, dtype=str).fillna("")

column_map = {
    "提出者名": "name_ja",
    "提出者法人番号": "corp_number",
    "ＥＤＩＮＥＴコード": "edinet_code",
    "Code": "security_code",
    "提出者業種": "sector_33",
    "Sector17CodeName": "sector_17",
    "ScaleCategory": "scale_category",
    "MarketCode": "market_code",
    "提出者名（英字）": "name_en",
    "所在地": "address",
}
df = df.rename(columns=column_map)

keep_cols = [
    "security_code",
    "name_ja",
    "name_en",
    "sector_17",
    "sector_33",
    "scale_category",
    "market_code",
    "edinet_code",
    "address",
    "corp_number",
]
df = df[keep_cols]

# 整形処理
df["security_code"] = df["security_code"].str.zfill(5)
df["edinet_code"] = df["edinet_code"].str.zfill(6)
df["corp_number"] = df["corp_number"].apply(
    lambda x: str(int(float(x))).zfill(13) if pd.notnull(x) and x != "" else ""
)

# ----------- ③ DB接続 & company_id 上書き ----------
conn = get_connection()
cur = conn.cursor()
df = overwrite_company_ids_if_exists(df, cur)


# ----------- ④ UUIDバリデーション & 重複チェック ----------
def is_valid_uuid(val: str) -> bool:
    try:
        uuid.UUID(str(val))
        return True
    except ValueError:
        return False


df["company_id"] = df["company_id"].astype(str)
valid_mask = df["company_id"].apply(is_valid_uuid)

# 無効なUUID検出
invalid_df = df[~valid_mask]
if not invalid_df.empty:
    print("🟥 無効な company_id を持つレコード:")
    display(invalid_df)

# 重複検出
duplicates = df[valid_mask].duplicated(subset=["company_id"], keep=False)
if duplicates.any():
    print("⚠️ company_id が重複しているレコード（同一年度内）:")
    display(df[valid_mask][duplicates])

# 有効なUUIDのみ残す
df = df[valid_mask].copy()

print("🔍 company_idが有効な件数:", len(df))
print("✅ ユニークな company_id 件数:", df["company_id"].nunique())

# ----------- ⑤ companies テーブルに先に挿入（外部キー用） ----------
company_records = [
    (row["company_id"], row["corp_number"], row["edinet_code"], row["security_code"])
    for _, row in df.iterrows()
]
execute_values(
    cur,
    """
    INSERT INTO companies (company_id, corp_number, edinet_code, security_code)
    VALUES %s
    ON CONFLICT (company_id) DO UPDATE
    SET corp_number = EXCLUDED.corp_number,
        edinet_code = EXCLUDED.edinet_code,
        security_code = EXCLUDED.security_code;
""",
    company_records,
)

# ----------- ⑥ company_profiles テーブルに挿入 or 更新 ----------
profile_records = [
    (
        row["company_id"],
        YEAR,
        row["name_ja"],
        row["name_en"],
        row["sector_17"],
        row["sector_33"],
        row["scale_category"],
        row["market_code"].zfill(3),
        row["address"],
    )
    for _, row in df.iterrows()
]
execute_values(
    cur,
    """
    INSERT INTO company_profiles (
        company_id, year, name_ja, name_en,
        sector_17, sector_33, scale_category, market_code, address
    ) VALUES %s
    ON CONFLICT (company_id, year) DO UPDATE
    SET name_ja = EXCLUDED.name_ja,
        name_en = EXCLUDED.name_en,
        sector_17 = EXCLUDED.sector_17,
        sector_33 = EXCLUDED.sector_33,
        scale_category = EXCLUDED.scale_category,
        market_code = EXCLUDED.market_code,
        address = EXCLUDED.address;
""",
    profile_records,
)

# ----------- ⑦ コミット & 終了処理 ----------
conn.commit()
cur.close()
conn.close()

print(f"✅ {len(df)} 件のデータを {YEAR} 年分として挿入・更新しました。")

In [None]:
## 2023
from scripts.preprocess import load_topix_and_edinet

df_2023 = load_topix_and_edinet(
    "data/raw/topix_growth_snapshots/2023_topix_growth.csv",
    "data/raw/EdinetcodeDlInfo.csv",
)
df_2023.to_csv("data/processed/topix_companies_2023.csv", index=False)

In [None]:
## 手動

In [None]:
import pandas as pd
import uuid
from psycopg2.extras import execute_values
from db.connection import get_connection
from scripts.preprocess import overwrite_company_ids_if_exists

# ----------- ① 年度指定 & ファイルパス ----------
YEAR = 2023
CSV_PATH = f"data/processed/topix_companies_{YEAR}.csv"

# ----------- ② データ読み込み & カラム統一 ----------
df = pd.read_csv(CSV_PATH, dtype=str).fillna("")

column_map = {
    "提出者名": "name_ja",
    "提出者法人番号": "corp_number",
    "ＥＤＩＮＥＴコード": "edinet_code",
    "Code": "security_code",
    "提出者業種": "sector_33",
    "Sector17CodeName": "sector_17",
    "ScaleCategory": "scale_category",
    "MarketCode": "market_code",
    "提出者名（英字）": "name_en",
    "所在地": "address",
}
df = df.rename(columns=column_map)

keep_cols = [
    "security_code",
    "name_ja",
    "name_en",
    "sector_17",
    "sector_33",
    "scale_category",
    "market_code",
    "edinet_code",
    "address",
    "corp_number",
]
df = df[keep_cols]

# 整形処理
df["security_code"] = df["security_code"].str.zfill(5)
df["edinet_code"] = df["edinet_code"].str.zfill(6)
df["corp_number"] = df["corp_number"].apply(
    lambda x: str(int(float(x))).zfill(13) if pd.notnull(x) and x != "" else ""
)

# ----------- ③ DB接続 & company_id 上書き ----------
conn = get_connection()
cur = conn.cursor()
df = overwrite_company_ids_if_exists(df, cur)


# ----------- ④ UUIDバリデーション & 重複チェック ----------
def is_valid_uuid(val: str) -> bool:
    try:
        uuid.UUID(str(val))
        return True
    except ValueError:
        return False


df["company_id"] = df["company_id"].astype(str)
valid_mask = df["company_id"].apply(is_valid_uuid)

# 無効なUUID検出
invalid_df = df[~valid_mask]
if not invalid_df.empty:
    print("🟥 無効な company_id を持つレコード:")
    display(invalid_df)

# 重複検出
duplicates = df[valid_mask].duplicated(subset=["company_id"], keep=False)
if duplicates.any():
    print("⚠️ company_id が重複しているレコード（同一年度内）:")
    display(df[valid_mask][duplicates])

# 有効なUUIDのみ残す
df = df[valid_mask].copy()

print("🔍 company_idが有効な件数:", len(df))
print("✅ ユニークな company_id 件数:", df["company_id"].nunique())

# ----------- ⑤ companies テーブルに先に挿入（外部キー用） ----------
company_records = [
    (row["company_id"], row["corp_number"], row["edinet_code"], row["security_code"])
    for _, row in df.iterrows()
]
execute_values(
    cur,
    """
    INSERT INTO companies (company_id, corp_number, edinet_code, security_code)
    VALUES %s
    ON CONFLICT (company_id) DO UPDATE
    SET corp_number = EXCLUDED.corp_number,
        edinet_code = EXCLUDED.edinet_code,
        security_code = EXCLUDED.security_code;
""",
    company_records,
)

# ----------- ⑥ company_profiles テーブルに挿入 or 更新 ----------
profile_records = [
    (
        row["company_id"],
        YEAR,
        row["name_ja"],
        row["name_en"],
        row["sector_17"],
        row["sector_33"],
        row["scale_category"],
        row["market_code"].zfill(3),
        row["address"],
    )
    for _, row in df.iterrows()
]
execute_values(
    cur,
    """
    INSERT INTO company_profiles (
        company_id, year, name_ja, name_en,
        sector_17, sector_33, scale_category, market_code, address
    ) VALUES %s
    ON CONFLICT (company_id, year) DO UPDATE
    SET name_ja = EXCLUDED.name_ja,
        name_en = EXCLUDED.name_en,
        sector_17 = EXCLUDED.sector_17,
        sector_33 = EXCLUDED.sector_33,
        scale_category = EXCLUDED.scale_category,
        market_code = EXCLUDED.market_code,
        address = EXCLUDED.address;
""",
    profile_records,
)

# ----------- ⑦ コミット & 終了処理 ----------
conn.commit()
cur.close()
conn.close()

print(f"✅ {len(df)} 件のデータを {YEAR} 年分として挿入・更新しました。")

In [None]:
## 2024
from scripts.preprocess import load_topix_and_edinet

df_2024 = load_topix_and_edinet(
    "data/raw/topix_growth_snapshots/2024_topix_growth.csv",
    "data/raw/EdinetcodeDlInfo.csv",
)
df_2024.to_csv("data/processed/topix_companies_2024.csv", index=False)

In [None]:
## 手動で

In [None]:
import pandas as pd
import uuid
from psycopg2.extras import execute_values
from db.connection import get_connection
from scripts.preprocess import overwrite_company_ids_if_exists

# ----------- ① 年度指定 & ファイルパス ----------
YEAR = 2024
CSV_PATH = f"data/processed/topix_companies_{YEAR}.csv"

# ----------- ② データ読み込み & カラム統一 ----------
df = pd.read_csv(CSV_PATH, dtype=str).fillna("")

column_map = {
    "提出者名": "name_ja",
    "提出者法人番号": "corp_number",
    "ＥＤＩＮＥＴコード": "edinet_code",
    "Code": "security_code",
    "提出者業種": "sector_33",
    "Sector17CodeName": "sector_17",
    "ScaleCategory": "scale_category",
    "MarketCode": "market_code",
    "提出者名（英字）": "name_en",
    "所在地": "address",
}
df = df.rename(columns=column_map)

keep_cols = [
    "security_code",
    "name_ja",
    "name_en",
    "sector_17",
    "sector_33",
    "scale_category",
    "market_code",
    "edinet_code",
    "address",
    "corp_number",
]
df = df[keep_cols]

# 整形処理
df["security_code"] = df["security_code"].str.zfill(5)
df["edinet_code"] = df["edinet_code"].str.zfill(6)
df["corp_number"] = df["corp_number"].apply(
    lambda x: str(int(float(x))).zfill(13) if pd.notnull(x) and x != "" else ""
)

# ----------- ③ DB接続 & company_id 上書き ----------
conn = get_connection()
cur = conn.cursor()
df = overwrite_company_ids_if_exists(df, cur)


# ----------- ④ UUIDバリデーション & 重複チェック ----------
def is_valid_uuid(val: str) -> bool:
    try:
        uuid.UUID(str(val))
        return True
    except ValueError:
        return False


df["company_id"] = df["company_id"].astype(str)
valid_mask = df["company_id"].apply(is_valid_uuid)

# 無効なUUID検出
invalid_df = df[~valid_mask]
if not invalid_df.empty:
    print("🟥 無効な company_id を持つレコード:")
    display(invalid_df)

# 重複検出
duplicates = df[valid_mask].duplicated(subset=["company_id"], keep=False)
if duplicates.any():
    print("⚠️ company_id が重複しているレコード（同一年度内）:")
    display(df[valid_mask][duplicates])

# 有効なUUIDのみ残す
df = df[valid_mask].copy()

print("🔍 company_idが有効な件数:", len(df))
print("✅ ユニークな company_id 件数:", df["company_id"].nunique())

# ----------- ⑤ companies テーブルに先に挿入（外部キー用） ----------
company_records = [
    (row["company_id"], row["corp_number"], row["edinet_code"], row["security_code"])
    for _, row in df.iterrows()
]
execute_values(
    cur,
    """
    INSERT INTO companies (company_id, corp_number, edinet_code, security_code)
    VALUES %s
    ON CONFLICT (company_id) DO UPDATE
    SET corp_number = EXCLUDED.corp_number,
        edinet_code = EXCLUDED.edinet_code,
        security_code = EXCLUDED.security_code;
""",
    company_records,
)

# ----------- ⑥ company_profiles テーブルに挿入 or 更新 ----------
profile_records = [
    (
        row["company_id"],
        YEAR,
        row["name_ja"],
        row["name_en"],
        row["sector_17"],
        row["sector_33"],
        row["scale_category"],
        row["market_code"].zfill(3),
        row["address"],
    )
    for _, row in df.iterrows()
]
execute_values(
    cur,
    """
    INSERT INTO company_profiles (
        company_id, year, name_ja, name_en,
        sector_17, sector_33, scale_category, market_code, address
    ) VALUES %s
    ON CONFLICT (company_id, year) DO UPDATE
    SET name_ja = EXCLUDED.name_ja,
        name_en = EXCLUDED.name_en,
        sector_17 = EXCLUDED.sector_17,
        sector_33 = EXCLUDED.sector_33,
        scale_category = EXCLUDED.scale_category,
        market_code = EXCLUDED.market_code,
        address = EXCLUDED.address;
""",
    profile_records,
)

# ----------- ⑦ コミット & 終了処理 ----------
conn.commit()
cur.close()
conn.close()

print(f"✅ {len(df)} 件のデータを {YEAR} 年分として挿入・更新しました。")

In [None]:
## 2025
from scripts.preprocess import load_topix_and_edinet

df_2025 = load_topix_and_edinet(
    "data/raw/topix_growth_snapshots/2025_topix_growth.csv",
    "data/raw/EdinetcodeDlInfo.csv",
)
df_2025.to_csv("data/processed/topix_companies_2025.csv", index=False)

In [None]:
## 手動チェック

In [None]:
import pandas as pd
import uuid
from psycopg2.extras import execute_values
from db.connection import get_connection
from scripts.preprocess import overwrite_company_ids_if_exists

# ----------- ① 年度指定 & ファイルパス ----------
YEAR = 2025
CSV_PATH = f"data/processed/topix_companies_{YEAR}.csv"

# ----------- ② データ読み込み & カラム統一 ----------
df = pd.read_csv(CSV_PATH, dtype=str).fillna("")

column_map = {
    "提出者名": "name_ja",
    "提出者法人番号": "corp_number",
    "ＥＤＩＮＥＴコード": "edinet_code",
    "Code": "security_code",
    "提出者業種": "sector_33",
    "Sector17CodeName": "sector_17",
    "ScaleCategory": "scale_category",
    "MarketCode": "market_code",
    "提出者名（英字）": "name_en",
    "所在地": "address",
}
df = df.rename(columns=column_map)

keep_cols = [
    "security_code",
    "name_ja",
    "name_en",
    "sector_17",
    "sector_33",
    "scale_category",
    "market_code",
    "edinet_code",
    "address",
    "corp_number",
]
df = df[keep_cols]

# 整形処理
df["security_code"] = df["security_code"].str.zfill(5)
df["edinet_code"] = df["edinet_code"].str.zfill(6)
df["corp_number"] = df["corp_number"].apply(
    lambda x: str(int(float(x))).zfill(13) if pd.notnull(x) and x != "" else ""
)

# ----------- ③ DB接続 & company_id 上書き ----------
conn = get_connection()
cur = conn.cursor()
df = overwrite_company_ids_if_exists(df, cur)


# ----------- ④ UUIDバリデーション & 重複チェック ----------
def is_valid_uuid(val: str) -> bool:
    try:
        uuid.UUID(str(val))
        return True
    except ValueError:
        return False


df["company_id"] = df["company_id"].astype(str)
valid_mask = df["company_id"].apply(is_valid_uuid)

# 無効なUUID検出
invalid_df = df[~valid_mask]
if not invalid_df.empty:
    print("🟥 無効な company_id を持つレコード:")
    display(invalid_df)

# 重複検出
duplicates = df[valid_mask].duplicated(subset=["company_id"], keep=False)
if duplicates.any():
    print("⚠️ company_id が重複しているレコード（同一年度内）:")
    display(df[valid_mask][duplicates])

# 有効なUUIDのみ残す
df = df[valid_mask].copy()

print("🔍 company_idが有効な件数:", len(df))
print("✅ ユニークな company_id 件数:", df["company_id"].nunique())

# ----------- ⑤ companies テーブルに先に挿入（外部キー用） ----------
company_records = [
    (row["company_id"], row["corp_number"], row["edinet_code"], row["security_code"])
    for _, row in df.iterrows()
]
execute_values(
    cur,
    """
    INSERT INTO companies (company_id, corp_number, edinet_code, security_code)
    VALUES %s
    ON CONFLICT (company_id) DO UPDATE
    SET corp_number = EXCLUDED.corp_number,
        edinet_code = EXCLUDED.edinet_code,
        security_code = EXCLUDED.security_code;
""",
    company_records,
)

# ----------- ⑥ company_profiles テーブルに挿入 or 更新 ----------
profile_records = [
    (
        row["company_id"],
        YEAR,
        row["name_ja"],
        row["name_en"],
        row["sector_17"],
        row["sector_33"],
        row["scale_category"],
        row["market_code"].zfill(3),
        row["address"],
    )
    for _, row in df.iterrows()
]
execute_values(
    cur,
    """
    INSERT INTO company_profiles (
        company_id, year, name_ja, name_en,
        sector_17, sector_33, scale_category, market_code, address
    ) VALUES %s
    ON CONFLICT (company_id, year) DO UPDATE
    SET name_ja = EXCLUDED.name_ja,
        name_en = EXCLUDED.name_en,
        sector_17 = EXCLUDED.sector_17,
        sector_33 = EXCLUDED.sector_33,
        scale_category = EXCLUDED.scale_category,
        market_code = EXCLUDED.market_code,
        address = EXCLUDED.address;
""",
    profile_records,
)

# ----------- ⑦ コミット & 終了処理 ----------
conn.commit()
cur.close()
conn.close()

print(f"✅ {len(df)} 件のデータを {YEAR} 年分として挿入・更新しました。")