In [4]:
import sys
import os

os.chdir(os.path.abspath(".."))
sys.path.append(os.path.abspath(".."))

# EDINET Risk Text UPSERT

This notebook loads per-document JSON risk sections and upserts them into the `edinet_documents` table in PostgreSQL.

In [7]:
import os, json
from glob import glob

def load_risk_data(base_dir, min_year=2018):
    records, errors = [], []
    for year_folder in os.listdir(base_dir):
        try:
            year = int(year_folder)
        except ValueError:
            continue
        if year < min_year:
            continue
        year_dir = os.path.join(base_dir, year_folder)
        if not os.path.isdir(year_dir):
            continue
        for filepath in glob(os.path.join(year_dir, "*.json")):
            try:
                if os.stat(filepath).st_size == 0:
                    errors.append((filepath, "Empty file"))
                    continue
                with open(filepath, "r", encoding="utf-8") as f:
                    data = json.load(f)
                doc_id = data.get("doc_id")
                risk_text = data.get("risk_text")
                if not doc_id or not risk_text:
                    errors.append((filepath, "Missing doc_id or risk_text"))
                    continue
                data.setdefault("risk_count", None)
                records.append(data)
            except Exception as e:
                errors.append((filepath, str(e)))
    return records, errors

base_dir = "data/processed/risk_sections"
records, errors = load_risk_data(base_dir)
print(f"Loaded {len(records)} records, {len(errors)} errors")

Loaded 7832 records, 12 errors


In [8]:
import os
from sqlalchemy import create_engine
from dotenv import load_dotenv

load_dotenv()

USER = os.getenv("POSTGRES_USER")
PASS = os.getenv("POSTGRES_PASSWORD")
HOST = os.getenv("POSTGRES_HOST")
PORT = os.getenv("POSTGRES_PORT")
DB   = os.getenv("POSTGRES_DB")

engine = create_engine(f"postgresql+psycopg2://{USER}:{PASS}@{HOST}:{PORT}/{DB}")
print("Engine created:", engine)

Engine created: Engine(postgresql+psycopg2://junxzi:***@localhost:5432/yuho)


In [9]:
from sqlalchemy import text

upsert_sql = text("""
INSERT INTO edinet_documents 
  (doc_id, company_id, edinet_code, doc_type_code, submit_date, fiscal_year, description, risk_text, risk_count)
VALUES 
  (:doc_id, :company_id, :edinet_code, :doc_type_code, :submit_date, :fiscal_year, :description, :risk_text, :risk_count)
ON CONFLICT (doc_id)
DO UPDATE SET
  risk_text  = EXCLUDED.risk_text,
  risk_count = EXCLUDED.risk_count;
""")
print("UPSERT SQL prepared")

UPSERT SQL prepared


In [10]:
from sqlalchemy.exc import SQLAlchemyError

try:
    with engine.begin() as conn:
        conn.execute(upsert_sql, records)
    print("✅ UPSERT completed for", len(records), "records")
except SQLAlchemyError as e:
    print("❌ UPSERT failed:", e)

✅ UPSERT completed for 7832 records
