In [1]:
# import libraries and set parameters 

import os
import io
import re
import time
import zipfile 
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor 

import dart_fss
import OpenDartReader

API_key = '0d67945133e224c451452e071e0d8349969353e1' 
dart = OpenDartReader(API_key)
dart_fss.set_api_key(API_key)

bsns_year = '2024'
reprt_code = '11011'

# call most recent annual/semi-annual report as quarterly may exclude audit info due to corporate disclosure form preparation standards 

Error occurred during getting browser(s): random, but was suppressed with fallback.


In [2]:
dart = OpenDartReader(API_key)
dart_fss.set_api_key(API_key)
print("OpenDart API initialized.")

OpenDart API initialized.


In [59]:
BASE_DATA_DIR = os.path.join('data', 'raw')
os.makedirs(BASE_DATA_DIR, exist_ok=True)
print(f"Ensured raw data directory exists at: {BASE_DATA_DIR}")

Ensured raw data directory exists at: data\raw


In [15]:
def save_df_to_csv(df: pd.DataFrame, file_path: str, index: bool = False):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    try:
        df.to_csv(file_path, index=index)
        print(f"DataFrame saved to {file_path}")
    except Exception as e:
        print(f"Error saving DataFrame to CSV {file_path}: {e}")

In [None]:
# get all listed corp codes (keys to access opendart) 
# opendart link: https://engopendart.fss.or.kr/guide/detail.do?apiGrpCd=DE001&apiId=AE00004

def get_corp_code(api_key: str, output_dir: str = BASE_DATA_DIR) -> pd.DataFrame:
    url_code = f'https://opendart.fss.or.kr/api/corpCode.xml?crtfc_key={api_key}'
    response = requests.get(url_code) 

    # check that the target directory exists 
    os.makedirs('dart_data', exist_ok=True)

    # unzip and extract CORPCODE.xml
    with zipfile.ZipFile(io.BytesIO(response.content)) as z:
        z.extractall('dart_data')
        xml_path = os.path.join('dart_data', 'CORPCODE.xml')

    # parse XML
    tree = ET.parse(xml_path)
    root = tree.getroot()

    # collect listed companies (6-digit stock code only)
    corp_list = []
    for corp in root.findall('list'):
        stock_code = corp.findtext('stock_code')
        if stock_code and len(stock_code) == 6:
            corp_list.append({
                'corp_code': corp.findtext('corp_code'),
                'corp_name': corp.findtext('corp_name'),
                'corp_eng_name': corp.findtext('corp_eng_name'),
                'stock_code': stock_code
            })

    corp_codes_df = pd.DataFrame(corp_list)
    output_filepath = os.path.join(output_dir, 'listed_corp_codes.csv')
    save_df_to_csv(corp_codes_df, output_filepath)
    return corp_codes_df

In [None]:
def get_kospi_company_info(api_key: str, corp_codes_df: pd.DataFrame, output_dir: str = BASE_DATA_DIR) -> pd.DataFrame:
    """
    Fetches detailed company information for KOSPI-listed companies from OpenDartReader.
    Filters for corp_cls 'Y' (Yoo-ga-jeung-kwon Market, i.e., KOSPI).
    Saves the results to a CSV file.

    Args:
        dart_reader: An initialized OpenDartReader instance.
        corp_codes_df: DataFrame containing corp_code and other identifiers.
        output_dir: Directory to save the resulting CSV.

    Returns:
        A pandas DataFrame with detailed information for KOSPI companies.
    """
    data = []

    api_endpoint = "https://engopendart.fss.or.kr/engapi/company.json"

    for i, row in corp_codes_df.iterrows():
        corp_code = row['corp_code']
        corp_name = row['corp_name']

        params = {
            'crtfc_key': API_key,
            'corp_code': corp_code
        }
        try:
            response = requests.get(api_endpoint, params=params)
            response.raise_for_status() # Raise HTTPError for bad responses 
            info = response.json()

            # Filter for KOSPI companies ('Y' indicates Yoo-ga-jeung-kwon Market)
            if info and info.get('corp_cls') == 'Y':
                data.append({
                    'corp_name': info.get('corp_name'),
                    'corp_code': info.get('corp_code'),
                    'stock_code': info.get('stock_code'),
                    'ceo_name': info.get('ceo_nm'),
                    'industry_code': info.get('induty_code'),
                    'established_date': info.get('est_dt'),
                    'ir_url': info.get('ir_url'),
                    'corp_reg_number': info.get('jurir_no'), 
                    'business_no': info.get('bizr_no')
                })
            time.sleep(0.01) 
        except Exception as e:
            print(f"Failed to fetch company info for {corp_name} ({corp_code}): {e}")
            continue

    kospi_codes_df = pd.DataFrame(data)
    output_filepath = os.path.join(output_dir, 'kospi_company_info.csv')
    save_df_to_csv(kospi_codes_df, output_filepath)
    return kospi_codes_df

In [5]:
def get_executive_status_data(api_key: str, kospi_codes_df: pd.DataFrame, bsns_year: int, reprt_code: str, output_dir: str = os.path.join('data', 'raw')) -> pd.DataFrame:
    """
    Fetches executive status data for a list of KOSPI corporations from OpenDART API.

    Args:
        api_key: Your OpenDART API key.
        kospi_codes_df: DataFrame containing 'corp_code' and 'corp_name' for KOSPI companies.
        bsns_year: Business year.
        reprt_code: Report code ('11011' for Annual Report, '11012' for Half-Year Report, etc.).
        output_dir: Directory to save the resulting CSV.

    Returns:
        A pandas DataFrame containing combined executive status data for all fetched companies.
    """
    print(f"Fetching executive status data for business year {bsns_year} and report code {reprt_code}...")
    results = []
    api_endpoint = "https://opendart.fss.or.kr/api/exctvSttus.json"
    total_corps = len(kospi_codes_df)

    for idx, row in kospi_codes_df.iterrows():
        corp_code = row['corp_code']
        corp = str(corp_code).zfill(8) # Ensure corp_code is 8-digits, padded with leading zeros
        corp_name = row['corp_name']

        params = {
            'crtfc_key': api_key,
            'corp_code': corp,
            'bsns_year': bsns_year,
            'reprt_code': reprt_code
        }

        try:
            response = requests.get(api_endpoint, params=params)
            response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
            data = response.json()

            if data['status'] == '000': # Success
                if 'list' in data and data['list']:
                    df = pd.DataFrame(data['list'])
                    df['original_corp_code'] = corp_code # Keep the original corp_code if zfill modified it
                    df['corp_name_from_input'] = corp_name
                    results.append(df)
            elif data['status'] == '013':
                # No data available for this corp/year (status 013: No data)
                print(f"No executive data available for {corp_name} ({corp_code}) for {bsns_year}/{reprt_code}.")
            else:
                # Other API errors
                print(f"API Error for {corp_name} ({corp_code}): Status {data.get('status')}, Message: {data.get('message')}")

        except Exception as e:
            print(f"An unexpected error occurred for {corp_name} ({corp_code}): {e}")

        time.sleep(0.7) 

    if results:
        executive_status_df = pd.concat(results, ignore_index=True)
        output_filepath = os.path.join(output_dir, f'executive_status_{bsns_year}_{reprt_code}.csv')
        save_df_to_csv(executive_status_df, output_filepath)
        print(f"\nSuccessfully fetched and saved executive status for {len(executive_status_df)} records.")
        return executive_status_df
    else:
        print("\nNo executive status data was retrieved.")
        return pd.DataFrame() 

In [48]:
kospi_company_info_df.head(2)

Unnamed: 0,corp_name,corp_code,stock_code,ceo_name,industry_code,established_date,ir_url,corp_reg_number,business_no
0,미래에셋맵스 아시아퍼시픽 부동산공모 1호 투자회사,600013,94800,미래에셋자산운용(주),642,20070109,,1101114000000.0,1078686211
1,맥쿼리한국인프라투융자회사,435297,88980,맥쿼리자산운용(주),64201,20021212,,1101113000000.0,1048177509


In [None]:
def get_total_assets(kospi_company_info_df: pd.DataFrame, bsns_year: str, reprt_code: str, API_key: str, output_dir: str = os.path.join('data', 'raw')) -> pd.DataFrame:
    """
    Fetches Total Assets (자산총계) for a list of companies from the DART API.

    Args:
        kospi_company_info_df (pd.DataFrame): DataFrame with 'corp_code' column.
        bsns_year (str): Business year (e.g., '2023').
        reprt_code (str): Report code (e.g., '11011' for Annual Report).
        API_key (str): Your DART API key.
        output_dir (str): Directory to save the output CSV.

    Returns:
        pd.DataFrame: A DataFrame containing 'Corp Code', 'Total Assets (YYYY)',
                      'Total Assets (YYYY-1)', 'Total Assets (YYYY-2)' for each company.
    """
    
    # Initialize list to store results for all companies
    assets_info = []

    # DART API endpoint for single account financial statements (Korean API)
    # Using Korean API for consistency with '자산총계'
    api_url = 'https://opendart.fss.or.kr/api/fnlttSinglAcntAll.json'

    # Define the account to fetch
    # As discussed, '자산총계' is generally used for both CFS and OFS totals.
    TARGET_ACCOUNT_NAME = ["자산총계"] 
    TARGET_SJ_DIV = "BS" # Balance Sheet

    year = int(bsns_year) # Convert business year to integer for calculation

    for corp in kospi_company_info_df['corp_code']:
        corp_code = str(corp).zfill(8)  # Ensure corp_code is 8-digits, padded with leading zeros
        
        print(f"Fetching assets for corp_code: {corp_code}")

        fetched_df = pd.DataFrame()
        source = None # To track if data came from CFS or OFS

        # Attempt to fetch CFS data first
        cfs_params = {
            'crtfc_key': API_key,
            'corp_code': corp_code,
            'bsns_year': bsns_year,
            'reprt_code': reprt_code,
            'fs_div': 'CFS'
        }
        try:
            res_cfs = requests.get(api_url, params=cfs_params)
            res_cfs.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
            data_cfs = res_cfs.json()

            if data_cfs.get('status') == '000' and 'list' in data_cfs and data_cfs['list']:
                fetched_df = pd.DataFrame(data_cfs['list'])
                source = 'CFS'
            else:
                pass # Will attempt OFS next
        except requests.exceptions.RequestException as e:
            pass # Will attempt OFS next
        except ValueError as e: # JSON decoding error
            # print(f"  JSON decoding error for CFS {corp_code}: {e} - Response: {res_cfs.text[:100]}...")
            pass


        # If CFS fails or returns no usable data, fallback to OFS
        if fetched_df.empty:
            ofs_params = cfs_params.copy() # Start with CFS params and just change fs_div
            ofs_params['fs_div'] = 'OFS'
            try:
                res_ofs = requests.get(api_url, params=ofs_params)
                res_ofs.raise_for_status()
                data_ofs = res_ofs.json()

                if data_ofs.get('status') == '000' and 'list' in data_ofs and data_ofs['list']:
                    fetched_df = pd.DataFrame(data_ofs['list'])
                    source = 'OFS'
                else:
                    print(f"  No data (CFS/OFS) found for {corp_code}. Status: {data_ofs.get('status')}, Message: {data_ofs.get('message', 'No message')}")
            except requests.exceptions.RequestException as e:
                print(f"  Request failed for OFS {corp_code}: {e}")
            except ValueError as e: # JSON decoding error
                print(f"  JSON decoding error for OFS {corp_code}: {e} - Response: {res_ofs.text[:100]}...")


        current_company_assets = {
            'rcept_no': None, 
            'Corp Code': corp_code,
            f'Total Assets ({year})': None,
            f'Total Assets ({year - 1})': None,
            f'Total Assets ({year - 2})': None
        }

        if not fetched_df.empty:
            # Clean and convert amount columns to numeric
            for col in ['thstrm_amount', 'frmtrm_amount', 'bfefrmtrm_amount']:
                if col in fetched_df.columns:
                    # Replace commas, handle hyphens (often for 'nil'), and convert to numeric
                    fetched_df[col] = fetched_df[col].astype(str).str.replace(',', '', regex=False).replace('-', pd.NA).apply(pd.to_numeric, errors='coerce')

            total_assets_row = fetched_df[
                (fetched_df['sj_div'] == TARGET_SJ_DIV) &
                (fetched_df['account_nm'].isin(TARGET_ACCOUNT_NAME))
            ]

            if not total_assets_row.empty:
                row_data = total_assets_row.iloc[0]
                current_company_assets['rcept_no'] = row_data.get('rcept_no', None)
                current_company_assets[f'Total Assets ({year})'] = row_data.get('thstrm_amount', None)
                current_company_assets[f'Total Assets ({year - 1})'] = row_data.get('frmtrm_amount', None)
                current_company_assets[f'Total Assets ({year - 2})'] = row_data.get('bfefrmtrm_amount', None)
                # print(f"  Successfully fetched assets for {corp_code} from {source}.")
            else:
                print(f"  '{TARGET_ACCOUNT_NAME[0]}' not found in {source} for {corp_code}.")
        else:
            print(f"  No usable financial data found for {corp_code}.")

        assets_info.append(current_company_assets)
        time.sleep(0.7)  # to avoid request throttling

    assets_df = pd.DataFrame(assets_info)
    
    # Ensure numeric types for asset columns. 'Int64' for nullable integer.
    for y_offset in [0, 1, 2]:
        col_name = f'Total Assets ({year - y_offset})'
        if col_name in assets_df.columns:
            assets_df[col_name] = pd.to_numeric(assets_df[col_name], errors='coerce').astype('Int64')

    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, f"assets_{bsns_year}_{reprt_code}.csv")
    assets_df.to_csv(output_path, index=False, encoding='utf-8-sig')

    return assets_df

In [61]:
get_total_assets(kospi_company_info_df, bsns_year, reprt_code, API_key, output_dir=BASE_DATA_DIR)

Fetching assets for corp_code: 00600013
  No data (CFS/OFS) found for 00600013. Status: 013, Message: 조회된 데이타가 없습니다.
  No usable financial data found for 00600013.
Fetching assets for corp_code: 00435297
  No data (CFS/OFS) found for 00435297. Status: 013, Message: 조회된 데이타가 없습니다.
  No usable financial data found for 00435297.
Fetching assets for corp_code: 00173944
Fetching assets for corp_code: 00109286
Fetching assets for corp_code: 00129350
Fetching assets for corp_code: 00144252
Fetching assets for corp_code: 00303217
Fetching assets for corp_code: 01036446
Fetching assets for corp_code: 00112970
Fetching assets for corp_code: 00129554
Fetching assets for corp_code: 00163691
Fetching assets for corp_code: 00260392
Fetching assets for corp_code: 00153524
Fetching assets for corp_code: 00105156
Fetching assets for corp_code: 00146649
Fetching assets for corp_code: 00148443
Fetching assets for corp_code: 00148939
  '자산총계' not found in OFS for 00148939.
Fetching assets for corp_code: 0

Unnamed: 0,rcept_no,Corp Code,Total Assets (2024),Total Assets (2023),Total Assets (2022)
0,,00600013,,,
1,,00435297,,,
2,20250313001154,00173944,276510432217,240301076155,227053375662
3,20250318001356,00109286,2132627316122,1956376110942,1713773205581
4,20250313000953,00129350,328114248167,308031065017,287453832625
...,...,...,...,...,...
844,20250320001708,00108746,213873418416,220114642087,192687257308
845,20250320001204,01262032,950573044038,944050170382,871931521611
846,20250319000718,00136776,38915573941,48355454222,48387026626
847,20250317000666,00114154,558566600775,573648655244,520217537029


In [66]:
assets_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\financial_metrics_2024_11011.csv')
assets_df.head(30)

Unnamed: 0,rcept_no,Corp Code,Total Assets (2024),Total Assets (2023),Total Assets (2022)
0,,600013,,,
1,,435297,,,
2,20250310000000.0,173944,276510400000.0,240301100000.0,227053400000.0
3,20250320000000.0,109286,2132627000000.0,1956376000000.0,1713773000000.0
4,20250310000000.0,129350,328114200000.0,308031100000.0,287453800000.0
5,20250320000000.0,144252,305473800000.0,382846300000.0,376111700000.0
6,20250320000000.0,303217,329391200000.0,318384800000.0,285245000000.0
7,20250330000000.0,1036446,2953842000000.0,2357851000000.0,1930332000000.0
8,20250310000000.0,112970,185398500000.0,180718500000.0,210639600000.0
9,20250320000000.0,129554,69828890000.0,90043250000.0,57863010000.0


# 8/5 START HERE ^


In [None]:
def get_total_assets(kospi_company_info_df: pd.DataFrame, bsns_year: str, reprt_code: str, API_key: str, output_dir: str = os.path.join('data', 'raw')) -> pd.DataFrame:

    for corp in kospi_company_info_df['corp_code']:
        corp_code = str(corp).zfill(8)  # Ensure corp_code is 8-digits, padded with leading zeros
        url = 'https://engopendart.fss.or.kr/engapi/fnlttSinglAcntAll.json'
        df = dart.finstate(name, bsns_year) 

        cfs_params = {
            'crtfc_key': API_key,
            'corp_code': code_str,
            'bsns_year': bsns_year,
            'reprt_code': reprt_code,
            'fs_div': 'CFS'
        }

        ofs_params = cfs_params.copy()
        ofs_params['fs_div'] = 'OFS'

        res = requests.get(url, params=cfs_params)

        # If CFS fails or returns no usable data, fallback to OFS
        if not res.ok or res.json().get('status') != '000' or not res.json().get('list'):
            res = requests.get(url, params=ofs_params)
            source = 'OFS'
            target = '자산'
        else:
            source = 'CFS'
            target = '자산총계'

        data = res.json()

        if data['status'] != '000':
            print(f"Error: {data['message']}")
        else:
            df = pd.DataFrame(data['list'])
            total_assets_row = df[(df['sj_div'] == 'BS') & (df['account_nm'] == target)]

            if not total_assets_row.empty:
                total_assets = total_assets_row.iloc[0]['thstrm_amount']
                prior_year_assets = total_assets_row.iloc[0]['frmtrm_amount'] if 'frmtrm_amount' in total_assets_row.columns else None
                two_years_ago_assets = total_assets_row.iloc[0]['bfefrmtrm_amount'] if 'bfefrmtrm_amount' in total_assets_row.columns else None
                assets_info.append({
                    'Corp Code': code,
                    'Total Assets': total_assets,
                    'Prior Year Assets': prior_year_assets,
                    'Two Years Ago Assets': two_years_ago_assets})
            else:
                print(f"No financial statement data retrieved for corp code {code_str} or API error.")
        
        time.sleep(0.7) 

    assets_df = pd.DataFrame(assets_info)
    assets_df['Total Assets'] = (
        assets_df['Total Assets']
        .str.replace(',', '', regex=False)  # remove commas
        .astype('Int64')                    # convert to nullable integer for consistent formatting 
    )

    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, f"financial_metrics_{bsns_year}_{reprt_code}.csv")
    assets_df.to_csv(output_path, index=False, encoding='utf-8-sig')

    return assets_df

SyntaxError: unterminated string literal (detected at line 5) (3869788136.py, line 5)

In [44]:
financial_metrics_df = get_total_assets(kospi_company_info_df, bsns_year, reprt_code, API_key) # Execution Time ~ 11 minutes 

KeyboardInterrupt: 

In [42]:
financial_metrics_df

Unnamed: 0,corp_code,Total Assets (2024),Total Assets (2023),Total Assets (2022)
0,600013,,,
1,435297,,,
2,173944,,,
3,109286,,,
4,129350,,,
...,...,...,...,...
844,108746,,,
845,1262032,,,
846,136776,,,
847,114154,,,


In [None]:
def get_major_shareholder_data(api_key: str, kospi_codes_df: pd.DataFrame, bsns_year: int, reprt_code: str, output_dir: str = os.path.join('data', 'raw')) -> pd.DataFrame:
    # pulled from: https://opendart.fss.or.kr/guide/detail.do?apiGrpCd=DS002&apiId=2019007
    """
    Fetches major shareholder status data for a list of KOSPI corporations from OpenDART API.

    Args:
        api_key: Your OpenDART API key.
        kospi_codes_df: DataFrame containing 'corp_code' and 'corp_name' for KOSPI companies.
        bsns_year: Business year.
        reprt_code: Report code ('11011' for Annual Report, '11012' for Half-Year Report, etc.).
        output_dir: Directory to save the resulting CSV.

    Returns:
        A pandas DataFrame containing combined major shareholder data for all fetched companies.
    """
    print(f"Fetching major shareholder data for business year {bsns_year} and report code {reprt_code}...")
    results = []
    api_endpoint = "https://opendart.fss.or.kr/api/hyslrSttus.json"
    total_corps = len(kospi_codes_df)

    for idx, row in kospi_codes_df.iterrows():
        corp_code = row['corp_code']
        corp = str(corp_code).zfill(8)
        corp_name = row['corp_name']

        params = {
            'crtfc_key': api_key,
            'corp_code': corp,
            'bsns_year': bsns_year,
            'reprt_code': reprt_code
        }

        try:
            response = requests.get(api_endpoint, params=params)
            response.raise_for_status()
            data = response.json()

            if data['status'] == '000':
                if 'list' in data and data['list']:
                    df = pd.DataFrame(data['list'])
                    results.append(df)
            elif data['status'] == '013':
                print(f"No shareholder data available for {corp_name} ({corp_code}) for {bsns_year}/{reprt_code}.")
            else:
                print(f"API Error for {corp_name} ({corp_code}): Status {data.get('status')}, Message: {data.get('message')}")

        except Exception as e:
            print(f"An unexpected error occurred for {corp_name} ({corp_code}): {e}")

        time.sleep(0.7)

    if results:
        shareholder_df = pd.concat(results, ignore_index=True)
        output_filepath = os.path.join(output_dir, f'major_shareholders_{bsns_year}_{reprt_code}.csv')
        save_df_to_csv(shareholder_df, output_filepath)
        print(f"\nSuccessfully fetched and saved major shareholder data for {len(shareholder_df)} records.")
        return shareholder_df
    else:
        print("\nNo major shareholder data was retrieved.")
        return pd.DataFrame()

In [None]:
# RUNNING BLOCK 

# 1. Contruct Files: if running for the first time 
all_corp_codes_df = get_corp_code(api_key=API_key)
kospi_company_info_df = get_kospi_company_info(api_key=API_key, corp_codes_df=all_corp_codes_df)
kospi_company_info_df["corp_code"] = kospi_company_info_df["corp_code"].astype(str).str.zfill(8)
executive_status_data_df = get_executive_status_data(api_key=API_key, kospi_codes_df=kospi_company_info_df, bsns_year=bsns_year, reprt_code=reprt_code) # Execution Time ~ 11 minutes 

major_shareholder_df = get_major_shareholder_data(api_key=API_key, kospi_codes_df=kospi_company_info_df, bsns_year=bsns_year, reprt_code=reprt_code) # Execution Time ~ 11 minutes 

# 2. Read in Files: if run before, read in the files 
all_corp_codes_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\listed_corp_codes.csv')
kospi_company_info_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\kospi_company_info.csv')
executive_status_data_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\executive_status_2024_11011.csv')
major_shareholder_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\major_shareholders_2024_11011.csv')

In [3]:
all_corp_codes_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\listed_corp_codes.csv')
kospi_company_info_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\kospi_company_info.csv')
executive_status_data_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\executive_status_2024_11011.csv')

In [None]:
# Note: OPENDART has a financial doc data BUT doesn't indicate net values as loss or gains for some cases 

In [None]:
'''  
08/02 TODO: 
# financials: because it's not the same format and doesn't indicate when 'net' income is loss or profit, only within the summary financial information sheet -> check through scraping 

- complete data extraction 
    - make sure that financial metrics is pulling correctly: should have complete data from ALL 
    - check that major shareholder status + is being pulled correcting 
- work on pre-processing 
    - with new financials, recheck audit committee requirement check 
    - audit committee corrections 
    - tenure standardization 
    - background parsing 
    - remove '/n' lines from all exec info 

- over next week: scoring/pre-processing 

'''

# instead of columns, print flags 
# column of audit committee flag type: 
    # 1 - assets over 2 trillion won but no audit committee listed 
    # 2 - failed audit committee requirement check
    # 3 - audit committee was updated from executive disclosure (ie individuals weren't identified as AC members though required to have one, so relies on business report to update)
# for the audit check: parse on corporation level. for those that were updated, mark from where 
# notes section: audit committee was not listed in executive disclosure and instead updated from financials. 
# for individual: individual was not listed under executive disclosure as an audit committee but detected in business report as so 
# audit committee was not listed for this corporation though required to 

'''
Documentation Format: 
- the produced data frames
- the full code and explaining decision processes (ie why est/exact for salary, the flags for audit committee, assets check from 2 years ago), what columns were droppped, etc 

Proposed Next Steps:

Useful OpenDART links: 
- https://opendart.fss.or.kr/guide/detail.do?apiGrpCd=DE004&apiId=AE00041 Ownership change (most recent data)
'''

'\nDocumentation Format: \n- the produced data frames\n- the full code and explaining decision processes (ie why est/exact for salary, the flags for audit committee, assets check from 2 years ago), what columns were droppped, etc \n\nProposed Next Steps:\n\nUseful OpenDART links: \n- https://opendart.fss.or.kr/guide/detail.do?apiGrpCd=DE004&apiId=AE00041 Ownership change (most recent data)\n'

In [None]:
# FIX: 
# pass through the list of corp codes from kospi_company_info_df
# to get the urls with opendart reader, it requires the disclosure number (rcept_no) which is currently only available in the executive status data 
# scrape the summary and audit urls for financial data (assets) for the past 3 years, and the audit committee members

# produce two csv files: one for financial metrics and one for audit committee members 

In [None]:
# pulling from opendart sub docs requires the disclosure number (rcept_no) which is currently only available in the executive status data

In [27]:
executive_status_data_df[executive_status_data_df['corp_code'] == 102858]['rcept_no'].values[0]

np.int64(20250331003521)

In [49]:
dart.finstate('00126380,00164779,00164742', 2021).head(2)

Unnamed: 0,rcept_no,reprt_code,bsns_year,corp_code,stock_code,fs_div,fs_nm,sj_div,sj_nm,account_nm,...,thstrm_dt,thstrm_amount,frmtrm_nm,frmtrm_dt,frmtrm_amount,bfefrmtrm_nm,bfefrmtrm_dt,bfefrmtrm_amount,ord,currency
0,20220308000798,11011,2021,126380,5930,CFS,연결재무제표,BS,재무상태표,유동자산,...,2021.12.31 현재,218163185000000,제 52 기,2020.12.31 현재,198215579000000,제 51 기,2019.12.31 현재,181385260000000,1,KRW
1,20220308000798,11011,2021,126380,5930,CFS,연결재무제표,BS,재무상태표,비유동자산,...,2021.12.31 현재,208457973000000,제 52 기,2020.12.31 현재,180020139000000,제 51 기,2019.12.31 현재,171179237000000,3,KRW


In [33]:
def get_financial_and_audit_data(api_key: str, kospi_company_info_df: pd.DataFrame, bsns_year: int, reprt_code: str, output_dir: str = os.path.join('data', 'raw')) -> pd.DataFrame:
    def get_urls():
        results = []
        for idx, row in kospi_company_info_df.iterrows():
            corp_name = row['corp_name']
            corp_code = row['corp_code']

            try:
                rcp = executive_status_data_df[executive_status_data_df['corp_code'] == corp_code]['rcept_no'].values[0]
                subdocs = dart.sub_docs(str(rcp))

                summary_match = subdocs[subdocs['title'].str.contains("요약재무정보", na=False)]
                summary_url = summary_match.iloc[0]['url'] if not summary_match.empty else None

                audit_match = subdocs[subdocs['title'].str.contains("감사제도에 관한 사항", na=False)]
                audit_url = audit_match.iloc[0]['url'] if not audit_match.empty else None

            except Exception as e:
                print(f"Failed to fetch for {corp_name} ({corp_code}): {e}")
                rcp = None
                summary_url = None
                audit_url = None

            results.append({
                'corp_code': str(corp_code),
                'company': corp_name,
                'rcept_no': rcp,
                'summary_url': summary_url,
                'audit_url': audit_url
            })

            time.sleep(0.7)

        return pd.DataFrame(results)

    def parse_summary_table(summary_url):
        try:
            r = requests.get(summary_url)
            soup = BeautifulSoup(r.content, 'html.parser')
            unit_check = soup.find(string=lambda t: '단위' in t and '백만' in t)
            if not unit_check:
                return None

            table = soup.find('table')
            df = pd.read_html(str(table))[0]

            def extract_value(df, label):
                row = df[df.iloc[:, 0].astype(str).str.contains(label)]
                if not row.empty:
                    val = row.iloc[0, 1]
                    if isinstance(val, str) and '손실' in label:
                        return -float(val.replace(',', '').replace(')', '').replace('(', '-'))
                    return float(str(val).replace(',', '').replace(')', '').replace('(', '-'))
                return None

            return {
                'total_assets': extract_value(df, '자산총계'),
                'total_liabilities': extract_value(df, '부채총계'),
                'total_equity': extract_value(df, '자본총계'),
                'net_income': extract_value(df, '당기순이익'),
                'operating_profit': extract_value(df, '영업이익'),
                'sales_revenue': extract_value(df, '매출액')
            }
        except Exception as e:
            print(f"Error parsing summary table: {e}")
            return None

    def parse_audit_info(audit_url):
        try:
            r = requests.get(audit_url)
            soup = BeautifulSoup(r.content, 'html.parser')

            tables = soup.find_all('table')
            audit_members = []
            for i, table in enumerate(tables):
                df = pd.read_html(str(table))[0]
                if df.shape[1] >= 3 and df.columns[0].startswith('성명') and '사외이사' in df.columns[1]:
                    for j in range(len(df)):
                        name = df.iloc[j, 0]
                        is_outside = df.iloc[j, 1]
                        career = df.iloc[j, 2]

                        appoint_data = {'reappointment': None, 'rationale': None, 'nominator': None, 'transactions': None, 'relation_to_shareholder': None}
                        for k in range(i + 1, min(i + 4, len(tables))):
                            appoint_df = pd.read_html(str(tables[k]))[0]
                            if name in appoint_df.astype(str).to_string():
                                for col in appoint_df.columns:
                                    if appoint_df.iloc[:, 0].astype(str).str.contains('연임여부').any():
                                        appoint_data['reappointment'] = appoint_df.iloc[0, 1]
                                    if appoint_df.iloc[:, 0].astype(str).str.contains('지명사유').any():
                                        appoint_data['rationale'] = appoint_df.iloc[0, 1]
                                    if appoint_df.iloc[:, 0].astype(str).str.contains('지명주체').any():
                                        appoint_data['nominator'] = appoint_df.iloc[0, 1]
                                    if appoint_df.iloc[:, 0].astype(str).str.contains('거래내역').any():
                                        appoint_data['transactions'] = appoint_df.iloc[0, 1]
                                    if appoint_df.iloc[:, 0].astype(str).str.contains('최대주주와의 관계').any():
                                        appoint_data['relation_to_shareholder'] = appoint_df.iloc[0, 1]

                        audit_members.append({
                            'name': name,
                            'is_outside': is_outside,
                            'career': career,
                            **appoint_data
                        })
            return audit_members
        except Exception as e:
            print(f"Error parsing audit info: {e}")
            return None

    url_df = get_urls()

    records = []
    for idx, row in url_df.iterrows():
        summary_data = parse_summary_table(row['summary_url']) if row['summary_url'] else {}
        audit_data = parse_audit_info(row['audit_url']) if row['audit_url'] else []

        records.append({
            'corp_code': row['corp_code'],
            'company': row['company'],
            'rcept_no': row['rcept_no'],
            **(summary_data if summary_data else {}),
            'audit_info': audit_data
        })

    time.sleep(0.7)
    
    result_df = pd.DataFrame(records)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    result_df.to_csv(os.path.join(output_dir, f'financial_audit_data_{bsns_year}_{reprt_code}.csv'), index=False)

    return result_df

In [None]:
def get_json(url, params):
    try:
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()
        if data.get('status') == '013' or 'list' not in data:
            return []
        return data['list']
    except Exception as e:
        print(f"Request failed for {url}: {e}")
        return []

def consolidate_salary_data_api(corp_code):
    endpoints = {
        'individual': 'https://opendart.fss.or.kr/api/hmvAuditIndvdlBySttus.json',
        'unregistered': 'https://opendart.fss.or.kr/api/unrstExctvMendngSttus.json',
        'grouped': 'https://opendart.fss.or.kr/api/drctrAdtAllMendngSttusMendngPymntamtTyCl.json'
    }
    params = {
        'crtfc_key': API_key,
        'corp_code': corp_code,
        'bsns_year': bsns_year,
        'reprt_code': reprt_code
    }

    results = []

    for row in get_json(endpoints['individual'], params):
        results.append({
            'corp_code': corp_code,
            'name': row.get('nm'),
            'ofcps': row.get('ofcps'),
            'salary': row.get('mendng_totamt'),
            'benefits': row.get('mendng_totamt_ct_incls_mendng'),
            'salary_source': '개인별보수',
            'salary_type': 'exact'
        })

    for row in get_json(endpoints['unregistered'], params):
        results.append({
            'corp_code': corp_code,
            'name': '',
            'ofcps': row.get('se'),
            'salary': row.get('jan_salary_am'),
            'benefits': None,
            'salary_source': '미등기임원',
            'salary_type': 'est'
        })

    for row in get_json(endpoints['grouped'], params):
        results.append({
            'corp_code': corp_code,
            'name': '',
            'ofcps': row.get('se'),
            'salary': row.get('psn1_avrg_pymntamt'),
            'benefits': None,
            'salary_source': '임원전체보수유형',
            'salary_type': 'est'
        })

    return results

corp_list = ['00126380', '00141780', '00151000']  # Replace with your own list or read from CSV
all_data = []

for idx, corp_code in enumerate(corp_list):
    print(f"Processing {idx+1}/{len(corp_list)}: {corp_code}")
    data = consolidate_salary_data_api(corp_code)
    all_data.extend(data)
    time.sleep(0.7)

df = pd.DataFrame(all_data)
output_filepath = os.path.join(output_dir, f'salary_data_{bsns_year}_{reprt_code}.csv')
df.to_csv(output_filepath, index=False, encoding='utf-8-sig')

print(f"\nSuccessfully saved salary data for {len(df)} rows to:\n{output_filepath}")


In [49]:
exec_df = corp_df.drop(columns=['corp_cls', 'birth_ym', 'fte_at', 'tenure_end_on', 'stlm_dt'], errors='ignore')

exec_df = exec_df.rename(columns={
    'rcept_no': 'Disclosure',
    'corp_name': 'Company',
    'corp_code': 'Corp Code',
    'nm': 'Name',
    'sexdstn': 'Gender',
    'ofcps': 'Position',
    'rgist_exctv_at': 'Registered Officer Status',
    'chrg_job': 'Responsibilities',
    'main_career': 'Professional Background',
    'mxmm_shrholdr_relate': 'Shareholder Relation',
    'hffc_pd': 'Period of employment'
})

In [9]:
kospi_codes['corp_code'] = kospi_codes['corp_code'].astype(str).str.zfill(8)
exec_df['Corp Code'] = exec_df['Corp Code'].astype(str).str.zfill(8)

company_df_unique = kospi_codes[['corp_code', 'industry_code']].drop_duplicates(subset='corp_code')

exec_df = exec_df.merge(
    company_df_unique,
    left_on='Corp Code',
    right_on='corp_code',
    how='left'
)

exec_df.drop(columns='corp_code', inplace=True)

In [None]:
# --- OPENDART LOAD IN: UPDATE with most recent executive status report --- 
# exec_df2 = pd.read_excel('C:\Program Files\Git\OPENDART\.vscode\BusinessReport(Executive status_2025_First Quarterly).xls')
# exec_df['Total'] = exec_df.groupby('Item code')['Item code'].transform('count')
# exec_df.to_csv('kospi_exec_df.csv', index=False)

  exec_df = pd.read_excel('C:\Program Files\Git\OPENDART\.vscode\BusinessReport(Executive status_2025_First Quarterly).xls')


SECTION 2: Categorize and Split into Individual/Grouped Data

The below functions produce: 
    1. updated exec_df 
        with additional columns reflecting audit committee member/auditor status 
    2. individual_df 
        dataframe of each individual executive's information (professional background info to be added)
    3. summary_df 
        grouped dataframe by company, summing across all category types

In [None]:
# --- 1. Indivualized Dataframe where each row represents a unique executive --- 

# from the original exec_df dataframe, identify members of the audit committee and auditors 

# because audit committee member can also be extracted from the corrected executive data: update such that it also clears afterwards 

def is_audit_committee_member(responsibility, position):

    responsibility_cleaned = re.sub(r'\s', '', responsibility)
    
    return bool(re.search(r'감사위원회위원|감사위원|감사위원장', responsibility_cleaned)) 

def is_auditor_exclusive(responsibility, position): #isolated capture to not overlap with is_audit_committee_member
    if is_audit_committee_member(responsibility, position):
        return False
    
    responsibility_check = False
    if isinstance(responsibility, str):
        responsibility_cleaned = re.sub(r'\s', '', responsibility)

        responsibility_check = '감사' in responsibility_cleaned and not re.search(r'감사위원회위원|감사위원', responsibility_cleaned) 
    position_check = False 


    return responsibility_check or position_check

# apply updated audit membership 
exec_df['is_audit_committee_member'] = exec_df.apply(
    lambda row: is_audit_committee_member(row['Responsibilities'], row['Registered Officer Status']), axis=1
)

exec_df['is_auditor'] = exec_df.apply(
    lambda row: is_auditor_exclusive(row['Responsibilities'], row['Registered Officer Status']), axis=1
)

# build the individual-level dataframe

Constructing summary_df of grouped data by corp

In [None]:
exec_df.to_csv('exec_df', index = False)

# fix exec_df and save such that each individual is checked for audit committee membership before saving and reading into parsing notebook 

In [30]:
# --- 2. Grouped summary dataframe where each row represents a corporation --- 

def extract_summary(group):
    voting_directors_group = group[~group['Registered Officer Status'].isin(['미등기', '감사'])]

    female_voting = (voting_directors_group['Gender'] == '여').sum()
    male_voting = (voting_directors_group['Gender'] == '남').sum()
    
    return pd.Series({
        
        # total counts for all individuals 
        'Audit Committee': group['is_audit_committee_member'].sum(),
        'Audit Committee Outside Directors': ((group['is_audit_committee_member'] == True) & (group['Registered Officer Status'] == '사외이사')).sum(),
        'Inside Directors': group['Registered Officer Status'].isin(['사내이사', '대표집행임원']).sum(),
        'Outside Directors': (group['Registered Officer Status'] == '사외이사').sum(),
        'Other Non Exec Directors': (group['Registered Officer Status'] == '기타비상무이사').sum(),
        'Auditors': group['is_auditor'].sum(),
        
        # counts for Voting Directors subset
        'Female Voting': female_voting,
        'Male Voting': male_voting,
        'Voting Directors': female_voting + male_voting,

        # count for Non-Registered (separate)
        'Non Registered': (group['Registered Officer Status'] == '미등기').sum()
    })


# merge with kospi_codes to append corp_code, required for financial statment search in next section 
# Item code and stock_code are interchangeable 
summary_df = exec_df.groupby(['Company', 'Corp Code', 'Disclosure']).apply(extract_summary).reset_index()
summary_df

  summary_df = exec_df.groupby(['Company', 'Corp Code', 'Disclosure']).apply(extract_summary).reset_index()


Unnamed: 0,Company,Corp Code,Disclosure,Audit Committee,Audit Committee Outside Directors,Inside Directors,Outside Directors,Other Non Exec Directors,Auditors,Female Voting,Male Voting,Voting Directors,Non Registered
0,AJ네트웍스,00365387,20250325000220,0,0,3,2,1,1,0,6,6,12
1,AK홀딩스,00125080,20250318001263,3,3,4,3,1,0,1,7,8,1
2,BGF,00219097,20250318001407,0,0,2,2,0,1,0,4,4,0
3,BGF리테일,01263022,20250318000733,3,3,2,5,0,0,1,6,7,0
4,BNK금융지주,00858364,20250318001176,4,4,1,7,0,1,1,7,8,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
866,휴니드테크놀러지스,00111421,20250310000706,0,0,3,1,0,1,0,4,4,16
867,휴비스,00362238,20250317000508,0,0,1,3,2,0,0,6,6,9
868,휴스틸,00156488,20250318001178,3,3,3,3,1,0,0,7,7,9
869,흥국화재,00103176,20250320001444,3,3,3,4,0,3,0,7,7,19


In [None]:
summary_df['Disclosure'] = summary_df['Disclosure'].astype(str)
summary_df = summary_df.drop_duplicates(subset='Corp Code', keep='first')

In [13]:
search_list = summary_df['Corp Code']

In [14]:
# Assets Check: For each KOSPI corporation, check whether their total assets exceed the $2T minimum for a required audit committee 
# execution time ~ 11 minutes 
asset_year_int = int(asset_year)

search_list = summary_df['Corp Code'].astype(str) # define the list of corp_codes that we need to search, which dart/finstate requires as str type 

assets_info = []

for code in search_list: 
    code_str = str(code).zfill(8) # format corp code as an 8-digit string, add leading 0s where needed 

    # call the dart_finstate function for the current company
    url = 'https://engopendart.fss.or.kr/engapi/fnlttSinglAcntAll.json'

    # Try CFS first
    cfs_params = {
        'crtfc_key': API_key,
        'corp_code': code_str,
        'bsns_year': asset_year_int,
        'reprt_code': reprt_code,
        'fs_div': 'CFS'
    }

    ofs_params = cfs_params.copy()
    ofs_params['fs_div'] = 'OFS'

    res = requests.get(url, params=cfs_params)

    # If CFS fails or returns no usable data, fallback to OFS
    if not res.ok or res.json().get('status') != '000' or not res.json().get('list'):
        res = requests.get(url, params=ofs_params)
        source = 'OFS'
    else:
        source = 'CFS'

    data = res.json()

    if data['status'] != '000':
        print(f"Error: {data['message']}")
    else:
        df = pd.DataFrame(data['list'])
        total_assets_row = df[(df['sj_div'] == 'BS') & (df['account_nm'] == '자산총계')]

        if not total_assets_row.empty:
            total_assets = total_assets_row.iloc[0]['thstrm_amount']
            assets_info.append({
                'Corp Code': code,
                'Total Assets': total_assets})
        else:
            print(f"No financial statement data retrieved for corp code {code_str} or API error.")
    
    time.sleep(0.7) 

assets_df = pd.DataFrame(assets_info)
assets_df['Total Assets'] = (
    assets_df['Total Assets']
    .str.replace(',', '', regex=False)  # remove commas
    .astype('Int64')                    # convert to nullable integer for consistent formatting 
)

No financial statement data retrieved for corp code 00109693 or API error.
Error: No data viewed.
Error: No data viewed.
Error: No data viewed.
No financial statement data retrieved for corp code 00356361 or API error.
No financial statement data retrieved for corp code 00139889 or API error.
Error: No data viewed.
No financial statement data retrieved for corp code 00113492 or API error.
Error: No data viewed.
Error: No data viewed.
No financial statement data retrieved for corp code 00118026 or API error.
No financial statement data retrieved for corp code 00113562 or API error.
No financial statement data retrieved for corp code 01258507 or API error.
No financial statement data retrieved for corp code 00165413 or API error.
No financial statement data retrieved for corp code 00122694 or API error.
Error: No data viewed.
No financial statement data retrieved for corp code 00896285 or API error.
Error: No data viewed.
Error: No data viewed.
Error: No data viewed.
Error: No data viewe

In [15]:
summary_df['Corp Code'] = summary_df['Corp Code'].astype(str).str.zfill(8)
assets_df['Corp Code'] = assets_df['Corp Code'].astype(str).str.zfill(8)

summary_df = summary_df.merge(
    assets_df[['Corp Code', 'Total Assets']],
    on='Corp Code',
    how='left'
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  summary_df['Corp Code'] = summary_df['Corp Code'].astype(str).str.zfill(8)


In [None]:
# for the committee check, 

# dissonance 

In [16]:
def check_governance_compliance(df):
    """
    Evaluates governance compliance based on board and audit committee rules.
    Returns:
        pd.DataFrame with flagged issues for non-compliant companies.
    """
    flagged = []

    for index, row in df.iterrows():
        corp_code = row['Corp Code']
        total_assets = row['Total Assets']
        num_audit_committee = row['Audit Committee']
        num_outside_directors = row['Outside Directors']
        num_voting_directors = row['Voting Directors']
        num_outside_committee = row['Audit Committee Outside Directors']

        failures = []

        # Check audit committee compliances
        if pd.notna(total_assets) and total_assets > 2_000_000_000_000:
            if pd.isna(num_audit_committee) or num_audit_committee == 0:
                failures.append("No Audit Committee listed")
            elif num_audit_committee < 3:
                failures.append(f"Audit Committee has fewer than 3 members ({num_audit_committee})")

            if pd.notna(num_audit_committee) and num_audit_committee > 0:
                if pd.isna(num_outside_committee):
                    failures.append("Missing count of Audit Committee Outside Directors.")
                elif num_outside_committee < (2/3) * num_audit_committee:
                    failures.append(f"Audit Committee Outside Directors ({num_outside_committee}) < 2/3 of Audit Committee ({num_audit_committee})")
            if num_outside_directors < (1/4) * num_voting_directors:
                failures.append(f"Outside Directors ({num_outside_directors}) < 1/4 of Voting Directors ({num_voting_directors})")

        if failures:
            flagged.append({'Corp Code': corp_code, 'Flagged Conditions': "; ".join(failures)})

    failed_df = pd.DataFrame(flagged)
    failed_df['Corp Code'] = failed_df['Corp Code'].astype(str).str.zfill(8)

    # Merge with original data
    return pd.merge(failed_df, df, on='Corp Code', how='inner')

flagged = check_governance_compliance(summary_df) 

# 101 flagged 

In [None]:
# for corp listed in summary_df, pull a list of unreported corps (where audit committee = 0) 
# and parse through each of their subdocs using their disclosure number. the goal is, for each that has not reported their audit committee to 
# go through a more rigorous check through the 'VI. 이사회 등 회사의 기관에 관한 사항' report and update the summary_df with the findings on audit committee members, 
# specifically how large the committee is and how much of it is composed of outside directors 

# opendart reader documentation for subdocs: 
# 7. 확장 기능 of https://nbviewer.org/github/FinanceData/OpenDartReader/blob/master/docs/OpenDartReader_users_guide.ipynb

# excecution time: ~1 min

def missing_acm_urls(flagged_df):
    results = []

    for idx, row in flagged_df.iterrows():
        company = row['Company']
        corp = row['Corp Code']
        rcp = row['Disclosure']
        assets = row['Total Assets']

        try:
            subdocs = dart.sub_docs(str(rcp))  # rcept_no must be string
            match = subdocs[subdocs['title'].str.contains("이사회 등 회사의 기관에 관한 사항")]

            if not match.empty:
                url = match.iloc[0]['url']
            else:
                url = None

        except Exception as e:
            print(f"Failed to fetch for {corp} ({rcp}): {e}")
            url = None

        results.append({
            'corp_code': str(corp),
            'company': company,
            'rcept_no': rcp,
            'url': url,
            'assets': assets
        })

        time.sleep(0.7)

    return pd.DataFrame(results)

# 3. Retrieve target audit committee document URLs for flagged cases
audit_targets_df = missing_acm_urls(flagged).drop_duplicates(subset=['corp_code'])
audit_targets_df

Unnamed: 0,corp_code,company,rcept_no,url,assets
0,00219097,BGF,20250318001407,http://dart.fss.or.kr/report/viewer.do?rcpNo=2...,2171266453121
1,00303873,CJ CGV,20250321001474,http://dart.fss.or.kr/report/viewer.do?rcpNo=2...,3194214710432
2,00115694,DB증권,20250317000605,http://dart.fss.or.kr/report/viewer.do?rcpNo=2...,10211959858311
3,00116949,DN오토모티브,20250313001452,http://dart.fss.or.kr/report/viewer.do?rcpNo=2...,4567386501437
4,01437186,ESR켄달스퀘어리츠,20241031000319,http://dart.fss.or.kr/report/viewer.do?rcpNo=2...,2343918337381
...,...,...,...,...,...
72,00148610,한화투자증권,20250311001013,http://dart.fss.or.kr/report/viewer.do?rcpNo=2...,13250183608173
73,00164478,현대건설,20250312001136,http://dart.fss.or.kr/report/viewer.do?rcpNo=2...,23714495000000
74,00145880,현대제철,20250317000668,http://dart.fss.or.kr/report/viewer.do?rcpNo=2...,35218841930044
75,00117188,효성,20250313001006,http://dart.fss.or.kr/report/viewer.do?rcpNo=2...,5088858301845


In [None]:
# TO DO: fix so that it pulls info from the immediate table following:  <p>가. 감사위원 현황</p>
# <span style="font-family:DartNBSP, &quot;바탕&quot;, Batang;font-size:12pt;line-height:1.6em;font-weight:bold;color:#000000;">가. 감사위원 현황<br></span>

# make sure its not drawing from another table ie not captured in flags but incorrect count nonetheless 

# fix the structure of this notebook 

# also make sure that they're extracting names properly ie print out some examples and cross check manually 

In [18]:
# execution time ~ 2 minutes 

def parse_and_update_audit_members(audit_targets_df, exec_df, summary_df):
    updated_count = 0
    summary_updates = {}

    for idx, row in audit_targets_df.iterrows():
        corp_code = row['corp_code']
        company = row['company']
        url = row['url']
        rcept_no = row['rcept_no']

        if pd.isna(url) or not isinstance(url, str):
            continue

        try:
            response = requests.get(url, timeout=20)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            anchor = soup.find('a', attrs={'name': 'toc3'}, string='2. 감사제도에 관한 사항')
            anchor_p = anchor.find_parent('p') if anchor else None

            table = None
            candidate = anchor_p.find_next_sibling() if anchor_p else None

            while candidate:
                if candidate.name == 'table':
                    header_rows = []
                    for tr in candidate.find_all('tr'):
                        ths = tr.find_all('th')
                        if len(ths) == 1 and ths[0].has_attr('colspan'):
                            continue
                        header_rows.append(tr)
                        if len(header_rows) >= 2:
                            break

                    if len(header_rows) < 2:
                        candidate = candidate.find_next_sibling()
                        continue

                    headers = []
                    for tr in header_rows:
                        headers.extend([
                            th.get_text(strip=True).replace('\xa0', '').replace('\n', '')
                            for th in tr.find_all(['th', 'td'])
                        ])

                    name_idx = next((i for i, h in enumerate(headers) if '성명' in h), None)
                    outside_idx = next((i for i, h in enumerate(headers) if '사외이사' in h), None)

                    if name_idx is not None and outside_idx is not None:
                        table = candidate
                        break

                candidate = candidate.find_next_sibling() # iterate to the next table

            if table:
                members = []
                data_rows = table.find_all('tbody')[0].find_all('tr') if table.find('tbody') else table.find_all('tr')[len(header_rows):]

                for tr in data_rows:
                    tds = tr.find_all(['td', 'th'])
                    if len(tds) <= max(name_idx, outside_idx):
                        continue

                    name = tds[name_idx].get_text(strip=True)
                    is_outside = tds[outside_idx].get_text(strip=True)

                    if not name or name == '-' or not is_outside or is_outside == '-':
                        continue

                    is_outside_flag = '예' in is_outside or 'O' in is_outside
                    members.append((name, is_outside_flag))

                    mask = (
                        (exec_df['Corp Code'] == corp_code) &
                        (exec_df['Name'] == name)
                    )
                    if not exec_df.loc[mask].empty:
                        exec_df.loc[mask, 'is_audit_committee_member'] = True
                        updated_count += 1

                total_members = len(members)
                outside_members = sum(1 for _, flag in members if flag)

                if total_members == 1 and outside_members == 0:
                    total_members = 0
                    outside_members = 0

                if total_members > 0:
                    summary_updates[corp_code] = {
                        'Audit Committee': total_members,
                        'Audit Committee Outside Directors': outside_members
                    }
                    print(f"Updated {corp_code} - {company} - {rcept_no} - {summary_updates[corp_code]}")
                else:
                    summary_updates[corp_code] = {
                        'Audit Committee': 0,
                        'Audit Committee Outside Directors': 0
                    }
                    print(f"{corp_code} - {company} - {rcept_no}: Valid table found but no valid members.")

            else:
                print(f"{corp_code} - {company} - {rcept_no}: No valid audit committee table found.")

        except Exception as e:
            print(f"Exception occurred for {corp_code} - {company}: {e}")

        time.sleep(1.5)

    for corp_code, update in summary_updates.items():
        summary_df.loc[summary_df['Corp Code'] == corp_code, 'Audit Committee'] = update['Audit Committee']
        summary_df.loc[summary_df['Corp Code'] == corp_code, 'Audit Committee Outside Directors'] = update['Audit Committee Outside Directors']

    return exec_df, summary_df

exec_df_updated, summary_df_updated = parse_and_update_audit_members(audit_targets_df, exec_df, summary_df)

00219097 - BGF - 20250318001407: No valid audit committee table found.
Updated 00303873 - CJ CGV - 20250321001474 - {'Audit Committee': 4, 'Audit Committee Outside Directors': 4}
Updated 00115694 - DB증권 - 20250317000605 - {'Audit Committee': 3, 'Audit Committee Outside Directors': 3}
00116949 - DN오토모티브 - 20250313001452: No valid audit committee table found.
01437186 - ESR켄달스퀘어리츠 - 20241031000319: Valid table found but no valid members.
Updated 01568413 - F&F - 20250318001144 - {'Audit Committee': 3, 'Audit Committee Outside Directors': 3}
Updated 00500254 - GS - 20250318001410 - {'Audit Committee': 3, 'Audit Committee Outside Directors': 3}
Updated 01310269 - HDC현대산업개발 - 20250318000895 - {'Audit Committee': 3, 'Audit Committee Outside Directors': 3}
Updated 00164830 - HD한국조선해양 - 20250318001131 - {'Audit Committee': 3, 'Audit Committee Outside Directors': 3}
Updated 01390344 - HD현대중공업 - 20250318000897 - {'Audit Committee': 3, 'Audit Committee Outside Directors': 3}
Updated 01316254 - HS

In [24]:
for idx, row in audit_targets_df.iterrows():
    corp_code = row['corp_code']
    company = row['company']
    rcept_no = str(row['rcept_no'])  # Ensure it's a string
    audit = row['url']

    try:
        subdocs = dart.sub_docs(rcept_no)

        match = subdocs[subdocs['title'].str.contains("이사회 등 회사의 기관에 관한 사항")]

        if not match.empty:
            print(f"{corp_code} - {company} - {rcept_no} - {audit}")
        else:
            print(f"{corp_code} - {company} - {rcept_no}: Governance section not found.")
    except Exception as e:
        print(f"{corp_code} - {company} - {rcept_no}: Error - {e}")

00219097 - BGF - 20250318001407 - http://dart.fss.or.kr/report/viewer.do?rcpNo=20250318001407&dcmNo=10425605&eleId=119&offset=3170425&length=89257&dtd=dart4.xsd
00303873 - CJ CGV - 20250321001474 - http://dart.fss.or.kr/report/viewer.do?rcpNo=20250321001474&dcmNo=10448443&eleId=112&offset=4292328&length=114906&dtd=dart4.xsd
00115694 - DB증권 - 20250317000605 - http://dart.fss.or.kr/report/viewer.do?rcpNo=20250317000605&dcmNo=10414192&eleId=38&offset=3871883&length=156053&dtd=dart4.xsd
00116949 - DN오토모티브 - 20250313001452 - http://dart.fss.or.kr/report/viewer.do?rcpNo=20250313001452&dcmNo=10405111&eleId=125&offset=3259998&length=68432&dtd=dart4.xsd
01437186 - ESR켄달스퀘어리츠 - 20241031000319 - http://dart.fss.or.kr/report/viewer.do?rcpNo=20241031000319&dcmNo=10164914&eleId=42&offset=604202&length=34536&dtd=dart4.xsd
01568413 - F&F - 20250318001144 - http://dart.fss.or.kr/report/viewer.do?rcpNo=20250318001144&dcmNo=10423669&eleId=111&offset=2976222&length=77180&dtd=dart4.xsd
00500254 - GS - 2025

In [33]:
filtered_df = summary_df_updated[summary_df_updated['Corp Code'] == '00219097']
filtered_df

Unnamed: 0,Company,Corp Code,Disclosure,Audit Committee,Audit Committee Outside Directors,Inside Directors,Outside Directors,Other Non Exec Directors,Auditors,Female Voting,Male Voting,Voting Directors,Non Registered,Disclosure Date,Total Assets
2,BGF,219097,20250318001407,0,0,2,2,0,1,0,4,4,0,2025-03-18,2171266453121


In [34]:
filtered_df = exec_df_updated[exec_df_updated['Corp Code'] == '00219097']
filtered_df

Unnamed: 0,Disclosure,Corp Code,Company,Name,Gender,Position,Registered Officer Status,Responsibilities,Professional Background,Shareholder Relation,Period of employment,industry_code,is_audit_committee_member,is_auditor
353,20250318001407,219097,BGF,홍정국,남,부회장,사내이사,대표이사\n\n사외이사후보 \n추천위원회,·美스탠포드대학교 경제학과 졸업·美스탠포드대학교 산업공학 석사·美와튼 경영대학원(M...,최대주주의 자/임원,11년\n7개월,64992,False,False
354,20250318001407,219097,BGF,류철한,남,전무,사내이사,재경담당\n\n내부거래위원회,·동국대학교 무역학과 졸업·舊비지에프리테일 재무팀장 ...,임원,32년\n1개월,64992,False,False
355,20250318001407,219097,BGF,성영훈,남,이사,사외이사,자문역\n\n내부거래위원회\n\n사외이사후보\n추천위원회,·연세대학교 법학과 졸업\n·연세대학교 법과대학원 석사\n·연세대학교 법과대학원 박...,임원,4년\n10개월,64992,False,False
356,20250318001407,219097,BGF,김봉환,남,이사,사외이사,자문역\n\n내부거래위원회\n\n사외이사후보\n추천위원회,·서울대학교 경제학과 졸업\n·미시건대학 경영학과(MBA) 졸업\n·워싱턴대학 경영...,임원,1년\n10개월,64992,False,False
357,20250318001407,219097,BGF,전 홍,남,감사,감사,감사,·서울대학교 경영학과 졸업\n·서울대학교 회계학 석사\n·삼일회계법인\n·비지에프리...,임원,5년\n10개월,64992,False,True


In [64]:
exec_df_updated.to_csv('exec_df_updated', index = False)

In [65]:
second_flag = check_governance_compliance(summary_df_updated)

In [68]:
filtered_df = exec_df_updated[exec_df_updated['Corp Code'] == '00860730']
filtered_df

Unnamed: 0,Disclosure,Corp Code,Company,Name,Gender,Position,Registered Officer Status,Responsibilities,Professional Background,Shareholder Relation,Period of employment,industry_code,is_audit_committee_member,is_auditor
7,20250314000900,860730,에이리츠,김종국,남,대표이사,사내이사,총괄,- KB투자증권\n- 유진투자증권 \n- 삼성증권\n- 한국공인회계사,본인,2010.11~현재,68121,False,False
8,20250314000900,860730,에이리츠,김춘희,여,사내이사,사내이사,경영,- 유진투자증권\n- 하이투자증권\n- 교보증권,-,2011.01~현재,68121,False,False
9,20250314000900,860730,에이리츠,임일수,남,사외이사,사외이사,-,- 한화투자증권 대표이사\n- 푸르덴셜투자증권 대표이사,-,2018.03~현재,68121,False,False
10,20250314000900,860730,에이리츠,김기태,남,사외이사,사외이사,-,- 現)시리우스 인베스트먼트 대표이사\n- 화천종합기획실 부사장\n- (주)알피바이...,-,2018.11~현재,68121,False,False
11,20250314000900,860730,에이리츠,김홍구,남,감사,감사,-,- 現) 롯데관광개발 감사\n- 오성회계법인 대표\n- 윈즈회계사무소 운영\n- 한...,-,2019.03~현재,68121,False,False
12,20250314000900,860730,에이리츠,우성민,남,상무,미등기,준법\n 감시인,- 벨에포크자산운용\n- IBK 투자증권\n- 삼성증권,-,2020.02~현재,68121,False,False
13,20250314000900,860730,에이리츠,황성재,남,상무,미등기,투자사업\n 본부장,- 현대BS&C- 호야씨앤티- 학운산업개발- 세중아이앤디\n- 호야건설,-,2022.02~현재,68121,False,False


In [17]:
summary_df_updated = pd.read_csv('summary_df_updated')
summary_df_updated

Unnamed: 0,Company,Corp Code,Disclosure,Audit Committee,Audit Committee Outside Directors,Inside Directors,Outside Directors,Other Non Exec Directors,Auditors,Female Voting,Male Voting,Voting Directors,Non Registered,Disclosure Date,Total Assets
0,디씨엠,177199,20250718000228,0,0,3,1,0,0,0,4,4,8,2025-07-18,2.776427e+11
1,신도리코,135795,20250715000025,3,3,2,3,0,0,0,5,5,5,2025-07-15,1.078783e+12
2,조선선재,796994,20250715000164,3,3,1,3,0,0,0,4,4,0,2025-07-15,1.555382e+11
3,한일철강,163196,20250715000144,0,0,3,1,0,1,2,2,4,9,2025-07-15,3.310290e+11
4,우진아이엔에스,143262,20250711000031,0,0,2,2,0,1,0,4,4,7,2025-07-11,1.134977e+11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
840,오리엔트바이오,141626,20240620000211,0,0,3,1,0,1,1,3,4,5,2024-06-20,8.105022e+10
841,동원금속,118008,20240619000177,0,0,3,3,0,0,0,6,6,20,2024-06-19,4.878675e+11
842,대동전자,157104,20240618000424,3,3,2,3,0,0,0,5,5,0,2024-06-18,2.511756e+11
843,기신정기,106614,20240613000239,0,0,3,1,0,2,0,4,4,3,2024-06-13,2.651461e+11


In [None]:
# TODO

# to do: correct to cover the below cases or ammend the changes

# update summary_df in full and save as updated csv 

# correct comments andd sectioning

# work on individual_df

In [None]:
'''
Undercounted: 
00159698 - 지역난방공사 - 20250327000192 - 2
00261285 - 한국가스공사 - 20250325000742 - 2
00255619 - 강원랜드 - 20250318001106 - 2

Counts 2 when there are none: 
00195229 - 미스토홀딩스 - 20250321001837 - 2: 
'''

In [24]:
rcept_no = "20250312001136"  # Make sure it's a string

# Fetch sub-document metadata as a DataFrame
subdocs = dart.sub_docs(rcept_no)

# Search for the subdocument titled "이사회 등 회사의 기관에 관한 사항"
match = subdocs[subdocs['title'].str.contains("이사회 등 회사의 기관에 관한 사항")]

if not match.empty:
    print("Governance section URL:", match.iloc[0]["url"])
else:
    print("Governance section not found.")

Governance section URL: http://dart.fss.or.kr/report/viewer.do?rcpNo=20250312001136&dcmNo=10399030&eleId=122&offset=6215580&length=152371&dtd=dart4.xsd


In [None]:
rcept_no = "20250328002285"  # Make sure it's a string

# Fetch sub-document metadata as a DataFrame
subdocs = dart.sub_docs(rcept_no)

# Search for the subdocument titled "이사회 등 회사의 기관에 관한 사항"
match = subdocs[subdocs['title'].str.contains("이사회 등 회사의 기관에 관한 사항")]

if not match.empty:
    print("Governance section URL:", match.iloc[0]["url"])
else:
    print("Governance section not found.")

In [101]:
summary_df_updated.to_csv('summary_df_updated', index=False)

In [None]:
summary_df_updated

Unnamed: 0,Company,Corp Code,Disclosure,Audit Committee,Audit Committee Outside Directors,Inside Directors,Outside Directors,Other Non Exec Directors,Auditors,Female Voting,Male Voting,Voting Directors,Non Registered,Disclosure Date,Total Assets
0,디씨엠,00177199,20250718000228,0,0,3,1,0,0,0,4,4,8,2025-07-18,277642743004
1,신도리코,00135795,20250715000025,3,3,2,3,0,0,0,5,5,5,2025-07-15,1078782998517
2,조선선재,00796994,20250715000164,3,3,1,3,0,0,0,4,4,0,2025-07-15,155538184310
3,한일철강,00163196,20250715000144,0,0,3,1,0,1,2,2,4,9,2025-07-15,331028995726
4,우진아이엔에스,00143262,20250711000031,0,0,2,2,0,1,0,4,4,7,2025-07-11,113497686732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
840,오리엔트바이오,00141626,20240620000211,0,0,3,1,0,1,1,3,4,5,2024-06-20,81050220067
841,동원금속,00118008,20240619000177,0,0,3,3,0,0,0,6,6,20,2024-06-19,487867520934
842,대동전자,00157104,20240618000424,3,3,2,3,0,0,0,5,5,0,2024-06-18,251175551401
843,기신정기,00106614,20240613000239,0,0,3,1,0,2,0,4,4,3,2024-06-13,265146139041


SECTION 3: Checks on Grouped Company Data

OVERVIEW
1. Overlap: check that there are no overlapping cases where an executive is both an audit committee member AND auditor. 
    t/f check
2. Assets: For each company, parse through their financial statements to see if their assets > $2T KRW threshold and are subject to a mandatory audit committee.
    800 API requests, ~8 min to run assets call
    assets appeneded to summary_df
3. Governance Check 1: 
    For All Corporations:
    - outside directors make up >= 1/4 voting directors 
    For required audit committees: 
    - audit committee > 3
    - outside directors >= 2/3 audit committee 
4. Governance Check 2: 
    For each flagged case, go through a more rigourous check for false negatives and correct any excluded information.

RESOURCES 
Retrieve Total Assets from OPENDART
    OPENDART Financial Statement Documentation: https://engopendart.fss.or.kr/guide/detail.do?apiGrpCd=DE003&apiId=AE00033
    Dart Package Documentation: https://nbviewer.org/github/FinanceData/OpenDartReader/blob/master/docs/OpenDartReader_reference_manual.ipynb 


USAGE 
Update dart.finstate() bsns_year and reprt_code
- First Quarterly Report : 11013
- Semi-annual Report : 11012
- Third Quarterly Report : 11014
- Annual Report : 11011
Validate flagged governance cases 

In [254]:
# check for overlapping cases of audit committee member and auditor 

overlap_cases = exec_df[(exec_df['is_audit_committee_member'] == True) & (exec_df['is_auditor'] == True)]

if not overlap_cases.empty:
    print("WARNING: Found instances where an individual is both an audit committee member and an exclusive auditor:")
    print(overlap_cases[['Name', 'Responsibilities', 'Position', 'is_audit_committee_member', 'is_auditor']])
else:
    print("No instances found where an individual is both an audit committee member and an exclusive auditor. The logic for non-overlap is working as expected.")

No instances found where an individual is both an audit committee member and an exclusive auditor. The logic for non-overlap is working as expected.


In [None]:
# 195229	Audit Committee Outside Directors (0) < 2/3 of...	미스토홀딩스	20250321001837 
# audit committee = 3 when none exists

# Counts Under: 
''' 
20250318000895
20250311001270
20250313000901	
20250318000465
20250318000789
20250327000192
20250331003380
'''

# Correct Under 3 
''' 
20250313000770
20250314001522	
20250306000580 
'''


In [None]:
# 0 Audit Committee Errors: 

# Different formatting of audit committe table: 20250318000789
'''   
http://dart.fss.or.kr/report/viewer.do?rcpNo=20250318000789&dcmNo=10421007&eleId=38&offset=2967448&length=134337&dtd=dart4.xsd 


성 명	주 요 경 력	    비 고
Name                   사외이사 Outside Director  
'''

'   \nhttp://dart.fss.or.kr/report/viewer.do?rcpNo=20250318000789&dcmNo=10421007&eleId=38&offset=2967448&length=134337&dtd=dart4.xsd \n\n\n성 명\t주 요 경 력\t    비 고\nName                   사외이사 Outside Director  \n'

SECTION 4: Fix Governance Check
Read in additional disclosure file to cross check false negatives that were flagged by the initial governance check 

Take in these columns to update outside director and audit committee member counts:
    'Category': 
        Registered director(Outside director, excluding members of the audit committee) -> excludes outside directors and audit committee members 
        Outside director (excluding members of the audit committee)
        Member of the audit committee or auditor 
        Auditor 
    'Headcount' 

To get audit committee: Member of the audit committee or auditor  - Auditor
To get outside directors in audit committee: Member of the audit committee or auditor - summary_df[outside directors] - outside director

In [129]:
summary_df.to_csv('by_corp_df', index = False)

Building individual_df

In [None]:
# BEFORE RUNNING: open the parsing notebook and run
# if run correctly, it will read in the exec_df file and save regex_parsed_df,
# a data frame of individual entries where their professional background has been parsed

regex_parsed_df = pd.read_csv('regex_parsed_df.csv')
exec_df = exec_df.reset_index(drop=True)
exec_df = pd.concat([exec_df, regex_parsed_df.drop(columns=['user_id'])], axis=1)
exec_df.to_csv('exec_df')

In [None]:
exec_df = pd.read_csv('exec_df')
exec_df = exec_df.drop('Unnamed: 0', axis = 1)
exec_df['Responsibilities'] = exec_df['Responsibilities'].str.replace('\n', ', ')

In [135]:
by_corp_df = pd.read_csv('by_corp_df')

In [137]:
by_corp_df

Unnamed: 0,corp_code,Name of company,Item code,Audit Committee,Audit Committee Outside Directors,Voting Directors,Inside Directors,Outside Directors,Other Non Exec Directors,Female Voting,Male Voting,Non Registered,total_assets,compliance_flag
0,684802,A Plus Asset Advisor,244920,3,3,7,2,3,2,0,7,0,4.792762e+11,Pass
1,860730,A SELF-ADMINISTERED REAL ESTATE INVESTMENT TRUST,140910,0,0,4,2,2,0,1,3,3,8.288426e+10,Pass
2,400857,ABLE C&C,78520,3,3,8,0,3,4,3,5,0,1.590859e+11,Pass
3,936787,AEKYUNG CHEMICAL,161000,0,0,8,4,3,1,2,6,8,1.356432e+12,Pass
4,365387,AJ Networks,95570,0,0,6,3,2,1,0,6,11,1.717770e+12,Pass
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
807,878915,iM Financial Group,139130,8,8,9,1,8,0,2,7,8,1.279968e+12,Pass
808,360142,iMarketKorea,122900,0,3,7,3,3,1,0,7,10,4.414712e+12,No Audit Committee listed
809,1244601,kakaopay,377300,3,3,6,1,4,1,1,5,10,4.350661e+11,Pass
810,372882,ktcs corporation,58850,3,3,7,2,3,2,2,5,0,4.699321e+11,Pass
