In [2]:
# import libraries and set parameters 

import os
import io
import re
import time
import zipfile 
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor 

import dart_fss
import OpenDartReader

API_key = '0d67945133e224c451452e071e0d8349969353e1' 
dart = OpenDartReader(API_key)
dart_fss.set_api_key(API_key)

bsns_year = '2024'
reprt_code = '11011'

# call most recent annual/semi-annual report as quarterly may exclude audit info due to corporate disclosure form preparation standards 

Error occurred during getting browser(s): random, but was suppressed with fallback.


In [3]:
dart = OpenDartReader(API_key)
dart_fss.set_api_key(API_key)
print("OpenDart API initialized.")

OpenDart API initialized.


In [None]:
BASE_DATA_DIR = os.path.join(r'..\..', 'data', 'raw')
os.makedirs(BASE_DATA_DIR, exist_ok=True)
print(f"Raw data directory exists at: {BASE_DATA_DIR}")

Raw data directory exists at: ..\..\data\raw


  BASE_DATA_DIR = os.path.join('..\..', 'data', 'raw')


In [5]:
def save_df_to_csv(df: pd.DataFrame, file_path: str, index: bool = False):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    try:
        df.to_csv(file_path, index=index)
        print(f"DataFrame saved to {file_path}")
    except Exception as e:
        print(f"Error saving DataFrame to CSV {file_path}: {e}")

In [5]:
# get all listed corp codes (keys to access opendart) 
# opendart link: https://engopendart.fss.or.kr/guide/detail.do?apiGrpCd=DE001&apiId=AE00004

def get_corp_code(api_key: str, output_dir: str = BASE_DATA_DIR) -> pd.DataFrame:
    url_code = f'https://opendart.fss.or.kr/api/corpCode.xml?crtfc_key={api_key}'
    response = requests.get(url_code) 

    # check that the target directory exists 
    os.makedirs('dart_data', exist_ok=True)

    # unzip and extract CORPCODE.xml
    with zipfile.ZipFile(io.BytesIO(response.content)) as z:
        z.extractall('dart_data')
        xml_path = os.path.join('dart_data', 'CORPCODE.xml')

    # parse XML
    tree = ET.parse(xml_path)
    root = tree.getroot()

    # collect listed companies (6-digit stock code only)
    corp_list = []
    for corp in root.findall('list'):
        stock_code = corp.findtext('stock_code')
        if stock_code and len(stock_code) == 6:
            corp_list.append({
                'corp_code': corp.findtext('corp_code'),
                'corp_name': corp.findtext('corp_name'),
                'corp_eng_name': corp.findtext('corp_eng_name'),
                'stock_code': stock_code
            })

    corp_codes_df = pd.DataFrame(corp_list)
    output_filepath = os.path.join(output_dir, 'listed_corp_codes.csv')
    save_df_to_csv(corp_codes_df, output_filepath)
    return corp_codes_df

In [None]:
def get_kospi_company_info(api_key: str, corp_codes_df: pd.DataFrame, output_dir: str = BASE_DATA_DIR) -> pd.DataFrame:
    data = []

    api_endpoint = "https://engopendart.fss.or.kr/engapi/company.json"

    for i, row in corp_codes_df.iterrows():
        corp_code = row['corp_code']
        corp_name = row['corp_name']

        params = {
            'crtfc_key': API_key,
            'corp_code': corp_code
        }
        try:
            response = requests.get(api_endpoint, params=params)
            response.raise_for_status() # Raise HTTPError for bad responses 
            info = response.json()

            # Filter for KOSPI companies ('Y' indicates Yoo-ga-jeung-kwon Market)
            if info and info.get('corp_cls') == 'Y':
                data.append({
                    'corp_name': info.get('corp_name'),
                    'corp_code': info.get('corp_code'),
                    'stock_code': info.get('stock_code'),
                    'ceo_name': info.get('ceo_nm'),
                    'industry_code': info.get('induty_code'),
                })
            time.sleep(0.01) 
        except Exception as e:
            print(f"Failed to fetch company info for {corp_name} ({corp_code}): {e}")
            continue

    kospi_codes_df = pd.DataFrame(data)
    output_filepath = os.path.join(output_dir, 'kospi_company_info.csv')
    save_df_to_csv(kospi_codes_df, output_filepath)
    return kospi_codes_df

In [None]:
def get_executive_status_data(api_key: str, kospi_codes_df: pd.DataFrame, bsns_year: int, reprt_code: str, output_dir: str = os.path.join('data', 'raw')) -> pd.DataFrame:
    results = []
    api_endpoint = "https://opendart.fss.or.kr/api/exctvSttus.json"
    total_corps = len(kospi_codes_df)

    for idx, row in kospi_codes_df.iterrows():
        corp_code = row['corp_code']
        corp_name = row['corp_name']

        params = {
            'crtfc_key': api_key,
            'corp_code': corp_code,
            'bsns_year': bsns_year,
            'reprt_code': reprt_code
        }

        try:
            response = requests.get(api_endpoint, params=params)
            response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
            data = response.json()

            if data['status'] == '000': # Success
                if 'list' in data and data['list']:
                    df = pd.DataFrame(data['list'])
                    results.append(df)
            else:
                print(f"No executive data available for {corp_name} ({corp_code}) for {bsns_year}/{reprt_code}.")

        except Exception as e:
            print(f"An unexpected error occurred for {corp_name} ({corp_code}): {e}")

        time.sleep(0.07) 

    if results:
        executive_status_df = pd.concat(results, ignore_index=True)
        output_filepath = os.path.join(output_dir, f'executive_status_{bsns_year}_{reprt_code}.csv')
        save_df_to_csv(executive_status_df, output_filepath)
        print(f"\nSuccessfully fetched and saved executive status for {len(executive_status_df)} records.")
        return executive_status_df
    else:
        print("\nNo executive status data was retrieved.")
        return pd.DataFrame() 

In [None]:
def get_total_assets(kospi_company_info_df: pd.DataFrame, bsns_year: str, reprt_code: str, API_key: str, output_dir: str = os.path.join('data', 'raw')) -> pd.DataFrame:
    """
    Fetches Total Assets for a list of companies from the DART API without helper functions.
    """
    api_url = 'https://opendart.fss.or.kr/api/fnlttSinglAcntAll.json'
    target_sj_div = "BS"
    target_account_names = {"자산총계", "총자산", "자산"} # checks for possible categories covering total assets 
    year = int(bsns_year)

    all_results = []

    for corp_code in kospi_company_info_df['corp_code']:
        rcept_no, assets, prior_assets, two_years_ago = None, None, None, None
        
        # Try CFS first, then OFS
        for fs_div in ['CFS', 'OFS']:
            params = {'crtfc_key': API_key, 'corp_code': corp_code, 'bsns_year': bsns_year, 'reprt_code': reprt_code, 'fs_div': fs_div}
            try:
                res = requests.get(api_url, params=params)
                res.raise_for_status()
                data = res.json()
                
                if data.get('status') == '000' and 'list' in data:
                    # Search for assets data directly from the JSON list
                    for item in data['list']:
                        if item['sj_div'] == target_sj_div and item['account_nm'].strip().replace(' ', '') in target_account_names:
                            rcept_no = item.get('rcept_no')
                            assets = pd.to_numeric(item.get('thstrm_amount', '').replace(',', ''), errors='coerce')
                            prior_assets = pd.to_numeric(item.get('frmtrm_amount', '').replace(',', ''), errors='coerce')
                            two_years_ago = pd.to_numeric(item.get('bfefrmtrm_amount', '').replace(',', ''), errors='coerce')
                            break # Found a match, exit inner loop
                    
                    if assets is not None:
                        break # Found assets, exit fs_div loop
            
            except (requests.exceptions.RequestException, ValueError):
                continue
        
        if assets is None:
            print(f"Total Assets not found for {corp_code}.")

        all_results.append({
            'Corp Code': corp_code,
            'rcept_no': rcept_no,
            f'Total Assets ({year})': assets,
            f'Total Assets ({year - 1})': prior_assets,
            f'Total Assets ({year - 2})': two_years_ago
        })
        
        time.sleep(0.07)

    assets_df = pd.DataFrame(all_results)
    
    for y_offset in [0, 1, 2]:
        col_name = f'Total Assets ({year - y_offset})'
        if col_name in assets_df.columns:
            assets_df[col_name] = pd.to_numeric(assets_df[col_name], errors='coerce').astype('Int64')

    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, f"assets_{bsns_year}_{reprt_code}.csv")
    assets_df.to_csv(output_path, index=False, encoding='utf-8-sig')

    return assets_df

In [None]:
def get_salary_type(kospi_company_info_df: pd.DataFrame, bsns_year: str, reprt_code: str, API_key: str, output_dir: str = os.path.join('data', 'raw')) -> pd.DataFrame:
    # === Internal Helper Function: DART API JSON request ===
    def _get_json(url, corp_code):
        """Helper to fetch JSON data from DART API and handle errors."""
        params = {
            'crtfc_key': API_key,
            'corp_code': corp_code,
            'bsns_year': bsns_year,
            'reprt_code': reprt_code
        }
        try:
            response = requests.get(url, params=params, timeout=10)
            response.raise_for_status()
            data = response.json()
            
            if data.get('status') != '000' or 'list' not in data:
                if data.get('status') != '000':
                    print(f"OPENDART Error for {corp_code}: {data.get('message')}")
                return []
            return data['list']
        except Exception as e:
            print(f"Request failed for {url} with params {params}: {e}")
            return []

    # === Internal Helper Function: Fetches data for a single company ===
    def _get_salary_data_for_corp(corp_code):
        """Fetches and consolidates salary data for a single company."""
        endpoints = {
            'individual': 'https://opendart.fss.or.kr/api/hmvAuditIndvdlBySttus.json',
            'unregistered': 'https://opendart.fss.or.kr/api/unrstExctvMendngSttus.json',
            'grouped': 'https://opendart.fss.or.kr/api/drctrAdtAllMendngSttusMendngPymntamtTyCl.json' 
        }
        
        results = []

        # 1. Individual executives (개인별 보수)
        for row in _get_json(endpoints['individual'], corp_code):
            results.append({
                'corp_code': corp_code,
                'name': row.get('nm'),
                'position': row.get('ofcps'),
                'compensation': row.get('mendng_totamt'), 
                'salary_source': '개인별보수',
                'salary_type': 'exact'
            })

        # 2. Unregistered executives (미등기 임원)
        for row in _get_json(endpoints['unregistered'], corp_code):
            results.append({
                'corp_code': corp_code,
                'name': '',
                'position': row.get('se'),
                'compensation': row.get('jan_salary_am'), 
                'salary_source': '미등기임원',
                'salary_type': 'estimate'
            })

        # 3. Grouped executives (임원 전체 보수 유형)
        for row in _get_json(endpoints['grouped'], corp_code):
            results.append({
                'corp_code': corp_code,
                'name': '',
                'position': row.get('se'),
                'compensation': row.get('psn1_avrg_pymntamt'),
                'salary_source': '임원전체보수유형',
                'salary_type': 'estimate'
            })
            
        return pd.DataFrame(results)

    # === Main loop to process all companies ===
    all_salary_data = []

    for corp_code in kospi_company_info_df['corp_code'].apply(lambda c: str(c).zfill(8)):
        df = _get_salary_data_for_corp(corp_code)
        
        if not df.empty:
            all_salary_data.append(df)
        
        # Respect DART API rate limits
        time.sleep(0.07)

    # Concatenate all individual DataFrames into one
    final_df = pd.concat(all_salary_data, ignore_index=True)
    
    # Save the final DataFrame to a CSV file
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, f"salary_separate_{bsns_year}_{reprt_code}.csv")
    final_df.to_csv(output_path, index=False, encoding='utf-8-sig')
    
    print(f"Salary data for {len(all_salary_data)} companies saved to: {output_path}")

    return final_df

In [8]:
def get_salary_total(kospi_company_info_df: pd.DataFrame, bsns_year: str, reprt_code: str, API_key: str, output_dir: str = os.path.join('data', 'raw')) -> pd.DataFrame:
    url = "https://opendart.fss.or.kr/api/hmvAuditAllSttus.json"

    # === Internal Helper Function: DART API JSON request ===
    def _get_json(url, corp_code):
        params = {
            'crtfc_key': API_key,
            'corp_code': corp_code,
            'bsns_year': bsns_year,
            'reprt_code': reprt_code
        }
        
        try:
            response = requests.get(url, params=params, timeout=10)
            response.raise_for_status()
            data = response.json()
            
            if data.get('status') != '000' or 'list' not in data:
                if data.get('status') != '000':
                    print(f"DART API Error for {corp_code}: {data.get('message')}")
                return []
            return data['list']
    
        except Exception as e:
            print(f"Request failed for {url} with params {params}: {e}")
            return []
        
    salary_total = []
    
    for corp_code in kospi_company_info_df['corp_code'].apply(lambda c: str(c).zfill(8)):
        data_list = _get_json(url, corp_code)

        if data_list:
            df = pd.DataFrame(data_list)
            salary_total.append(df)
        
        time.sleep(0.07)
        
    # concatenate all individual DataFrames into one
    final_df = pd.concat(salary_total, ignore_index=True)
    
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, f"salary_total_data_{bsns_year}_{reprt_code}.csv")
    final_df.to_csv(output_path, index=False, encoding='utf-8-sig')
    
    print(f"Salary data for {len(salary_total)} companies saved to: {output_path}")

In [15]:
def get_major_shareholder_data(api_key: str, kospi_codes_df: pd.DataFrame, bsns_year: int, reprt_code: str, output_dir: str = os.path.join('data', 'raw')) -> pd.DataFrame:
    # pulled from: https://opendart.fss.or.kr/guide/detail.do?apiGrpCd=DS002&apiId=2019007
    """
    Fetches major shareholder status data for a list of KOSPI corporations from OpenDART API.

    Args:
        api_key: Your OpenDART API key.
        kospi_codes_df: DataFrame containing 'corp_code' and 'corp_name' for KOSPI companies.
        bsns_year: Business year.
        reprt_code: Report code ('11011' for Annual Report, '11012' for Half-Year Report, etc.).
        output_dir: Directory to save the resulting CSV.

    Returns:
        A pandas DataFrame containing combined major shareholder data for all fetched companies.
    """
    print(f"Fetching major shareholder data for business year {bsns_year} and report code {reprt_code}...")
    results = []
    api_endpoint = "https://opendart.fss.or.kr/api/hyslrSttus.json"
    total_corps = len(kospi_codes_df)

    for idx, row in kospi_codes_df.iterrows():
        corp_code = row['corp_code']
        corp = str(corp_code).zfill(8)
        corp_name = row['corp_name']

        params = {
            'crtfc_key': api_key,
            'corp_code': corp,
            'bsns_year': bsns_year,
            'reprt_code': reprt_code
        }

        try:
            response = requests.get(api_endpoint, params=params)
            response.raise_for_status()
            data = response.json()

            if data['status'] == '000':
                if 'list' in data and data['list']:
                    df = pd.DataFrame(data['list'])
                    results.append(df)
            elif data['status'] == '013':
                print(f"No shareholder data available for {corp_name} ({corp_code}) for {bsns_year}/{reprt_code}.")
            else:
                print(f"API Error for {corp_name} ({corp_code}): Status {data.get('status')}, Message: {data.get('message')}")

        except Exception as e:
            print(f"An unexpected error occurred for {corp_name} ({corp_code}): {e}")

        time.sleep(0.7)

    if results:
        shareholder_df = pd.concat(results, ignore_index=True)
        output_filepath = os.path.join(output_dir, f'major_shareholders_{bsns_year}_{reprt_code}.csv')
        save_df_to_csv(shareholder_df, output_filepath)
        print(f"\nSuccessfully fetched and saved major shareholder data for {len(shareholder_df)} records.")
        return shareholder_df
    else:
        print("\nNo major shareholder data was retrieved.")
        return pd.DataFrame()

In [None]:
# RUNNING BLOCK 

# 1. Contruct Files: if running for the first time 
all_corp_codes_df = get_corp_code(API_key)
kospi_company_info_df = get_kospi_company_info(api_key=API_key, corp_codes_df=all_corp_codes_df)
kospi_company_info_df["corp_code"] = kospi_company_info_df["corp_code"].astype(str).str.zfill(8)
executive_status_data_df = get_executive_status_data(API_key, kospi_company_info_df, bsns_year, reprt_code) # Execution Time ~ 11 minutes 
assets_df = get_total_assets(kospi_company_info_df, bsns_year, reprt_code, API_key)
salary_separate_df = get_salary_type(kospi_company_info_df, bsns_year, reprt_code, API_key) 
salary_total_df = get_salary_total(kospi_company_info_df, bsns_year, reprt_code, API_key)
major_shareholder_df = get_major_shareholder_data(API_key, kospi_company_info_df, bsns_year, reprt_code) # Execution Time ~ 11 minutes 

# Total Execution Time: 49 minutes

DataFrame saved to data\raw\listed_corp_codes.csv
DataFrame saved to data\raw\kospi_company_info.csv
Fetching executive status data for business year 2024 and report code 11011...
No executive data available for 미래에셋맵스 아시아퍼시픽 부동산공모 1호 투자회사 (00600013) for 2024/11011.
No executive data available for 맥쿼리한국인프라투융자회사 (00435297) for 2024/11011.
No executive data available for 한국투자ANKOR유전해외자원개발특별자산투자회사1호(지분증권) (00907013) for 2024/11011.
No executive data available for 케이비발해인프라투융자회사 (01880801) for 2024/11011.
No executive data available for 주식회사 대신밸류리츠위탁관리부동산투자회사 (01885222) for 2024/11011.
No executive data available for 대한조선 주식회사 (00182696) for 2024/11011.
DataFrame saved to data\raw\executive_status_2024_11011.csv

Successfully fetched and saved executive status for 15082 records.
  Total Assets not found for 00600013.
  Total Assets not found for 00435297.
  Total Assets not found for 00907013.
  Total Assets not found for 01880801.
  Total Assets not found for 00112998.
  Total Assets not f

In [78]:
# 2. Read in Files: if run before, read in the files 
all_corp_codes_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\listed_corp_codes.csv')
kospi_company_info_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\kospi_company_info.csv')
executive_status_data_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\executive_status_2024_11011.csv')
assets_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\assets_2024_11011.csv')
salary_total_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\salary_total_data_2024_11011.csv')
major_shareholder_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\major_shareholders_2024_11011.csv')

In [None]:
'''  
08/02 TODO: 
# financials: because it's not the same format and doesn't indicate when 'net' income is loss or profit, only within the summary financial information sheet -> check through scraping 

- complete data extraction 
    - make sure that financial metrics is pulling correctly: should have complete data from ALL 
    - check that major shareholder status + is being pulled correcting 
- work on pre-processing 
    - with new financials, recheck audit committee requirement check 
    - audit committee corrections 
    - tenure standardization 
    - background parsing 
    - remove '/n' lines from all exec info 

- over next week: scoring/pre-processing 

'''

# instead of columns, print flags 
# column of audit committee flag type: 
    # 1 - assets over 2 trillion won but no audit committee listed 
    # 2 - failed audit committee requirement check
    # 3 - audit committee was updated from executive disclosure (ie individuals weren't identified as AC members though required to have one, so relies on business report to update)
# for the audit check: parse on corporation level. for those that were updated, mark from where 
# notes section: audit committee was not listed in executive disclosure and instead updated from financials. 
# for individual: individual was not listed under executive disclosure as an audit committee but detected in business report as so 
# audit committee was not listed for this corporation though required to 

'''
Documentation Format: 
- the produced data frames
- the full code and explaining decision processes (ie why est/exact for salary, the flags for audit committee, assets check from 2 years ago), what columns were droppped, etc 

Proposed Next Steps:

Useful OpenDART links: 
- https://opendart.fss.or.kr/guide/detail.do?apiGrpCd=DE004&apiId=AE00041 Ownership change (most recent data)
'''

'\nDocumentation Format: \n- the produced data frames\n- the full code and explaining decision processes (ie why est/exact for salary, the flags for audit committee, assets check from 2 years ago), what columns were droppped, etc \n\nProposed Next Steps:\n\nUseful OpenDART links: \n- https://opendart.fss.or.kr/guide/detail.do?apiGrpCd=DE004&apiId=AE00041 Ownership change (most recent data)\n'

In [None]:
# FIX: 
# pass through the list of corp codes from kospi_company_info_df
# to get the urls with opendart reader, it requires the disclosure number (rcept_no) which is currently only available in the executive status data 
# scrape the summary and audit urls for financial data (assets) for the past 3 years, and the audit committee members

# produce two csv files: one for financial metrics and one for audit committee members 

In [None]:
# pulling from opendart sub docs requires the disclosure number (rcept_no) which is currently only available in the executive status data

In [None]:
# --- OPENDART LOAD IN: UPDATE with most recent executive status report --- 
# exec_df2 = pd.read_excel('C:\Program Files\Git\OPENDART\.vscode\BusinessReport(Executive status_2025_First Quarterly).xls')
# exec_df['Total'] = exec_df.groupby('Item code')['Item code'].transform('count')
# exec_df.to_csv('kospi_exec_df.csv', index=False)

  exec_df = pd.read_excel('C:\Program Files\Git\OPENDART\.vscode\BusinessReport(Executive status_2025_First Quarterly).xls')


SECTION 2: Categorize and Split into Individual/Grouped Data

The below functions produce: 
    1. updated exec_df 
        with additional columns reflecting audit committee member/auditor status 
    2. individual_df 
        dataframe of each individual executive's information (professional background info to be added)
    3. summary_df 
        grouped dataframe by company, summing across all category types

Constructing summary_df of grouped data by corp

SECTION 3: Checks on Grouped Company Data

OVERVIEW
1. Overlap: check that there are no overlapping cases where an executive is both an audit committee member AND auditor. 
    t/f check
2. Assets: For each company, parse through their financial statements to see if their assets > $2T KRW threshold and are subject to a mandatory audit committee.
    800 API requests, ~8 min to run assets call
    assets appeneded to summary_df
3. Governance Check 1: 
    For All Corporations:
    - outside directors make up >= 1/4 voting directors 
    For required audit committees: 
    - audit committee > 3
    - outside directors >= 2/3 audit committee 
4. Governance Check 2: 
    For each flagged case, go through a more rigourous check for false negatives and correct any excluded information.

RESOURCES 
Retrieve Total Assets from OPENDART
    OPENDART Financial Statement Documentation: https://engopendart.fss.or.kr/guide/detail.do?apiGrpCd=DE003&apiId=AE00033
    Dart Package Documentation: https://nbviewer.org/github/FinanceData/OpenDartReader/blob/master/docs/OpenDartReader_reference_manual.ipynb 


USAGE 
Update dart.finstate() bsns_year and reprt_code
- First Quarterly Report : 11013
- Semi-annual Report : 11012
- Third Quarterly Report : 11014
- Annual Report : 11011
Validate flagged governance cases 

SECTION 4: Fix Governance Check
Read in additional disclosure file to cross check false negatives that were flagged by the initial governance check 

Take in these columns to update outside director and audit committee member counts:
    'Category': 
        Registered director(Outside director, excluding members of the audit committee) -> excludes outside directors and audit committee members 
        Outside director (excluding members of the audit committee)
        Member of the audit committee or auditor 
        Auditor 
    'Headcount' 

To get audit committee: Member of the audit committee or auditor  - Auditor
To get outside directors in audit committee: Member of the audit committee or auditor - summary_df[outside directors] - outside director

Building individual_df