In [None]:
# import libraries and set parameters 

import os
import io
import re
import time
import zipfile 
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor 

import dart_fss
import OpenDartReader

API_key = '0d67945133e224c451452e071e0d8349969353e1' 
dart = OpenDartReader(API_key)
dart_fss.set_api_key(API_key)

current_year = 2025
bsns_year = '2024' # from most recent annual report
reprt_code = '11011'
asset_year = '2024' # set to a year before bsns year due to grace period for corps surpassing 2 trillion krw 
current_date = '2025.08.07'

# call most recent annual/semi-annual report as quarterly may exclude audit info due to corporate disclosure form preparation standards 

Error occurred during getting browser(s): random, but was suppressed with fallback.


In [2]:
# 2. Read in Files: if run before, read in the files 
all_corp_codes_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\listed_corp_codes.csv')
kospi_company_info_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\kospi_company_info.csv')
executive_status_data_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\executive_status_2024_11011.csv')
assets_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\assets_2024_11011.csv')
salary_type_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\salary_data_2024_11011.csv')
salary_total_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\salary_total_data_2024_11011.csv')
major_shareholder_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\major_shareholders_2024_11011.csv')

Take in Raw Data. Clean, standardize, and parse. Produce an individual_df and standardized_summary_df 

clean 
- handle cases where assets/salary/shareholder status not included (set as 0 or flag)
- executive status:
    - standardize tenure 
    - standardize career points (ie just point system for higher degree phd, masters, law? - figure out if possible to parse for department)
    - drop original and corp name input columns 


then group by exec_df to get:
- the summarized data ie for voting, audit commmittee, male/female, num outside directors, and total salary 

individual_df 
- assign salary
- check that salary numbers add up 

group_df 
- counts of all 
- governance check/flag 

# TODO: fully fix exec-df 
- standardize tenure 
- group data 
- make sure that audit commmittee membership is correct (governance check -> if same problems come up, extract audit committee membership from financial doc)
- standardize education 

# TODO: group data 
- sumarize the exec_df + get the counts 
- check that the total comp of summary_df = the salary_total comp vals 
- standardize tenure 

# At Large: 
- check that audit committee members are correct 

In [53]:
exec_df = executive_status_data_df.drop(columns=['corp_cls', 'birth_ym', 'fte_at', 'tenure_end_on', 'stlm_dt'], errors='ignore')

def is_audit_committee_member(responsibility, position):

    responsibility_cleaned = re.sub(r'\s', '', responsibility)
    
    return bool(re.search(r'감사위원회위원|감사위원|감사위원장', responsibility_cleaned)) 

def is_auditor_exclusive(responsibility, position): #isolated capture to not overlap with is_audit_committee_member
    if is_audit_committee_member(responsibility, position):
        return False
    
    responsibility_check = False
    if isinstance(responsibility, str):
        responsibility_cleaned = re.sub(r'\s', '', responsibility)

        responsibility_check = '감사' in responsibility_cleaned and not re.search(r'감사위원회위원|감사위원', responsibility_cleaned) 
    position_check = False 


    return responsibility_check or position_check

# apply updated audit membership 
exec_df['is_audit_committee_member'] = exec_df.apply(
    lambda row: is_audit_committee_member(row['chrg_job'], row['rgist_exctv_at']), axis=1
)

exec_df['is_auditor'] = exec_df.apply(
    lambda row: is_auditor_exclusive(row['chrg_job'], row['rgist_exctv_at']), axis=1
)

In [54]:
def assign_compensation(exec_df: pd.DataFrame, salary_type_df: pd.DataFrame) -> pd.DataFrame:
    """
    Assign salary from compensation_df to each executive in exec_df.
    Uses individual match first; falls back on average grouped position compensation if registered.
    Leaves '미등기' status executives empty unless exact match is found.
    """
    exec_df['salary'] = None
    exec_df['salary_source'] = None
    exec_df['salary_type'] = None

    for idx, row in exec_df.iterrows():
        corp_code = row['corp_code']
        name = row['nm']
        status = row.get('rgist_exctv_at')
        is_auditor = row.get('is_auditor', False)
        is_committee = row.get('is_audit_committee_member', False)

        # 1. Try to match by name
        match = salary_type_df[
            (salary_type_df['corp_code'] == corp_code) & 
            (salary_type_df['name'] == name)
        ]

        if not match.empty:
            row_data = match.iloc[0]

        else:
            if status == '미등기':
                label = '미등기임원' 
            # 2. Estimate fallback: build label
            elif is_auditor:
                label = '감사'
            elif status == '사외이사':
                label = '감사위원회 위원' if is_committee else '사외이사(감사위원회 위원 제외)'
            else:
                label = '등기이사(사외이사, 감사위원회 위원 제외)'

            group_match = salary_type_df[
                (salary_type_df['corp_code'] == corp_code) & 
                (salary_type_df['name'].isna()) & 
                (salary_type_df['position'] == label)
            ]

            row_data = group_match.iloc[0] if not group_match.empty else pd.Series(dtype='object')

        # 3. Assign if valid
        if not row_data.empty:
            exec_df.at[idx, 'salary'] = row_data.get('compensation')
            exec_df.at[idx, 'salary_source'] = row_data.get('salary_source')
            exec_df.at[idx, 'salary_type'] = row_data.get('salary_type')

    return exec_df

exec_df = assign_compensation(exec_df, salary_type_df)

exec_df = pd.merge(
    exec_df, 
    major_shareholder_df[['corp_code', 'nm', 'trmend_posesn_stock_qota_rt']],
    on=['corp_code', 'nm'],
    how='left'
)

In [56]:
import pandas as pd
import re
from datetime import datetime

def convert_tenure_to_months(tenure_str, current_date=None):
    """
    Converts a tenure string from various formats into a total number of months.
    
    Args:
        tenure_str (str): The string representing tenure (e.g., '2년 6개월', '2022.05', '11.3년').
        current_date (datetime): The reference date to calculate tenure from a start date.
                                 Defaults to today's date if not provided.

    Returns:
        float: The total tenure in months, or pd.NA if the format is not recognized.
    """
    if pd.isna(tenure_str) or not isinstance(tenure_str, str) or not tenure_str.strip():
        return pd.NA
        
    tenure_str = tenure_str.strip()
    
    # 1. Pattern: All date formats, including those with extra text
    date_match = re.search(r'(\d{2,4}[년\.\s]\d{1,2}[월]?(?:[년\.\s]\d{1,2}[일])?|\d{1,2}\.\d{1,2}\.\d{1,2})', tenure_str)
    
    if date_match:
        date_str = date_match.group(1).replace(' ', '').replace('년', '.').replace('월', '').replace('일', '')
        date_obj = pd.NaT
        
        # Now try to parse the cleaned date string with multiple formats
        date_formats = ['%Y.%m.%d', '%Y.%m', '%d.%m.%y', '%y.%m.%d', '%y.%m']
        for fmt in date_formats:
            try:
                date_obj = pd.to_datetime(date_str, format=fmt, errors='raise')
                break # Exit the loop if parsing is successful
            except (ValueError, TypeError):
                continue
    
        if pd.notna(date_obj):
            if current_date is None:
                current_date = datetime.now()
            
            total_months = (current_date.year - date_obj.year) * 12 + (current_date.month - date_obj.month)
            return float(max(0, total_months))
    
    # 2. Pattern: Decimal years (e.g., '11.3년', '22.5')
    match_deci = re.search(r'^(\d+(?:\.\d+)?)(?:년)?$', tenure_str)
    if match_deci:
        decimal_years = float(match_deci.group(1))
        return decimal_years * 12
    
    # 3. Pattern: Years and Months (e.g., "3년 6개월", "4년4개월")
    match_ym = re.search(r'(\d+)\s*년(?:[^\d]+)?\s*(\d+)\s*개월', tenure_str)
    if match_ym:
        years = int(match_ym.group(1))
        months = int(match_ym.group(2))
        return float(years * 12 + months)
        
    # 4. Pattern: Only Years (e.g., "3년")
    match_y = re.search(r'^(\d+)\s*년$', tenure_str)
    if match_y:
        years = int(match_y.group(1))
        return float(years * 12)
        
    # 5. Pattern: Only Months (e.g., "18개월")
    match_m = re.search(r'^(\d+)\s*개월$', tenure_str)
    if match_m:
        months = int(match_m.group(1))
        return float(months)
    
    return pd.NA

# Set a specific reference date for consistent tenure calculation
reference_date = datetime(2025, 8, 5)

# Apply the custom function to the column
exec_df['hffc_pd'] = exec_df['hffc_pd'].apply(
    lambda x: convert_tenure_to_months(x, current_date=reference_date)
)

In [58]:
exec_df.head(2)

Unnamed: 0,rcept_no,corp_code,corp_name,nm,sexdstn,ofcps,rgist_exctv_at,chrg_job,main_career,mxmm_shrholdr_relate,hffc_pd,is_audit_committee_member,is_auditor,salary,salary_source,salary_type,trmend_posesn_stock_qota_rt
0,20250313001154,173944,우진,이재원,남,이사회 의장,사내이사,경영전반 \n및 해외사업,아이닥 아이앤씨 대표이사\n現 ㈜우진 이사 (이사회 의장)\n現 TAKA INTER...,최대주주 본인,108.0,False,False,1297181000,개인별보수,exact,15.82
1,20250313001154,173944,우진,이재상,남,대표이사 사장,사내이사,대표이사,연세대학교 경영학 석사 \n㈜우진 상무이사\n現 ㈜우진 대표이사 사장,특수관계인,240.0,False,False,1337155000,개인별보수,exact,9.58


In [96]:
def extract_summary(group):
    voting_directors_group = group[~group['rgist_exctv_at'].isin(['미등기', '감사'])]

    female_voting = (voting_directors_group['sexdstn'] == '여').sum()
    male_voting = (voting_directors_group['sexdstn'] == '남').sum()
    
    return pd.Series({
        
        # total counts for all individuals 
        'Audit Committee': group['is_audit_committee_member'].sum(),
        'Audit Committee ODs': ((group['is_audit_committee_member'] == True) & (group['rgist_exctv_at'] == '사외이사')).sum(),
        'Inside Directors': group['rgist_exctv_at'].isin(['사내이사', '대표집행임원']).sum(),
        'Outside Directors': (group['rgist_exctv_at'] == '사외이사').sum(),
        'Other Non-Exec Directors': (group['rgist_exctv_at'] == '기타비상무이사').sum(),
        'Auditors': group['is_auditor'].sum(),
        
        # counts for Voting Directors subset
        'Female Voting': female_voting,
        'Male Voting': male_voting,
        'Voting Directors': female_voting + male_voting,

        # count for Non-Registered (separate)
        'Non Registered': (group['rgist_exctv_at'] == '미등기').sum()
    })


# merge with kospi_codes to append corp_code, required for financial statment search in next section 
# Item code and stock_code are interchangeable 
summary_df = exec_df.groupby(['corp_code', 'corp_name']).apply(extract_summary).reset_index()
summary_df = pd.merge(summary_df, assets_df[['Corp Code', 'Total Assets (2024)', 'Total Assets (2023)', 'Total Assets (2022)']], left_on='corp_code', right_on='Corp Code', how='left')
summary_df.drop(columns=['Corp Code'], inplace=True)

rcept_no = exec_df.groupby('corp_code')['rcept_no'].max().reset_index()
rcept_no.rename(columns={'rcept_no': 'rcept_no'}, inplace=True)
summary_df = pd.merge(summary_df, rcept_no, on='corp_code', how='left')

  summary_df = exec_df.groupby(['corp_code', 'corp_name']).apply(extract_summary).reset_index()


# CHECK on Audit Committee

In [98]:
def check_governance_compliance(df):
    """
    Evaluates governance compliance based on board and audit committee rules.
    Returns:
        pd.DataFrame with flagged issues for non-compliant companies.
    """
    flagged = []

    for index, row in df.iterrows():
        corp_code = row['corp_code']
        total_assets = row['Total Assets (2022)']
        num_audit_committee = row['Audit Committee']
        num_outside_directors = row['Outside Directors']
        num_voting_directors = row['Voting Directors']
        num_outside_committee = row['Audit Committee ODs']

        failures = []

        # Check audit committee compliances
        if pd.notna(total_assets) and total_assets > 2_000_000_000_000:
            if pd.isna(num_audit_committee) or num_audit_committee == 0:
                failures.append("No Audit Committee listed")
            elif num_audit_committee < 3:
                failures.append(f"Audit Committee has fewer than 3 members ({num_audit_committee})")

            if pd.notna(num_audit_committee) and num_audit_committee > 0:
                if pd.isna(num_outside_committee):
                    failures.append("Missing count of Audit Committee Outside Directors.")
                elif num_outside_committee < (2/3) * num_audit_committee:
                    failures.append(f"Audit Committee Outside Directors ({num_outside_committee}) < 2/3 of Audit Committee ({num_audit_committee})")
            if num_outside_directors < (1/4) * num_voting_directors:
                failures.append(f"Outside Directors ({num_outside_directors}) < 1/4 of Voting Directors ({num_voting_directors})")

        if failures:
            flagged.append({'corp_code': corp_code, 'Flagged Conditions': "; ".join(failures)})

    failed_df = pd.DataFrame(flagged)
    failed_df['corp_code'] = failed_df['corp_code']

    # Merge with original data
    return pd.merge(failed_df, df, on='corp_code', how='inner')

flagged = check_governance_compliance(summary_df) 

# ~ 75 flagged

# START from here: 
- to get the flagged cases' missing urls, pass in the corresponding rcpt_no from exec_df 
- make sure that the updates are accurate 
- print flags for those that persist 

- clean up exec and summary df's 
    - remove \n 
    - rename columns 

- clean notebook file 
- update documentation 

In [None]:
def missing_acm_urls(flagged_df):
    results = []

    for idx, row in flagged_df.iterrows():
        company = row['corp_name']
        corp = row['corp_code']
        rcp = row['rcept_no'] # update to append the disclosure from exec_df most recent 
        assets = row['Total Assets (2022)']

        try:
            subdocs = dart.sub_docs(str(rcp))  # rcept_no must be string
            match = subdocs[subdocs['title'].str.contains("감사제도에 관한 사항")]

            if not match.empty:
                url = match.iloc[0]['url']
            else:
                url = None

        except Exception as e:
            print(f"Failed to fetch for {corp} ({rcp}): {e}")
            url = None

        results.append({
            'corp_code': str(corp),
            'corp_name': company,
            'rcept_no': rcp,
            'url': url,
            'Total Assets (2022)': assets
        })

        time.sleep(0.7)

    return pd.DataFrame(results)

# 3. Retrieve target audit committee document URLs for flagged cases
audit_targets_df = missing_acm_urls(flagged).drop_duplicates(subset=['corp_code'])
audit_targets_df

Unnamed: 0,corp_code,corp_name,rcept_no,url,Total Assets (2022)
0,101220,KG케미칼,20250320001653,http://dart.fss.or.kr/report/viewer.do?rcpNo=2...,7.076647e+12
1,102432,계룡건설산업,20250318001013,http://dart.fss.or.kr/report/viewer.do?rcpNo=2...,2.560029e+12
2,104856,삼성증권,20250306000628,http://dart.fss.or.kr/report/viewer.do?rcpNo=2...,5.384826e+13
3,106669,세아베스틸지주,20250318000465,http://dart.fss.or.kr/report/viewer.do?rcpNo=2...,3.818404e+12
4,108135,녹십자홀딩스,20250318001422,http://dart.fss.or.kr/report/viewer.do?rcpNo=2...,3.592061e+12
...,...,...,...,...,...
71,1363818,롯데리츠,20250312000767,http://dart.fss.or.kr/report/viewer.do?rcpNo=2...,2.313651e+12
72,1390344,HD현대중공업,20250318000897,http://dart.fss.or.kr/report/viewer.do?rcpNo=2...,1.628940e+13
73,1415892,제이알글로벌리츠,20250321001699,http://dart.fss.or.kr/report/viewer.do?rcpNo=2...,2.056286e+12
74,1437186,ESR켄달스퀘어리츠,20250218000559,http://dart.fss.or.kr/report/viewer.do?rcpNo=2...,2.343918e+12


In [182]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re

def parse_and_update_audit_members(audit_targets_df, exec_df, summary_df):
    """
    Parses financial documents for flagged corporations to update audit committee
    member information in both exec_df and summary_df, using a more robust
    and flexible parsing strategy.

    Args:
        audit_targets_df (pd.DataFrame): DataFrame of corporations flagged for
                                         audit committee updates.
        exec_df (pd.DataFrame): DataFrame containing executive information.
        summary_df (pd.DataFrame): DataFrame containing summary information.

    Returns:
        tuple: The updated exec_df and summary_df DataFrames.
    """
    # Initialize the is_audit_committee_member column if it doesn't exist
    if 'is_audit_committee_member' not in exec_df.columns:
        exec_df['is_audit_committee_member'] = False

    updated_count = 0
    summary_updates = {}
    new_members_to_add = []

    for idx, row in audit_targets_df.iterrows():
        corp_code = row['corp_code']
        company = row['corp_name']
        url = row['url']
        rcept_no = row['rcept_no']

        if pd.isna(url) or not isinstance(url, str):
            continue

        try:
            response = requests.get(url, timeout=20)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # --- METHOD 1: Find the section header by its text content ---
            members_found = []
            
            header_node = soup.find(string=re.compile('2\.\s*감사제도에 관한 사항'))
            
            if header_node:
                section_container = header_node.find_parent('p') or header_node.find_parent('div')
                
                if section_container:
                    # Look for a table in the section
                    table = section_container.find_next_sibling('table')
                    
                    if table:
                        # --- METHOD 1A: Parse a table-based structure ---
                        header_rows = table.find_all('tr', limit=2)
                        
                        headers = [
                            th.get_text(strip=True).replace('\xa0', '').replace('\n', '')
                            for tr in header_rows for th in tr.find_all(['th', 'td'])
                        ]
                        
                        name_idx = next((i for i, h in enumerate(headers) if '성명' in h), None)
                        outside_idx = next((i for i, h in enumerate(headers) if '사외이사' in h), None)
                        
                        if name_idx is not None and outside_idx is not None:
                            data_rows = table.find_all('tbody')[0].find_all('tr') if table.find('tbody') else table.find_all('tr')[len(header_rows):]
                            for tr in data_rows:
                                tds = tr.find_all(['td', 'th'])
                                if len(tds) > max(name_idx, outside_idx):
                                    name = tds[name_idx].get_text(strip=True)
                                    is_outside = tds[outside_idx].get_text(strip=True)
                                    if name and name != '-' and is_outside and is_outside != '-':
                                        is_outside_flag = '예' in is_outside or 'O' in is_outside
                                        members_found.append((name, is_outside_flag))

                    # If no table or table parsing failed, look for div-based structure
                    if not members_found:
                        # --- METHOD 1B: Parse a div/span-based structure ---
                        container_text = section_container.get_text()
                        member_matches = re.findall(r'([\w가-힣]+)\s*감사위원회\s*위원', container_text)
                        
                        for name in member_matches:
                            members_found.append((name, False)) # Can't determine 'is_outside'

            # --- Update DFs with found members from either method ---
            if members_found:
                for name, is_outside_flag in members_found:
                    # Correctly use 'nm' for the executive's name
                    existing_mask = (exec_df['corp_code'] == corp_code) & (exec_df['nm'] == name)
                    if not exec_df[existing_mask].empty:
                        exec_df.loc[existing_mask, 'is_audit_committee_member'] = True
                        updated_count += 1
                    else:
                        new_members_to_add.append({
                            'corp_code': corp_code,
                            'corp_name': company, # Use company from loop
                            'nm': name, # Corrected key for name
                            'chrg_job': '감사위원회 위원',
                            'is_audit_committee_member': True
                        })
                
                total_members = len(members_found)
                outside_members = sum(1 for _, flag in members_found if flag)
                
                summary_updates[corp_code] = {
                    'Audit Committee': total_members,
                    'Audit Committee ODs': outside_members # Corrected dictionary key
                }
                print(f"Updated {corp_code} - {company} with {total_members} members.")
            else:
                summary_updates[corp_code] = {'Audit Committee': 0, 'Audit Committee ODs': 0}
                print(f"{corp_code} - {company}: No valid audit committee info found.")

        except Exception as e:
            print(f"Exception occurred for {corp_code} - {company}: {e}")
        time.sleep(0.7)
    
    if new_members_to_add:
        exec_df = pd.concat([exec_df, pd.DataFrame(new_members_to_add)], ignore_index=True)
        summary_df['corp_code'] = summary_df['corp_code'].astype(str)

    for corp_code, update in summary_updates.items():
        summary_df.loc[summary_df['corp_code'] == corp_code, 'Audit Committee'] = update['Audit Committee']
        summary_df.loc[summary_df['corp_code'] == corp_code, 'Audit Committee ODs'] = update['Audit Committee ODs'] # Corrected key

    return exec_df, summary_df

  header_node = soup.find(string=re.compile('2\.\s*감사제도에 관한 사항'))


In [None]:
# Create test dataframes from your originals
audit_targets_test = audit_targets_df.copy()
exec_df_test = exec_df.copy()
summary_df_test = summary_df.copy()

# Run the function on the test data
# The function will return the modified test dataframes
exec_df_updated, summary_df_updated = parse_and_update_audit_members(
    audit_targets_test,
    exec_df_test,
    summary_df_test
)

# Now you can inspect exec_df_updated and summary_df_updated
# without your original data being changed.
print("Original exec_df is unchanged.")
print("Original summary_df is unchanged.")

In [191]:
# check_governance_compliance(summary_df_updated)

# print cases where both audit committee & auditors = 0 

# 08/08 TODO:
- fix auditor placeholder logic + flagging 
- fix exec updating 
- parse prof_background to separate edu / work experience
- rename columns 
- save all processed df's 
- clean notebooks 


# Documentation Format: 
- the produced data frames
- the full code and explaining decision processes (ie why est/exact for salary, the flags for audit committee, assets check from 2 years ago), what columns were droppped, etc 

# Useful OpenDART links: 
- https://opendart.fss.or.kr/guide/detail.do?apiGrpCd=DE004&apiId=AE00041 Ownership change (most recent data)


# AUDIT Governance Check: 
To Update: 

# Correct Flag. For the single audit cases, want to correct audit committee from 0 -> (1) to indicate that rather than audit committee, independent auditor stands 
    0	101220 - single audit - 1 
    1	108135 - single audit - 1 
    2	111810 - single audit, flags as one in audit committee - 1 
    3	113997 - single audit - 1 
    4	114792 - single audit - 1 
    5	116949 - single audit - 1 
    6	120182 - 2
    7	124197 - single audit under 가. 감사의 인적사항 
    8	125150 - single audit 
    10	138701 - single audit 
    11	140964 - single audit 
    12	v- single audit 
    13	149646 - single audit under 감사의 인적사항 
    14	152862 - single audit under 가. 감사의 인적사항 및 사외이사 여부
    16	162063 - single auditor under 나.  감사의 인적사항
    17	162993 - single auditor under 나.  감사의 인적사항
    18	176914 - single auditor under 나.  감사의 인적사항
    20	203315 - single auditor under 나. 감사의 인적사항
    21	225159 - single auditor 
    아니하며, 주주총회 결의에 의하여 선임된 감사 1명이 감사업무를 수행하고 있습니다. 
    24	377610 - single auditor under (1) 감사의 인적사항
    26	539274 - single auditor under ※ 감사의 인적사항
    27	545716 - 2 
    28	939331 - audit?/audit committee? under 가. 감사의 인적사항 
        kr kolmar 
    29	990165 - single auditor under 가. 감사위원회(감사) 설치여부, 구성방법 등
    31	1363818
    32	1415892 - single auditor under 감사의 인적사항 
    33	1437186 - single auditor under 나. 감사관련 사항
    34	1535150

# Incorrect Flag: 
    10	138701 - different formatting under 마. 감사위원 현황 

    15	159698 - scrapes 2/3 under (1) 감사위원 현황 
    https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20250327000192&dcmNo=10464392&eleId=68&offset=1754795&length=223978&dtd=dart3.xsd&detailYn=Y

    19	195229 - scrapes 2 when there's a single auditor 

    22	255619 - different formatting under 가. 감사위원회 위원 현황
    https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20250318001106&dcmNo=10423403&eleId=62&offset=1540190&length=171481&dtd=dart3.xsd&detailYn=Y

    23	261285 - scrapes 2/3 under 가. 감사위원 현황

    25	503668 - different formatting under 가. 감사위원회 위원의 인적사항 및 사외이사 여부

    30	1060744 - scrapes wrong table, should be 라. 감사위원 현황
    https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20250314001432&dcmNo=10410006&eleId=65&offset=2329678&length=198881&dtd=dart3.xsd&detailYn=Y

In [None]:
# --- 1. Indivualized Dataframe where each row represents a unique executive --- 

# from the original exec_df dataframe, identify members of the audit committee and auditors 

# because audit committee member can also be extracted from the corrected executive data: update such that it also clears afterwards 

def is_audit_committee_member(responsibility, position):

    responsibility_cleaned = re.sub(r'\s', '', responsibility)
    
    return bool(re.search(r'감사위원회위원|감사위원|감사위원장', responsibility_cleaned)) 

def is_auditor_exclusive(responsibility, position): #isolated capture to not overlap with is_audit_committee_member
    if is_audit_committee_member(responsibility, position):
        return False
    
    responsibility_check = False
    if isinstance(responsibility, str):
        responsibility_cleaned = re.sub(r'\s', '', responsibility)

        responsibility_check = '감사' in responsibility_cleaned and not re.search(r'감사위원회위원|감사위원', responsibility_cleaned)  
    position_check = False 

    return responsibility_check or position_check

# apply updated audit membership 
exec_df['is_audit_committee_member'] = exec_df.apply(
    lambda row: is_audit_committee_member(row['Responsibilities'], row['Registered Officer Status']), axis=1
)

exec_df['is_auditor'] = exec_df.apply(
    lambda row: is_auditor_exclusive(row['Responsibilities'], row['Registered Officer Status']), axis=1
)

# build the individual-level dataframe

In [24]:
def assign_compensation(exec_df: pd.DataFrame, salary_type_df: pd.DataFrame) -> pd.DataFrame:
    """
    Assign salary from compensation_df to each executive in exec_df.
    Uses individual match first; falls back on average grouped position compensation if registered.
    Leaves '미등기' status executives empty unless exact match is found.
    """
    exec_df['salary'] = None
    exec_df['salary_source'] = None
    exec_df['salary_type'] = None

    for idx, row in exec_df.iterrows():
        corp_code = row['corp_code']
        name = row['nm']
        status = row.get('rgist_exctv_at')
        is_auditor = row.get('is_auditor', False)
        is_committee = row.get('is_audit_committee_member', False)

        # 1. Try to match by name
        match = salary_type_df[
            (salary_type_df['corp_code'] == corp_code) & 
            (salary_type_df['name'] == name)
        ]

        if not match.empty:
            row_data = match.iloc[0]

        else:
            if status == '미등기':
                label = '미등기임원' 
            # 2. Estimate fallback: build label
            elif is_auditor:
                label = '감사'
            elif status == '사외이사':
                label = '감사위원회 위원' if is_committee else '사외이사(감사위원회 위원 제외)'
            else:
                label = '등기이사(사외이사, 감사위원회 위원 제외)'

            group_match = salary_type_df[
                (salary_type_df['corp_code'] == corp_code) & 
                (salary_type_df['name'].isna()) & 
                (salary_type_df['position'] == label)
            ]

            row_data = group_match.iloc[0] if not group_match.empty else pd.Series(dtype='object')

        # 3. Assign if valid
        if not row_data.empty:
            exec_df.at[idx, 'salary'] = row_data.get('salary')
            exec_df.at[idx, 'salary_source'] = row_data.get('salary_source')
            exec_df.at[idx, 'salary_type'] = row_data.get('salary_type')

    return exec_df

exec_df = assign_compensation(exec_df, salary_type_df)

In [35]:
exec_df = pd.merge(
    exec_df, 
    major_shareholder_df[['corp_code', 'nm', 'trmend_posesn_stock_qota_rt']],
    on=['corp_code', 'nm'],
    how='left'
)

In [None]:
def extract_summary(group):
    voting_directors_group = group[~group['Registered Officer Status'].isin(['미등기', '감사'])]

    female_voting = (voting_directors_group['Gender'] == '여').sum()
    male_voting = (voting_directors_group['Gender'] == '남').sum()
    
    return pd.Series({
        
        # total counts for all individuals 
        'Audit Committee': group['is_audit_committee_member'].sum(),
        'Audit Committee Outside Directors': ((group['is_audit_committee_member'] == True) & (group['Registered Officer Status'] == '사외이사')).sum(),
        'Inside Directors': group['Registered Officer Status'].isin(['사내이사', '대표집행임원']).sum(),
        'Outside Directors': (group['Registered Officer Status'] == '사외이사').sum(),
        'Other Non Exec Directors': (group['Registered Officer Status'] == '기타비상무이사').sum(),
        'Auditors': group['is_auditor'].sum(),
        
        # counts for Voting Directors subset
        'Female Voting': female_voting,
        'Male Voting': male_voting,
        'Voting Directors': female_voting + male_voting,

        # count for Non-Registered (separate)
        'Non Registered': (group['Registered Officer Status'] == '미등기').sum()
    })


# merge with kospi_codes to append corp_code, required for financial statment search in next section 
# Item code and stock_code are interchangeable 
summary_df = exec_df.groupby(['Company', 'Corp Code', 'Disclosure']).apply(extract_summary).reset_index()
summary_df

In [None]:
#TODO: add tenure parsing by yr 

In [None]:
# manual check for dart reports involving remuneration records 

dart.report('01263022', '임원전체보수유형', 2024, reprt_code='11011') #grouped 
dart.report('00126380', '개인별보수', 2024, reprt_code='00684802') #listed exec 
dart.report('00126380', '임원개인보수', 2024, reprt_code='00684802') #registered exec 

{'message': '정의되지 않은 오류가 발생하였습니다.', 'status': '900'}
{'message': '정의되지 않은 오류가 발생하였습니다.', 'status': '900'}


In [None]:
df.to_csv("compensation_df", index=False, encoding='utf-8-sig')

In [5]:
compensation_df = pd.read_csv('compensation_df')

In [None]:
# TODO: add in the case where a past exec member is no longer included in current exec_df but receives a salary 
# correct such that for unregistered, if not existing as exact number, it won't take on the estimate for registered 

In [6]:
compensation_df['corp_code'] = compensation_df['corp_code'].astype(str).str.zfill(8)

In [15]:
def assign_compensation(exec_df_updated: pd.DataFrame, compensation_df: pd.DataFrame) -> pd.DataFrame:
    """
    Assign salary from compensation_df to each executive in exec_df.
    Uses individual match first; falls back on average grouped position compensation if registered.
    Leaves '미등기' status executives empty unless exact match is found.
    """
    exec_df_updated = exec_df_updated.copy()
    exec_df_updated['Corp Code'] = exec_df_updated['Corp Code'].astype(str).str.zfill(8)

    exec_df_updated['Salary'] = None
    exec_df_updated['salary_source'] = None
    exec_df_updated['salary_type'] = None

    for idx, row in exec_df_updated.iterrows():
        corp_code = row['Corp Code']
        name = row['Name']
        status = row.get('Registered Officer Status', '')
        is_auditor = row.get('is_auditor', False)
        is_committee = row.get('is_audit_committee_member', False)

        # 1. Try to match by name
        match = compensation_df[
            (compensation_df['corp_code'] == corp_code) & 
            (compensation_df['name'] == name)
        ]

        if not match.empty:
            row_data = match.iloc[0]

        else:
            if status == '미등기':
                label = '미등기임원' 
            # 2. Estimate fallback: build label
            elif is_auditor:
                label = '감사'
            elif status == '사외이사':
                label = '감사위원회 위원' if is_committee else '사외이사(감사위원회 위원 제외)'
            else:
                label = '등기이사(사외이사, 감사위원회 위원 제외)'

            group_match = compensation_df[
                (compensation_df['corp_code'] == corp_code) & 
                (compensation_df['name'].isna()) & 
                (compensation_df['ofcps'] == label)
            ]

            row_data = group_match.iloc[0] if not group_match.empty else pd.Series(dtype='object')

        # 3. Assign if valid
        if not row_data.empty:
            exec_df_updated.at[idx, 'Salary'] = row_data.get('salary')
            exec_df_updated.at[idx, 'salary_source'] = row_data.get('salary_source')
            exec_df_updated.at[idx, 'salary_type'] = row_data.get('salary_type')

    return exec_df_updated
exec_df_updated = assign_compensation(exec_df_updated, compensation_df)

# execution time: 19 seconds 

In [17]:
exec_df_updated.to_csv('exec_df_updated')

In [18]:
individual_comps = compensation_df[
    (compensation_df['name'].notnull()) &
    (compensation_df['name'] != '') &
    (compensation_df['name'] != '-')
].copy()

# Build a composite key for matching
exec_keys = set(zip(exec_df_updated['Corp Code'], exec_df_updated['Name']))
comp_keys = set(zip(individual_comps['corp_code'], individual_comps['name']))

# Get the difference: people in comp_df but not exec_df
extra_comp_keys = comp_keys - exec_keys

# Filter those from comp_df
mask = individual_comps.apply(lambda row: (row['corp_code'], row['name']) in extra_comp_keys, axis=1)
extra_individuals = individual_comps[mask]

In [20]:
# manually check that the ind audit status has been properly updated 
# manually check that the compensation has been properly updated 
# manual check that compensation has been accurately assigned 
# unregistered members that are not on the audit commitee should = None 
# check that number of exact in exec_check + extra_check = comp_check 
# check outside dir & audit committee differentiations by the dart report calls:
    # dart.report('01263022', '임원전체보수유형', 2024, reprt_code='11011') #grouped 
    # dart.report('00126380', '개인별보수', 2024, reprt_code='11011') #listed exec 
    # dart.report('00126380', '임원개인보수', 2024, reprt_code='11011') #registered exec 

comp_check = compensation_df[compensation_df['corp_code'] == '00126380']
extra_check = extra_individuals[extra_individuals['corp_code'] == '00126380']
exec_check = exec_df_updated[exec_df_updated['Corp Code'] == '00126380']

In [None]:
dart.report('00126380', '임원전체보수유형', 2024, reprt_code='11011') #grouped 

Unnamed: 0,rcept_no,corp_cls,corp_code,corp_name,se,nmpr,pymnt_totamt,psn1_avrg_pymntamt,rm,stlm_dt
0,20250515001922,Y,126380,삼성전자,"등기이사(사외이사, 감사위원회 위원 제외)",-,-,-,-,2025-03-31
1,20250515001922,Y,126380,삼성전자,사외이사(감사위원회 위원 제외),-,-,-,-,2025-03-31
2,20250515001922,Y,126380,삼성전자,감사위원회 위원,-,-,-,-,2025-03-31
3,20250515001922,Y,126380,삼성전자,감사,-,-,-,-,2025-03-31


In [None]:
'''
경계현 8,036,000,000 # under extra 
이정배 6,950,000,000	
최시영 6,900,000,000
한종희 5,240,000,000
노태문 5,098,000,000	
박학규 3,346,000,000 # only additional from unregistered - under extra 
'''

In [None]:
dart.report('00126380', '개인별보수', 2024, reprt_code='11011') #indivudal registered directors + auditors

# 최시영 not appended 

Unnamed: 0,rcept_no,corp_cls,corp_code,corp_name,nm,ofcps,mendng_totamt,mendng_totamt_ct_incls_mendng,stlm_dt
0,20250311001085,Y,126380,삼성전자,경계현,고문,8036000000,-,2024-12-31
1,20250311001085,Y,126380,삼성전자,이정배,상담역,6950000000,-,2024-12-31
2,20250311001085,Y,126380,삼성전자,최시영,상담역,6900000000,-,2024-12-31
3,20250311001085,Y,126380,삼성전자,한종희,대표이사,5240000000,-,2024-12-31
4,20250311001085,Y,126380,삼성전자,노태문,이사,5098000000,-,2024-12-31


In [None]:
dart.report('00126380', '미등기임원보수', 2024, reprt_code='11011') #unregistered exec 

Unnamed: 0,rcept_no,corp_cls,corp_code,corp_name,nm,ofcps,mendng_totamt,mendng_totamt_ct_incls_mendng,stlm_dt
0,20250311001085,Y,126380,삼성전자,박학규,사장,3346000000,-,2024-12-31
1,20250311001085,Y,126380,삼성전자,노태문,이사,5098000000,-,2024-12-31
2,20250311001085,Y,126380,삼성전자,한종희,대표이사,5240000000,-,2024-12-31
3,20250311001085,Y,126380,삼성전자,이정배,상담역,6950000000,-,2024-12-31
4,20250311001085,Y,126380,삼성전자,경계현,고문,8036000000,-,2024-12-31


In [None]:
# TOTAL compensation: 임원전체보수 
# EXEC TYPE: 임원전체보수유형 : 
    # 등기이사(사외이사, 감사위원회 위원 제외): (registered director excluding outside directors and audit committee members)
    # 사외이사(감사위원회 위원 제외): outside (non audit committee)
    # 감사위원회 위원: audit committee (감사위원회 위원)
    # 감사: auditor 
# from opendart, we get exact numbers of registered inside/executive directors from: '임원개인보수' current 
# with the addition of '개인별보수' which includes ONEDs making over 5 million 
# '임원전체보수유형' which provides registered directors (non outside/ac), outside dir (excluding ac), ac, auditor 

# consider: having individual nums where possible OR all averaged 

In [None]:
# Appending shareholder status (relation/holding status)

# OPENDART link: https://opendart.fss.or.kr/guide/detail.do?apiGrpCd=DS002&apiId=2019007 
# API url: https://opendart.fss.or.kr/api/hyslrSttus.json
# nm, relate, stock_knd, trmend_posesn_stock_co	, trmend_posesn_stock_qota_rt	

In [161]:
exec_df_updated.head(1)

Unnamed: 0,Disclosure,Corp Code,Company,Name,Gender,Position,Registered Officer Status,Responsibilities,Professional Background,Shareholder Relation,Period of employment,industry_code,is_audit_committee_member,is_auditor,Salary,salary_source,salary_type
0,20250320001609,684802,에이플러스에셋,곽근호,남,총괄 대표이사\n 회장,사내이사,본인,"영남대학교 공업화학과 졸업영남대학교 명예 경영학 박사前) 삼성생명 지점장, 상무現)...",본인,17년\n 5개월,66202,False,False,985000000,개인별보수,exact


In [163]:
API_key = 'c59d8af8c96ea97002c35d4c160a8c8c5c5dbd3c'

In [176]:
shareholder_df_renamed = shareholder_df.rename(columns={
    'corp_code': 'Corp Code',
    'nm': 'Name',
    'trmend_posesn_stock_co': 'Shares Owned',
    'trmend_posesn_stock_qota_rt': 'Shares Ratio'
})

# Merge by corp_code and name
exec_df_updated = exec_df_updated.merge(
    shareholder_df_renamed[['Corp Code', 'Name', 'Shares Owned', 'Shares Ratio']],
    on=['Corp Code', 'Name'],
    how='left'
)

In [181]:
exec_df_updated.to_csv('exec_salary_shareholder', index = False)

In [1]:
exec_df_updated = pd.read_csv('exec_salary_shareholder')

NameError: name 'pd' is not defined

In [None]:
# 7/30 TODO: check that consolidated salaries/compensation are accurate 
# check that audit update is accurate (including for ind_df -> exec_df_updated)
# see if possible to standardize tenure by years employed 
# see if it's possible to map industry code to industry name, otherwise run k-means on industry code 
# update documentation (comments in notebook and seperate doc detailing transformations, dropped columns, sources, why API vs dart fss or opendart reader)

In [None]:
dart.report('01263022', '임원전체보수', 2024, reprt_code='11011')
dart.report('01263022', '임원전체보수유형', 2024, reprt_code='11011')

Unnamed: 0,rcept_no,corp_cls,corp_code,corp_name,nmpr,jan_avrg_mendng_am,mendng_totamt,rm,stlm_dt
0,20250318000733,Y,1263022,BGF리테일,7,425000000,2972000000,-,2024-12-31


In [None]:
# TOTAL compensation: 임원전체보수 
# EXEC TYPE: 임원전체보수유형 : 
    # 등기이사(사외이사, 감사위원회 위원 제외): (registered director excluding outside directors and audit committee members)
    # 사외이사(감사위원회 위원 제외): outside (non audit committee)
    # 감사위원회 위원: audit committee (감사위원회 위원)
    # 감사: auditor 
# from opendart, we get exact numbers of registered inside/executive directors from: '임원개인보수' current 
# with the addition of '개인별보수' which includes ONEDs making over 5 million 
# '임원전체보수유형' which provides registered directors (non outside/ac), outside dir (excluding ac), ac, auditor 

# consider: having individual nums where possible OR all averaged 

In [None]:
""""
Korean	      English
사모자금사용	Use of Private Fundraising Proceeds
공모자금사용	Use of Public Offering Proceeds
주식총수	Total Shares Outstanding

사외이사	Outside Directors https://engopendart.fss.or.kr/guide/detail.do?apiGrpCd=DE002&apiId=AE00027 
    - already has information for num directors, outside directors 

최대주주	Major Shareholder https://engopendart.fss.or.kr/guide/detail.do?apiGrpCd=DE002&apiId=AE00008 
- 최대주주변동	Change in Major Shareholder 
소액주주	Minority Shareholders https://engopendart.fss.or.kr/guide/detail.do?apiGrpCd=DE002&apiId=AE00010  

Compensation Types: 
    임원전체보수	Total Executive Compensation 
        임원전체보수유형	Types of Total Executive Compensation
    개인별보수	Individual Compensation Disclosure 
        임원개인보수	Individual Executive Compensation 


"""


In [None]:
# if this doesn't include the break down by executive position like https://engopendart.fss.or.kr/disclosureinfo/fnltt/singl/main.do does 

# cross check whether its concatonated under the other individual api call 

# if enough, merge on corp code 
# if not enough, scrape from url directly for grouped data 

# figure out how to extract industry name from code and group data from there 

# from the url link: pull the following info
# 1. total remuneration amount (with unit appended)
# 2. average per executive (general)
# 3. average per registered directors 
# 4. average per outside non audit committee directors 
# 5. average per audit committee members 

In [None]:
# From individual_df create a separate one of the registered executives 

# Append compensation from average remittance, stocks, relationship to largest shareholder  

In [None]:
# Try OpenDart Reader function: 
# 2. 사업보고서:  '임원개인보수', '임원전체보수', '개인별보수',

GOAL: Clean up all comments and scrap irrelevancy. Fix the building notebook audit update and governance check. Then correct the logic behind the remuneration data. Update and append any other relevant info.

If necessary, see if scraping directly will yield better results. Otherwise don't. See if opendart reader has a function to call in the data instead. 