In [2]:
# import libraries and set parameters 

import os
import io
import re
import time
import zipfile 
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor 

import dart_fss
import OpenDartReader

API_key = '0d67945133e224c451452e071e0d8349969353e1' 
dart = OpenDartReader(API_key)
dart_fss.set_api_key(API_key)

current_year = 2025
bsns_year = '2024' # from most recent annual report
reprt_code = '11013'
asset_year = '2024' # set to a year before bsns year due to grace period for corps surpassing 2 trillion krw 

# call most recent annual/semi-annual report as quarterly may exclude audit info due to corporate disclosure form preparation standards 

Error occurred during getting browser(s): random, but was suppressed with fallback.


In [None]:
def parse_and_update_audit_members(audit_targets_df, exec_df, summary_df):
    updated_count = 0
    summary_updates = {}

    for idx, row in audit_targets_df.iterrows():
        corp_code = row['corp_code']
        company = row['company']
        url = row['url']
        rcept_no = row['rcept_no']

        if pd.isna(url) or not isinstance(url, str):
            continue

        try:
            response = requests.get(url, timeout=20)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            anchor = soup.find('a', attrs={'name': 'toc3'}, string='2. 감사제도에 관한 사항')
            anchor_p = anchor.find_parent('p') if anchor else None

            table = None
            candidate = anchor_p.find_next_sibling() if anchor_p else None

            while candidate:
                if candidate.name == 'table':
                    header_rows = []
                    for tr in candidate.find_all('tr'):
                        ths = tr.find_all('th')
                        if len(ths) == 1 and ths[0].has_attr('colspan'):
                            continue
                        header_rows.append(tr)
                        if len(header_rows) >= 2:
                            break

                    if len(header_rows) < 2:
                        candidate = candidate.find_next_sibling()
                        continue

                    headers = []
                    for tr in header_rows:
                        headers.extend([
                            th.get_text(strip=True).replace('\xa0', '').replace('\n', '')
                            for th in tr.find_all(['th', 'td'])
                        ])

                    name_idx = next((i for i, h in enumerate(headers) if '성명' in h), None)
                    outside_idx = next((i for i, h in enumerate(headers) if '사외이사' in h), None)

                    if name_idx is not None and outside_idx is not None:
                        table = candidate
                        break

                candidate = candidate.find_next_sibling() # iterate to the next table

            if table:
                members = []
                data_rows = table.find_all('tbody')[0].find_all('tr') if table.find('tbody') else table.find_all('tr')[len(header_rows):]

                for tr in data_rows:
                    tds = tr.find_all(['td', 'th'])
                    if len(tds) <= max(name_idx, outside_idx):
                        continue

                    name = tds[name_idx].get_text(strip=True)
                    is_outside = tds[outside_idx].get_text(strip=True)

                    if not name or name == '-' or not is_outside or is_outside == '-':
                        continue

                    is_outside_flag = '예' in is_outside or 'O' in is_outside
                    members.append((name, is_outside_flag))

                    mask = (
                        (exec_df['Corp Code'] == corp_code) &
                        (exec_df['Name'] == name)
                    )
                    if not exec_df.loc[mask].empty:
                        exec_df.loc[mask, 'is_audit_committee_member'] = True
                        updated_count += 1

                total_members = len(members)
                outside_members = sum(1 for _, flag in members if flag)

                if total_members == 1 and outside_members == 0:
                    total_members = 0
                    outside_members = 0

                if total_members > 0:
                    summary_updates[corp_code] = {
                        'Audit Committee': total_members,
                        'Audit Committee Outside Directors': outside_members
                    }
                    print(f"Updated {corp_code} - {company} - {rcept_no} - {summary_updates[corp_code]}")
                else:
                    summary_updates[corp_code] = {
                        'Audit Committee': 0,
                        'Audit Committee Outside Directors': 0
                    }
                    print(f"{corp_code} - {company} - {rcept_no}: Valid table found but no valid members.")

            else:
                print(f"{corp_code} - {company} - {rcept_no}: No valid audit committee table found.")

        except Exception as e:
            print(f"Exception occurred for {corp_code} - {company}: {e}")

        time.sleep(1.5)

    for corp_code, update in summary_updates.items():
        summary_df.loc[summary_df['Corp Code'] == corp_code, 'Audit Committee'] = update['Audit Committee']
        summary_df.loc[summary_df['Corp Code'] == corp_code, 'Audit Committee Outside Directors'] = update['Audit Committee Outside Directors']

    return exec_df, summary_df

exec_df_updated, summary_df_updated = parse_and_update_audit_members(audit_targets_df, exec_df, summary_df)

In [3]:
exec_df_updated = pd.read_csv('exec_df_updated')
exec_df_updated['Corp Code'] = exec_df_updated['Corp Code'].astype(str).str.zfill(8)
kospi_codes = pd.read_csv('kospi_codes.csv')
kospi_codes['corp_code'] = kospi_codes['corp_code'].astype(str).str.zfill(8)

In [None]:
# --- 1. Indivualized Dataframe where each row represents a unique executive --- 

# from the original exec_df dataframe, identify members of the audit committee and auditors 

# because audit committee member can also be extracted from the corrected executive data: update such that it also clears afterwards 

def is_audit_committee_member(responsibility, position):

    responsibility_cleaned = re.sub(r'\s', '', responsibility)
    
    return bool(re.search(r'감사위원회위원|감사위원|감사위원장', responsibility_cleaned)) 

def is_auditor_exclusive(responsibility, position): #isolated capture to not overlap with is_audit_committee_member
    if is_audit_committee_member(responsibility, position):
        return False
    
    responsibility_check = False
    if isinstance(responsibility, str):
        responsibility_cleaned = re.sub(r'\s', '', responsibility)

        responsibility_check = '감사' in responsibility_cleaned and not re.search(r'감사위원회위원|감사위원', responsibility_cleaned) 
    position_check = False 


    return responsibility_check or position_check

# apply updated audit membership 
exec_df['is_audit_committee_member'] = exec_df.apply(
    lambda row: is_audit_committee_member(row['Responsibilities'], row['Registered Officer Status']), axis=1
)

exec_df['is_auditor'] = exec_df.apply(
    lambda row: is_auditor_exclusive(row['Responsibilities'], row['Registered Officer Status']), axis=1
)

# build the individual-level dataframe

In [None]:

def extract_summary(group):
    voting_directors_group = group[~group['Registered Officer Status'].isin(['미등기', '감사'])]

    female_voting = (voting_directors_group['Gender'] == '여').sum()
    male_voting = (voting_directors_group['Gender'] == '남').sum()
    
    return pd.Series({
        
        # total counts for all individuals 
        'Audit Committee': group['is_audit_committee_member'].sum(),
        'Audit Committee Outside Directors': ((group['is_audit_committee_member'] == True) & (group['Registered Officer Status'] == '사외이사')).sum(),
        'Inside Directors': group['Registered Officer Status'].isin(['사내이사', '대표집행임원']).sum(),
        'Outside Directors': (group['Registered Officer Status'] == '사외이사').sum(),
        'Other Non Exec Directors': (group['Registered Officer Status'] == '기타비상무이사').sum(),
        'Auditors': group['is_auditor'].sum(),
        
        # counts for Voting Directors subset
        'Female Voting': female_voting,
        'Male Voting': male_voting,
        'Voting Directors': female_voting + male_voting,

        # count for Non-Registered (separate)
        'Non Registered': (group['Registered Officer Status'] == '미등기').sum()
    })


# merge with kospi_codes to append corp_code, required for financial statment search in next section 
# Item code and stock_code are interchangeable 
summary_df = exec_df.groupby(['Company', 'Corp Code', 'Disclosure']).apply(extract_summary).reset_index()
summary_df

In [None]:
exec_df_updated

# to do: 
    # fix regex scraping: 

Unnamed: 0.1,Unnamed: 0,Disclosure,Corp Code,Company,Name,Gender,Position,Registered Officer Status,Responsibilities,Professional Background,Shareholder Relation,Period of employment,industry_code,is_audit_committee_member,is_auditor,Salary,salary_source,salary_type
0,0,20250320001609,00684802,에이플러스에셋,곽근호,남,총괄 대표이사\n 회장,사내이사,본인,"영남대학교 공업화학과 졸업영남대학교 명예 경영학 박사前) 삼성생명 지점장, 상무現)...",본인,17년\n 5개월,66202,False,False,985000000,개인별보수,exact
1,1,20250320001609,00684802,에이플러스에셋,황승목,남,대표이사,사내이사,대표이사,"서강대학교 철학과 졸업前) 삼성화재 대구사업부장, 상무現) \n(주)에이플러스에셋 ...",발행회사 임원,9년\n 1개월,66202,False,False,267000000,임원전체보수유형,est
2,2,20250320001609,00684802,에이플러스에셋,안영욱,남,기타비상무이사,기타비상무이사,경영자문,서울대학교 경영학과 졸업現) 스카이레이크 인베스트먼트 상무\n 現) \n(주)에이플...,발행회사 임원,4년\n 9개월,66202,False,False,267000000,임원전체보수유형,est
3,3,20250320001609,00684802,에이플러스에셋,김택군,남,기타비상무이사,기타비상무이사,경영자문,연세대학교 경영학과 졸업現) 스카이레이크 인베스트먼트 상무現) \n(주)에이플러스에...,발행회사 임원,4년\n 9개월,66202,False,False,267000000,임원전체보수유형,est
4,4,20250320001609,00684802,에이플러스에셋,서동진,남,사외이사,사외이사,감사위원,경북대학교 법학과 졸업前) 아람의료재단 이사장現) \n(주)에이플러스에셋 사외이사,발행회사 임원,4년\n 9개월,66202,True,False,18000000,임원전체보수유형,est
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15095,15095,20250318000822,00372873,KTis,이병무,남,기타비상무이사,기타비상무이사,평가 및 보상위원회\n위원,고려대학교 정치외교학과 졸업\nDuke University MBA\n前 KT 전략지...,최대주주(법인)의 임원,2024.03.28~\n2026년 정기주주총회일,63991,False,False,241000000,임원전체보수유형,est
15096,15096,20250318000822,00372873,KTis,강현구,남,기타비상무이사,기타비상무이사,평가 및 보상위원회\n위원,국민대학교 회계학과 졸업\n아주대학교 회계학과 석사\n前 KT 그룹경영실 그룹경영1...,최대주주(법인)의 임원,2024.03.28~\n2026년 정기주주총회일,63991,False,False,241000000,임원전체보수유형,est
15097,15097,20250318000822,00372873,KTis,김종만,남,전무,미등기,CV사업본부장,동국대학교 독어독문학과 졸업\n前 KT 강원고객본부 사업지원부장('18.11~'20...,계열회사 임원,2023.12.01~,63991,False,False,167000000,미등기임원,est
15098,15098,20250318000822,00372873,KTis,정영훈,남,상무,미등기,경영기획총괄,연세대학교 경제학과 졸업\n연세대학교 정보대학원 석사\n前 KT 그룹경영실 그룹경영...,계열회사 임원,2024.12.3.~,63991,False,False,167000000,미등기임원,est


In [None]:
education_pattern = re.compile(
    r"""
    (?P<university>
        (?:[가-힣A-Za-z\s.,&'()\-\[\]]*?)
        (?:
            [A-Z][a-z]+(?:\s+of\s+[A-Z][a-z]+)+  # University of X
            |
            [A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+(?:University|College|School|Institute)
            |
            [가-힣]+대학교
            |
            [가-힣]+대학
        )
        (?:\s+[가-힣A-Za-z]+(?:학교|대학|스쿨|School|College))?  # trailing subunits
    )
    """,
    re.IGNORECASE | re.VERBOSE
)

degree_finder = re.compile(
    r"""
    (?P<full_degree>
        명예\s*(?P<hon_field>[가-힣A-Za-z\s]+?)\s*박사
        |
        Ph\.?D\.?|M\.?D\.?|MBA|B\.S\.?|B\.A\.?
        |
        박사|석사|학사
        |
        졸업
    )
    """,
    re.IGNORECASE | re.VERBOSE
)

standardize_degree = {
    '박사': 'PhD', '석사': 'Master', '학사': 'Bachelor',
    'Ph.D.': 'PhD', 'PhD': 'PhD', 'Ph.D': 'PhD',
    'MBA': 'MBA', 'MD': 'MD', 'BS': 'Bachelor', 'BA': 'Bachelor',
    'B.S.': 'Bachelor', 'B.A.': 'Bachelor',
    '졸업': 'Graduated'
}


def parse_professional_background(text: str):
    education_results = []
    work_experience_parts = []
    if not isinstance(text, str):
        return (education_results, "")

    segments = re.split(r'\*|\s*前\)?|\s*現\)?|\n', text)

    for seg in segments:
        seg = seg.strip()
        if not seg:
            continue

        unis = list(education_pattern.finditer(seg))
        degrees = list(degree_finder.finditer(seg))
        matches = sorted([(m, 'uni') for m in unis] + [(m, 'deg') for m in degrees], key=lambda x: x[0].start())

        if not matches:
            work_experience_parts.append(seg)
            continue

        last_edu_entity_end = max(m.end() for m, _ in matches)
        work_text = seg[last_edu_entity_end:].strip(' ,()')
        if work_text:
            work_experience_parts.append(work_text)

        processed_indices = set()
        i = 0
        while i < len(matches):
            if i in processed_indices:
                i += 1
                continue

            current_match, current_type = matches[i]

            # Case 1: university → university (college) → degree
            if current_type == 'uni' and (i + 2) < len(matches):
                if matches[i + 1][1] == 'uni' and matches[i + 2][1] == 'deg':
                    uni = matches[i][0].group("university").strip()
                    dept = matches[i + 1][0].group("university").strip()
                    raw_degree = matches[i + 2][0].group("full_degree").replace('.', '').strip()
                    degree = standardize_degree.get(raw_degree, raw_degree)
                    education_results.append({"university": uni, "department": dept, "degree": degree})
                    processed_indices.update([i, i + 1, i + 2])
                    i += 3
                    continue

            # Case 2: university → degree
            if current_type == 'uni' and (i + 1) < len(matches) and matches[i + 1][1] == 'deg':
                uni = current_match.group("university").strip()
                deg_match = matches[i + 1][0]
                raw_degree = deg_match.group("full_degree").replace('.', '').strip()
                degree = standardize_degree.get(raw_degree, raw_degree)
                dept = deg_match.group("hon_field").strip() if deg_match.group("hon_field") else 'unlisted'
                education_results.append({"university": uni, "department": dept, "degree": degree})
                processed_indices.update([i, i + 1])
                i += 2
                continue

            # Case 3: degree → university (e.g., "철학과 졸업 연세대학교")
            if current_type == 'deg' and (i + 1) < len(matches) and matches[i + 1][1] == 'uni':
                deg_match, uni_match = matches[i][0], matches[i + 1][0]
                uni = uni_match.group("university").strip()
                raw_degree = deg_match.group("full_degree").replace('.', '').strip()
                degree = standardize_degree.get(raw_degree, raw_degree)

                between_text = seg[deg_match.end():uni_match.start()].strip(' ,()')
                dept = deg_match.group("hon_field").strip() if deg_match.group("hon_field") else (between_text or 'unlisted')
                education_results.append({"university": uni, "department": dept, "degree": degree})
                processed_indices.update([i, i + 1])
                i += 2
                continue

            i += 1

        # Unmatched universities
        for j, (match, m_type) in enumerate(matches):
            if j not in processed_indices and m_type == 'uni':
                uni = match.group("university").strip()
                education_results.append({"university": uni, "department": 'unlisted', "degree": 'unlisted'})

    return (education_results, " ".join(filter(None, work_experience_parts)))


# --- Apply to exec_df ---

parsed_data = exec_df["Professional Background"].apply(parse_professional_background)

individual_df = pd.DataFrame({
    "user_id": exec_df.index,
    "University": parsed_data.apply(lambda x: [e["university"] for e in x[0]]),
    "Department": parsed_data.apply(lambda x: [e["department"] for e in x[0]]),
    "Degree": parsed_data.apply(lambda x: [e["degree"] for e in x[0]]),
    "Work Experience": parsed_data.apply(lambda x: x[1])
})

In [None]:
""""

standardize the tenure column of exec data into year format (in reference to current year being bsns_yr). 

then from the different types of dart compensation calls, create a consolidated list of salary that includes: 
corp_code, name (blank if entry is from the grouped exec comp), ofcps, mendng_totamt, mendng_totamt_ct_incls_mendng, salary_source, and salary_type (exact/est). 

for each corporation in exec_df, if the listed individual is in the consolidated corp_list, append their compensation. 
however, if they're not listed by name, use their registered director status as reference to append to average salary for their position. 

similarly, create a list of individual major shareholders from the shareholder report. 
for the listed executives, if they're included in the list - append the trmend_posesn_stock_qota_rt value. 

the resulting compensation df should include the following per individual: 
corp_code, company, industry, name, role, executive type(inside, outside, other non exec), compensation, source, salary_ type, tenure, shareholder relation, shares 

"""

In [None]:
#TODO: add tenure parsing by yr 

In [None]:
# manual check for dart reports involving remuneration records 

dart.report('01263022', '임원전체보수유형', 2024, reprt_code='11011') #grouped 
dart.report('00126380', '개인별보수', 2024, reprt_code='00684802') #listed exec 
dart.report('00126380', '임원개인보수', 2024, reprt_code='00684802') #registered exec 

{'message': '정의되지 않은 오류가 발생하였습니다.', 'status': '900'}
{'message': '정의되지 않은 오류가 발생하였습니다.', 'status': '900'}


In [None]:
corp_list = kospi_codes['corp_code']

# === UTILITY ===
def get_json(url, params):
    try:
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()
        if data.get('status') == '013' or 'list' not in data:
            return []
        return data['list']
    except Exception as e:
        print(f"Request failed for {url}: {e}")
        return []

# === MAIN FUNCTION ===
def consolidate_salary_data_api(corp_code):

    endpoints = {
    'individual': 'https://opendart.fss.or.kr/api/hmvAuditIndvdlBySttus.json',
    'unregistered': 'https://opendart.fss.or.kr/api/unrstExctvMendngSttus.json',
    'grouped': 'https://opendart.fss.or.kr/api/drctrAdtAllMendngSttusMendngPymntamtTyCl.json' 
}
    params = {
        'crtfc_key': API_key,
        'corp_code': corp_code,
        'bsns_year': bsns_year,
        'reprt_code': reprt_code
    }

    results = []

    # 1. Individual executives
    for row in get_json(endpoints['individual'], params):
        results.append({
            'corp_code': corp_code,
            'name': row.get('nm'),
            'ofcps': row.get('ofcps'),
            'salary': row.get('mendng_totamt'),
            'benefits': row.get('mendng_totamt_ct_incls_mendng'),
            'salary_source': '개인별보수',
            'salary_type': 'exact'
        })

    # 2. Unregistered executives
    for row in get_json(endpoints['unregistered'], params):
        results.append({
            'corp_code': corp_code,
            'name': '',
            'ofcps': row.get('se'),
            'salary': row.get('jan_salary_am'),
            'benefits': None,
            'salary_source': '미등기임원',
            'salary_type': 'est'
        })

    # 3. Grouped executives
    for row in get_json(endpoints['grouped'], params):
        results.append({
            'corp_code': corp_code,
            'name': '',
            'ofcps': row.get('se'),
            'salary': row.get('psn1_avrg_pymntamt'),
            'benefits': None,
            'salary_source': '임원전체보수유형',
            'salary_type': 'est'
        })

    return results

# === EXECUTION ===
if __name__ == "__main__":
    all_data = []

    for idx, corp_code in enumerate(corp_list):
        data = consolidate_salary_data_api(corp_code)
        all_data.extend(data)
        time.sleep(0.7)

    # Final dataframe
    df = pd.DataFrame(all_data)

# execution time: 13minutes 

In [None]:
df.to_csv("compensation_df", index=False, encoding='utf-8-sig')

In [5]:
compensation_df = pd.read_csv('compensation_df')

In [None]:
# TODO: add in the case where a past exec member is no longer included in current exec_df but receives a salary 
# correct such that for unregistered, if not existing as exact number, it won't take on the estimate for registered 

In [6]:
compensation_df['corp_code'] = compensation_df['corp_code'].astype(str).str.zfill(8)

In [9]:
compensation_df

Unnamed: 0,corp_code,name,ofcps,salary,benefits,salary_source,salary_type
0,00684802,곽근호,총괄 대표이사\n 회장,985000000,-,개인별보수,exact
1,00684802,,미등기임원,177000000,,미등기임원,est
2,00684802,,"등기이사(사외이사, 감사위원회 위원 제외)",267000000,,임원전체보수유형,est
3,00684802,,사외이사(감사위원회 위원 제외),-,,임원전체보수유형,est
4,00684802,,감사위원회 위원,18000000,,임원전체보수유형,est
...,...,...,...,...,...,...,...
5680,00372873,,미등기임원,167000000,,미등기임원,est
5681,00372873,,"등기이사(사외이사, 감사위원회 위원 제외)",241000000,,임원전체보수유형,est
5682,00372873,,사외이사(감사위원회 위원 제외),-,,임원전체보수유형,est
5683,00372873,,감사위원회 위원,44000000,,임원전체보수유형,est


In [15]:
def assign_compensation(exec_df_updated: pd.DataFrame, compensation_df: pd.DataFrame) -> pd.DataFrame:
    """
    Assign salary from compensation_df to each executive in exec_df.
    Uses individual match first; falls back on average grouped position compensation if registered.
    Leaves '미등기' status executives empty unless exact match is found.
    """
    exec_df_updated = exec_df_updated.copy()
    exec_df_updated['Corp Code'] = exec_df_updated['Corp Code'].astype(str).str.zfill(8)

    exec_df_updated['Salary'] = None
    exec_df_updated['salary_source'] = None
    exec_df_updated['salary_type'] = None

    for idx, row in exec_df_updated.iterrows():
        corp_code = row['Corp Code']
        name = row['Name']
        status = row.get('Registered Officer Status', '')
        is_auditor = row.get('is_auditor', False)
        is_committee = row.get('is_audit_committee_member', False)

        # 1. Try to match by name
        match = compensation_df[
            (compensation_df['corp_code'] == corp_code) & 
            (compensation_df['name'] == name)
        ]

        if not match.empty:
            row_data = match.iloc[0]

        else:
            if status == '미등기':
                label = '미등기임원' 
            # 2. Estimate fallback: build label
            elif is_auditor:
                label = '감사'
            elif status == '사외이사':
                label = '감사위원회 위원' if is_committee else '사외이사(감사위원회 위원 제외)'
            else:
                label = '등기이사(사외이사, 감사위원회 위원 제외)'

            group_match = compensation_df[
                (compensation_df['corp_code'] == corp_code) & 
                (compensation_df['name'].isna()) & 
                (compensation_df['ofcps'] == label)
            ]

            row_data = group_match.iloc[0] if not group_match.empty else pd.Series(dtype='object')

        # 3. Assign if valid
        if not row_data.empty:
            exec_df_updated.at[idx, 'Salary'] = row_data.get('salary')
            exec_df_updated.at[idx, 'salary_source'] = row_data.get('salary_source')
            exec_df_updated.at[idx, 'salary_type'] = row_data.get('salary_type')

    return exec_df_updated
exec_df_updated = assign_compensation(exec_df_updated, compensation_df)

# execution time: 19 seconds 

In [17]:
exec_df_updated.to_csv('exec_df_updated')

In [18]:
individual_comps = compensation_df[
    (compensation_df['name'].notnull()) &
    (compensation_df['name'] != '') &
    (compensation_df['name'] != '-')
].copy()

# Build a composite key for matching
exec_keys = set(zip(exec_df_updated['Corp Code'], exec_df_updated['Name']))
comp_keys = set(zip(individual_comps['corp_code'], individual_comps['name']))

# Get the difference: people in comp_df but not exec_df
extra_comp_keys = comp_keys - exec_keys

# Filter those from comp_df
mask = individual_comps.apply(lambda row: (row['corp_code'], row['name']) in extra_comp_keys, axis=1)
extra_individuals = individual_comps[mask]

In [20]:
# manually check that the ind audit status has been properly updated 
# manually check that the compensation has been properly updated 
# manual check that compensation has been accurately assigned 
# unregistered members that are not on the audit commitee should = None 
# check that number of exact in exec_check + extra_check = comp_check 
# check outside dir & audit committee differentiations by the dart report calls:
    # dart.report('01263022', '임원전체보수유형', 2024, reprt_code='11011') #grouped 
    # dart.report('00126380', '개인별보수', 2024, reprt_code='11011') #listed exec 
    # dart.report('00126380', '임원개인보수', 2024, reprt_code='11011') #registered exec 

comp_check = compensation_df[compensation_df['corp_code'] == '00126380']
extra_check = extra_individuals[extra_individuals['corp_code'] == '00126380']
exec_check = exec_df_updated[exec_df_updated['Corp Code'] == '00126380']

In [None]:
exec_check

In [33]:
comp_check

Unnamed: 0,corp_code,name,ofcps,salary,benefits,salary_source,salary_type
4334,126380,박학규,사장,3346000000,-,개인별보수,exact
4335,126380,노태문,이사,5098000000,-,개인별보수,exact
4336,126380,한종희,대표이사,5240000000,-,개인별보수,exact
4337,126380,이정배,상담역,6950000000,-,개인별보수,exact
4338,126380,경계현,고문,8036000000,-,개인별보수,exact
4339,126380,,미등기임원,671000000,,미등기임원,est
4340,126380,,"등기이사(사외이사, 감사위원회 위원 제외)",5734000000,,임원전체보수유형,est
4341,126380,,사외이사(감사위원회 위원 제외),247000000,,임원전체보수유형,est
4342,126380,,감사위원회 위원,120000000,,임원전체보수유형,est
4343,126380,,감사,-,,임원전체보수유형,est


In [34]:
exec_check

Unnamed: 0,Disclosure,Corp Code,Company,Name,Gender,Position,Registered Officer Status,Responsibilities,Professional Background,Shareholder Relation,Period of employment,industry_code,is_audit_committee_member,is_auditor,Salary,salary_source,salary_type
12041,20250311001085,126380,삼성전자,한종희,남,부회장,사내이사,대표이사(DX부문 경영전반 총괄),ㆍ인하대 전자공학 학사ㆍ삼성전자 DX부문장,계열회사 임원,58개월,264,False,False,5240000000,개인별보수,exact
12042,20250311001085,126380,삼성전자,노태문,남,사장,사내이사,MX사업부장,ㆍ포항공대 전자전기공학 박사ㆍ삼성전자 MX사업부장,계열회사 임원,34개월,264,False,False,5098000000,개인별보수,exact
12043,20250311001085,126380,삼성전자,이정배,남,상담역,사내이사,상담역,ㆍ서울대 전자공학 박사ㆍ삼성전자 메모리사업부장,계열회사 임원,34개월,264,False,False,6950000000,개인별보수,exact
12044,20250311001085,126380,삼성전자,김한조,남,이사,사외이사,이사회 의장감사위원회 위원장내부거래위원회 위원보상위원회 위원지속가능경영위원회 위원장,ㆍ연세대 불어불문학 학사ㆍ하나금융지주 부회장,계열회사 임원,70개월,264,True,False,120000000,임원전체보수유형,est
12045,20250311001085,126380,삼성전자,김준성,남,이사,사외이사,보상위원회 위원지속가능경영위원회 위원,ㆍCarnegie Mellon대 \n 경제학/산업공학 학사ㆍ싱가포르 투자청 \n...,계열회사 임원,34개월,264,False,False,247000000,임원전체보수유형,est
12046,20250311001085,126380,삼성전자,허은녕,남,이사,사외이사,내부거래위원회 위원장사외이사 후보추천위원회 위원지속가능경영위원회 위원,ㆍPennsylvania State대\n 자원경제학 박사ㆍ서울대 공과대학 교수,계열회사 임원,26개월,264,False,False,247000000,임원전체보수유형,est
12047,20250311001085,126380,삼성전자,유명희,여,이사,사외이사,감사위원회 위원내부거래위원회 위원사외이사 후보추천위원회 위원지속가능경영위원회 위원,ㆍVanderbilt대 법학 박사ㆍ산업통상자원부 \n 통상교섭본부 본부장,계열회사 임원,26개월,264,True,False,120000000,임원전체보수유형,est
12048,20250311001085,126380,삼성전자,신제윤,남,이사,사외이사,사외이사 후보추천위원회 위원장보상위원회 위원지속가능경영위원회 위원,ㆍ서울대 경제학 학사ㆍ금융위원회 위원장,계열회사 임원,10개월,264,False,False,247000000,임원전체보수유형,est
12049,20250311001085,126380,삼성전자,조혜경,여,이사,사외이사,감사위원회 위원지속가능경영위원회 위원,ㆍ서울대 로봇공학 박사ㆍ한성대 AI응용학과 교수,계열회사 임원,10개월,264,True,False,120000000,임원전체보수유형,est


In [29]:
extra_check

Unnamed: 0,corp_code,name,ofcps,salary,benefits,salary_source,salary_type
4334,126380,박학규,사장,3346000000,-,개인별보수,exact
4338,126380,경계현,고문,8036000000,-,개인별보수,exact


In [None]:
dart.report('00126380', '임원전체보수유형', 2024, reprt_code='11011') #grouped 

Unnamed: 0,rcept_no,corp_cls,corp_code,corp_name,se,nmpr,pymnt_totamt,psn1_avrg_pymntamt,rm,stlm_dt
0,20250515001922,Y,126380,삼성전자,"등기이사(사외이사, 감사위원회 위원 제외)",-,-,-,-,2025-03-31
1,20250515001922,Y,126380,삼성전자,사외이사(감사위원회 위원 제외),-,-,-,-,2025-03-31
2,20250515001922,Y,126380,삼성전자,감사위원회 위원,-,-,-,-,2025-03-31
3,20250515001922,Y,126380,삼성전자,감사,-,-,-,-,2025-03-31


In [None]:
'''
경계현 8,036,000,000 # under extra 
이정배 6,950,000,000	
최시영 6,900,000,000
한종희 5,240,000,000
노태문 5,098,000,000	
박학규 3,346,000,000 # only additional from unregistered - under extra 
'''

In [None]:
dart.report('00126380', '개인별보수', 2024, reprt_code='11011') #indivudal registered directors + auditors

# 최시영 not appended 

Unnamed: 0,rcept_no,corp_cls,corp_code,corp_name,nm,ofcps,mendng_totamt,mendng_totamt_ct_incls_mendng,stlm_dt
0,20250311001085,Y,126380,삼성전자,경계현,고문,8036000000,-,2024-12-31
1,20250311001085,Y,126380,삼성전자,이정배,상담역,6950000000,-,2024-12-31
2,20250311001085,Y,126380,삼성전자,최시영,상담역,6900000000,-,2024-12-31
3,20250311001085,Y,126380,삼성전자,한종희,대표이사,5240000000,-,2024-12-31
4,20250311001085,Y,126380,삼성전자,노태문,이사,5098000000,-,2024-12-31


In [None]:
dart.report('00126380', '미등기임원보수', 2024, reprt_code='11011') #unregistered exec 

Unnamed: 0,rcept_no,corp_cls,corp_code,corp_name,nm,ofcps,mendng_totamt,mendng_totamt_ct_incls_mendng,stlm_dt
0,20250311001085,Y,126380,삼성전자,박학규,사장,3346000000,-,2024-12-31
1,20250311001085,Y,126380,삼성전자,노태문,이사,5098000000,-,2024-12-31
2,20250311001085,Y,126380,삼성전자,한종희,대표이사,5240000000,-,2024-12-31
3,20250311001085,Y,126380,삼성전자,이정배,상담역,6950000000,-,2024-12-31
4,20250311001085,Y,126380,삼성전자,경계현,고문,8036000000,-,2024-12-31


In [None]:
# TOTAL compensation: 임원전체보수 
# EXEC TYPE: 임원전체보수유형 : 
    # 등기이사(사외이사, 감사위원회 위원 제외): (registered director excluding outside directors and audit committee members)
    # 사외이사(감사위원회 위원 제외): outside (non audit committee)
    # 감사위원회 위원: audit committee (감사위원회 위원)
    # 감사: auditor 
# from opendart, we get exact numbers of registered inside/executive directors from: '임원개인보수' current 
# with the addition of '개인별보수' which includes ONEDs making over 5 million 
# '임원전체보수유형' which provides registered directors (non outside/ac), outside dir (excluding ac), ac, auditor 

# consider: having individual nums where possible OR all averaged 

In [None]:
# Appending shareholder status (relation/holding status)

# OPENDART link: https://opendart.fss.or.kr/guide/detail.do?apiGrpCd=DS002&apiId=2019007 
# API url: https://opendart.fss.or.kr/api/hyslrSttus.json
# nm, relate, stock_knd, trmend_posesn_stock_co	, trmend_posesn_stock_qota_rt	

In [161]:
exec_df_updated.head(1)

Unnamed: 0,Disclosure,Corp Code,Company,Name,Gender,Position,Registered Officer Status,Responsibilities,Professional Background,Shareholder Relation,Period of employment,industry_code,is_audit_committee_member,is_auditor,Salary,salary_source,salary_type
0,20250320001609,684802,에이플러스에셋,곽근호,남,총괄 대표이사\n 회장,사내이사,본인,"영남대학교 공업화학과 졸업영남대학교 명예 경영학 박사前) 삼성생명 지점장, 상무現)...",본인,17년\n 5개월,66202,False,False,985000000,개인별보수,exact


In [163]:
API_key = 'c59d8af8c96ea97002c35d4c160a8c8c5c5dbd3c'

In [None]:
corp_list = kospi_codes['corp_code']
url = 'https://opendart.fss.or.kr/api/hyslrSttus.json'
shareholder_status = []

for corp_code in corp_list:
    params = {
        'crtfc_key': API_key,
        'corp_code': corp_code,
        'bsns_year': bsns_year,
        'reprt_code': reprt_code
    }

    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()

        if data['status'] != '000':
            print(f"{corp_code} - No data or error: {data['message']}")
            continue

        if 'list' in data:
            df = pd.DataFrame(data['list'])
            df['corp_code'] = corp_code  # Track origin
            shareholder_status.append(df)

    except Exception as e:
        print(f"Error for corp {corp_code}: {e}")

    time.sleep(0.7)  # Respect rate limit

    print(f"Appended data for: {corp_code}")

shareholder_df = pd.concat(shareholder_status, ignore_index=True)

# execution time: 11 minutes 

Appended data for: 00684802
Appended data for: 00860730
Appended data for: 00400857
Appended data for: 00936787
Appended data for: 00365387
Appended data for: 00111689
Appended data for: 00486705
Appended data for: 00125080
Appended data for: 00117027
Appended data for: 00583424
Appended data for: 00154462
Appended data for: 00138516
Appended data for: 01190568
Appended data for: 00990165
Appended data for: 00138701
Appended data for: 00138729
Appended data for: 00152729
Appended data for: 00143314
Appended data for: 00139454
Appended data for: 00651901
Appended data for: 00101044
Appended data for: 00152385
Appended data for: 00145437
Appended data for: 00138792
Appended data for: 00458234
Appended data for: 00122551
Appended data for: 00219097
Appended data for: 01263022
Appended data for: 00447609
Appended data for: 00124726
Appended data for: 00858364
Appended data for: 00123541
Appended data for: 00123107
Appended data for: 00123772
Appended data for: 00123718
Appended data for: 0

In [176]:
shareholder_df_renamed = shareholder_df.rename(columns={
    'corp_code': 'Corp Code',
    'nm': 'Name',
    'trmend_posesn_stock_co': 'Shares Owned',
    'trmend_posesn_stock_qota_rt': 'Shares Ratio'
})

# Merge by corp_code and name
exec_df_updated = exec_df_updated.merge(
    shareholder_df_renamed[['Corp Code', 'Name', 'Shares Owned', 'Shares Ratio']],
    on=['Corp Code', 'Name'],
    how='left'
)

In [181]:
exec_df_updated.to_csv('exec_salary_shareholder', index = False)

In [1]:
exec_df_updated = pd.read_csv('exec_salary_shareholder')

NameError: name 'pd' is not defined

In [None]:
exec_df_updated

Unnamed: 0,Disclosure,Corp Code,Company,Name,Gender,Position,Registered Officer Status,Responsibilities,Professional Background,Shareholder Relation,Period of employment,industry_code,is_audit_committee_member,is_auditor,Salary,salary_source,salary_type,Shares Owned,Shares Ratio
0,20250320001609,684802,에이플러스에셋,곽근호,남,총괄 대표이사\n 회장,사내이사,본인,"영남대학교 공업화학과 졸업영남대학교 명예 경영학 박사前) 삼성생명 지점장, 상무現)...",본인,17년\n 5개월,66202,False,False,985000000.0,개인별보수,exact,4535596,20.06
1,20250320001609,684802,에이플러스에셋,황승목,남,대표이사,사내이사,대표이사,"서강대학교 철학과 졸업前) 삼성화재 대구사업부장, 상무現) \n(주)에이플러스에셋 ...",발행회사 임원,9년\n 1개월,66202,False,False,267000000.0,임원전체보수유형,est,16245,0.07
2,20250320001609,684802,에이플러스에셋,안영욱,남,기타비상무이사,기타비상무이사,경영자문,서울대학교 경영학과 졸업現) 스카이레이크 인베스트먼트 상무\n 現) \n(주)에이플...,발행회사 임원,4년\n 9개월,66202,False,False,267000000.0,임원전체보수유형,est,,
3,20250320001609,684802,에이플러스에셋,김택군,남,기타비상무이사,기타비상무이사,경영자문,연세대학교 경영학과 졸업現) 스카이레이크 인베스트먼트 상무現) \n(주)에이플러스에...,발행회사 임원,4년\n 9개월,66202,False,False,267000000.0,임원전체보수유형,est,,
4,20250320001609,684802,에이플러스에셋,서동진,남,사외이사,사외이사,감사위원,경북대학교 법학과 졸업前) 아람의료재단 이사장現) \n(주)에이플러스에셋 사외이사,발행회사 임원,4년\n 9개월,66202,True,False,18000000.0,임원전체보수유형,est,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15144,20250318000822,372873,KTis,이병무,남,기타비상무이사,기타비상무이사,평가 및 보상위원회\n위원,고려대학교 정치외교학과 졸업\nDuke University MBA\n前 KT 전략지...,최대주주(법인)의 임원,2024.03.28~\n2026년 정기주주총회일,63991,False,False,241000000.0,임원전체보수유형,est,,
15145,20250318000822,372873,KTis,강현구,남,기타비상무이사,기타비상무이사,평가 및 보상위원회\n위원,국민대학교 회계학과 졸업\n아주대학교 회계학과 석사\n前 KT 그룹경영실 그룹경영1...,최대주주(법인)의 임원,2024.03.28~\n2026년 정기주주총회일,63991,False,False,241000000.0,임원전체보수유형,est,,
15146,20250318000822,372873,KTis,김종만,남,전무,미등기,CV사업본부장,동국대학교 독어독문학과 졸업\n前 KT 강원고객본부 사업지원부장('18.11~'20...,계열회사 임원,2023.12.01~,63991,False,False,167000000.0,미등기임원,est,,
15147,20250318000822,372873,KTis,정영훈,남,상무,미등기,경영기획총괄,연세대학교 경제학과 졸업\n연세대학교 정보대학원 석사\n前 KT 그룹경영실 그룹경영...,계열회사 임원,2024.12.3.~,63991,False,False,167000000.0,미등기임원,est,,


: 

In [None]:
exec_df_updated['Salary'] = (
    exec_df_updated['Salary']
    .astype(str)
    .str.replace(r'[^\d.]', '', regex=True)
)

# Convert to numeric (coerce errors to NaN)
exec_df_updated['Salary'] = pd.to_numeric(exec_df_updated['Salary'], errors='coerce')

# Drop rows where Salary is still NaN after conversion
exec_df_cleaned = exec_df_updated.dropna(subset=['Salary'])

# Now group and aggregate
industry_stats = exec_df_cleaned.groupby('industry_code').agg(
    median_salary=('Salary', 'median'),
    count=('Salary', 'count')
).sort_index()

print("Executive salary stats by industry code:")
print(industry_stats)

Executive salary stats by industry code:
               median_salary  count
industry_code                      
102              133000000.0     30
104              214851000.0     20
105              156375910.0     76
106               86333000.0      9
108              261000000.0    342
...                      ...    ...
75320            296000000.0     35
76110            300000000.0     28
76320            218000000.0     19
85120            161000000.0     10
91249            123000000.0     46

[308 rows x 2 columns]


In [None]:
# 7/30 TODO: check that consolidated salaries/compensation are accurate 
# check that audit update is accurate (including for ind_df -> exec_df_updated)
# see if possible to standardize tenure by years employed 
# see if it's possible to map industry code to industry name, otherwise run k-means on industry code 
# update documentation (comments in notebook and seperate doc detailing transformations, dropped columns, sources, why API vs dart fss or opendart reader)

In [None]:
dart.report('01263022', '임원전체보수', 2024, reprt_code='11011')
dart.report('01263022', '임원전체보수유형', 2024, reprt_code='11011')

Unnamed: 0,rcept_no,corp_cls,corp_code,corp_name,nmpr,jan_avrg_mendng_am,mendng_totamt,rm,stlm_dt
0,20250318000733,Y,1263022,BGF리테일,7,425000000,2972000000,-,2024-12-31


In [None]:
# TOTAL compensation: 임원전체보수 
# EXEC TYPE: 임원전체보수유형 : 
    # 등기이사(사외이사, 감사위원회 위원 제외): (registered director excluding outside directors and audit committee members)
    # 사외이사(감사위원회 위원 제외): outside (non audit committee)
    # 감사위원회 위원: audit committee (감사위원회 위원)
    # 감사: auditor 
# from opendart, we get exact numbers of registered inside/executive directors from: '임원개인보수' current 
# with the addition of '개인별보수' which includes ONEDs making over 5 million 
# '임원전체보수유형' which provides registered directors (non outside/ac), outside dir (excluding ac), ac, auditor 

# consider: having individual nums where possible OR all averaged 

In [None]:
""""
Korean	      English
사모자금사용	Use of Private Fundraising Proceeds
공모자금사용	Use of Public Offering Proceeds
주식총수	Total Shares Outstanding

사외이사	Outside Directors https://engopendart.fss.or.kr/guide/detail.do?apiGrpCd=DE002&apiId=AE00027 
    - already has information for num directors, outside directors 

최대주주	Major Shareholder https://engopendart.fss.or.kr/guide/detail.do?apiGrpCd=DE002&apiId=AE00008 
- 최대주주변동	Change in Major Shareholder 
소액주주	Minority Shareholders https://engopendart.fss.or.kr/guide/detail.do?apiGrpCd=DE002&apiId=AE00010  

Compensation Types: 
    임원전체보수	Total Executive Compensation 
        임원전체보수유형	Types of Total Executive Compensation
    개인별보수	Individual Compensation Disclosure 
        임원개인보수	Individual Executive Compensation 


"""


In [None]:
# if this doesn't include the break down by executive position like https://engopendart.fss.or.kr/disclosureinfo/fnltt/singl/main.do does 

# cross check whether its concatonated under the other individual api call 

# if enough, merge on corp code 
# if not enough, scrape from url directly for grouped data 

# figure out how to extract industry name from code and group data from there 

# from the url link: pull the following info
# 1. total remuneration amount (with unit appended)
# 2. average per executive (general)
# 3. average per registered directors 
# 4. average per outside non audit committee directors 
# 5. average per audit committee members 

In [None]:
# From individual_df create a separate one of the registered executives 

# Append compensation from average remittance, stocks, relationship to largest shareholder  

In [None]:
# Try OpenDart Reader function: 
# 2. 사업보고서:  '임원개인보수', '임원전체보수', '개인별보수',

GOAL: Clean up all comments and scrap irrelevancy. Fix the building notebook audit update and governance check. Then correct the logic behind the remuneration data. Update and append any other relevant info.

If necessary, see if scraping directly will yield better results. Otherwise don't. See if opendart reader has a function to call in the data instead. 