In [1]:
# import libraries and set parameters 

import os
import io
import re
import time
import zipfile 
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor 

import dart_fss
import OpenDartReader
from datetime import datetime

API_key = '0d67945133e224c451452e071e0d8349969353e1' 
dart = OpenDartReader(API_key)
dart_fss.set_api_key(API_key)

current_year = 2025
bsns_year = '2024' # from most recent annual report
reprt_code = '11011'
asset_year = '2024' # set to a year before bsns year due to grace period for corps surpassing 2 trillion krw 
current_date = '2025.08.07'

# call most recent annual/semi-annual report as quarterly may exclude audit info due to corporate disclosure form preparation standards 

Error occurred during getting browser(s): random, but was suppressed with fallback.


Take in Raw Data. Clean, standardize, and parse. Produce an individual_df and standardized_summary_df 

individual_df 
- assign salary
- check that salary numbers add up 

group_df 
- counts of all 
- governance check/flag 

# TODO: fully fix exec-df 
- standardize tenure 
- group data 
- make sure that audit commmittee membership is correct (governance check -> if same problems come up, extract audit committee membership from financial doc)
- standardize education 

# TODO: group data 
- sumarize the exec_df + get the counts 
- check that the total comp of summary_df = the salary_total comp vals 
- standardize tenure 

# At Large: 
- check that audit committee members are correct 

# 08/08 TODO:
- fix auditor placeholder logic + flagging 
- fix exec updating 
- parse prof_background to separate edu / work experience
- rename columns 
- save all processed df's 
- clean notebooks 


# Documentation Format: 
- the produced data frames
- the full code and explaining decision processes (ie why est/exact for salary, the flags for audit committee, assets check from 2 years ago), what columns were droppped, etc 

# Useful OpenDART links: 
- https://opendart.fss.or.kr/guide/detail.do?apiGrpCd=DE004&apiId=AE00041 Ownership change (most recent data)

In [2]:
# 2. Read in Files: if run before, read in the files 
all_corp_codes_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\listed_corp_codes.csv')
kospi_company_info_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\kospi_company_info.csv')
executive_status_data_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\executive_status_2024_11011.csv')
assets_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\assets_2024_11011.csv')
salary_type_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\salary_data_2024_11011.csv')
salary_total_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\salary_total_data_2024_11011.csv')
major_shareholder_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\raw\major_shareholders_2024_11011.csv')

In [3]:
exec_df = executive_status_data_df.drop(columns=['corp_cls', 'birth_ym', 'fte_at', 'tenure_end_on', 'stlm_dt'], errors='ignore')

def is_audit_committee_member(responsibility, position):

    responsibility_cleaned = re.sub(r'\s', '', responsibility)
    
    return bool(re.search(r'감사위원회위원|감사위원|감사위원장', responsibility_cleaned)) 

def is_auditor_exclusive(responsibility, position): #isolated capture to not overlap with is_audit_committee_member
    if is_audit_committee_member(responsibility, position):
        return False
    
    responsibility_check = False
    if isinstance(responsibility, str):
        responsibility_cleaned = re.sub(r'\s', '', responsibility)

        responsibility_check = '감사' in responsibility_cleaned and not re.search(r'감사위원회위원|감사위원', responsibility_cleaned) 
    position_check = False 


    return responsibility_check or position_check

# apply updated audit membership 
exec_df['is_audit_committee_member'] = exec_df.apply(
    lambda row: is_audit_committee_member(row['chrg_job'], row['rgist_exctv_at']), axis=1
)

exec_df['is_auditor'] = exec_df.apply(
    lambda row: is_auditor_exclusive(row['chrg_job'], row['rgist_exctv_at']), axis=1
)

In [4]:
def assign_compensation(exec_df: pd.DataFrame, salary_type_df: pd.DataFrame) -> pd.DataFrame:
    """
    Assign salary from compensation_df to each executive in exec_df.
    Uses individual match first; falls back on average grouped position compensation if registered.
    Leaves '미등기' status executives empty unless exact match is found.
    """
    exec_df['salary'] = None
    exec_df['salary_source'] = None
    exec_df['salary_type'] = None

    for idx, row in exec_df.iterrows():
        corp_code = row['corp_code']
        name = row['nm']
        status = row.get('rgist_exctv_at')
        is_auditor = row.get('is_auditor', False)
        is_committee = row.get('is_audit_committee_member', False)

        # 1. Try to match by name
        match = salary_type_df[
            (salary_type_df['corp_code'] == corp_code) & 
            (salary_type_df['name'] == name)
        ]

        if not match.empty:
            row_data = match.iloc[0]

        else:
            if status == '미등기':
                label = '미등기임원' 
            # 2. Estimate fallback: build label
            elif is_auditor:
                label = '감사'
            elif status == '사외이사':
                label = '감사위원회 위원' if is_committee else '사외이사(감사위원회 위원 제외)'
            else:
                label = '등기이사(사외이사, 감사위원회 위원 제외)'

            group_match = salary_type_df[
                (salary_type_df['corp_code'] == corp_code) & 
                (salary_type_df['name'].isna()) & 
                (salary_type_df['position'] == label)
            ]

            row_data = group_match.iloc[0] if not group_match.empty else pd.Series(dtype='object')

        # 3. Assign if valid
        if not row_data.empty:
            exec_df.at[idx, 'salary'] = row_data.get('compensation')
            exec_df.at[idx, 'salary_source'] = row_data.get('salary_source')
            exec_df.at[idx, 'salary_type'] = row_data.get('salary_type')

    return exec_df

exec_df = assign_compensation(exec_df, salary_type_df)

exec_df = pd.merge(
    exec_df, 
    major_shareholder_df[['corp_code', 'nm', 'trmend_posesn_stock_qota_rt']],
    on=['corp_code', 'nm'],
    how='left'
)

In [6]:
def convert_tenure_to_months(tenure_str, current_date=None):
    """
    Converts a tenure string from various formats into a total number of months.
    
    Args:
        tenure_str (str): The string representing tenure (e.g., '2년 6개월', '2022.05', '11.3년').
        current_date (datetime): The reference date to calculate tenure from a start date.
                                 Defaults to today's date if not provided.

    Returns:
        float: The total tenure in months, or pd.NA if the format is not recognized.
    """
    if pd.isna(tenure_str) or not isinstance(tenure_str, str) or not tenure_str.strip():
        return pd.NA
        
    tenure_str = tenure_str.strip()
    
    # 1. Pattern: All date formats, including those with extra text
    date_match = re.search(r'(\d{2,4}[년\.\s]\d{1,2}[월]?(?:[년\.\s]\d{1,2}[일])?|\d{1,2}\.\d{1,2}\.\d{1,2})', tenure_str)
    
    if date_match:
        date_str = date_match.group(1).replace(' ', '').replace('년', '.').replace('월', '').replace('일', '')
        date_obj = pd.NaT
        
        # Now try to parse the cleaned date string with multiple formats
        date_formats = ['%Y.%m.%d', '%Y.%m', '%d.%m.%y', '%y.%m.%d', '%y.%m']
        for fmt in date_formats:
            try:
                date_obj = pd.to_datetime(date_str, format=fmt, errors='raise')
                break # Exit the loop if parsing is successful
            except (ValueError, TypeError):
                continue
    
        if pd.notna(date_obj):
            if current_date is None:
                current_date = datetime.now()
            
            total_months = (current_date.year - date_obj.year) * 12 + (current_date.month - date_obj.month)
            return float(max(0, total_months))
    
    # 2. Pattern: Decimal years (e.g., '11.3년', '22.5')
    match_deci = re.search(r'^(\d+(?:\.\d+)?)(?:년)?$', tenure_str)
    if match_deci:
        decimal_years = float(match_deci.group(1))
        return decimal_years * 12
    
    # 3. Pattern: Years and Months (e.g., "3년 6개월", "4년4개월")
    match_ym = re.search(r'(\d+)\s*년(?:[^\d]+)?\s*(\d+)\s*개월', tenure_str)
    if match_ym:
        years = int(match_ym.group(1))
        months = int(match_ym.group(2))
        return float(years * 12 + months)
        
    # 4. Pattern: Only Years (e.g., "3년")
    match_y = re.search(r'^(\d+)\s*년$', tenure_str)
    if match_y:
        years = int(match_y.group(1))
        return float(years * 12)
        
    # 5. Pattern: Only Months (e.g., "18개월")
    match_m = re.search(r'^(\d+)\s*개월$', tenure_str)
    if match_m:
        months = int(match_m.group(1))
        return float(months)
    
    return pd.NA

# Set a specific reference date for consistent tenure calculation
reference_date = datetime(2025, 8, 5)

# Apply the custom function to the column
exec_df['hffc_pd'] = exec_df['hffc_pd'].apply(
    lambda x: convert_tenure_to_months(x, current_date=reference_date)
)

In [7]:
def extract_summary(group):
    voting_directors_group = group[~group['rgist_exctv_at'].isin(['미등기', '감사'])]

    female_voting = (voting_directors_group['sexdstn'] == '여').sum()
    male_voting = (voting_directors_group['sexdstn'] == '남').sum()
    
    return pd.Series({
        
        # total counts for all individuals 
        'Audit Committee': group['is_audit_committee_member'].sum(),
        'Audit Committee ODs': ((group['is_audit_committee_member'] == True) & (group['rgist_exctv_at'] == '사외이사')).sum(),
        'Inside Directors': group['rgist_exctv_at'].isin(['사내이사', '대표집행임원']).sum(),
        'Outside Directors': (group['rgist_exctv_at'] == '사외이사').sum(),
        'Other Non-Exec Directors': (group['rgist_exctv_at'] == '기타비상무이사').sum(),
        'Auditors': group['is_auditor'].sum(),
        
        # counts for Voting Directors subset
        'Female Voting': female_voting,
        'Male Voting': male_voting,
        'Voting Directors': female_voting + male_voting,

        # count for Non-Registered (separate)
        'Non Registered': (group['rgist_exctv_at'] == '미등기').sum()
    })


# merge with kospi_codes to append corp_code, required for financial statment search in next section 
# Item code and stock_code are interchangeable 
summary_df = exec_df.groupby(['corp_code', 'corp_name']).apply(extract_summary).reset_index()
summary_df = pd.merge(summary_df, assets_df[['Corp Code', 'Total Assets (2024)', 'Total Assets (2023)', 'Total Assets (2022)']], left_on='corp_code', right_on='Corp Code', how='left')
summary_df.drop(columns=['Corp Code'], inplace=True)

rcept_no = exec_df.groupby('corp_code')['rcept_no'].max().reset_index()
rcept_no.rename(columns={'rcept_no': 'rcept_no'}, inplace=True)
summary_df = pd.merge(summary_df, rcept_no, on='corp_code', how='left')

  summary_df = exec_df.groupby(['corp_code', 'corp_name']).apply(extract_summary).reset_index()


In [8]:
def check_governance_compliance(df):
    """
    Evaluates governance compliance based on board and audit committee rules.
    Returns:
        pd.DataFrame with flagged issues for non-compliant companies.
    """
    flagged = []

    for index, row in df.iterrows():
        corp_code = row['corp_code']
        total_assets = row['Total Assets (2022)']
        num_audit_committee = row['Audit Committee']
        num_outside_directors = row['Outside Directors']
        num_voting_directors = row['Voting Directors']
        num_outside_committee = row['Audit Committee ODs']

        failures = []

        # Check audit committee compliances
        if pd.notna(total_assets) and total_assets > 2_000_000_000_000:
            if pd.isna(num_audit_committee) or num_audit_committee == 0:
                failures.append("No Audit Committee listed")
            elif num_audit_committee < 3:
                failures.append(f"Audit Committee has fewer than 3 members ({num_audit_committee})")

            if pd.notna(num_audit_committee) and num_audit_committee > 0:
                if pd.isna(num_outside_committee):
                    failures.append("Missing count of Audit Committee Outside Directors.")
                elif num_outside_committee < (2/3) * num_audit_committee:
                    failures.append(f"Audit Committee Outside Directors ({num_outside_committee}) < 2/3 of Audit Committee ({num_audit_committee})")
            if num_outside_directors < (1/4) * num_voting_directors:
                failures.append(f"Outside Directors ({num_outside_directors}) < 1/4 of Voting Directors ({num_voting_directors})")

        if failures:
            flagged.append({'corp_code': corp_code, 'Flagged Conditions': "; ".join(failures)})

    failed_df = pd.DataFrame(flagged)
    failed_df['corp_code'] = failed_df['corp_code']

    # Merge with original data
    return pd.merge(failed_df, df, on='corp_code', how='inner')

flagged = check_governance_compliance(summary_df) 

# ~ 75 flagged

In [9]:
def missing_acm_urls(flagged_df):
    results = []

    for idx, row in flagged_df.iterrows():
        company = row['corp_name']
        corp = row['corp_code']
        rcp = row['rcept_no'] # update to append the disclosure from exec_df most recent 
        assets = row['Total Assets (2022)']

        try:
            subdocs = dart.sub_docs(str(rcp))  # rcept_no must be string
            match = subdocs[subdocs['title'].str.contains("감사제도에 관한 사항")]

            if not match.empty:
                url = match.iloc[0]['url']
            else:
                url = None

        except Exception as e:
            print(f"Failed to fetch for {corp} ({rcp}): {e}")
            url = None

        results.append({
            'corp_code': str(corp),
            'corp_name': company,
            'rcept_no': rcp,
            'url': url,
            'Total Assets (2022)': assets
        })

        time.sleep(0.7)

    return pd.DataFrame(results)

# 3. Retrieve target audit committee document URLs for flagged cases
audit_targets_df = missing_acm_urls(flagged).drop_duplicates(subset=['corp_code'])

Failed to fetch for 101220 (20250320001653): ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Failed to fetch for 102432 (20250318001013): ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Failed to fetch for 104856 (20250306000628): ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Failed to fetch for 106669 (20250318000465): ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Failed to fetch for 108135 (20250318001422): ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Failed to fetch for 108241 (20250312000711): ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Failed to fetch for 111810 (20250401003907): ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Failed to fetch for 113058 (202503

KeyboardInterrupt: 

In [None]:
def parse_and_update_audit_members(audit_targets_df, exec_df, summary_df):
    """
    Parses financial documents for flagged corporations to find the table under
    the '감사위원 현황' title to update audit committee member information.

    Args:
        audit_targets_df (pd.DataFrame): DataFrame of corporations flagged for
                                         audit committee updates.
        exec_df (pd.DataFrame): DataFrame containing executive information.
        summary_df (pd.DataFrame): DataFrame containing summary information.

    Returns:
        tuple: The updated exec_df and summary_df DataFrames.
    """
    # Initialize necessary columns if they don't exist
    for df in [exec_df, summary_df]:
        if 'corp_code' in df.columns:
            df['corp_code'] = df['corp_code'].astype(str)
    
    new_execs_to_add = []
    summary_updates = {}

    for idx, row in audit_targets_df.iterrows():
        corp_code = str(row['corp_code'])
        company = row['corp_name']
        url = row['url']

        if pd.isna(url) or not isinstance(url, str):
            continue

        try:
            response = requests.get(url, timeout=20)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            members_found = []
            auditors_found = []
            
            # --- SCENARIO 1: Find the table under a specific committee title ---
            # Search for a header containing '감사위원 현황' or '감사위원회 위원의 인적사항'
            ac_header_patterns = [
                re.compile(r'감사위원\s*현황'),
                re.compile(r'감사위원회\s*위원의\s*인적사항'),
                re.compile(r'감사위원회\s*위원'),
                re.compile(r'감사기구\s*관련\s*사항')
            ]
            
            found_ac_section = None
            ac_table = None
            for pattern in ac_header_patterns:
                found_ac_section = soup.find(string=pattern)
                if found_ac_section:
                    # Find the first table that comes after this specific header
                    ac_table = found_ac_section.find_next('table')
                    if ac_table:
                        break
            
            if ac_table:
                # Improved table parsing logic
                headers = [th.get_text(strip=True).replace('\xa0', '').replace('\n', '') for tr in ac_table.find_all('tr', limit=2) for th in tr.find_all(['th', 'td'])]
                name_idx = next((i for i, h in enumerate(headers) if '성명' in h), None)
                outside_idx = next((i for i, h in enumerate(headers) if '사외이사' in h), None)
                
                if name_idx is not None and outside_idx is not None:
                    # Prioritize tbody for finding data rows
                    data_rows = ac_table.find_all('tbody')[0].find_all('tr') if ac_table.find('tbody') else ac_table.find_all('tr')[len(ac_table.find_all('tr', limit=2)):]
                    for tr in data_rows:
                        tds = tr.find_all(['td', 'th'])
                        if len(tds) > max(name_idx, outside_idx):
                            name = tds[name_idx].get_text(strip=True)
                            is_outside = tds[outside_idx].get_text(strip=True)
                            if name and name != '-' and '---' not in name:
                                is_outside_flag = '예' in is_outside or 'O' in is_outside
                                members_found.append({'name': name, 'is_outside': is_outside_flag})
      
                                        
            # --- Finalize updates based on which scenario matched ---
            if members_found:
                total_members = len(members_found)
                outside_members = sum(1 for member in members_found if member['is_outside'])
                
                for member in members_found:
                    name = member['name']
                    existing_mask = (exec_df['corp_code'] == corp_code) & (exec_df['nm'] == name)
                    if not exec_df[existing_mask].empty:
                        exec_df.loc[existing_mask, 'is_audit_committee_member'] = True
                    else:
                        new_execs_to_add.append({
                            'corp_code': corp_code, 'corp_name': company, 'nm': name,
                            'chrg_job': '감사위원회 위원', 'is_audit_committee_member': True
                        })
                
                summary_updates[corp_code] = {
                    'Audit Committee': total_members, 'Audit Committee ODs': outside_members,
                }
            else:
                summary_updates[corp_code] = {
                    'Audit Committee': 0, 'Audit Committee ODs': 0,
                }

        except Exception as e:
            print(f"Exception occurred for {corp_code} - {company}: {e}")
        time.sleep(0.7)
    
    for corp_code, update in summary_updates.items():
        summary_df.loc[summary_df['corp_code'] == corp_code, 'Audit Committee'] = update['Audit Committee']
        summary_df.loc[summary_df['corp_code'] == corp_code, 'Audit Committee ODs'] = update['Audit Committee ODs']
        
    return exec_df, summary_df

In [None]:
exec_df, summary_df = parse_and_update_audit_members(
    audit_targets_df,
    exec_df,
    summary_df
)

❌ 101220 - KG케미칼: No valid audit info found.
✅ Updated 102432 - 계룡건설산업 with 3 audit committee members.
✅ Updated 104856 - 삼성증권 with 3 audit committee members.
✅ Updated 106669 - 세아베스틸지주 with 4 audit committee members.
❌ 108135 - 녹십자홀딩스: No valid audit info found.
✅ Updated 108241 - 농심 with 4 audit committee members.
✅ Updated 111810 - 대웅 with 1 audit committee members.
✅ Updated 113058 - 한화생명 with 3 audit committee members.
✅ Updated 113191 - 코리안리 with 3 audit committee members.
✅ Updated 113359 - 교보증권 with 3 audit committee members.
❌ 113997 - 롯데에너지머티리얼즈: No valid audit info found.
❌ 114792 - 동국홀딩스: No valid audit info found.
✅ Updated 115694 - DB증권 with 3 audit committee members.
✅ Updated 115977 - 아이에스동서 with 3 audit committee members.
❌ 116949 - DN오토모티브: No valid audit info found.
✅ Updated 117188 - 효성 with 3 audit committee members.
✅ Updated 117577 - 오리온홀딩스 with 3 audit committee members.
✅ Updated 120182 - NH투자증권 with 2 audit committee members.
✅ Updated 121941 - 대상 with 4 audit

In [None]:
# Assuming summary_df_updated is your updated DataFrame
check_2 = check_governance_compliance(summary_df)

print("--- Governance Compliance Check Results ---")

# Print corporations with a single auditor (no audit committee)
no_ac_but_auditor = check_2[(check_2['Audit Committee'] == 0) & (check_2['Auditors'] > 0)]
if not no_ac_but_auditor.empty:
    print("\nCorporations with a single auditor (no audit committee):")
    for index, row in no_ac_but_auditor.iterrows():
        print(f"  - {row['corp_name']} (Auditor: {row['Auditors']})")
else:
    print("\nNo corporations found with a single auditor (no audit committee).")

print("-----------------------------------------")

# Print corporations with a partial audit committee (< 3 members)
partial_ac = check_2[((check_2['Audit Committee'] > 0) & (check_2['Audit Committee'] < 3))]
if not partial_ac.empty:
    print("\nCorporations with a partial audit committee (< 3 members):")
    for index, row in partial_ac.iterrows():
        print(f"  - {row['corp_name']} (Members: {row['Audit Committee']})")
else:
    print("\nNo corporations found with a partial audit committee.")

print("-----------------------------------------")

# Print corporations with no audit committee or auditor found
no_ac_no_auditor = check_2[(check_2['Audit Committee'] == 0) & (check_2['Auditors'] == 0)]
if not no_ac_no_auditor.empty:
    print("\nCorporations with no audit committee or auditor found:")
    for index, row in no_ac_no_auditor.iterrows():
        print(f"  - {row['corp_name']}")
else:
    print("\nNo corporations found with neither an audit committee nor an auditor.")

print("-----------------------------------------")

--- Governance Compliance Check Results ---

✅ Corporations with a single auditor (no audit committee):
  - KG케미칼 (Auditor: 1)
  - 녹십자홀딩스 (Auditor: 1)
  - 롯데에너지머티리얼즈 (Auditor: 1)
  - 동국홀딩스 (Auditor: 1)
  - DN오토모티브 (Auditor: 1)
  - 세아제강지주 (Auditor: 1)
  - SGC에너지 (Auditor: 1)
  - 아세아 (Auditor: 1)
  - 영원무역홀딩스 (Auditor: 1)
  - 제일기획 (Auditor: 1)
  - 기업은행 (Auditor: 1)
  - 코오롱 (Auditor: 1)
  - 한신공영 (Auditor: 1)
  - 한일홀딩스 (Auditor: 1)
  - 다우기술 (Auditor: 1)
  - 미스토홀딩스 (Auditor: 1)
  - 콘텐트리중앙 (Auditor: 1)
  - SNT홀딩스 (Auditor: 1)
  - 강원랜드 (Auditor: 1)
  - 세아홀딩스 (Auditor: 2)
  - 대상홀딩스 (Auditor: 1)
  - 한국콜마 (Auditor: 2)
  - 아세아시멘트 (Auditor: 1)
  - 제이알글로벌리츠 (Auditor: 2)
  - ESR켄달스퀘어리츠 (Auditor: 2)
-----------------------------------------

⚠️ Corporations with a partial audit committee (< 3 members):
  - 대웅 (Members: 1)
  - NH투자증권 (Members: 2)
  - 롯데렌탈 (Members: 2)
-----------------------------------------

❌ Corporations with no audit committee or auditor found:
  - 롯데리츠
  - SK리츠
------------------

In [114]:
def separate_career_refined(career_string):
    if pd.isna(career_string):
        return np.nan, np.nan

    education = []
    work_experience = []
    
    # Prioritized keywords
    job_keywords = ['교수', '총장', '강사', '연구원', '학장', '팀장', '실장', '감사', '대표', '회장', '이사'] 
    edu_keywords = ['학사', '석사', '박사', '대학교', '법학', '대학원', '졸업', '수료', 'Univ.', 'School', 'College', 'MBA', 'U.', 'Institute', 'University']

    career_items = career_string.split('\n')
    
    for item in career_items:
        # Step 1: Check for job keywords first
        if any(keyword in item for keyword in job_keywords):
            work_experience.append(item.strip())
        # Step 2: If no job keywords, check for educational keywords
        elif any(keyword in item for keyword in edu_keywords):
            education.append(item.strip())
        # Step 3: Default to work experience for other entries
        else:
            work_experience.append(item.strip())
            
    return (
        '\n'.join(education) if education else np.nan,
        '\n'.join(work_experience) if work_experience else np.nan
    )

exec_df[['Education', 'Work Experience']] = exec_df['main_career'].apply(
    lambda x: pd.Series(separate_career_refined(x))
)

In [119]:
new_names = {
    'rcept_no': 'Disclosure',
    'nm': 'Name',
    'sexdstn': 'Gender',
    'ofcps': 'Position',
    'rgist_exctv_at': 'Status',
    'chrg_job': 'Responsibilities',
    'mxmm_shrholdr_relate': 'Relation to Largest Shareholder',
    'hffc_pd': 'Employment Period',
    'trmend_posesn_stock_qota_rt': 'Stock Owned'
}

exec_df = exec_df.rename(columns=new_names).drop(columns=['main_career'])

In [121]:
output_folder = 'data/processed'
exec_file_path = os.path.join(output_folder, 'exec_df.csv')
summary_file_path = os.path.join(output_folder, 'summary_df.csv')

os.makedirs(output_folder, exist_ok=True)

exec_df.to_csv(exec_file_path, index=False, encoding='utf-8-sig')
summary_df.to_csv(summary_file_path, index=False, encoding='utf-8-sig')

In [124]:
salary_total_df

Unnamed: 0,rcept_no,corp_cls,corp_code,corp_name,nmpr,jan_avrg_mendng_am,mendng_totamt,rm,stlm_dt
0,20250313001154,Y,173944,우진,7,712026000,4984181000,-,2024-12-31
1,20250318001356,Y,109286,대동,7,427000000,2992000000,-,2024-12-31
2,20250313000953,Y,129350,삼화콘덴서공업,9,235095000,2115856000,-,2024-12-31
3,20250320001552,Y,144252,유니온,5,286000000,1431000000,-,2024-12-31
4,20250321000814,Y,303217,우진플라임,5,188070000,940351000,-,2024-12-31
...,...,...,...,...,...,...,...,...,...
865,20250320001708,Y,108746,DKME,6,67000000,403000000,-,2024-12-31
866,20250320001204,Y,1262032,롯데이노베이트,7,416000000,2911000000,-,2024-12-31
867,20250319000718,Y,136776,제이준코스메틱,10,18730000,187300000,-,2024-12-31
868,20250317000666,Y,114154,디와이덕양,10,167000000,1667000000,-,2024-12-31


In [11]:
exec_df = pd.read_csv(r'C:\Program Files\Git\OPENDART\governance_scoring_proj\notebooks\data\processed\exec_df.csv')