In [17]:
import os
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
path = '..'
os.chdir(path)

In [None]:
baseURL = 'apis.data.go.kr/1160100/service'
getApis = {
   # 'GetSBProfileInfoService' : ['getOtlInfo', 'getCsdoStatus'], # 개인사업자 기본정보
   # 'GetSBBankingInfoService' : ['getGrnBalInfo', 'getDpstLoanInfo'], # 개인사업자
   # 'GetSBFinanceInfoService' : ['getFnafInfo', 'getSlsInfo', 'getDbtInfo'], # 개인사업자
   'GetCorpBasicInfoService_V2' :  # 기업 기본정보
      [ 
         'getAffiliate_V2',
         'getConsSubsComp_V2',
         'getCorpOutline_V2'
      ],
   'GetFinaStatInfoService_V2' : # 기업 재무정보
      [
         'getIncoStat_V2',
         'getBs_V2',
         'getSummFinaStat_V2'
      ],
}


In [None]:
# baseURL = 'apis.data.go.kr/1160100/service/GetSBProfileInfoService' /
# apis.data.go.kr/1160100/service/GetCorpBasicInfoService_V2

In [4]:
os.getcwd()

'c:\\Users\\Pro\\Documents\\workspace\\analysis_soleProprietor'

In [None]:
def getApiData(apis: dict | list, baseURL: str = baseURL, start: int = 0, rows: int = 10000, scope: int = 100, current_path: str = os.getcwd()):
    
    scope = start + scope

    key = os.getenv('KEY_DECODE')
    if not key:
        logging.error("API key not found in environment variables.")
        return

    # apis 형태 처리
    if isinstance(apis, dict):
        base = list(apis.keys())
        api = list(apis.values())
    else:
        base = [baseURL.split('/')[-1]]
        api = apis
        baseURL = baseURL.replace(f'/{base[0]}', '')

    # 세션 및 재시도 설정
    session = requests.Session()
    retries = Retry(total=3, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
    session.mount('http://', HTTPAdapter(max_retries=retries))

    for base_item, api_list in zip(base, api):
        for api_item in api_list:
            logging.info(f"[START] Collecting: http://{baseURL}/{base_item}/{api_item}")
            for page in range(start, scope):
                url = f"http://{baseURL}/{base_item}/{api_item}"
                params = {
                    "serviceKey": key,
                    "pageNo": page + 1,
                    "numOfRows": rows,
                    "resultType": "json"
                }

                try:
                    response = session.get(url, params=params)
                    response.raise_for_status()
                except requests.exceptions.RequestException as e:
                    logging.error(f"Request failed for {url} (page {page+1}): {e}")
                    break

                logging.info(f"Collecting page {page + 1}...")

                try:
                    response_data = response.json().get('response', {}).get('body', {}).get('items', {}).get('item', [])
                except ValueError as e:
                    logging.error(f"Invalid JSON response from {url}: {e}")
                    break
                
                is_last_page = len(response_data) < rows

                if response_data:
                    try:
                        df = pd.DataFrame(response_data)
                    except ValueError as e:
                        logging.error(f"Error converting response to DataFrame on page {page + 1}: {e}")
                        break

                    save_path = os.path.join(current_path, 'dataset', base_item, api_item)
                    try:
                        os.makedirs(save_path, exist_ok=True)
                    except OSError as e:
                        logging.error(f"Failed to create directory {save_path}: {e}")
                        break

                    file_name = os.path.join(save_path, f"{api_item}_{str(page).zfill(3)}.csv")
                    try:
                        df.to_csv(file_name, index=False)
                        logging.info(f"Saved: {file_name}")
                    except PermissionError as e:
                        logging.error(f"Permission denied when saving file {file_name}: {e}")
                        break
                else:
                    logging.info(f"Page {page + 1} returned no data.")

                # 데이터가 지정 rows 미만이면 마지막 페이지로 간주하고 중단
                if is_last_page:
                    logging.info(f"end of pages: less then {rows} rows.")
                    break

In [None]:
getApiData(getApis)

INFO:root:[START] Collecting: http://apis.data.go.kr/1160100/service/GetCorpBasicInfoService_V2/getAffiliate_V2
INFO:root:Collecting page 101...
INFO:root:Saved: c:\Users\Pro\Documents\workspace\analysis_soleProprietor\dataset\GetCorpBasicInfoService_V2\getAffiliate_V2\getAffiliate_V2_100.csv
INFO:root:Collecting page 102...
INFO:root:Saved: c:\Users\Pro\Documents\workspace\analysis_soleProprietor\dataset\GetCorpBasicInfoService_V2\getAffiliate_V2\getAffiliate_V2_101.csv
INFO:root:Collecting page 103...
INFO:root:Saved: c:\Users\Pro\Documents\workspace\analysis_soleProprietor\dataset\GetCorpBasicInfoService_V2\getAffiliate_V2\getAffiliate_V2_102.csv
INFO:root:Collecting page 104...
INFO:root:Saved: c:\Users\Pro\Documents\workspace\analysis_soleProprietor\dataset\GetCorpBasicInfoService_V2\getAffiliate_V2\getAffiliate_V2_103.csv
INFO:root:Collecting page 105...
INFO:root:Saved: c:\Users\Pro\Documents\workspace\analysis_soleProprietor\dataset\GetCorpBasicInfoService_V2\getAffiliate_V2\ge

In [None]:
def process_csv_files(getApis, current_path: str = os.getcwd()):
    # Process each path to handle CSV files
    for key, values in getApis.items():
        if not values:  # If no values, skip
            logging.warning(f"The value for '{key}' is empty.")
            continue

        for value in values:
            full_path = f'{current_path}/dataset/{key}/{value}'

            # Retrieve list of CSV files in the directory
            try:
                if not os.path.exists(full_path):
                    logging.warning(f"Directory {full_path} does not exist. Skipping this path.")
                    continue  # Skip if the directory does not exist

                file_list = sorted([f for f in os.listdir(full_path) if f.endswith('.csv')])
                if not file_list:
                    logging.warning(f"There are no CSV files in {full_path}.")
                    continue  # Skip if no CSV files exist in the directory
            except Exception as e:
                logging.error(f"An error occurred with the directory {full_path}: {e}")
                continue  # Skip if there's an error with the directory

            # List to hold DataFrames
            all_data = []
            # Iterate over each file and read the data
            for filename in file_list:
                try:
                    file_path = os.path.join(full_path, filename)
                    tmp = pd.read_csv(file_path)
                    if tmp.empty:  # Handle empty files
                        logging.warning(f"The file {filename} is empty. Skipping this file.")
                        continue  # Skip empty files
                    all_data.append(tmp)  # Add file to the list
                except Exception as e:
                    logging.error(f"An error occurred while reading the file {filename}: {e}")

            # Combine all DataFrames into one
            if all_data:
                try:
                    df = pd.concat(all_data, ignore_index=True)
                    output_path = os.path.join(full_path, "all.csv")
                    df.to_csv(output_path, index=False)
                    logging.info(f"Successfully combined all data into 'all.csv' in {full_path}.")
                except Exception as e:
                    logging.error(f"An error occurred while merging all data: {e}")
            else:
                logging.warning(f"There are no valid CSV files in {full_path}.")

In [22]:
process_csv_files(getApis)

INFO:root:c:\Users\Pro\Documents\workspace\analysis_soleProprietor/dataset/GetCorpBasicInfoService_V2/getConsSubsComp_V2에 모든 데이터를 합쳐서 'all.csv'로 저장 완료.
INFO:root:c:\Users\Pro\Documents\workspace\analysis_soleProprietor/dataset/GetCorpBasicInfoService_V2/getCorpOutline_V2에 모든 데이터를 합쳐서 'all.csv'로 저장 완료.
INFO:root:c:\Users\Pro\Documents\workspace\analysis_soleProprietor/dataset/GetFinaStatInfoService_V2/getIncoStat_V2에 모든 데이터를 합쳐서 'all.csv'로 저장 완료.
INFO:root:c:\Users\Pro\Documents\workspace\analysis_soleProprietor/dataset/GetFinaStatInfoService_V2/getBs_V2에 모든 데이터를 합쳐서 'all.csv'로 저장 완료.
