In [1]:
import os
from dotenv import load_dotenv

import json
import requests
import xmltodict

import numpy as np
import pandas as pd
import time
import datetime

In [2]:
# Load environment variables from the .env file
load_dotenv()

# Get the API key from the environment variable
api_key = os.getenv("DATA_SEOUL_APIKEY")

In [14]:
service_name = "CardBusTimeNew"
service_url = f"http://openapi.seoul.go.kr:8088/{api_key}/xml/{service_name}/"

In [3]:
#API 응답을 XML 형식에서 딕셔너리로 변환
def response_to_dict(res):
    return json.loads(json.dumps(xmltodict.parse(res.text)))

In [4]:
#주어진 날짜에 대한 총 레코드 수를 가져옴
def get_list_total_count(date):
    url = service_url + '1/1/' + date
    res = requests.get(url)
    res_dict = response_to_dict(res)
    
    list_total_count = int(res_dict[service_name]['list_total_count'])
    return list_total_count

In [9]:
# 총 레코드 수를 기반으로 호출 범위 목록을 생성
def get_call_range_list(list_total_count):
    call_range_list = []
    call_capacity = 1000
    q = list_total_count // call_capacity
    r = list_total_count % call_capacity
    
    # Add quotient part
    for i in range(q):
        start_index = i * call_capacity + 1
        last_index = (i + 1) * call_capacity   
        call_range_list.append(f'{start_index}/{last_index}/')

    # Add remainder part
    if r != 0:
        start_index = q * call_capacity + 1
        last_index = list_total_count   
        call_range_list.append(f'{start_index}/{last_index}/')

    return call_range_list

In [10]:
#  두 날짜 사이의 날짜 문자열 목록을 생성
def get_date_str_list(from_date, to_date, freq='M'):
    date_list = []

    period_range = pd.period_range(from_date, to_date, freq=freq).to_timestamp()

    for date in period_range:
        date_str = date.strftime('%Y%m')
        if freq == 'D':
            date_str = date.strftime('%Y%m%d')
        date_list.append(date_str)

    return date_list

In [11]:
#특정 날짜에 대한 데이터를 여러 번의 API 호출을 통해 가져옴
def fetch_data_for_date(date):
    list_total_count = get_list_total_count(date)
    if list_total_count == 0:
        return []  
    call_range_list = get_call_range_list(list_total_count)
    
    all_data = []
    for call_range in call_range_list:
        url = service_url + call_range + date
        res = requests.get(url)
        res_dict = response_to_dict(res)
        if 'CardBusTimeNew' in res_dict and 'row' in res_dict['CardBusTimeNew']:
            items = res_dict['CardBusTimeNew']['row']
            all_data.extend(items)
    
    return all_data

In [12]:

def get_result(service_name, date_list):
    result_list = []
    for date_str in date_list:
        list_total_count = get_list_total_count(date_str)
        call_range_list = get_call_range_list(list_total_count)
        print(f"Start for {date_str}({list_total_count}개)...")
              
        previous_result_len = len(result_list)
        for call_range_str in call_range_list:
            print(call_range_str)
            url = service_url + call_range_str + date_str

            res = requests.get(url)
            res_dict = response_to_dict(res)
            #print(res_dict)
            # if res_dict[service_name]['RESULT']['CODE'] != 'INFO-000':
            #     print('something wrong in %d try' % i)
            #     break

            result_list.extend(res_dict[service_name]['row'])

            #print(i, end='\r')
            time.sleep(0.3)
        current_result_len = len(result_list)
        assert current_result_len == previous_result_len + list_total_count
        print(f"Well done for {date_str}")
    
    return result_list

In [20]:
# Set the date range from 2023-01 to 2024-04
date_str_list = get_date_str_list('202301', '202404', 'M')
print(date_str_list)

['202301', '202302', '202303', '202304', '202305', '202306', '202307', '202308', '202309', '202310', '202311', '202312', '202401', '202402', '202403', '202404']


In [21]:
all_data = get_result(service_name, date_str_list)

Start for 202301(41062개)...
1/5000/


KeyError: 'CardBusTimeNew'

In [None]:
# Convert to DataFrame
df = pd.DataFrame(all_data)

In [None]:
df.to_csv('time_dataset.csv', index=False)

In [13]:
service_name = "CardBusStatisticsServiceNew"
service_url = f"http://openapi.seoul.go.kr:8088/{api_key}/xml/{service_name}/"

In [20]:
date_list = get_date_str_list('20231101', '20240430', 'D')

In [21]:
bus_list = get_result(service_name, date_list) 

Start for 20231101(40764개)...
1/1000/
1001/2000/
2001/3000/
3001/4000/
4001/5000/
5001/6000/
6001/7000/
7001/8000/
8001/9000/
9001/10000/
10001/11000/
11001/12000/
12001/13000/
13001/14000/
14001/15000/
15001/16000/
16001/17000/
17001/18000/
18001/19000/
19001/20000/
20001/21000/
21001/22000/
22001/23000/
23001/24000/
24001/25000/
25001/26000/
26001/27000/
27001/28000/
28001/29000/
29001/30000/
30001/31000/
31001/32000/
32001/33000/
33001/34000/
34001/35000/
35001/36000/
36001/37000/
37001/38000/
38001/39000/
39001/40000/
40001/40764/
Well done for 20231101
Start for 20231102(40783개)...
1/1000/
1001/2000/
2001/3000/
3001/4000/
4001/5000/
5001/6000/
6001/7000/
7001/8000/
8001/9000/
9001/10000/
10001/11000/
11001/12000/
12001/13000/
13001/14000/
14001/15000/
15001/16000/
16001/17000/
17001/18000/
18001/19000/
19001/20000/
20001/21000/
21001/22000/
22001/23000/
23001/24000/
24001/25000/
25001/26000/
26001/27000/
27001/28000/
28001/29000/
29001/30000/
30001/31000/
31001/32000/
32001/33000/

In [22]:
# Convert to DataFrame
df = pd.DataFrame(bus_list)

In [23]:
df.to_csv('date_dataset.csv', index=False)