# 데이터 수집
   
<br>

데이터는 현대 비즈니스에서 가장 중요한 자산 중 하나로 자리매김하고 있다. 데이터 분석을 통한 지표 확인 및 인공지능 모델 학습까지 데이터의 중요성은 계속해서 커지고 있다.
   
이를 위해 데이터 수집은 필수적인 단계로 자리잡고 있으며, 다양한 소스로부터의 데이터를 효과적으로 수집하는 것은 매우 중요하다.
   

수업 내용을 바탕으로 실습한 해당 Jupyter Notebook에서는 데이터 수집의 다양한 측면을 다루고 있다. 먼저, Web Request를 통한 데이터 수집 방법을 알아볼 것이다. 기본적인 Web API 호출 방법에 대해 소개하려고 한다.

   
수집된 데이터의 포멧에 대해서 현업에서 가장 많이 사용되는 JSON과 XML 형식에 대한 이해를 바탕으로, 각각의 특징을 살펴볼 것이다.

   
이어서는 Open API를 활용하여 공공데이터 포털에서 데이터를 수집하는 방법을 다룬다. 다양한 도메인의 데이터에 접근하여 실제 활용 가능한 정보로 변환하는 과정을 살펴볼 것이다.

   
마지막으로, 수집한 데이터를 어떻게 저장하고 관리할지에 대대해서 MongoDB를 활용해 데이터를 저장하는 과정 다루어보려고 한다. MongoDB는 유연한 구조와 확장성을 제공하여 다양한 형태의 데이터를 효과적으로 저장할 수 있는 데이터베이스이다.

<br>

## 1. Web Request
- 기본적인 Web API를 호출하는 방법을 다룬다.

## 2. 데이터 포멧
- JSON 형식과 XML 형식에 대해서 다룬다.

## 3. Open API
- 공공데이터 포털의 데이터를 수집한다.

## 4. MongoDB
- 수집한 데이터를 MongoDB에 저장해본다


<br>
<hr>
<br>

### Web Request

In [1]:
import requests
r = requests.post("http://httpbin.org/post", data={"name":"js"})

In [2]:
r.json()

{'args': {},
 'data': '',
 'files': {},
 'form': {'name': 'js'},
 'headers': {'Accept': '*/*',
  'Accept-Encoding': 'gzip, deflate, br',
  'Content-Length': '7',
  'Content-Type': 'application/x-www-form-urlencoded',
  'Host': 'httpbin.org',
  'User-Agent': 'python-requests/2.28.1',
  'X-Amzn-Trace-Id': 'Root=1-6563afc8-21bb015105ef0a6d6f0a5de4'},
 'json': None,
 'origin': '61.73.68.53',
 'url': 'http://httpbin.org/post'}

In [3]:
r.status_code

200

### XML

In [4]:
input = '''
<students>               # root 태그는 하나만 허용.
    <student x="1">      # 속성은 시작태그에 적고, key="value"로 구성. value는 따옴표.
        <id>001</id>     # 계층적으로 태그 구성.
        <name>Kim</name>
    </student>
    <student x="2">
        <id>002</id>
        <name>Lee</name>
    </student>
</students>
'''

In [5]:
import lxml.etree
root=lxml.etree.fromstring(input)

from io import StringIO
tree=lxml.etree.parse(StringIO(input))
root=tree.getroot()

In [6]:
root.tag #XML 트리의 최상위 노드의 tag

'students'

In [7]:
#XML 트리에 어떤 태그가 있는지 확인
for e in root: 
    print(e.tag)

student
student


In [8]:
root.getchildren() #하위 요소를 모두 찾아 리스트로 만들어줌

[<Element student at 0x7fc6802f1a40>, <Element student at 0x7fc6802f13c0>]

In [9]:
root.getchildren()[1].attrib #2번째 student의 속성의 값 -> Dictionary

{'x': '2'}

In [10]:
root.getchildren()[1].attrib['x'] #Dictionary의 key를 통해서 value를 얻어옴

'2'

In [11]:
(root.getchildren()[1]).getchildren()[0].text #2번째 student의 id, name 중 첫번째 value

'002'

In [12]:
(root.getchildren()[1]).getchildren()[1].text #2번째 student의 id, name 중 두번째 value

'Lee'

In [13]:
for ee in root.getchildren():    # 하위 요소를 읽음
    for e in ee.getchildren():   # 하위의 하위 요소를 읽음
        if not e.text:
            text = "None"
        else:
            text = e.text
        print(e.tag + " => " + text)

id => 001
name => Kim
id => 002
name => Lee


### XML - XPath
xml의 데이터 추출 방식에 xpath, css, 정규식 등이 있다. 


In [14]:
root.xpath('//@x') #//@는 어느 수준에 있든지 x 속성을 검색해서 Value를 추출한다

['1', '2']

In [15]:
root.xpath('//*[@x="1"]/id/text()') 
#사각괄호를 사용하게 되면, 그 조건에 맞는 요소만 찾아낸다. [@x="1"]는 속성 x=1인 어떤 요소라도 (와일드카드 * 의미) 찾아내라는 의미이다.

['001']

### XML - CSS

In [16]:
from lxml.cssselect import CSSSelector

sel = CSSSelector('student') # construct a CSS Selector

print(sel)
print(sel.css)
print(sel.path)

<CSSSelector 7fc680308190 for 'student'>
student
descendant-or-self::student


### 인코딩

In [17]:
# [승인] 도로교통공단_사망교통사고정보서비스
# 신청일 2023-09-22 만료예정일 2025-09-22

In [18]:
import urllib

print (urllib.parse.urlencode({'#q':'한글'}))
print (urllib.parse.urlencode({'#q':u'한글'.encode('utf-8')}))

%23q=%ED%95%9C%EA%B8%80
%23q=%ED%95%9C%EA%B8%80


### 인증키 작성하기

In [19]:
# %%writefile src/key.properties
# dataseoul=YOUR_DS_KEY
# gokr=YOUR_GO_KEY
# mongo=YOUR_MONGO_KEY

### 라이브러리 만들기(키를 얻어내는 기능)

In [20]:
%%writefile src/mylib.py

def getKey(keyPath):
    d=dict()
    f=open(keyPath,'r')
    for line in f.readlines():
        row=line.split('=')
        row0=row[0]
        d[row0]=row[1].strip()
    return d

Overwriting src/mylib.py


In [21]:
import os
import src.mylib

keyPath=os.path.join(os.getcwd(), 'src', 'key.properties')
key=src.mylib.getKey(keyPath)

mongo_pwd = key['mongo']
mongo_uri = f'mongodb+srv://sjh9708:{mongo_pwd}@mycluster.tfiobq6.mongodb.net/?retryWrites=true&w=majority'
# print (key['dataseoul'])
# print(mongo_uri)

### JSON


In [22]:
#List of Dictionary

input = '''[
    { "id" : "001", "x" : "2", "name" : "Chuck"},
    { "id" : "009","x" : "7","name" : "Brent" }
]''' 

In [23]:
import json

info = json.loads(input) #String to JSON
info

[{'id': '001', 'x': '2', 'name': 'Chuck'},
 {'id': '009', 'x': '7', 'name': 'Brent'}]

In [24]:
info[0]

{'id': '001', 'x': '2', 'name': 'Chuck'}

In [25]:
info[0]['id']

'001'

In [26]:
print(f'User Count: {len(info)}')

User Count: 2


In [27]:
# Python Dictionary to JSON
import json
my=json.dumps(
    ['foo', {'bar': ('baz', None, 1.0, 2)}]
)

In [28]:
type(my)

str

### Request로 IP 주소를 통해 지역정보 알아오기

In [29]:
import requests

url='http://api.ipstack.com/125.176.129.177?access_key=d7cdd0602f68e39767eba8a0e245e849'

r=requests.get(url)

r

<Response [200]>

In [30]:
r.text

'{"ip": "125.176.129.177", "type": "ipv4", "continent_code": "AS", "continent_name": "Asia", "country_code": "KR", "country_name": "South Korea", "region_code": "11", "region_name": "Seoul", "city": "Seoul", "zip": "100-011", "latitude": 37.56100082397461, "longitude": 126.98265075683594, "location": {"geoname_id": 1835848, "capital": "Seoul", "languages": [{"code": "ko", "name": "Korean", "native": "\\ud55c\\uad6d\\uc5b4"}], "country_flag": "https://assets.ipstack.com/flags/kr.svg", "country_flag_emoji": "\\ud83c\\uddf0\\ud83c\\uddf7", "country_flag_emoji_unicode": "U+1F1F0 U+1F1F7", "calling_code": "82", "is_eu": false}}'

In [31]:
geo = json.loads(r.text)
geo

{'ip': '125.176.129.177',
 'type': 'ipv4',
 'continent_code': 'AS',
 'continent_name': 'Asia',
 'country_code': 'KR',
 'country_name': 'South Korea',
 'region_code': '11',
 'region_name': 'Seoul',
 'city': 'Seoul',
 'zip': '100-011',
 'latitude': 37.56100082397461,
 'longitude': 126.98265075683594,
 'location': {'geoname_id': 1835848,
  'capital': 'Seoul',
  'languages': [{'code': 'ko', 'name': 'Korean', 'native': '한국어'}],
  'country_flag': 'https://assets.ipstack.com/flags/kr.svg',
  'country_flag_emoji': '🇰🇷',
  'country_flag_emoji_unicode': 'U+1F1F0 U+1F1F7',
  'calling_code': '82',
  'is_eu': False}}

In [32]:
print(geo.get('ip'))
print(geo['ip'])
print(geo.get('location').get('capital'))
print(geo['location']['capital'])

125.176.129.177
125.176.129.177
Seoul
Seoul


### 공공데이터

In [33]:
# 열린데이터포털은 활용신청 필요없고, 공공데이터포털은 활용신청 받아야 함

In [34]:
import urllib

# URL 인코딩 : URLdms 128 ASCII 문자만 받도록 되어있다. 한글의 경우 퍼센트 인코딩으로 표현된다.
print (urllib.parse.urlencode({'#q':'한글'}))
print (urllib.parse.urlencode({'#q':u'한글'.encode('utf-8')}))

%23q=%ED%95%9C%EA%B8%80
%23q=%ED%95%9C%EA%B8%80


In [35]:
# 인증키 읽기

import os
import src.mylib

keyPath=os.path.join(os.getcwd(), 'src', 'key.properties')
key=src.mylib.getKey(keyPath)

# print (key['dataseoul'])
# print (key['gokr'])

서울시 지하철역 관련 정보를 수집해보자. 열린데이터 사이트에서 "서울교통공사 노선별 지하철역 정보"을 찾아보면, 그러한 정보를 제공하는 API 'SearchSTNBySubwayLineInfo'가 존재한다.

In [36]:
# 서울시 지하철역 정보 수집
# https://data.seoul.go.kr/dataList/OA-15442/S/1/datasetView.do
# 샘플URL : http://openapi.seoul.go.kr:8088/(인증키)/xml/SearchSTNBySubwayLineInfo/1/5/


dataseoul_key = key['dataseoul']
url=f'http://openapi.seoul.go.kr:8088/{dataseoul_key}/xml/SearchSTNBySubwayLineInfo/1/5/'

r=requests.get(url)

r.text # 지하철 정보 XML



'<?xml version="1.0" encoding="UTF-8"?>\n<SearchSTNBySubwayLineInfo>\n<list_total_count>777</list_total_count>\n<RESULT>\n<CODE>INFO-000</CODE>\n<MESSAGE>정상 처리되었습니다</MESSAGE>\n</RESULT>\n<row>\n<STATION_CD>1726</STATION_CD>\n<STATION_NM>직산</STATION_NM>\n<STATION_NM_ENG>Jiksan</STATION_NM_ENG>\n<LINE_NUM>01호선</LINE_NUM>\n<FR_CODE>P167</FR_CODE>\n<STATION_NM_CHN>稷山</STATION_NM_CHN>\n<STATION_NM_JPN>稷山</STATION_NM_JPN>\n</row>\n<row>\n<STATION_CD>1725</STATION_CD>\n<STATION_NM>성환</STATION_NM>\n<STATION_NM_ENG>Seonghwan</STATION_NM_ENG>\n<LINE_NUM>01호선</LINE_NUM>\n<FR_CODE>P166</FR_CODE>\n<STATION_NM_CHN>成歡</STATION_NM_CHN>\n<STATION_NM_JPN>成歡</STATION_NM_JPN>\n</row>\n<row>\n<STATION_CD>1724</STATION_CD>\n<STATION_NM>평택</STATION_NM>\n<STATION_NM_ENG>Pyeongtaek</STATION_NM_ENG>\n<LINE_NUM>01호선</LINE_NUM>\n<FR_CODE>P165</FR_CODE>\n<STATION_NM_CHN>平澤</STATION_NM_CHN>\n<STATION_NM_JPN>平澤</STATION_NM_JPN>\n</row>\n<row>\n<STATION_CD>1723</STATION_CD>\n<STATION_NM>평택지제</STATION_NM>\n<STATION_NM

In [37]:


#Parameter
KEY = key['dataseoul']
TYPE = 'json'
SERVICE = 'SearchSTNBySubwayLineInfo'
START_INDEX = str(0)
END_INDEX=str(20)
LINE_NUM=str(2)

params="/".join([KEY,TYPE,SERVICE,START_INDEX,END_INDEX,'','',LINE_NUM])
print(params[30:])


#Url
_url='http://openapi.seoul.go.kr:8088'
# url=urllib.parse.urljoin(_url, params) 인자가 합성되지만 생략된 인자가 생략됨. 따라서 urllib을 사용하면 제대로 검색할 수 없음
url = "/".join([_url, params])
# print(url)


r=requests.get(url)

stations = json.loads(r.text)
stations # 지하철 정보 Json

/json/SearchSTNBySubwayLineInfo/0/20///2


{'SearchSTNBySubwayLineInfo': {'list_total_count': 777,
  'RESULT': {'CODE': 'INFO-000', 'MESSAGE': '정상 처리되었습니다'},
  'row': [{'STATION_CD': '1726',
    'STATION_NM': '직산',
    'STATION_NM_ENG': 'Jiksan',
    'LINE_NUM': '01호선',
    'FR_CODE': 'P167',
    'STATION_NM_CHN': '稷山',
    'STATION_NM_JPN': '稷山'},
   {'STATION_CD': '1725',
    'STATION_NM': '성환',
    'STATION_NM_ENG': 'Seonghwan',
    'LINE_NUM': '01호선',
    'FR_CODE': 'P166',
    'STATION_NM_CHN': '成歡',
    'STATION_NM_JPN': '成歡'},
   {'STATION_CD': '1724',
    'STATION_NM': '평택',
    'STATION_NM_ENG': 'Pyeongtaek',
    'LINE_NUM': '01호선',
    'FR_CODE': 'P165',
    'STATION_NM_CHN': '平澤',
    'STATION_NM_JPN': '平澤'},
   {'STATION_CD': '1723',
    'STATION_NM': '평택지제',
    'STATION_NM_ENG': 'PyeongtaekJije',
    'LINE_NUM': '01호선',
    'FR_CODE': 'P164',
    'STATION_NM_CHN': '芝制',
    'STATION_NM_JPN': '芝制'},
   {'STATION_CD': '1722',
    'STATION_NM': '서정리',
    'STATION_NM_ENG': 'Seojeong-ri',
    'LINE_NUM': '01호선',
    '

In [38]:
for e in stations['SearchSTNBySubwayLineInfo']['row']:
    print ("{0:4s}\t{1:15s}\t{2:3s}\t{3:2s}".format(e['STATION_CD'], e['STATION_NM'], e['FR_CODE'], e['LINE_NUM']))

1726	직산             	P167	01호선
1725	성환             	P166	01호선
1724	평택             	P165	01호선
1723	평택지제           	P164	01호선
1722	서정리            	P163	01호선
1721	송탄             	P162	01호선
1718	오산대            	P159	01호선
1717	세마             	P158	01호선
0152	종각             	131	01호선
0151	시청             	132	01호선
1716	병점             	P157	01호선
1715	세류             	P156	01호선
1714	독산             	P143	01호선
1401	봉명             	P170	01호선
1402	쌍용             	P171	01호선
1403	아산             	P172	01호선
1404	탕정             	P173	01호선
1405	배방             	P174	01호선
1407	온양온천           	P176	01호선
1408	신창             	P177	01호선


### 지하철 스크립트 저장


In [39]:
%%writefile src/ds_open_subway_json.py
#!/usr/bin/env python
# coding: utf-8
import os
import requests
import urllib
import mylib # NO! src.mylib -> 상대적 디렉터리이기 때문. src 밑에 저장하기 때문

def doIt():
    keyPath=os.path.join(os.getcwd(), 'src', 'key.properties')
    key=mylib.getKey(keyPath)
    
    # (1) make params with resource IDs
    KEY=str(key['dataseoul'])
    TYPE='json'
    SERVICE='SearchSTNBySubwayLineInfo'
    LINE_NUM=str(2)
    START_INDEX=str(1)
    END_INDEX=str(10)
    
    #params=os.path.join(KEY,TYPE,SERVICE,START_INDEX,END_INDEX,LINE_NUM)
    params="/".join([KEY,TYPE,SERVICE,START_INDEX,END_INDEX,'','',LINE_NUM])
    
    # (2) make a full url
    _url='http://openAPI.seoul.go.kr:8088' #NOTE slash: do not use 'http://openAPI.seoul.go.kr:8088/'
    #url=urllib.parse.urljoin(_url,params)
    url="/".join([_url,params])
    #print(url)
    
    # (3) get data
    r=requests.get(url)
    #print(r)
    stations=r.json()
    #print(stations)
    for e in stations['SearchSTNBySubwayLineInfo']['row']:
        print (u"{0:4.0f}\t{1:15s}\t{2:3s}\t{3:2s}".format(e['STATION_CD'], e['STATION_NM'], e['FR_CODE'], e['LINE_NUM']))

if __name__ == "__main__":
    doIt()

Overwriting src/ds_open_subway_json.py


In [40]:
%%writefile src/ds_open_subway_iter_json.py
#!/usr/bin/env python
# coding: utf-8
import os
import requests
import urllib
import mylib # NO! src.mylib

def doIt():
    keyPath=os.path.join(os.getcwd(), 'src', 'key.properties')
    key=mylib.getKey(keyPath)
    # (1) make params with resource IDs
    KEY=str(key['dataseoul'])
    TYPE='json'
    #OLD: SERVICE='SearchSTNBySubwayLineService'
    SERVICE='SearchSTNBySubwayLineInfo'
    LINE_NUM=str(2)
    START_INDEX=str(1)
    END_INDEX=str(10)
    startIndex=1
    endIndex=10
    list_total_count=0    # set later
    while True:
        START_INDEX=str(startIndex)
        END_INDEX=str(endIndex)
        #params=os.path.join(KEY,TYPE,SERVICE,START_INDEX,END_INDEX,LINE_NUM)
        params="/".join([KEY,TYPE,SERVICE,START_INDEX,END_INDEX,'','',LINE_NUM])
        # (2) make a full url
        _url='http://openAPI.seoul.go.kr:8088' #NOTE slash: do not use 'http://openAPI.seoul.go.kr:8088/'
        #url=urllib.parse.urljoin(_url,params)
        url="/".join([_url,params])
        #print(url)
        # (3) get data
        r=requests.get(url)
        #print(r)
        stations=r.json()
        #print(stations)
        if(startIndex==1):
            list_total_count=stations['SearchSTNBySubwayLineInfo']['list_total_count']
            print("- Total Count: ", list_total_count)
        if(stations["SearchSTNBySubwayLineInfo"]["RESULT"]["CODE"]=="INFO-000"):
            for e in stations['SearchSTNBySubwayLineInfo']['row']:
                print (u"{0:3d}\t{1:4.0f}\t{2:15s}\t{3:3s}\t{4:2s}".format(
                    startIndex, e['STATION_CD'], e['STATION_NM'], e['FR_CODE'], e['LINE_NUM']
                ))
            startIndex+=10
            endIndex+=10
        else:
            print("Error: "+str(startIndex))
        if(endIndex > list_total_count):
            print("----- Ending endIndex=",endIndex)
            break  # exit from the while loop

if __name__ == "__main__":
    doIt()
    


Overwriting src/ds_open_subway_iter_json.py


In [41]:
!python3 src/ds_open_subway_iter_json.py

- Total Count:  777
Traceback (most recent call last):
  File "/Users/sojaehwi/Documents/GitHub/LECTURE_Bigdata/src/ds_open_subway_iter_json.py", line 54, in <module>
    doIt()
  File "/Users/sojaehwi/Documents/GitHub/LECTURE_Bigdata/src/ds_open_subway_iter_json.py", line 42, in doIt
    print (u"{0:3d}\t{1:4.0f}\t{2:15s}\t{3:3s}\t{4:2s}".format(
ValueError: Unknown format code 'f' for object of type 'str'


### MongoDB


In [42]:
import os
import src.mylib

keyPath=os.path.join(os.getcwd(), 'src', 'key.properties')
key=src.mylib.getKey(keyPath)

mongo_pwd = key['mongo']
uriCloud = f'mongodb+srv://sjh9708:{mongo_pwd}@mycluster.tfiobq6.mongodb.net/?retryWrites=true&w=majority'
# print(uriCloud)

In [43]:
import pymongo

Client = pymongo.MongoClient(uriCloud)

In [44]:
db=Client.myDB

In [45]:
_id=1
_name='js'
_age=22
_country='ko'

db.myPyCol.insert_one({
    "id": _id,
    "name": _name,
    "age": _age,
    "country": _country
})

<pymongo.results.InsertOneResult at 0x7fc698305940>

In [46]:
print(Client.list_database_names())

['ds_open_subwayPassengersDb', 'ds_sanggwun', 'midterm', 'myDB', 'myDB4', 'admin', 'local']


In [47]:
print(db.list_collection_names())

['myPyCol']


### 문제 3: 행정동별 서울생활인구 (단기체류 외국인)


In [48]:
# https://data.seoul.go.kr/dataList/OA-14993/S/1/datasetView.do


In [49]:
import os
import src.mylib
keyPath=os.path.join(os.getcwd(), 'src', 'key.properties')
key=src.mylib.getKey(keyPath)
KEY=str(key['dataseoul'])

TYPE='xml'
SERVICE='SPOP_FORN_TEMP_RESD_DONG'
START_INDEX=str(1)
END_INDEX=str(5)
STDR_DE_ID=str(20200617)

#params=os.path.join(KEY,TYPE,SERVICE,START_INDEX,END_INDEX)
params="/".join([KEY,TYPE,SERVICE,START_INDEX,END_INDEX,STDR_DE_ID])

print (params[30:])  # 인증키(30자리)를 제거하고 출력한다

/xml/SPOP_FORN_TEMP_RESD_DONG/1/5/20200617


In [50]:
import urllib
_url='http://openapi.seoul.go.kr:8088/'
url=urllib.parse.urljoin(_url,params)

# print(url)
# http://openapi.seoul.go.kr:8088/MY_KEY/xml/SPOP_FORN_TEMP_RESD_DONG/1/5/20200617

In [51]:
import requests

data=requests.get(url).text

print (data)

<?xml version="1.0" encoding="UTF-8"?>
<SPOP_FORN_TEMP_RESD_DONG>
<list_total_count>9849</list_total_count>
<RESULT>
<CODE>INFO-000</CODE>
<MESSAGE>정상 처리되었습니다</MESSAGE>
</RESULT>
<row>
<STDR_DE_ID>20200617</STDR_DE_ID>
<TMZON_PD_SE>00</TMZON_PD_SE>
<ADSTRD_CODE_SE>11110515</ADSTRD_CODE_SE>
<TOT_LVPOP_CO>248.866</TOT_LVPOP_CO>
<CHINA_STAYPOP_CO>86.4335</CHINA_STAYPOP_CO>
<ETC_STAYPOP_CO>162.432</ETC_STAYPOP_CO>
</row>
<row>
<STDR_DE_ID>20200617</STDR_DE_ID>
<TMZON_PD_SE>00</TMZON_PD_SE>
<ADSTRD_CODE_SE>11110530</ADSTRD_CODE_SE>
<TOT_LVPOP_CO>71.2628</TOT_LVPOP_CO>
<CHINA_STAYPOP_CO>0</CHINA_STAYPOP_CO>
<ETC_STAYPOP_CO>71.2627</ETC_STAYPOP_CO>
</row>
<row>
<STDR_DE_ID>20200617</STDR_DE_ID>
<TMZON_PD_SE>00</TMZON_PD_SE>
<ADSTRD_CODE_SE>11110540</ADSTRD_CODE_SE>
<TOT_LVPOP_CO>150.1777</TOT_LVPOP_CO>
<CHINA_STAYPOP_CO>86.2246</CHINA_STAYPOP_CO>
<ETC_STAYPOP_CO>63.9532</ETC_STAYPOP_CO>
</row>
<row>
<STDR_DE_ID>20200617</STDR_DE_ID>
<TMZON_PD_SE>00</TMZON_PD_SE>
<ADSTRD_CODE_SE>11110550</ADST

In [52]:
import lxml
import lxml.etree

tree = lxml.etree.fromstring(data.encode('utf-8'))
nodes = tree.xpath('//TOT_LVPOP_CO') #속성
print(nodes)

for node in nodes:
    print(node.text)


[<Element TOT_LVPOP_CO at 0x7fc6a0b97740>, <Element TOT_LVPOP_CO at 0x7fc6c05ac8c0>, <Element TOT_LVPOP_CO at 0x7fc6c05ac1c0>, <Element TOT_LVPOP_CO at 0x7fc6c059e840>, <Element TOT_LVPOP_CO at 0x7fc6c059ecc0>]
248.866
71.2628
150.1777
136.7137
174.824


In [53]:
%%writefile src/ds_open_foreigners_xml.py
import os
import urllib
import requests
import lxml
import lxml.etree
import mylib #from src import mylib

def doIt():
    keyPath=os.path.join(os.getcwd(), 'src', 'key.properties')
    key=mylib.getKey(keyPath)
    KEY=str(key['dataseoul'])
    TYPE='xml'
    SERVICE='SPOP_FORN_TEMP_RESD_DONG'
    START_INDEX=str(1)
    END_INDEX=str(10)
    STDR_DE_ID=str(20200617)
    #params="/".join([KEY,TYPE,SERVICE,START_INDEX,END_INDEX,STDR_DE_ID])

    #_url='http://openAPI.seoul.go.kr:8088/'
    _url='http://openAPI.seoul.go.kr:8088' # NOTE: the slash at the end removed
    #url=urllib.parse.urljoin(_url,params)
    url="/".join([_url,KEY,TYPE,SERVICE,START_INDEX,END_INDEX,STDR_DE_ID])
    data=requests.get(url).text
    #tree=lxml.etree.parse(StringIO.StringIO(data.encode('utf-8')))
    tree=lxml.etree.fromstring(data.encode('utf-8'))
    nodes=tree.xpath('//TOT_LVPOP_CO')
    for node in nodes:
        print (node.text)

if __name__ == "__main__":
    doIt()

Overwriting src/ds_open_foreigners_xml.py


In [54]:
!python3 src/ds_open_foreigners_xml.py

248.866
71.2628
150.1777
136.7137
174.824
0.4833
0.2163
35.2093
756.5433
87.6117


### 문제 4: 서울시 지하철역별 월별 승하차인원 구하기


In [55]:
import os
import src.mylib as mylib

keyPath = os.path.join(os.getcwd(), 'src', 'key.properties')
key = mylib.getKey(keyPath)
# key

In [56]:
_url='http://openAPI.seoul.go.kr:8088'
_key=str(key['dataseoul'])
_type='json'
_service='CardSubwayStatsNew'
_start_index=1
_end_index=5
_use_dt='20220801'

#### URL을 만드는 세 가지 방법

In [57]:
import urllib

_api0=os.path.join(_url,_key,_type,_service,str(_start_index),str(_end_index),_use_dt)
_api1="/".join([_url,_key,_type,_service,str(_start_index),str(_end_index),_use_dt])
_api2=urllib.parse.urljoin(_url,"/".join([_key,_type,_service,str(_start_index),str(_end_index),_use_dt]))
                           
# print("api0: ", _api0)
# print("api1: ", _api1)
# print("api2: ", _api2)
# api0:  http://openAPI.seoul.go.kr:8088/MY_KEY/json/CardSubwayStatsNew/1/5/20220801
# api1:  http://openAPI.seoul.go.kr:8088/MY_KEY/json/CardSubwayStatsNew/1/5/20220801
# api2:  http://openAPI.seoul.go.kr:8088/MY_KEY/json/CardSubwayStatsNew/1/5/20220801

#### 데이터 수집

In [58]:
import requests
response = requests.get(_api1)

print(response.text)

{"CardSubwayStatsNew":{"list_total_count":605,"RESULT":{"CODE":"INFO-000","MESSAGE":"정상 처리되었습니다"},"row":[{"USE_DT":"20220801","LINE_NUM":"1호선","SUB_STA_NM":"서울역","RIDE_PASGR_NUM":47711.0,"ALIGHT_PASGR_NUM":45432.0,"WORK_DT":"20220804"},{"USE_DT":"20220801","LINE_NUM":"1호선","SUB_STA_NM":"시청","RIDE_PASGR_NUM":21830.0,"ALIGHT_PASGR_NUM":22357.0,"WORK_DT":"20220804"},{"USE_DT":"20220801","LINE_NUM":"1호선","SUB_STA_NM":"종각","RIDE_PASGR_NUM":35945.0,"ALIGHT_PASGR_NUM":35727.0,"WORK_DT":"20220804"},{"USE_DT":"20220801","LINE_NUM":"1호선","SUB_STA_NM":"종로3가","RIDE_PASGR_NUM":19983.0,"ALIGHT_PASGR_NUM":18539.0,"WORK_DT":"20220804"},{"USE_DT":"20220801","LINE_NUM":"1호선","SUB_STA_NM":"종로5가","RIDE_PASGR_NUM":18002.0,"ALIGHT_PASGR_NUM":18063.0,"WORK_DT":"20220804"}]}}


#### JSON 파싱 (문자열 -> JSON)

In [59]:
if response.status_code == 200:
    data = response.json()  # JSON 파싱
else:
    print("오류:", response.status_code)
    
sub=response.json()

print(sub)

{'CardSubwayStatsNew': {'list_total_count': 605, 'RESULT': {'CODE': 'INFO-000', 'MESSAGE': '정상 처리되었습니다'}, 'row': [{'USE_DT': '20220801', 'LINE_NUM': '1호선', 'SUB_STA_NM': '서울역', 'RIDE_PASGR_NUM': 47711.0, 'ALIGHT_PASGR_NUM': 45432.0, 'WORK_DT': '20220804'}, {'USE_DT': '20220801', 'LINE_NUM': '1호선', 'SUB_STA_NM': '시청', 'RIDE_PASGR_NUM': 21830.0, 'ALIGHT_PASGR_NUM': 22357.0, 'WORK_DT': '20220804'}, {'USE_DT': '20220801', 'LINE_NUM': '1호선', 'SUB_STA_NM': '종각', 'RIDE_PASGR_NUM': 35945.0, 'ALIGHT_PASGR_NUM': 35727.0, 'WORK_DT': '20220804'}, {'USE_DT': '20220801', 'LINE_NUM': '1호선', 'SUB_STA_NM': '종로3가', 'RIDE_PASGR_NUM': 19983.0, 'ALIGHT_PASGR_NUM': 18539.0, 'WORK_DT': '20220804'}, {'USE_DT': '20220801', 'LINE_NUM': '1호선', 'SUB_STA_NM': '종로5가', 'RIDE_PASGR_NUM': 18002.0, 'ALIGHT_PASGR_NUM': 18063.0, 'WORK_DT': '20220804'}]}}


#### 데이터 추출

In [60]:
rows = sub['CardSubwayStatsNew']['row']

In [61]:
for row in rows:
    print(row.get('SUB_STA_NM'))
    print(row['SUB_STA_NM'])

서울역
서울역
시청
시청
종각
종각
종로3가
종로3가
종로5가
종로5가


In [62]:
for row in rows:
    sum_ridenum = float(row['RIDE_PASGR_NUM'] + row['RIDE_PASGR_NUM'])
    print ("{0:5s}\t{1:10s}\t{2:9.1f}\t{3:9.1f}\t{4:9.1f}"
           .format(row['LINE_NUM'],row['SUB_STA_NM'],row['RIDE_PASGR_NUM'],row['ALIGHT_PASGR_NUM'], sum_ridenum))


1호선  	서울역       	  47711.0	  45432.0	  95422.0
1호선  	시청        	  21830.0	  22357.0	  43660.0
1호선  	종각        	  35945.0	  35727.0	  71890.0
1호선  	종로3가      	  19983.0	  18539.0	  39966.0
1호선  	종로5가      	  18002.0	  18063.0	  36004.0


### 문제 5: 서울시 지하철 호선별 역별 시간대별 승하차 인원 정보

In [63]:
import os
import src.mylib as mylib

keyPath = os.path.join(os.getcwd(), 'src', 'key.properties')
key = mylib.getKey(keyPath)
# key

#### XML

In [64]:
_url='http://openAPI.seoul.go.kr:8088'
_key=str(key['dataseoul'])
_type='xml'
_service='CardSubwayTime'
_start_index=1
_end_index=5
_use_mon='202106'

In [65]:
import requests

#여기서는 INDEX를 통해 1회에 5건씩, iter를 통해 1회 반복해서 총 5건의 데이터를 가져오고 있다. 
#iter를 크게 또는 작게 해서 데이터 수집량을 조정할 수 있다. 각 레코드는 시간대별 승하차인원을 포함하고 있다.

_maxIter=2 #몇페이지까지?
_iter=0
while _iter<_maxIter:
    _api="/".join([_url,_key,_type,_service,str(_start_index),str(_end_index),_use_mon])
    #print _api
    response = requests.get(_api).text
    print(response)
    _start_index+=5
    _end_index+=5
    _iter+=1

<?xml version="1.0" encoding="UTF-8"?>
<CardSubwayTime>
<list_total_count>608</list_total_count>
<RESULT>
<CODE>INFO-000</CODE>
<MESSAGE>정상 처리되었습니다</MESSAGE>
</RESULT>
<row>
<USE_MON>202106</USE_MON>
<LINE_NUM>1호선</LINE_NUM>
<SUB_STA_NM>서울역</SUB_STA_NM>
<FOUR_RIDE_NUM>654</FOUR_RIDE_NUM>
<FOUR_ALIGHT_NUM>17</FOUR_ALIGHT_NUM>
<FIVE_RIDE_NUM>9008</FIVE_RIDE_NUM>
<FIVE_ALIGHT_NUM>6400</FIVE_ALIGHT_NUM>
<SIX_RIDE_NUM>12474</SIX_RIDE_NUM>
<SIX_ALIGHT_NUM>37203</SIX_ALIGHT_NUM>
<SEVEN_RIDE_NUM>37253</SEVEN_RIDE_NUM>
<SEVEN_ALIGHT_NUM>91875</SEVEN_ALIGHT_NUM>
<EIGHT_RIDE_NUM>59876</EIGHT_RIDE_NUM>
<EIGHT_ALIGHT_NUM>187805</EIGHT_ALIGHT_NUM>
<NINE_RIDE_NUM>44619</NINE_RIDE_NUM>
<NINE_ALIGHT_NUM>118679</NINE_ALIGHT_NUM>
<TEN_RIDE_NUM>42611</TEN_RIDE_NUM>
<TEN_ALIGHT_NUM>57710</TEN_ALIGHT_NUM>
<ELEVEN_RIDE_NUM>49533</ELEVEN_RIDE_NUM>
<ELEVEN_ALIGHT_NUM>50003</ELEVEN_ALIGHT_NUM>
<TWELVE_RIDE_NUM>59357</TWELVE_RIDE_NUM>
<TWELVE_ALIGHT_NUM>53317</TWELVE_ALIGHT_NUM>
<THIRTEEN_RIDE_NUM>61171</THIRTEE

<?xml version="1.0" encoding="UTF-8"?>
<CardSubwayTime>
<list_total_count>608</list_total_count>
<RESULT>
<CODE>INFO-000</CODE>
<MESSAGE>정상 처리되었습니다</MESSAGE>
</RESULT>
<row>
<USE_MON>202106</USE_MON>
<LINE_NUM>1호선</LINE_NUM>
<SUB_STA_NM>종로5가</SUB_STA_NM>
<FOUR_RIDE_NUM>71</FOUR_RIDE_NUM>
<FOUR_ALIGHT_NUM>0</FOUR_ALIGHT_NUM>
<FIVE_RIDE_NUM>1526</FIVE_RIDE_NUM>
<FIVE_ALIGHT_NUM>3779</FIVE_ALIGHT_NUM>
<SIX_RIDE_NUM>2574</SIX_RIDE_NUM>
<SIX_ALIGHT_NUM>16992</SIX_ALIGHT_NUM>
<SEVEN_RIDE_NUM>4781</SEVEN_RIDE_NUM>
<SEVEN_ALIGHT_NUM>39423</SEVEN_ALIGHT_NUM>
<EIGHT_RIDE_NUM>7414</EIGHT_RIDE_NUM>
<EIGHT_ALIGHT_NUM>103060</EIGHT_ALIGHT_NUM>
<NINE_RIDE_NUM>11525</NINE_RIDE_NUM>
<NINE_ALIGHT_NUM>62717</NINE_ALIGHT_NUM>
<TEN_RIDE_NUM>18431</TEN_RIDE_NUM>
<TEN_ALIGHT_NUM>50987</TEN_ALIGHT_NUM>
<ELEVEN_RIDE_NUM>27735</ELEVEN_RIDE_NUM>
<ELEVEN_ALIGHT_NUM>48857</ELEVEN_ALIGHT_NUM>
<TWELVE_RIDE_NUM>34387</TWELVE_RIDE_NUM>
<TWELVE_ALIGHT_NUM>47295</TWELVE_ALIGHT_NUM>
<THIRTEEN_RIDE_NUM>40094</THIRTEEN_RID

#### JSON

In [66]:
_type='json'
_api="/".join([_url,_key,_type,_service,str(_start_index),str(_end_index),_use_mon])
data=requests.get(_api).text

In [67]:
# JSON Parsing

import json
jsubwayTime = json.loads(data)

In [68]:
j0=jsubwayTime['CardSubwayTime']['row'][0]

In [69]:
j0.keys() #Dic Keys

dict_keys(['USE_MON', 'LINE_NUM', 'SUB_STA_NM', 'FOUR_RIDE_NUM', 'FOUR_ALIGHT_NUM', 'FIVE_RIDE_NUM', 'FIVE_ALIGHT_NUM', 'SIX_RIDE_NUM', 'SIX_ALIGHT_NUM', 'SEVEN_RIDE_NUM', 'SEVEN_ALIGHT_NUM', 'EIGHT_RIDE_NUM', 'EIGHT_ALIGHT_NUM', 'NINE_RIDE_NUM', 'NINE_ALIGHT_NUM', 'TEN_RIDE_NUM', 'TEN_ALIGHT_NUM', 'ELEVEN_RIDE_NUM', 'ELEVEN_ALIGHT_NUM', 'TWELVE_RIDE_NUM', 'TWELVE_ALIGHT_NUM', 'THIRTEEN_RIDE_NUM', 'THIRTEEN_ALIGHT_NUM', 'FOURTEEN_RIDE_NUM', 'FOURTEEN_ALIGHT_NUM', 'FIFTEEN_RIDE_NUM', 'FIFTEEN_ALIGHT_NUM', 'SIXTEEN_RIDE_NUM', 'SIXTEEN_ALIGHT_NUM', 'SEVENTEEN_RIDE_NUM', 'SEVENTEEN_ALIGHT_NUM', 'EIGHTEEN_RIDE_NUM', 'EIGHTEEN_ALIGHT_NUM', 'NINETEEN_RIDE_NUM', 'NINETEEN_ALIGHT_NUM', 'TWENTY_RIDE_NUM', 'TWENTY_ALIGHT_NUM', 'TWENTY_ONE_RIDE_NUM', 'TWENTY_ONE_ALIGHT_NUM', 'TWENTY_TWO_RIDE_NUM', 'TWENTY_TWO_ALIGHT_NUM', 'TWENTY_THREE_RIDE_NUM', 'TWENTY_THREE_ALIGHT_NUM', 'MIDNIGHT_RIDE_NUM', 'MIDNIGHT_ALIGHT_NUM', 'ONE_RIDE_NUM', 'ONE_ALIGHT_NUM', 'TWO_RIDE_NUM', 'TWO_ALIGHT_NUM', 'THREE_RIDE_NU

In [70]:
for item in jsubwayTime['CardSubwayTime']['row']:
    print(item.values()) #Dic Values

dict_values(['202106', '2호선', '시청', 14.0, 1.0, 842.0, 1763.0, 1860.0, 14975.0, 4613.0, 64814.0, 6633.0, 200414.0, 8802.0, 77430.0, 11360.0, 22797.0, 16922.0, 19360.0, 17020.0, 18897.0, 21198.0, 20182.0, 25371.0, 17294.0, 30010.0, 15601.0, 35093.0, 13175.0, 75625.0, 14559.0, 161483.0, 15107.0, 49943.0, 7260.0, 34005.0, 3595.0, 34984.0, 3606.0, 25529.0, 3849.0, 3207.0, 1524.0, 1.0, 59.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, '20210703'])
dict_values(['202106', '2호선', '을지로입구', 64.0, 3.0, 1704.0, 2012.0, 2899.0, 28884.0, 7675.0, 117998.0, 13246.0, 291410.0, 13992.0, 135503.0, 16743.0, 58638.0, 25033.0, 49654.0, 30391.0, 41996.0, 40023.0, 41126.0, 45420.0, 35619.0, 54603.0, 31053.0, 66615.0, 31408.0, 137169.0, 34407.0, 239134.0, 39521.0, 92855.0, 19653.0, 80662.0, 10659.0, 56892.0, 10478.0, 45993.0, 11575.0, 6249.0, 3586.0, 4.0, 1586.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, '20210703'])
dict_values(['202106', '2호선', '을지로3가', 9.0, 1.0, 911.0, 1683.0, 1978.0, 18284.0, 5137.0, 64623.0, 8881.0, 153068.0, 9900.

### 문제 6: 지하철역 승하차 인원 수집, 저장

앞에서 상호작용 방식으로 작성했던 코드를 일괄 실행하고, 검색결과를 저장해보자.

지난 코드에서는 화면출력만 해보았으나, 파일이나 mongodb로 저장해보자.

In [71]:
%%writefile src/ds_open_subwayTime.py
# coding: utf-8
import os
import requests
import json
from pymongo import MongoClient
import mylib

# JSON 파일 저장하기
def saveJson(_fname,_data):
    import io
    with io.open(_fname, 'a', encoding='utf8') as json_file:
        #_j=json.dumps(_data, json_file, ensure_ascii=False, encoding='utf8') # python2
        _j=json.dump(_data, json_file, ensure_ascii=False)
        json_file.write(str(_j)+"\n")

def readJson(_fname):
    for line in open(_fname, 'r').readlines():
        _j=json.loads(line)
        #print _j['id'],_j['text']
        print (_j['id'])

# 데이터베이스에 저장하기        
def saveDB(_table, _data):
    _table.insert_one(_data)

# Update Required: tweet -> ds_open_subwayPassengersDb
def readDB(_table):
    for tweet in _table.find():
        print (tweet['id'], tweet['text'])

def saveFile(_fname,_data):
    fp=open(_fname,'a')
    fp.write(_data+"\n")
    fp.close()

def doIt():
    # 1. Key.properties 가져오기
    keyPath=os.path.join(os.getcwd(), 'src', 'key.properties')
    key=mylib.getKey(keyPath)
    
    # 2. mongodb connect
    mongo_pwd = key['mongo']
    uriCloud = f'mongodb+srv://sjh9708:{mongo_pwd}@mycluster.tfiobq6.mongodb.net/?retryWrites=true&w=majority'
    Client = MongoClient(uriCloud)
    _db=Client['ds_open_subwayPassengersDb'] #db created by mongo. You do not have to create this.
    _table=_db['db_open_subwayTable'] #collection
    
    # 3. 저장할 JSON 파일명
    _jfname='src/ds_open_subwayTime.json'

    # 4. OpenAPI dataseoul에서 수집
    _key=key['dataseoul'] #KEY='73725.....'
    _url='http://openAPI.seoul.go.kr:8088'
    _type='json'
    _service='CardSubwayTime'
    _start_index=1
    _end_index=5
    _use_mon='202106'
    _maxIter=2   #20
    _iter=0
    while _iter<_maxIter:
        # 5. API URL 작성
        _api="/".join([_url,_key,_type,_service,str(_start_index),str(_end_index),_use_mon])

        # 6. API 요청
        r=requests.get(_api)
        
        # 7. JSON Parsing
        _json=r.json()
        print (_json)
        
        # 8. JSON 파일 / MongoDB에 JSON 저장
        saveJson(_jfname,_json)
        saveDB(_table, _json)
        _start_index+=5
        _end_index+=5
        _iter+=1

if __name__ == "__main__":
    doIt()

Overwriting src/ds_open_subwayTime.py


In [72]:
!python3 src/ds_open_subwayTime.py

{'CardSubwayTime': {'list_total_count': 608, 'RESULT': {'CODE': 'INFO-000', 'MESSAGE': '정상 처리되었습니다'}, 'row': [{'USE_MON': '202106', 'LINE_NUM': '1호선', 'SUB_STA_NM': '서울역', 'FOUR_RIDE_NUM': 654.0, 'FOUR_ALIGHT_NUM': 17.0, 'FIVE_RIDE_NUM': 9008.0, 'FIVE_ALIGHT_NUM': 6400.0, 'SIX_RIDE_NUM': 12474.0, 'SIX_ALIGHT_NUM': 37203.0, 'SEVEN_RIDE_NUM': 37253.0, 'SEVEN_ALIGHT_NUM': 91875.0, 'EIGHT_RIDE_NUM': 59876.0, 'EIGHT_ALIGHT_NUM': 187805.0, 'NINE_RIDE_NUM': 44619.0, 'NINE_ALIGHT_NUM': 118679.0, 'TEN_RIDE_NUM': 42611.0, 'TEN_ALIGHT_NUM': 57710.0, 'ELEVEN_RIDE_NUM': 49533.0, 'ELEVEN_ALIGHT_NUM': 50003.0, 'TWELVE_RIDE_NUM': 59357.0, 'TWELVE_ALIGHT_NUM': 53317.0, 'THIRTEEN_RIDE_NUM': 61171.0, 'THIRTEEN_ALIGHT_NUM': 53687.0, 'FOURTEEN_RIDE_NUM': 53310.0, 'FOURTEEN_ALIGHT_NUM': 49094.0, 'FIFTEEN_RIDE_NUM': 65767.0, 'FIFTEEN_ALIGHT_NUM': 52788.0, 'SIXTEEN_RIDE_NUM': 76249.0, 'SIXTEEN_ALIGHT_NUM': 53969.0, 'SEVENTEEN_RIDE_NUM': 122928.0, 'SEVENTEEN_ALIGHT_NUM': 64693.0, 'EIGHTEEN_RIDE_NUM': 184907.0,

In [73]:
!ls -l src/ds_open_subwayTime*

-rw-r--r--  1 sojaehwi  staff  96834 Nov 27 05:51 src/ds_open_subwayTime.json
-rw-r--r--  1 sojaehwi  staff   2267 Nov 27 05:51 src/ds_open_subwayTime.py


### 데이터베이스 확인

In [74]:
db=Client.ds_open_subwayPassengersDb

In [75]:
db.list_collection_names()

['db_open_subwayTable']

In [76]:
db.db_open_subwayTable.count_documents({})

12

In [77]:
db.db_open_subwayTable.find_one() # 한 건 출력

{'_id': ObjectId('6528f7cfd614ea37caa2ad28'),
 'CardSubwayTime': {'list_total_count': 608,
  'RESULT': {'CODE': 'INFO-000', 'MESSAGE': '정상 처리되었습니다'},
  'row': [{'USE_MON': '202106',
    'LINE_NUM': '1호선',
    'SUB_STA_NM': '서울역',
    'FOUR_RIDE_NUM': 654.0,
    'FOUR_ALIGHT_NUM': 17.0,
    'FIVE_RIDE_NUM': 9008.0,
    'FIVE_ALIGHT_NUM': 6400.0,
    'SIX_RIDE_NUM': 12474.0,
    'SIX_ALIGHT_NUM': 37203.0,
    'SEVEN_RIDE_NUM': 37253.0,
    'SEVEN_ALIGHT_NUM': 91875.0,
    'EIGHT_RIDE_NUM': 59876.0,
    'EIGHT_ALIGHT_NUM': 187805.0,
    'NINE_RIDE_NUM': 44619.0,
    'NINE_ALIGHT_NUM': 118679.0,
    'TEN_RIDE_NUM': 42611.0,
    'TEN_ALIGHT_NUM': 57710.0,
    'ELEVEN_RIDE_NUM': 49533.0,
    'ELEVEN_ALIGHT_NUM': 50003.0,
    'TWELVE_RIDE_NUM': 59357.0,
    'TWELVE_ALIGHT_NUM': 53317.0,
    'THIRTEEN_RIDE_NUM': 61171.0,
    'THIRTEEN_ALIGHT_NUM': 53687.0,
    'FOURTEEN_RIDE_NUM': 53310.0,
    'FOURTEEN_ALIGHT_NUM': 49094.0,
    'FIFTEEN_RIDE_NUM': 65767.0,
    'FIFTEEN_ALIGHT_NUM': 52788.0,
 

In [78]:
db.db_open_subwayTable.find_one({"CardSubwayTime.row.SUB_STA_NM":"서울역"}) # Where

{'_id': ObjectId('6528f7cfd614ea37caa2ad28'),
 'CardSubwayTime': {'list_total_count': 608,
  'RESULT': {'CODE': 'INFO-000', 'MESSAGE': '정상 처리되었습니다'},
  'row': [{'USE_MON': '202106',
    'LINE_NUM': '1호선',
    'SUB_STA_NM': '서울역',
    'FOUR_RIDE_NUM': 654.0,
    'FOUR_ALIGHT_NUM': 17.0,
    'FIVE_RIDE_NUM': 9008.0,
    'FIVE_ALIGHT_NUM': 6400.0,
    'SIX_RIDE_NUM': 12474.0,
    'SIX_ALIGHT_NUM': 37203.0,
    'SEVEN_RIDE_NUM': 37253.0,
    'SEVEN_ALIGHT_NUM': 91875.0,
    'EIGHT_RIDE_NUM': 59876.0,
    'EIGHT_ALIGHT_NUM': 187805.0,
    'NINE_RIDE_NUM': 44619.0,
    'NINE_ALIGHT_NUM': 118679.0,
    'TEN_RIDE_NUM': 42611.0,
    'TEN_ALIGHT_NUM': 57710.0,
    'ELEVEN_RIDE_NUM': 49533.0,
    'ELEVEN_ALIGHT_NUM': 50003.0,
    'TWELVE_RIDE_NUM': 59357.0,
    'TWELVE_ALIGHT_NUM': 53317.0,
    'THIRTEEN_RIDE_NUM': 61171.0,
    'THIRTEEN_ALIGHT_NUM': 53687.0,
    'FOURTEEN_RIDE_NUM': 53310.0,
    'FOURTEEN_ALIGHT_NUM': 49094.0,
    'FIFTEEN_RIDE_NUM': 65767.0,
    'FIFTEEN_ALIGHT_NUM': 52788.0,
 

### 문제 9: 공공데이터포털 서울시 구별 교통사고 사망자 수

In [79]:
import os
from src import mylib
keyPath=os.path.join(os.getcwd(), 'src', 'key.properties')
key=mylib.getKey(keyPath)

In [80]:
import os
SERVICE='AccidentDeath'
OPERATION_NAME='getRestTrafficAccidentDeath'
#params1=os.path.join(SERVICE,OPERATION_NAME)
params1="/".join([SERVICE,OPERATION_NAME])
print (params1)

AccidentDeath/getRestTrafficAccidentDeath


In [81]:
import urllib
_d=dict()
_d['searchYear']='2020'
_d['siDo']='1100' #서울특별시
_d['guGun']='1116'
_d['numOfRows']='20'
_d['pageNo']='1'
#params2=urllib.urlencode(_d)
#params2=urllib.parse.urlencode(_d).encode("utf-8")
params2=urllib.parse.urlencode(_d)
print (params2)

searchYear=2020&siDo=1100&guGun=1116&numOfRows=20&pageNo=1


In [82]:
params=params1+'?'+'serviceKey='+key['gokr']+'&'+params2
#print params|

In [83]:
import urllib
_url='http://apis.data.go.kr/B552061/'
url=urllib.parse.urljoin(_url,params)
print (url)

http://apis.data.go.kr/B552061/AccidentDeath/getRestTrafficAccidentDeath?serviceKey=5aRT%2Fpyk40PbDisBEqd6aYnrQ%2F9QUiCsRteDOzXaacgZ2e%2BuW4ROwo2o07N5wpdQppVtlJ%2FTV79t%2BLIwkp9I0A%3D%3D&searchYear=2020&siDo=1100&guGun=1116&numOfRows=20&pageNo=1


In [84]:
import requests

data=requests.get(url).text

print(data)

<response>
    <header>
        <resultCode>00</resultCode>
        <resultMsg>NORMAL_CODE</resultMsg>
    </header>
    <body>
        <items>
            <item>
                <acc_year>2020</acc_year>
                <occrrnc_dt>2020010105</occrrnc_dt>
                <dght_cd>2</dght_cd>
                <occrrnc_day_cd>4</occrrnc_day_cd>
                <dth_dnv_cnt>1</dth_dnv_cnt>
                <injpsn_cnt>1</injpsn_cnt>
                <se_dnv_cnt>0</se_dnv_cnt>
                <sl_dnv_cnt>0</sl_dnv_cnt>
                <wnd_dnv_cnt>0</wnd_dnv_cnt>
                <occrrnc_lc_sido_cd>1100</occrrnc_lc_sido_cd>
                <occrrnc_lc_sgg_cd>1116</occrrnc_lc_sgg_cd>
                <acc_ty_lclas_cd>01</acc_ty_lclas_cd>
                <acc_ty_mlsfc_cd>12</acc_ty_mlsfc_cd>
                <acc_ty_cd>02</acc_ty_cd>
                <aslt_vtr_cd>05 </aslt_vtr_cd>
                <road_frm_lclas_cd>01</road_frm_lclas_cd>
                <road_frm_cd>05</ro