### Import

In [3]:
import requests
import pymysql
import json
import pandas as pd
import datetime as dt
import pickle
import time
from tqdm import tqdm
from bs4 import BeautifulSoup as bs

In [4]:
import urllib
from urllib.request import Request, urlopen

### DB 사용 예시

In [44]:
con = pymysql.connect(host='localhost', user='ssafy', password='ssafy', 
                      db='access_db', charset='utf8mb4', autocommit=True)
cur = con.cursor()

sql = "INSERT INTO table_name (value1, value2) VALUES (123, '이름');"
cur.execute(sql)

con.commit()
con.close()


### 환율

#### api 함수

In [57]:
def exchange_rate_api(date):
  API_HOST = "https://www.koreaexim.go.kr/site/program/financial/exchangeJSON"
  key = "?authkey=b60MyMtLUDVQyyiDjZRdFGbKxoJBypkl"
  search = "&searchdate="+date
  data = "&data=AP01"
  
  url = API_HOST+key+search+data
  headers = {"Content-Type" : "application/json", "charset" : "UTF-8", "Accept":"*/*"}

  try:
    response = requests.get(url, headers=headers)
    return response
  except Exception as ex:
    print(ex)
  

#### 데이터 수집

In [58]:
date_range = pd.date_range(start='10/29/2021', end='12/31/2022')

result_list = []
for date in tqdm(date_range):
  if dt.date.weekday(date) in (5, 6) :
    continue
  str_date = str(date).split(" ")[0]
  
  r = json.loads(exchange_rate_api(str_date).text)
  
  flag = True
  for data in r:
    if data["result"] == 4:
      flag = False
      break
    if data["cur_nm"] in ('미국 달러', '일본 옌', '유로') :
      if(data["cur_nm"] == '일본 옌'):
        result_list.append("('{}', '{}', {})".format(str_date, '일본 100엔', data["deal_bas_r"].replace(",", "")))
      else:
        result_list.append("('{}', '{}', {})".format(str_date, data['cur_nm'], data["deal_bas_r"].replace(",", "")))
  if flag == False:
    break

100%|██████████| 429/429 [00:30<00:00, 14.24it/s]


#### 저장 및 불러오기

In [59]:
with open("data(cur_20221231).p", "wb") as f:
  pickle.dump(result_list, f)

In [5]:
with open("data.p", "rb") as f:
  result_list = pickle.load(f)

#### DB 저장

In [60]:
con = pymysql.connect(host='localhost', user='ssafy', password='ssafy', 
                      db='access_db', charset='utf8mb4', autocommit=True)
cur = con.cursor()

In [61]:
sql = "INSERT INTO currency(cur_date, cur_name, cur_rate) VALUES " + ",".join(result_list)
cur.execute(sql)
con.close()

### 원자재(석유, 금)

- API가 2020년부터 데이터가 있어서 사용이 불가능.. 하하
- Naver를 통해 크롤링해야할 듯.
  - 주식, 코스닥 코스피도 동일
- 석유 : 경유, 휘발유, 등유 3가지 평균값 활용
  - 네이버 증권에는 등유가 없어서 경유, 휘발유만 사용?
- 금 : 국내금, 국제금 2가지 각각 활용

#### api 함수

##### 오일
- 휘발유 : OIL_GSL
- 경유 : OIL_LO
- 예시 : https://finance.naver.com/marketindex/oilDailyQuote.naver?marketindexCd=OIL_GSL&page=640

In [None]:
def oil_api(oil_type):
  page = 641
  end = 15
  oil = "휘발유" if oil_type == "OIL_GSL" else "경유"
  HOST = "https://finance.naver.com/marketindex/oilDailyQuote.naver?marketindexCd={}&page=".format(oil_type)
  
  headers = {"Content-Type" : "application/json", "charset" : "UTF-8", "Accept":"*/*"}

  try:
    result_list = []
    for p in tqdm(range(page, end-1, -1), 
              total = page-end+1, ## 전체 진행수
              desc = 'Desc', ## 진행률 앞쪽 출력 문장
              ncols = 80, ## 진행률 출력 폭 조절
              leave = True, ## True 반복문 완료시 진행률 출력 남김. False 남기지 않음.
            ):
      url = HOST+str(p)
      response = requests.get(url, headers=headers)
      result_list.extend(oil_bs(response, oil))
      
      time.sleep(0.5)
      
    return result_list
  except Exception as ex:
    print("oil_api 오류 발생")
    print(ex)

def oil_bs(response, oil_type):
  start = dt.datetime.strptime("2011-01-01", "%Y-%m-%d")
  end = dt.datetime.strptime("2023-01-01", "%Y-%m-%d")
  
  soup = bs(response.text, "html.parser")
  result = soup.select("tbody > tr")
  
  result_list = []  
  try:
    for p in result[::-1]:
      state = p.get("class")[0]
      td_list = p.select("td")
      td = [td_list[0].getText().strip().replace(".", "-"), oil_type, state, td_list[1].getText().strip(), td_list[2].getText().strip(), td_list[3].getText().strip()[1:-1].strip()]
      
      cur_date = dt.datetime.strptime(td[0], "%Y-%m-%d")
      start_diff = (cur_date - start).total_seconds()
      end_diff= (end - cur_date).total_seconds()
      if start_diff < 0 or end_diff <= 0:
        continue
      
      result_list.append("('{}', '{}', '{}', {}, {}, {})".format(td[0], td[1], td[2], td[3].replace(",", ""), td[4], td[5]))
  except Exception as ex:
    print("oil_bs 오류 발생")
    print(ex)
  
  return result_list

##### 금
- 국제금
  - 예시 : https://finance.naver.com/marketindex/worldDailyQuote.naver?marketindexCd=CMDT_GC&fdtc=2&page=446
- 국내금
  - 예시 : https://finance.naver.com/marketindex/goldDailyQuote.naver?&page=305



In [93]:
def gold_api(gold_type):
  if gold_type == "national":
    HOST = "https://finance.naver.com/marketindex/worldDailyQuote.naver?marketindexCd=CMDT_GC&fdtc=2&page="
    page = 446
    end = 10
    gold = "국제금(달러/트로이온스)"
  else :
    HOST = "https://finance.naver.com/marketindex/goldDailyQuote.naver?&page="
    page = 305
    end = 8
    gold = "국내금(원/g)"
  
  headers = {"Content-Type" : "application/json", "charset" : "UTF-8", "Accept":"*/*"}

  try:
    result_list = []
    for p in tqdm(range(page, end-1, -1), 
              total = page-end+1, ## 전체 진행수
              desc = 'Desc', ## 진행률 앞쪽 출력 문장
              ncols = 80, ## 진행률 출력 폭 조절
              leave = True, ## True 반복문 완료시 진행률 출력 남김. False 남기지 않음.
            ):
      url = HOST+str(p)
      response = requests.get(url, headers=headers)
      result_list.extend(gold_bs(response, gold))
      
      time.sleep(0.5)
      
    return result_list
  except Exception as ex:
    print("gold_api 오류 발생")
    print(ex)

def gold_bs(response, gold_type):
  start = dt.datetime.strptime("2011-01-01", "%Y-%m-%d")
  end = dt.datetime.strptime("2023-01-01", "%Y-%m-%d")
  
  soup = bs(response.text, "html.parser")
  result = soup.select("tbody > tr")
  
  result_list = []  
  try:
    for p in result[::-1]:
      state = p.get("class")[0]
      td_list = p.select("td")
      
      td = [td_list[0].getText().strip().replace(".", "-"), gold_type, state, td_list[1].getText().strip(), td_list[2].getText().strip()]
      if gold_type == "국제금(달러/트로이온스)" :
        td.append(td_list[3].getText().strip()[1:-1].strip())
      else :
        temp_value = float(td[-2].replace(",", "")) + (float(td[-1].replace(",", "")) if td[2] == 'down' else (-float(td[-1].replace(",", ""))))
        td.append(str(round(float(td[-1].replace(",", "")) / temp_value * 100, 2)))
      
      cur_date = dt.datetime.strptime(td[0], "%Y-%m-%d")
      start_diff = (cur_date - start).total_seconds()
      end_diff= (end - cur_date).total_seconds()
      if start_diff < 0 or end_diff <= 0:
        continue
      
      result_list.append("('{}', '{}', '{}', {}, {}, {})".format(td[0], td[1], td[2], td[3].replace(",", ""), td[4].replace(",", ""), td[5]))
  except Exception as ex:
    print("gold_bs 오류 발생")
    print(ex)
  
  return result_list

#### 데이터 수집

##### 금

In [50]:
result_list = gold_api("national")

Desc: 100%|███████████████████████████████████| 437/437 [04:29<00:00,  1.62it/s]


In [94]:
result_list = gold_api("daily")

Desc: 100%|███████████████████████████████████| 298/298 [03:10<00:00,  1.57it/s]


#### 저장 및 불러오기

In [95]:
with open("data(domestic_gold_230414).p", "wb") as f:
  pickle.dump(result_list, f)

In [67]:
with open("data(domestic_gold_230414).p", "rb") as f:
  result_list = pickle.load(f)

#### DB 저장

In [96]:
con = pymysql.connect(host='localhost', user='ssafy', password='ssafy', 
                      db='access_db', charset='utf8mb4', autocommit=True)
cur = con.cursor()

In [97]:
sql = "INSERT INTO material(material_date, material_name, material_state, material_rate, material_change, material_change_rate) VALUES " + ",".join(result_list)
cur.execute(sql)
con.close()

### 주식 시세!

- 사용 주식 머시깽이들
  - 전기 : 삼성전자(005930), 
  - 화학 : 
  - 생명 : 
  - IT : Naver(035420), 넥슨()?
  - 엔터 : 

#### api 함수

- 주식 머시깽이들
- 예시 : https://finance.naver.com/item/sise_day.naver?code={}&page=

In [49]:
def stock_api(stock_type):
  page = 303
  end = 8
  
  if stock_type == "035420":
    stock = "G IT"
  
  HOST = "https://finance.naver.com/item/sise_day.naver?code={}&page=".format(stock_type)
  
  headers = {"Content-Type" : "application/json", "charset" : "UTF-8", "Accept":"*/*", 'User-Agent': 'Mozilla/5.0'}

  try:
    result_list = []
    for p in tqdm(range(page, end-1, -1), 
              total = page-end+1, ## 전체 진행수
              desc = 'Desc', ## 진행률 앞쪽 출력 문장
              ncols = 80, ## 진행률 출력 폭 조절
              leave = True, ## True 반복문 완료시 진행률 출력 남김. False 남기지 않음.
            ):
      url = HOST+str(p)
      req = Request(url, headers=headers)
      with urlopen(req) as response:
        if not response is None:
          result_list.extend(stock_bs(response, stock))
      
      time.sleep(0.5)
      
    return result_list
  except Exception as ex:
    print("stock_api 오류 발생")
    print(ex)

def stock_bs(response, origin, after):
  start = dt.datetime.strptime("2011-01-01", "%Y-%m-%d")
  end = dt.datetime.strptime("2023-01-01", "%Y-%m-%d")
  
  soup = bs(response, "html.parser")
  result = soup.select('table > tr[onmouseover="mouseOver(this)"]')
  
  result_list = []  
  try:
    for p in result[::-1]:
      td_list = p.select("td")
      
      if td_list[2].select_one("img") is None:
        state = "same"
      elif td_list[2].select_one("img")["alt"].strip() == "상승" :
        state = "up" 
      elif td_list[2].select_one("img")["alt"].strip() == "하락" :
        state = "down"
      
      td = [
        td_list[0].getText().strip().replace(".", "-"), 
        after,
        origin, 
        state, 
        td_list[1].getText().strip().replace(",", ""), 
        td_list[2].getText().strip().replace(",", ""), 
        td_list[4].getText().strip().replace(",", ""),
        td_list[5].getText().strip().replace(",", ""),
        td_list[6].getText().strip().replace(",", ""),
        ]
      
      if state != "same":
        change_rate = round(int(td[4]) / ( int(td[3]) + (int(td[4]) if td[2] == "down" else (-int(td[4])))) * 100, 1)
        td.append(change_rate)
      else:
        td.append(0)
      
      cur_date = dt.datetime.strptime(td[0], "%Y-%m-%d")
      start_diff = (cur_date - start).total_seconds()
      end_diff= (end - cur_date).total_seconds()
      if start_diff < 0 or end_diff <= 0:
        continue
      
      result_list.append("('{}', '{}', '{}', '{}', {}, {}, {}, {}, {}, {})".format(td[0], td[1], td[2], td[3], td[4], td[5], td[6], td[7], td[8], td[9]))
  except Exception as ex:
    print("stock_bs 오류 발생")
    print(ex)
  
  return result_list

#### 데이터 수집

In [50]:
result_list = stock_api("035420")

Desc: 100%|███████████████████████████████████| 296/296 [02:58<00:00,  1.66it/s]


#### 저장 및 불러오기

In [51]:
with open("data(stock_GIT_230416).p", "wb") as f:
  pickle.dump(result_list, f)

In [None]:
with open("data(stock_temp_230414).p", "rb") as f:
  result_list = pickle.load(f)

#### DB 저장

In [55]:
con = pymysql.connect(host='localhost', user='ssafy', password='ssafy', 
                      db='access_db', charset='utf8mb4', autocommit=True)
cur = con.cursor()

In [56]:
sql = "INSERT INTO stock(stock_date, stock_name, stock_name_origin, stock_state, stock_rate, stock_change, stock_low, stock_high, stock_volume, stock_change_rate) VALUES " + ",".join(result_list)
cur.execute(sql)
con.close()

### 주식 뉴스!

- 사용 주식 머시깽이들
  - 전기 : 삼성전자(005930), 
  - 화학 : 
  - 생명 : 
  - IT : Naver(035420), 넥슨()?
  - 엔터 : 

#### api 함수

- 예시 : https://search.naver.com/search.naver?where=news&sm=tab_pge&query={keyword}&sort=2&photo=3&field=0&pd=3&ds={yyyy.mm.dd}&de={yyyy.mm.dd}&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:from{yyyymmdd}to{yyyymmdd},a:all&start={1->11->21->...}

In [9]:
def news_api(news_type):
  if news_type == "네이버":
    stock = "G IT"
    
  headers = {"Content-Type" : "application/json", "charset" : "UTF-8", "Accept":"*/*", 'User-Agent': 'Mozilla/5.0'}
  
  result_list = []
  # 검색 결과는 최대 4000개 보여주므로 각 년도의 분기별로 기사 추출.
  st = dt.datetime.now()
  print("시작 시간 : {}".format(st))
  for year in range(2011, 2023):
    print("{}년도".format(year), end=" ")
    for month in range(1, 12, 2):
      startDate = '%d.%02d.%02d' %(year, month, 1)
      nextDate = (dt.datetime(year, month+2, 1) if month < 11 else dt.datetime(year+1, 1, 1))  - dt.timedelta(days=1)  
      endDate = nextDate.strftime("%Y.%m.%d")
      keyword = urllib.parse.quote(news_type)
      HOST = "https://search.naver.com/search.naver?where=news&sm=tab_pge&query={}&sort=2&photo=3&field=0&pd=3&ds={}&de={}&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:from{}to{},a:all&start=".format(keyword, startDate, endDate, startDate.replace(".", ""), endDate.replace(".", ""))
      
      try:
        page = 1
        max_page = 1
        while True:
          url = HOST + str(page)
          req = Request(url, headers=headers)
          with urlopen(req) as response:
            if not response is None:
              new_result_list, is_next = news_bs(response, news_type, stock)
              result_list.extend(new_result_list)
              
              # 다음 페이지가 있다면 계속 탐색
              if is_next : 
                page = page + 10
              else :
                break
          time.sleep(0.1)
        max_page = max(max_page, page)
      except Exception as ex:
        print("news_api 오류 발생")
        print(ex)
      print("{} ~ {}".format(month, month+1), end=" ")
    print("완료! : {}, 최대 페이지 : {}".format(dt.datetime.now(), max_page))
  
  et = dt.datetime.now()
  print("종료 시간 : {}".format(et))
  
  # 소요 시간(초)
  diff = int((et-st).total_seconds())
  
  # 소요 시간 세부 항목 계산
  total_hour = int(diff // 3600)
  diff = int(diff % 3600)
  total_minute = int(diff // 60)
  total_second = int(diff % 60)
  print("소요 시간 : {}시간 {}분 {}초".format(total_hour, total_minute, total_second))
  
  return result_list

def news_bs(response, origin, after):
  soup = bs(response, "lxml")
  result = soup.select("ul.list_news > li.bx")  # 기사 타이틀이 들어있는 태그 리스트
  btn_next = soup.select_one("a.btn_next")  # 다음 페이지 버튼
  
  # 다음 페이지가 있는지 검사
  is_next = False if btn_next.get_attribute_list("href")[0] is None else True
  
  result_list = [] # 결과 저장
  try:
    for p in result:
      cur_date = p.select(".info")[2].get_text().strip()[:-1].replace(".", "-")  # 날짜
      cur_title = p.select_one(".news_tit").get_attribute_list("title")[0]  # 타이틀
      cur_title = cur_title.replace("`", '"').replace("'", '"')
      
      if origin not in cur_title:
        continue
      
      result_list.append("('{}', '{}', '{}', '{}')".format(cur_date, origin, after, cur_title))
  except Exception as ex:
    print("stock_bs 오류 발생")
    print(ex)
  
  return result_list, is_next

#### 데이터 수집

In [None]:
result_list = news_api("네이버")

#### 저장 및 불러오기

In [187]:
with open("data(news_naver_230417)_2.p", "wb") as f:
  pickle.dump(result_list, f)

In [6]:
with open("data(news_naver_230417)_2.p", "rb") as f:
  result_list = pickle.load(f)

#### DB 저장

In [7]:
con = pymysql.connect(host='localhost', user='ssafy', password='ssafy', 
                      db='access_db', charset='utf8mb4', autocommit=True)
cur = con.cursor()

In [8]:
sql = "INSERT INTO news(news_date, news_name_origin, news_name, news_content) VALUES " + ",".join(result_list)
cur.execute(sql)
con.close()