In [1]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
import datetime
import FinanceDataReader as fdr
from requests.adapters import HTTPAdapter

In [2]:
def loadStockData(symbol, startDate, endDate):
    df_stock = fdr.DataReader(symbol, startDate.isoformat(), endDate.isoformat())
    df_stock = df_stock[['Close']]
    df_stock['Fluctuation'] = df_stock['Close'].div(df_stock['Close'].shift(1)).apply(lambda x : (x - 1) * 100)
    return df_stock

In [3]:
def aggregateTitles(companyName, url):
    resp = requests.get(url)
    titles = []

    for item in bs(resp.text, 'xml').find_all('item'):
        title = item.title.string
        source = item.source.string # 언론사
        if(companyName in title):
            titles.append(title[:title.find(source) - 3])

    return ' '.join(titles)

In [4]:
def classifyFluctuation(fluctuation):
    if fluctuation < -2.5:
        return 0
    elif fluctuation < 0:
        return 1
    elif fluctuation < 2.5:
        return 2
    else:
        return 3

In [7]:
def crawl(companyName, startDate, endDate, isKor=True): 
    if isKor:
        country = ('ko', 'KR')
        symbol = str(df_kospi.loc[df_kospi['Name'] == companyName]['Symbol'].values[0])
        print(f'Start crawling for {companyName} in Google News Korea')
    else:
        country = ('en', 'US')
        symbol = df_snp.loc[df_snp['Name'] == companyName]['Symbol'].values[0]
        print(f'Start crawling for {companyName} in Google News US')

    df_stock = loadStockData(symbol, startDate - datetime.timedelta(days=1), endDate)
    # df_stock.to_csv(f'./stock/{country[1]}/{companyName}_{startDate.isoformat()}_{endDate.isoformat()}.csv')
    print(f'Loaded {companyName} price info, from {startDate.isoformat()} to {endDate.isoformat()}!')

    dateList = df_stock.index.map(lambda x: datetime.datetime.strftime(x, '%Y-%m-%d')).values
    fluctuationList = df_stock.loc[:, 'Fluctuation'].values

    idx = 1
    while idx < len(dateList):
        url = f'https://news.google.com/rss/search?q={companyName}+after:{dateList[idx - 1]}+before:{dateList[idx]}& \
                hl={country[0]}&gl={country[1]}&ceid={country[1]}:{country[0]}'
        aggTitle = aggregateTitles(companyName, url)
        if aggTitle:
            with open(f'./exp_my_3/{classifyFluctuation(fluctuationList[idx])}/{companyName}_{dateList[idx]}.txt', 
                        'w', encoding='UTF-8') as file:
                file.write(aggTitle)
        idx += 1

In [None]:
if __name__=="__main__":
    df_kospi = fdr.StockListing('KOSPI')
    df_snp = fdr.StockListing('S&P500')
    # May replace w/ fixed dictionary

    startDate = datetime.date(2018, 1, 1) # inclusive
    endDate = datetime.date(2018, 12, 31) # inclusive
    companyListK = ['삼성전자', 'SK하이닉스', 'NAVER', '삼성바이오로직스', '카카오', 'LG화학', '삼성SDI', 
                    '현대차', '기아', '셀트리온', '카카오뱅크', '크래프톤', 'POSCO', 'KB금융', '현대모비스', 
                    '카카오페이', '삼성물산', 'SK이노베이션', 'LG전자', '신한지주', 'LG생활건강', 'SK바이오사이언스', 
                    '하이브', '엔씨소프트', '한국전력', '삼성생명', '두산중공업', '하나금융지주', 'HMM', '삼성전기', 
                    '삼성에스디에스', 'SK아이이테크놀로지', 'KT&G', '넷마블', '포스코케미칼', '아모레퍼시픽', '삼성화재', 
                    '대한항공', 'S-Oil', '우리금융지주', '현대중공업', '고려아연', '기업은행', 'KT', 'SK바이오팜', 'LG디스플레이', '한온시스템']
    # 우리금융지주 수집 중 "Remote end closed connection without" urllib3.exceptions.ProtocolError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
    # KOSPI 시총 상위 50개 종목, 지주회사 제외

    for companyName in companyListK:
        crawl(companyName, startDate, endDate)

    # companyListUS = ['Apple', 'IBM', 'Delta Air Lines']
    # for companyName in companyListUS:
    #     crawl(companyName, startDate, endDate, False)

Start crawling for 삼성전자 in Google News Korea
Loaded 삼성전자 price info, from 2018-01-01 to 2018-12-31!
Start crawling for SK하이닉스 in Google News Korea
Loaded SK하이닉스 price info, from 2018-01-01 to 2018-12-31!
Start crawling for NAVER in Google News Korea
Loaded NAVER price info, from 2018-01-01 to 2018-12-31!
Start crawling for 삼성바이오로직스 in Google News Korea
Loaded 삼성바이오로직스 price info, from 2018-01-01 to 2018-12-31!
Start crawling for 카카오 in Google News Korea
Loaded 카카오 price info, from 2018-01-01 to 2018-12-31!
Start crawling for LG화학 in Google News Korea
Loaded LG화학 price info, from 2018-01-01 to 2018-12-31!
Start crawling for 삼성SDI in Google News Korea
Loaded 삼성SDI price info, from 2018-01-01 to 2018-12-31!
Start crawling for 현대차 in Google News Korea
Loaded 현대차 price info, from 2018-01-01 to 2018-12-31!
Start crawling for 기아 in Google News Korea
Loaded 기아 price info, from 2018-01-01 to 2018-12-31!
Start crawling for 셀트리온 in Google News Korea
Loaded 셀트리온 price info, from 2018-01-01 to 2018