In [None]:
!pip install bardapi
!pip install requests
!pip install elasticsearch==7.10
!pip install beautifulsoup4

In [2]:
from elasticsearch import Elasticsearch
# defualt mappings in elasticsearch
# you can change to fit your database

es = Elasticsearch('http://elasticsearch:9200')

mappings = {
    "settings": {
        "analysis":{
            "tokenizer":{
                "korean_nori_tokenizer":{
                    "type":"nori_tokenizer",
                    "decompound_mode":"mixed",
                }
            },
            "analyzer":{
                "nori_analyzer":{
                    "type":"custom",
                    "tokenizer":"korean_nori_tokenizer",
                    "filter":[
                        "nori_posfilter"    
                    ]
                }
            },
            "filter":{
                "nori_posfilter":{
                    "type":"nori_part_of_speech",
                    "stoptags":[
                        "E",
                        "IC",
                        "J",
                        "MAG",
                        "MM",
                        "NA",
                        "NR",
                        "SC",
                        "SE",
                        "SF",
                        "SH",
                        "SL",
                        "SN",
                        "SP",
                        "SSC",
                        "SSO",
                        "SY",
                        "UNA",
                        "UNKNOWN",
                        "VA",
                        "VCN",
                        "VCP",
                        "VSV",
                        "VV",
                        "VX",
                        "XPN",
                        "XR",
                        "XSA",
                        "XSN",
                        "XSV"
                    ]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "title": {
                "type": "text",
                "analyzer": "nori_analyzer",
            },
            "content": {
                "type": "text",
                "analyzer": "nori_analyzer",
            },
            "office" : {
                "type" : "keyword"
            },
            "timestamp": {
                "type" : "date",
                "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd'T'HH:mm:ss.SSSZ||epoch_millis"
            },
            "url" : {
                "type" : "keyword"
            },
            "section": {
                "type" : "keyword"
            }
        }
    }
}


In [17]:
from bs4 import BeautifulSoup
import requests
import json
from datetime import datetime, timedelta
import time
delta = timedelta(days=1)
yesterday = datetime.now() - delta
yesterday = yesterday.strftime("%Y%m%d")

def create_index(body=None):
    INDEX = f'sports_news_{yesterday}'
    if not es.indices.exists(index=INDEX):
        return es.indices.create(index=INDEX, body=body)

def insert():
    INDEX = f'sports_news_{yesterday}'
    cnt = 1
    res_set = set()
    while True:
        page = requests.get(f"https://sports.news.naver.com/wfootball/news/list?isphoto=N&date={yesterday}&page={cnt}")
        soup = BeautifulSoup(page.text, 'html.parser')

        json_list = json.loads(page.content).get('list')

        if (json_list[0].get('oid'), json_list[0].get('aid')) in res_set:
            break

        res_set.add((json_list[0].get('oid'), json_list[0].get('aid')))

        for i in json.loads(page.content).get('list'):
            res = requests.get(f"https://sports.news.naver.com/news?oid={i.get('oid')}&aid={i.get('aid')}")
            res_soup = BeautifulSoup(res.text, 'html.parser')
            datetime_string = i.get('datetime')
            datetime_format = "%Y.%m.%d %H:%M"
            datetime_result = datetime.strptime(datetime_string, datetime_format)
            data = {
                'title' : i.get('title'),
                'content' : res_soup.select('.news_end')[0].text.split('\n')[1],
                'office' : i.get('officeName'),
                'timestamp' : datetime_result.strftime("%Y-%m-%d %H:%M:%S"),
                'url' : f"https://sports.news.naver.com/news?oid={i.get('oid')}&aid={i.get('aid')}",
                'section' : i.get('sectionName')
            }
            
            es.index(index=INDEX, doc_type="_doc", body=data)
            time.sleep(0.5)
        cnt += 1

create_index(mappings)
insert()