# URL 수집

In [14]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException
import csv
import time

def crawl_all_pages_selenium(search_query, start_date, end_date, total_pages=100):
    output_filename = "finalplz.csv"

    # 웹드라이버 설정
    options = Options()
    # options.add_argument("--headless")  # 헤드리스 모드 비활성화
    options.add_argument("--disable-gpu")  # Optional: Disable GPU hardware acceleration
    options.add_argument("--no-sandbox")  # Optional: Disable the sandbox for service workers.

    # Firefox specific settings
    options.set_preference("dom.webdriver.enabled", False)
    options.set_preference('useAutomationExtension', False)
    options.set_preference("privacy.trackingprotection.enabled", True)

    driver = webdriver.Firefox(options=options)

    with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['page', 'title', 'url', 'date']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for current_page in range(1, total_pages + 1):
            url = f'https://search.naver.com/search.naver?where=news&query={search_query}&sm=tab_pge&sort=0&pd=3&ds={start_date}&de={end_date}&start={(current_page - 1) * 10 + 1}'
            driver.get(url)

            # 웹 페이지에서 데이터 추출
            news_items = driver.find_elements(By.CSS_SELECTOR, 'div.news_area')
            for news_item in news_items:
                title_selector = 'a.news_tit'
                date_selector = 'div.info_group span.info'
                url_selector = 'a.news_tit'

                try:
                    title = news_item.find_element(By.CSS_SELECTOR, title_selector).get_attribute('title')
                    url = news_item.find_element(By.CSS_SELECTOR, url_selector).get_attribute('href')
                    date = news_item.find_element(By.CSS_SELECTOR, date_selector).text
                except NoSuchElementException:
                    # Skip if any of the elements is not found
                    continue

                # Write to CSV with page information
                writer.writerow({'page': current_page, 'title': title, 'url': url, 'date': date})

            # 딜레이 추가
            time.sleep(1)  # 1초 딜레이 (원하는 시간으로 변경)

    driver.quit()

if __name__ == "__main__":
    search_query = "전동 킥보드"
    start_date = "2023.01.01"
    end_date = "2024.04.26"

    crawl_all_pages_selenium(search_query, start_date, end_date)
    print("데이터가 finalplz.csv 파일로 저장되었습니다.")


데이터가 finalplz.csv 파일로 저장되었습니다.


# URL에 대한 기사 수집

In [19]:
!pip install newspaper3k



Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.1.2-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting tinysegmenter==0.3 (from newspaper3k)
  Downloading tinysegmenter-0.3.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25ldo

In [20]:
import os
import csv
from newspaper import Article

def extract_news_content(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.text, article.publish_date

def process_csv_with_newspaper(input_csv_filename, output_csv_filename):
    with open(input_csv_filename, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        
        with open(output_csv_filename, 'w', newline='', encoding='utf-8') as output_csvfile:
            fieldnames = ['page', 'title', 'url', 'date', 'content']
            writer = csv.DictWriter(output_csvfile, fieldnames=fieldnames)
            writer.writeheader()

            for row in reader:
                url = row['url']

                # Extract news content and date using newspaper3k
                try:
                    content, publish_date = extract_news_content(url)
                except Exception as e:
                    print(f"뉴스 내용 추출 중 오류 발생: {e}")
                    content, publish_date = '', None

                # Write to the output CSV file
                writer.writerow({
                    'page': row['page'],
                    'title': row['title'],
                    'url': url,
                    'date': publish_date,
                    'content': content
                })

if __name__ == "__main__":
    # Update file paths based on your directory structure
    input_csv_filename = "finalplz.csv"
    output_csv_filename = "1000news_data.csv"

    process_csv_with_newspaper(input_csv_filename, output_csv_filename)
    print(f"데이터가 {output_csv_filename} 파일로 저장되었습니다.")

뉴스 내용 추출 중 오류 발생: Article `download()` failed with 403 Client Error: Forbidden for url: http://news.maxmovie.com/437606 on URL http://news.maxmovie.com/437606
뉴스 내용 추출 중 오류 발생: Article `download()` failed with HTTPConnectionPool(host='news.mbccb.co.kr', port=80): Read timed out. on URL http://news.mbccb.co.kr/home/sub.php?menukey=61&mod=view&RECEIVE_DATE=20240123&SEQUENCE=4626
데이터가 1000news_data.csv 파일로 저장되었습니다.
