In [1]:
import requests
from bs4 import BeautifulSoup
import os
from typing import List
import time

In [44]:
BASE_URL = 'https://www.nogizaka46.com/s/n46/diary/detail/'
NEWEST_URL = BASE_URL + '102848?ima=3554&cd=MEMBER'
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"
}

In [45]:
def fetch_member_page(member_url: str) -> str:
    """
    Fetch the HTML of a given member's page.

    Args:
        member_url (str): URL of the member's blog page.

    Returns:
        str: HTML content of the page.
    """
    response = requests.get(member_url, headers=HEADERS)
    response.raise_for_status()
    return response.text

In [46]:
def parse_title(html_content: str) -> str:
    """
    Parse the images from the member's blog page HTML content.

    Args:
        html_content (str): HTML content of the page.

    Returns:
        List[str]: List of image URLs.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    title_tag = soup.find('meta', property='og:title')
    return title_tag['content']

In [63]:
def parse_date(html_content: str) -> str:
    """
    Parse the images from the member's blog page HTML content.

    Args:
        html_content (str): HTML content of the page.

    Returns:
        List[str]: List of image URLs.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    date_tag = soup.find('p', class_='bd--hd__date a--tx js-tdi')
    return date_tag.text.strip().replace(':', '：').replace('.', '-').replace(' ', '_')

In [64]:
def parse_images(html_content: str) -> List[str]:
    """
    Parse the images from the member's blog page HTML content.

    Args:
        html_content (str): HTML content of the page.

    Returns:
        List[str]: List of image URLs.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    images = []
    for img_tag in soup.find_all('img'):
        img_src = img_tag.get('src')
        img_src = 'https://www.nogizaka46.com' + img_src
        if img_src and img_src.startswith('http'):
            images.append(img_src)
    return images

In [65]:
def save_images(image_urls: List[str], save_dir: str) -> None:
    """
    Save the list of image URLs to a local directory named after the member.

    Args:
        image_urls (List[str]): List of image URLs.
        member_name (str): Name of the member.
    """
    dir_name = f"nogizaka/{save_dir}"
    os.makedirs(dir_name, exist_ok=True)
    
    for image_url in image_urls:
        image_data = requests.get(image_url).content
        image_name = os.path.join(dir_name, os.path.basename(image_url))
        with open(image_name, 'wb') as image_file:
            image_file.write(image_data)

In [66]:
def save_texts(html_content: str, save_dir: str) -> str:
    dir_name = f"nogizaka/{save_dir}"
    soup = BeautifulSoup(html_content, 'html.parser')
    raw_text = soup.find('div', class_='bd--edit')
    divs = raw_text.find_all('div')
    text_name = os.path.join(dir_name, 'blog.txt')
    text = ''
    for div in divs:
        text = text + div.get_text() + '\n'
    with open(text_name, 'w') as text_file:
        text_file.write(text)
    

In [67]:
def parse_previous_url(html_content: str) -> str:
    """
    Parse the images from the member's blog page HTML content.

    Args:
        html_content (str): HTML content of the page.

    Returns:
        List[str]: List of image URLs.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    url_tag = soup.find('a', class_='bd--hn__a hv--op')
    previous_or_next = url_tag.find('p', class_='bd--hn__tx f--head').text.strip()
    url = url_tag['href'] if previous_or_next == '前の記事' else None
    if not url is None:
        url = 'https://www.nogizaka46.com' + url
    return url

In [68]:
def main() -> None:
    """
    Main function to orchestrate the crawling and scraping process.
    """
    # Implement the process to get URLs of each member's blog.
    newest_url = NEWEST_URL  # Example actual URL
    html = fetch_member_page(newest_url)
    soup = BeautifulSoup(html, 'html.parser')
    name_tag = soup.find('p', class_='bd--prof__name f--head')
    member_name = name_tag.text.strip().replace(' ', '')
    url = newest_url
    count = 1
    
    while True:
        html = fetch_member_page(url)
        title = parse_title(html)
        date = parse_date(html)
        images = parse_images(html)
        save_dir = member_name + '/' + '【' + date + '】' + title
        save_images(images, save_dir)
        save_texts(html, save_dir)
        print(f'取り込み数: {count} {url}', end='\r', flush=True)
        url = parse_previous_url(html)
        if url is None:
            break
        else:
            count += 1
            time.sleep(0.1)

In [69]:
if __name__ == "__main__":
    main()



取り込み数: 112 https://www.nogizaka46.com/s/n46/diary/detail/43327?ima=2754&cd=MEMBER