In [7]:
import json
with open(r"D:\GitHub_Repository\Research\ma_thesis\py_code\maker_urls.json", "r", encoding="utf-8") as f:
    all_data = json.load(f)

In [None]:
import time
import random
import json
import os
from urllib.parse import urlparse

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc

BASE_URL = "https://carview.yahoo.co.jp"

# robots.txt の禁止パス（アクセスしない）
ROBOTS_DISALLOW = [
    "/news/detail/",
    "/article/detail/",
    "/article/comment",
    "/ncar/catalog/*/chiebukuro/",
    "/article/countUp/"
]

import requests  # ← 追加（Slack・LINE通知に必要）

# ===============================
# 通知関連の設定
# ===============================
SLACK_WEBHOOK_URL  # ← 自分のSlack URLに変更

def send_slack_message(text):
    """Slackに通知を送信"""
    try:
        if not SLACK_WEBHOOK_URL:
            return
        payload = {"text": text}
        r = requests.post(SLACK_WEBHOOK_URL, json=payload)
        if r.status_code == 200:
            print("[NOTIFY] Slack通知送信成功")
        else:
            print(f"[WARN] Slack通知失敗: {r.status_code}")
    except Exception as e:
        print(f"[WARN] Slack通知送信中にエラー: {e}")

# ===============================
# Selenium 初期化
# ===============================
def create_driver():
    options = uc.ChromeOptions()
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    )
    driver = uc.Chrome(options=options)
    driver.set_page_load_timeout(30)
    return driver


# ===============================
# robots.txt の禁止URLかどうか判定
# ===============================
def is_disallowed(url):
    path = urlparse(url).path
    for dis in ROBOTS_DISALLOW:
        if dis in path:
            return True
    return False


# ===============================
# 安全なページ取得
# ===============================
def safe_get(driver, url, retries=5, wait_minutes=8):
    if is_disallowed(url):
        print(f"[ROBOTS] disallow対象のためスキップ: {url}")
        return False

    backoff_min = 600   # 10分
    backoff_max = 1800  # 30分

    for i in range(retries):
        try:
            driver.get(url)
            time.sleep(random.uniform(3.5, 5.5))
            html = driver.page_source
            # -------------------------------
            # アクセス制限の自動検知
            # -------------------------------
            # Yahoo! のアクセス制限ページ判定
            blocked = False
            if "現在表示できません" in html and "Yahoo! JAPAN" in html:
                blocked = True
            if "<title>Yahoo! JAPAN - ご覧になろうとしているページは現在表示できません。" in html:
                blocked = True

            if blocked:
                # アクセス制限 → 一時停止して再開
                timestamp = time.strftime('%H:%M:%S')
                print(f"\n[BLOCKED] Yahoo! によりアクセス制限されました: {url}")
                print(f"[INFO] {timestamp} にブロック検知。 {wait_minutes} 分待機します…\n")

                # 安全な待機（人間のアクセス速度に近い）
                sleep_seconds = wait_minutes * 60
                for remaining in range(sleep_seconds, 0, -60):
                    print(f"  - 再試行まであと {remaining//60} 分…", end="\r")
                    time.sleep(60)

                print("\n[INFO] 再試行します…\n")
                continue  # 同じ URL 再試行

            # 通常のページ → OK
            return True


        except Exception as e:
            print(f"[WARN] {url} の取得に失敗 ({i+1}/{retries}): {e}")

            # 通常の接続エラー → Backoff（短め）
            sleep_time = i*20
            print(f"[BACKOFF] {sleep_time}秒待ちます…")
            time.sleep(sleep_time)

    return False


# ===============================
# メーカー・車種名抽出
# ===============================
def parse_maker_car(url):
    parts = urlparse(url).path.strip("/").split("/")
    maker = parts[2]
    car = parts[3]
    return maker, car


# ===============================
# 歴代モデル URL 取得
# ===============================
def get_historical_models(driver, wait, TOP_URL):
    if not safe_get(driver, TOP_URL):
        return []

    try:
        elems = wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, "#mdl_his a.p-list_landscape--item--link")
        ))
        urls = [e.get_attribute("href") for e in elems]

        # FMC001-MC004 形式のみ
        filtered = [
            url for url in urls
            if url.rstrip("/").split("/")[-1].startswith("FMC")
        ]
        return filtered

    except:
        print("[WARN] 歴代モデルが見つかりません")
        return []


# ===============================
# マイナーチェンジ URL 取得
# ===============================
def get_minor_changes(driver, wait, model_url):
    if not safe_get(driver, model_url):
        return []

    try:
        elems = wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, "div#minr_his a.p-list_landscape--item--link")
        ))
        return [e.get_attribute("href") for e in elems]

    except:
        print(f"[WARN] マイナーチェンジが見つかりません: {model_url}")
        return []


# ===============================
# グレード URL 取得
# ===============================
def get_grade_links(driver, wait, minor_url):
    if not safe_get(driver, minor_url):
        return []

    try:
        wait.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR, "div.grade_inform_mod_list--table--wrap a")
        ))
        elems = driver.find_elements(By.CSS_SELECTOR, "div.grade_inform_mod_list--table--wrap a")
        return [e.get_attribute("href") for e in elems]

    except:
        print(f"[WARN] グレードが見つかりません: {minor_url}")
        return []


# ===============================
# 仕様情報取得
# ===============================
def scrape_grade_page(driver, wait, grade_url):
    start = time.time()
    if not safe_get(driver, grade_url):
        return {}

    data = {}
    try:
        wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, "div.model_explanation--tbl table")
        ))
        tables = driver.find_elements(By.CSS_SELECTOR, "div.model_explanation--tbl")

        sections = ["基本情報", "寸法・重量", "エンジン・性能", "バッテリー・モーター性能", "タイヤ・足回り",
                    "安全装備", "快適装備", "エクステリア", "インテリア"]

        for section, tbl in zip(sections, tables):
            rows = tbl.find_elements(By.TAG_NAME, "tr")
            sec = {}
            for row in rows:
                ths = row.find_elements(By.TAG_NAME, "th")
                tds = row.find_elements(By.TAG_NAME, "td")
                for th, td in zip(ths, tds):
                    sec[th.text.strip()] = td.text.strip()
            data[section] = sec

    except Exception as e:
        print(f"[WARN] 仕様情報取得失敗 {grade_url}: {e}")

    print(f"[TIME] {grade_url}: {time.time()-start:.2f}秒")
    return data


# ===============================
# JSON 保存
# ===============================
def save_car_json(maker, car, new_data):
    os.makedirs("car_data", exist_ok=True)
    file_path = f"car_data/{maker}_{car}.json"

    # 既存 JSON を読み込み
    if os.path.exists(file_path):
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                old_data = json.load(f)
        except:
            old_data = {}
    else:
        old_data = {}

    # -------------------------
    # 深い階層をマージする関数
    # -------------------------
    def deep_merge(old, new):
        for k, v in new.items():
            if (
                k in old and
                isinstance(old[k], dict) and
                isinstance(v, dict)
            ):
                deep_merge(old[k], v)
            else:
                old[k] = v
        return old

    # マージ実行
    merged = deep_merge(old_data, new_data)

    # 保存
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(merged, f, ensure_ascii=False, indent=2)

    print(f"[SAVE] {file_path}（マージ保存）")


# ===============================
# メインスクレイピング処理
# ===============================
def scrape_car(driver, wait, TOP_URL, visited_minor_urls):
    maker, car = parse_maker_car(TOP_URL)
    print(f"[INFO] メーカー: {maker}, 車種: {car}")

    result = {
        "maker": maker,
        "car": car,
        "historical_models": {}
    }

    historical_models = get_historical_models(driver, wait, TOP_URL)

    for model_url in historical_models:
        result["historical_models"][model_url] = {"minor_changes": {}}

        minor_changes = get_minor_changes(driver, wait, model_url)

        for minor_url in minor_changes:

            if minor_url in visited_minor_urls:
                print(f"[SKIP] 重複マイナー: {minor_url}")
                continue

            result["historical_models"][model_url]["minor_changes"][minor_url] = {}

            grade_links = get_grade_links(driver, wait, minor_url)

            for grade_url in grade_links:
                result["historical_models"][model_url]["minor_changes"][minor_url][grade_url] = (
                    scrape_grade_page(driver, wait, grade_url)
                )

            visited_minor_urls.add(minor_url)

    save_car_json(maker, car, result)
    return result


# ===============================
# 実行
# ===============================
if __name__ == "__main__":

    driver = create_driver()
    wait = WebDriverWait(driver, 20)

    visited_minor_urls = set()
    all_results = {}

    for maker, urls in all_data.items():
        for t in urls:
            print("\n============================")
            print(f"[START] {t}")
            print("============================")

            try:
                all_results[t] = scrape_car(driver, wait, t, visited_minor_urls)
            except Exception as e:
                print(f"[ERROR] {t}: {e}")

    driver.quit()

In [None]:
'''
import json
import re

# JSONファイル読み込み
with open("car_url.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# FMC-MC パターン（例: FMC001-MC005）
pattern = re.compile(r'/FMC\d{3}-MC\d{3}/?$')

cleaned_data = {}

for maker, urls in data.items():
    cleaned_urls = []
    
    for url in urls:
        # FMCxxx-MCxxx を削除
        new_url = re.sub(pattern, '/', url)
        cleaned_urls.append(new_url)
    
    # 重複削除（順序維持）
    unique_urls = list(dict.fromkeys(cleaned_urls))
    
    cleaned_data[maker] = unique_urls

# 保存（上書きしたくない場合は別名に）
with open("car_url_cleaned.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_data, f, ensure_ascii=False, indent=2)

print("処理が完了しました！")
'''

処理が完了しました！


----------------

In [5]:
# lexus
lexus_urls = {
    "lexus": [
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/es_hybrid/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/gx/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/is/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/is_hybrid/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/lbx/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/lc/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/lc_convertible/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/lc_hybrid/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/lm/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/ls/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/ls-hybrid/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/lx/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/nx/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/nx_phev/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/nx_hybrid/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/rc/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/rc_f/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/rc_hybrid/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/rx/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/rx_phev/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/rz/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/ux_ev/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/ux_hybrid/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/ct/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/gs/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/gs_f/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/gs_hybrid/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/hs_hybrid/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/is-c/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/is-f/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/lfa/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/rx_hybrid/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/sc/",
    "https://carview.yahoo.co.jp/ncar/catalog/lexus/ux/"]
}
import time
import random
import re
import json
import os
from urllib.parse import urlparse

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc

BASE_URL = "https://carview.yahoo.co.jp"

# robots.txt の禁止パス
ROBOTS_DISALLOW = [
    "/news/detail/",
    "/article/detail/",
    "/article/comment",
    "/ncar/catalog/*/chiebukuro/",
    "/article/countUp/"
]

# =========================================================
# Driver 作成
# =========================================================
def create_driver():
    options = uc.ChromeOptions()
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    )
    driver = uc.Chrome(options=options)
    driver.set_page_load_timeout(30)
    return driver

# =========================================================
# robots.txt 判定
# =========================================================
def is_disallowed(url):
    path = urlparse(url).path
    for dis in ROBOTS_DISALLOW:
        if dis in path:
            return True
    return False

# =========================================================
# 安全な GET
# =========================================================
def safe_get(driver, url, retries=5, wait_minutes=8):
    if is_disallowed(url):
        print(f"[ROBOTS] disallow対象のためスキップ: {url}")
        return False

    for i in range(retries):
        try:
            driver.get(url)
            time.sleep(random.uniform(3.5, 5.5))
            html = driver.page_source

            # Yahoo! のアクセス制限判定
            blocked = False
            if "現在表示できません" in html and "Yahoo! JAPAN" in html:
                blocked = True
            if "<title>Yahoo! JAPAN - ご覧になろうとしているページは現在表示できません。" in html:
                blocked = True

            if blocked:
                timestamp = time.strftime('%H:%M:%S')
                print(f"\n[BLOCKED] Yahoo! アクセス制限: {url}")
                print(f"[INFO] {timestamp} → {wait_minutes} 分待機\n")

                for _ in range(wait_minutes * 60):
                    time.sleep(1)

                continue

            return True

        except Exception as e:
            print(f"[WARN] 取得失敗 ({i+1}/{retries}): {url} - {e}")
            sleep_time = i * 20
            print(f"[BACKOFF] {sleep_time} 秒待機")
            time.sleep(sleep_time)

    return False

# =========================================================
# URL から maker/car を抽出
# =========================================================
def parse_maker_car(url):
    parts = urlparse(url).path.strip("/").split("/")
    maker = parts[2]
    car = parts[3]
    return maker, car

# =========================================================
# 歴代モデル URL 取得
# =========================================================
def get_historical_models(driver, wait, TOP_URL):
    if not safe_get(driver, TOP_URL):
        return []

    try:
        elems = wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, "#mdl_his a.p-list_landscape--item--link")
        ))
        urls = [e.get_attribute("href") for e in elems]

        filtered = [
            url for url in urls
            if url.rstrip("/").split("/")[-1].startswith("FMC")
        ]
        return filtered
    except:
        print("[WARN] 歴代モデルが見つかりません")
        return []

# =========================================================
# マイナーチェンジ URL 取得
# =========================================================
def get_minor_changes(driver, wait, model_url):
    if not safe_get(driver, model_url):
        return []

    try:
        elems = wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, "div#minr_his a.p-list_landscape--item--link")
        ))
        return [e.get_attribute("href") for e in elems]

    except:
        print(f"[WARN] マイナーチェンジが見つかりません: {model_url}")
        return []

# =========================================================
# グレード名 & URL を取得（今回書き換え部分）
# =========================================================
def get_grade_links(driver, wait, minor_url):
    if not safe_get(driver, minor_url):
        return []

    try:
        # <th><a href="...">300h(CVT_2.5)</a></th>
        elems = wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, "th > a[href*='gradeid']")
        ))

        grades = []
        for e in elems:
            href = e.get_attribute("href")
            name = e.text.strip()
            m = re.search(r'/gradeid/(\d+)/?', href)
            grade_id = m.group(1) if m else None
            grades.append({
                "grade_name": name,
                "href": href,
                "grade_id": grade_id
            })

        return grades

    except Exception as e:
        print(f"[WARN] グレードが見つかりません: {minor_url} ({e})")
        return []

# =========================================================
# JSON 保存
# =========================================================
def save_car_json(maker, car, data):
    os.makedirs("scraped_data", exist_ok=True)

    path = f"scraped_data/{maker}_{car}.json"
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    print(f"[SAVE] {path}")

# =========================================================
# 車種スクレイピングのメイン処理
# =========================================================
def scrape_car(driver, wait, TOP_URL, visited_minor_urls):
    maker, car = parse_maker_car(TOP_URL)
    print(f"[INFO] メーカー: {maker}, 車種: {car}")

    result = {
        "maker": maker,
        "car": car,
        "historical_models": {}
    }

    historical_models = get_historical_models(driver, wait, TOP_URL)

    for model_url in historical_models:
        result["historical_models"][model_url] = {"minor_changes": {}}

        minor_changes = get_minor_changes(driver, wait, model_url)

        for minor_url in minor_changes:

            if minor_url in visited_minor_urls:
                print(f"[SKIP] 重複マイナー: {minor_url}")
                continue

            visited_minor_urls.add(minor_url)

            result["historical_models"][model_url]["minor_changes"][minor_url] = {}

            # ここで grades を取得
            grades = get_grade_links(driver, wait, minor_url)

            result["historical_models"][model_url]["minor_changes"][minor_url]["grades"] = grades

    #save_car_json(maker, car, result)
    return result

# =========================================================
# メイン
# =========================================================
if __name__ == "__main__":
    driver = create_driver()
    wait = WebDriverWait(driver, 20)
    visited_minor_urls = set()
    all_results = {}
    for maker, urls in lexus_urls.items():
        for t in urls:
            print("\n============================")
            print(f"[START] {t}")
            print("============================")
            try:
                all_results[t] = scrape_car(driver, wait, t, visited_minor_urls)
            except Exception as e:
                print(f"[ERROR] {t}: {e}")

    driver.quit()


[START] https://carview.yahoo.co.jp/ncar/catalog/lexus/es_hybrid/
[INFO] メーカー: lexus, 車種: es_hybrid

[START] https://carview.yahoo.co.jp/ncar/catalog/lexus/gx/
[INFO] メーカー: lexus, 車種: gx

[START] https://carview.yahoo.co.jp/ncar/catalog/lexus/is/
[INFO] メーカー: lexus, 車種: is

[START] https://carview.yahoo.co.jp/ncar/catalog/lexus/is_hybrid/
[INFO] メーカー: lexus, 車種: is_hybrid

[START] https://carview.yahoo.co.jp/ncar/catalog/lexus/lbx/
[INFO] メーカー: lexus, 車種: lbx

[START] https://carview.yahoo.co.jp/ncar/catalog/lexus/lc/
[INFO] メーカー: lexus, 車種: lc

[START] https://carview.yahoo.co.jp/ncar/catalog/lexus/lc_convertible/
[INFO] メーカー: lexus, 車種: lc_convertible

[START] https://carview.yahoo.co.jp/ncar/catalog/lexus/lc_hybrid/
[INFO] メーカー: lexus, 車種: lc_hybrid

[START] https://carview.yahoo.co.jp/ncar/catalog/lexus/lm/
[INFO] メーカー: lexus, 車種: lm

[START] https://carview.yahoo.co.jp/ncar/catalog/lexus/ls/
[INFO] メーカー: lexus, 車種: ls

[START] https://carview.yahoo.co.jp/ncar/catalog/lexus/ls-hyb

In [6]:
import json

with open(r"D:\GitHub_Repository\Research\ma_thesis\py_codelexus.json", "w", encoding="utf-8") as f:
    json.dump(all_results, f, ensure_ascii=False, indent=2)

In [4]:
# phev
phev_urls = {
    "toyota": [
    "https://carview.yahoo.co.jp/ncar/catalog/toyota/harrier_hybrid/",
    "https://carview.yahoo.co.jp/ncar/catalog/toyota/prius/",
    "https://carview.yahoo.co.jp/ncar/catalog/toyota/rav4_hybrid/",
    "https://carview.yahoo.co.jp/ncar/catalog/toyota/crown_crossover/",
    "https://carview.yahoo.co.jp/ncar/catalog/toyota/crown_sport/"
]
}
import time
import random
import re
import json
import os
from urllib.parse import urlparse

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc

BASE_URL = "https://carview.yahoo.co.jp"

# robots.txt の禁止パス
ROBOTS_DISALLOW = [
    "/news/detail/",
    "/article/detail/",
    "/article/comment",
    "/ncar/catalog/*/chiebukuro/",
    "/article/countUp/"
]

# =========================================================
# Driver 作成
# =========================================================
def create_driver():
    options = uc.ChromeOptions()
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    )
    driver = uc.Chrome(options=options)
    driver.set_page_load_timeout(30)
    return driver

# =========================================================
# robots.txt 判定
# =========================================================
def is_disallowed(url):
    path = urlparse(url).path
    for dis in ROBOTS_DISALLOW:
        if dis in path:
            return True
    return False

# =========================================================
# 安全な GET
# =========================================================
def safe_get(driver, url, retries=5, wait_minutes=8):
    if is_disallowed(url):
        print(f"[ROBOTS] disallow対象のためスキップ: {url}")
        return False

    for i in range(retries):
        try:
            driver.get(url)
            time.sleep(random.uniform(3.5, 5.5))
            html = driver.page_source

            # Yahoo! のアクセス制限判定
            blocked = False
            if "現在表示できません" in html and "Yahoo! JAPAN" in html:
                blocked = True
            if "<title>Yahoo! JAPAN - ご覧になろうとしているページは現在表示できません。" in html:
                blocked = True

            if blocked:
                timestamp = time.strftime('%H:%M:%S')
                print(f"\n[BLOCKED] Yahoo! アクセス制限: {url}")
                print(f"[INFO] {timestamp} → {wait_minutes} 分待機\n")

                for _ in range(wait_minutes * 60):
                    time.sleep(1)

                continue

            return True

        except Exception as e:
            print(f"[WARN] 取得失敗 ({i+1}/{retries}): {url} - {e}")
            sleep_time = i * 20
            print(f"[BACKOFF] {sleep_time} 秒待機")
            time.sleep(sleep_time)

    return False

# =========================================================
# URL から maker/car を抽出
# =========================================================
def parse_maker_car(url):
    parts = urlparse(url).path.strip("/").split("/")
    maker = parts[2]
    car = parts[3]
    return maker, car

# =========================================================
# 歴代モデル URL 取得
# =========================================================
def get_historical_models(driver, wait, TOP_URL):
    if not safe_get(driver, TOP_URL):
        return []

    try:
        elems = wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, "#mdl_his a.p-list_landscape--item--link")
        ))
        urls = [e.get_attribute("href") for e in elems]

        filtered = [
            url for url in urls
            if url.rstrip("/").split("/")[-1].startswith("FMC")
        ]
        return filtered
    except:
        print("[WARN] 歴代モデルが見つかりません")
        return []

# =========================================================
# マイナーチェンジ URL 取得
# =========================================================
def get_minor_changes(driver, wait, model_url):
    if not safe_get(driver, model_url):
        return []

    try:
        elems = wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, "div#minr_his a.p-list_landscape--item--link")
        ))
        return [e.get_attribute("href") for e in elems]

    except:
        print(f"[WARN] マイナーチェンジが見つかりません: {model_url}")
        return []

# =========================================================
# グレード名 & URL を取得（今回書き換え部分）
# =========================================================
def get_grade_links(driver, wait, minor_url):
    if not safe_get(driver, minor_url):
        return []

    try:
        # <th><a href="...">300h(CVT_2.5)</a></th>
        elems = wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, "th > a[href*='gradeid']")
        ))

        grades = []
        for e in elems:
            href = e.get_attribute("href")
            name = e.text.strip()
            m = re.search(r'/gradeid/(\d+)/?', href)
            grade_id = m.group(1) if m else None
            grades.append({
                "grade_name": name,
                "href": href,
                "grade_id": grade_id
            })

        return grades

    except Exception as e:
        print(f"[WARN] グレードが見つかりません: {minor_url} ({e})")
        return []

# =========================================================
# JSON 保存
# =========================================================
def save_car_json(maker, car, data):
    os.makedirs("scraped_data", exist_ok=True)

    path = f"scraped_data/{maker}_{car}.json"
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    print(f"[SAVE] {path}")

# =========================================================
# 車種スクレイピングのメイン処理
# =========================================================
def scrape_car(driver, wait, TOP_URL, visited_minor_urls):
    maker, car = parse_maker_car(TOP_URL)
    print(f"[INFO] メーカー: {maker}, 車種: {car}")

    result = {
        "maker": maker,
        "car": car,
        "historical_models": {}
    }

    historical_models = get_historical_models(driver, wait, TOP_URL)

    for model_url in historical_models:
        result["historical_models"][model_url] = {"minor_changes": {}}

        minor_changes = get_minor_changes(driver, wait, model_url)

        for minor_url in minor_changes:

            if minor_url in visited_minor_urls:
                print(f"[SKIP] 重複マイナー: {minor_url}")
                continue

            visited_minor_urls.add(minor_url)

            result["historical_models"][model_url]["minor_changes"][minor_url] = {}

            # ここで grades を取得
            grades = get_grade_links(driver, wait, minor_url)

            result["historical_models"][model_url]["minor_changes"][minor_url]["grades"] = grades

    #save_car_json(maker, car, result)
    return result

# =========================================================
# メイン
# =========================================================
if __name__ == "__main__":
    driver = create_driver()
    wait = WebDriverWait(driver, 20)
    visited_minor_urls = set()
    all_results = {}
    for maker, urls in phev_urls.items():
        for t in urls:
            print("\n============================")
            print(f"[START] {t}")
            print("============================")
            try:
                all_results[t] = scrape_car(driver, wait, t, visited_minor_urls)
            except Exception as e:
                print(f"[ERROR] {t}: {e}")

    driver.quit()


[START] https://carview.yahoo.co.jp/ncar/catalog/toyota/harrier_hybrid/
[INFO] メーカー: toyota, 車種: harrier_hybrid

[START] https://carview.yahoo.co.jp/ncar/catalog/toyota/prius/
[INFO] メーカー: toyota, 車種: prius

[START] https://carview.yahoo.co.jp/ncar/catalog/toyota/rav4_hybrid/
[INFO] メーカー: toyota, 車種: rav4_hybrid

[START] https://carview.yahoo.co.jp/ncar/catalog/toyota/crown_crossover/
[INFO] メーカー: toyota, 車種: crown_crossover

[START] https://carview.yahoo.co.jp/ncar/catalog/toyota/crown_sport/
[INFO] メーカー: toyota, 車種: crown_sport


In [5]:
import json

with open(r"D:\GitHub_Repository\Research\ma_thesis\py_codephev.json", "w", encoding="utf-8") as f:
    json.dump(all_results, f, ensure_ascii=False, indent=2)

In [None]:
# LOAD URL JSON FILE
import json
file_path = r"D:\GitHub_Repository\Research\ma_thesis\data\car_url_cleaned.json"

# JSONを辞書型として読み込み
with open(file_path, "r", encoding="utf-8") as f:
    all_data = json.load(f)

MAKER_MAP = {
    "トヨタ": "toyota", "レクサス": "lexus", "日産": "nissan", "ホンダ": "honda",
    "マツダ": "mazda", "スバル": "subaru", "スズキ": "suzuki", "ダイハツ": "daihatsu",
    "三菱": "mitsubishi", "いすゞ": "isuzu", "メルセデス・ベンツ": "mercedes-benz",
    "メルセデスAMG": "mercedes_amg", "BMW": "bmw", "ミニ": "mini",
    "フォルクスワーゲン": "volkswagen", "アウディ": "audi", "ポルシェ": "porsche",
    "スマート": "smart", "BMWアルピナ": "bmwalpina", "プジョー": "peugeot",
    "ルノー": "renault", "シトロエン": "citroen", "DSオートモビル": "ds_automobile",
    "アルピーヌ": "alpine", "ブガッティ": "bugatti", "フィアット": "fiat",
    "アバルト": "abarth", "アルファロメオ": "alfaromeo", "フェラーリ": "ferrari",
    "ランボルギーニ": "lamborghini", "マセラティ": "maserati", "ジャガー": "jaguar",
    "ランドローバー": "landrover", "アストンマーティン": "astonmartin",
    "ロールスロイス": "rolls-royce", "ベントレー": "bentley",
    "マクラーレン": "mclaren", "ロータス": "lotus", "ケータハム": "caterham",
    "モーガン": "morgan", "ジープ": "jeep", "キャデラック": "cadillac",
    "シボレー": "chevrolet", "テスラ": "tesla", "フォード": "ford",
    "リンカーン": "lincoln", "ハマー": "hummer", "ボルボ": "volvo",
    "ヒョンデ": "hyundai", "BYD": "byd",
}
for jp_key, en_key in MAKER_MAP.items():

    if jp_key in all_data:
        # 同一英語キーが既に存在する場合は統合
        all_data[en_key] = all_data.pop(jp_key)
# 読み込んだ内容を確認
print(type(all_data))  # <class 'dict'>
print(list(all_data.keys())[:5]) 

In [None]:
# GET EVERY MODEL URL OF EVERY MAKER 
import time
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# =============================
# 除外するメーカーURL一覧
# =============================
EXCLUDE_URLS = {
    "https://carview.yahoo.co.jp/ncar/catalog/mitsuoka/",
    "https://carview.yahoo.co.jp/ncar/catalog/mercedes_maybach/",
    "https://carview.yahoo.co.jp/ncar/catalog/opel/",
    "https://carview.yahoo.co.jp/ncar/catalog/amg/",
    "https://carview.yahoo.co.jp/ncar/catalog/maybach/",
    "https://carview.yahoo.co.jp/ncar/catalog/lancia/",
    "https://carview.yahoo.co.jp/ncar/catalog/rover/",
    "https://carview.yahoo.co.jp/ncar/catalog/mg/",
    "https://carview.yahoo.co.jp/ncar/catalog/daimler/",
    "https://carview.yahoo.co.jp/ncar/catalog/tvr/",
    "https://carview.yahoo.co.jp/ncar/catalog/saturn/",
    "https://carview.yahoo.co.jp/ncar/catalog/chrysler/",
    "https://carview.yahoo.co.jp/ncar/catalog/dodge/",
    "https://carview.yahoo.co.jp/ncar/catalog/saleen/",
    "https://carview.yahoo.co.jp/ncar/catalog/donkervoort/",
    "https://carview.yahoo.co.jp/ncar/catalog/saab/",
}


# =============================
# メーカーURL取得関数
# =============================
def get_all_manufacturer_urls():
    """
    https://carview.yahoo.co.jp/ncar/catalog/makerlist/ から
    全メーカーのURLを取得（除外リストを反映）。
    """
    base_url = "https://carview.yahoo.co.jp"
    makerlist_url = f"{base_url}/ncar/catalog/makerlist/"

    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-blink-features=AutomationControlled")

    driver = uc.Chrome(options=options)
    driver.set_page_load_timeout(30)

    print(f"[INFO] メーカー一覧ページにアクセス中: {makerlist_url}")
    driver.get(makerlist_url)
    time.sleep(5)

    maker_elements = driver.find_elements(By.CSS_SELECTOR, "li.maker_country--list--item a")
    print(f"[INFO] 検出したメーカー数: {len(maker_elements)}")

    manufacturer_urls = []
    for elem in maker_elements:
        href = elem.get_attribute("href")
        name = elem.text.strip()
        if href:
            if href.startswith("/"):
                href = base_url + href
            if href not in EXCLUDE_URLS:
                manufacturer_urls.append((name, href))
            else:
                print(f"[INFO] 除外メーカー: {name} ({href})")

    driver.quit()
    print(f"✅ 有効メーカー数: {len(manufacturer_urls)}")
    return manufacturer_urls

# =============================
# モデルURL取得関連関数
# =============================
def click_until_disappear_within(driver, container_elem, button_css, pause=1.0, max_loops=50):
    loops = 0
    while loops < max_loops:
        try:
            btn = container_elem.find_element(By.CSS_SELECTOR, button_css)
            display = driver.execute_script("return window.getComputedStyle(arguments[0]).display;", btn)
            if display == "none":
                break
            driver.execute_script("arguments[0].click();", btn)
            time.sleep(pause)
            loops += 1
        except Exception:
            break
    return loops


def collect_urls_from_visible_container(driver, container_elems):
    urls = []
    for cont in container_elems:
        try:
            disp = driver.execute_script("return window.getComputedStyle(arguments[0]).display;", cont)
        except Exception:
            continue
        if disp != "block":
            continue
        anchors = cont.find_elements(By.CSS_SELECTOR, "ul.model_list--list a[href^='/ncar/catalog/'], ul.model_list--list a[href^='https://carview.yahoo.co.jp/ncar/catalog/']")
        for a in anchors:
            href = a.get_attribute("href")
            if href:
                urls.append(href)
    return urls


def get_model_urls(manufacturer_url, headless=True):
    """販売中／販売終了タブを自動処理して全モデルURLを取得"""
    options = Options()
    if headless:
        options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--disable-dev-shm-usage")

    driver = uc.Chrome(options=options)
    driver.set_page_load_timeout(30)
    wait = WebDriverWait(driver, 20)

    try:
        print(f"[INFO] 開始: {manufacturer_url}")
        driver.get(manufacturer_url)
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.model_list")))
        time.sleep(1.0)

        collected = []

        for tab_index in [1, 2]:
            try:
                tab_elem = driver.find_element(By.CSS_SELECTOR, f"#ctl_tab li:nth-child({tab_index}) a")
                driver.execute_script("arguments[0].click();", tab_elem)
                time.sleep(1.0)
            except Exception:
                pass

            try:
                container_elems = driver.find_elements(By.CSS_SELECTOR, "div.model_list--body--inner")
            except Exception:
                container_elems = []

            any_clicked = 0
            for cont in container_elems:
                try:
                    disp = driver.execute_script("return window.getComputedStyle(arguments[0]).display;", cont)
                except Exception:
                    continue
                if disp != "block":
                    continue
                clicks = click_until_disappear_within(driver, cont, "a.model_list--btn--more", pause=1.0, max_loops=60)
                any_clicked += clicks

            urls = collect_urls_from_visible_container(driver, container_elems)
            print(f"[INFO] タブ{tab_index}（販売中=1/販売終了=2）: {len(urls)} 件 (more clicked {any_clicked} 回)")
            collected.extend(urls)

        seen = set()
        unique = []
        for u in collected:
            if u not in seen:
                seen.add(u)
                unique.append(u)

        print(f"[INFO] 合計モデル数(重複除去): {len(unique)}")
        return unique

    finally:
        driver.quit()


# =============================
# メイン実行部
# =============================
if __name__ == "__main__":
    manufacturers = get_all_manufacturer_urls()

    all_data = {}
    for name, url in manufacturers:
        try:
            urls = get_model_urls(url)
            all_data[name] = urls
        except Exception as e:
            print(f"[WARN] {name} のモデル取得中にエラー: {e}")

    # 取得結果のサマリ表示
    print("\n==================== 取得結果サマリ ====================")
    total = 0
    for name, urls in all_data.items():
        print(f"{name}: {len(urls)} モデル")
        total += len(urls)
    print(f"========================================================\n総モデル数: {total}")

output_file = "car_url.json"

# JSONとして保存
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(all_data, f, ensure_ascii=False, indent=2)

print(f"[INFO] JSONファイルに保存しました: {output_file}")