In [3]:
import json
import os
import re
import pandas as pd

# JSONファイルが入っているフォルダ
FOLDER = r"D:\GitHub_Repository\Research\ma_thesis\data\car_data"   # ← ここだけ書き換えてください

ev_urls = []

# 全 JSON をループ
for file in os.listdir(FOLDER):
    if not file.endswith(".json"):
        continue
    
    filepath = os.path.join(FOLDER, file)
    
    # JSON読み込み
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)
    except:
        print(f"読み込み失敗: {filepath}")
        continue
    
    # historical_models を探索
    hist = data.get("historical_models", {})

    for model_url, minor_dict in hist.items():

        minor_changes = minor_dict.get("minor_changes", {})

        for _, grades in minor_changes.items():

            for grade_url, contents in grades.items():

                # 基本情報を取得
                basic_info = contents.get("基本情報", {})
                engine_type = basic_info.get("エンジン区分", "")

                # EV だけを抽出
                if engine_type == "EV":
                    ev_urls.append({
                        "file": file,
                        "model_url": model_url,
                        "grade_url": grade_url,
                    })

# 結果をCSVに保存
df = pd.DataFrame(ev_urls)

In [4]:
urls = df['model_url'].to_list()

cleaned = set()

for url in urls:
    # FMCxxx-MCxxx の部分を削除
    new_url = re.sub(r'FMC\d+-MC\d+/?$', '', url)
    cleaned.add(new_url)

cleaned_list = list(cleaned)

print(cleaned_list)

['https://carview.yahoo.co.jp/ncar/catalog/nissan/leaf/', 'https://carview.yahoo.co.jp/ncar/catalog/suzuki/e_vitara/', 'https://carview.yahoo.co.jp/ncar/catalog/peugeot/e-2008/', 'https://carview.yahoo.co.jp/ncar/catalog/audi/a6_sportback_e_tron/', 'https://carview.yahoo.co.jp/ncar/catalog/rolls-royce/spectre/', 'https://carview.yahoo.co.jp/ncar/catalog/hyundai/kona/', 'https://carview.yahoo.co.jp/ncar/catalog/mercedes-benz/eqa/', 'https://carview.yahoo.co.jp/ncar/catalog/porsche/taycan_cross_turismo/', 'https://carview.yahoo.co.jp/ncar/catalog/peugeot/e-208/', 'https://carview.yahoo.co.jp/ncar/catalog/volkswagen/id_buzz/', 'https://carview.yahoo.co.jp/ncar/catalog/bmw/i4/', 'https://carview.yahoo.co.jp/ncar/catalog/nissan/clipper_ev/', 'https://carview.yahoo.co.jp/ncar/catalog/volkswagen/e_up/', 'https://carview.yahoo.co.jp/ncar/catalog/bmw/ix2/', 'https://carview.yahoo.co.jp/ncar/catalog/abarth/abarth_595e_cabriolet/', 'https://carview.yahoo.co.jp/ncar/catalog/mercedes-benz/eqs/', 'h

In [5]:
cleaned_list

['https://carview.yahoo.co.jp/ncar/catalog/nissan/leaf/',
 'https://carview.yahoo.co.jp/ncar/catalog/suzuki/e_vitara/',
 'https://carview.yahoo.co.jp/ncar/catalog/peugeot/e-2008/',
 'https://carview.yahoo.co.jp/ncar/catalog/audi/a6_sportback_e_tron/',
 'https://carview.yahoo.co.jp/ncar/catalog/rolls-royce/spectre/',
 'https://carview.yahoo.co.jp/ncar/catalog/hyundai/kona/',
 'https://carview.yahoo.co.jp/ncar/catalog/mercedes-benz/eqa/',
 'https://carview.yahoo.co.jp/ncar/catalog/porsche/taycan_cross_turismo/',
 'https://carview.yahoo.co.jp/ncar/catalog/peugeot/e-208/',
 'https://carview.yahoo.co.jp/ncar/catalog/volkswagen/id_buzz/',
 'https://carview.yahoo.co.jp/ncar/catalog/bmw/i4/',
 'https://carview.yahoo.co.jp/ncar/catalog/nissan/clipper_ev/',
 'https://carview.yahoo.co.jp/ncar/catalog/volkswagen/e_up/',
 'https://carview.yahoo.co.jp/ncar/catalog/bmw/ix2/',
 'https://carview.yahoo.co.jp/ncar/catalog/abarth/abarth_595e_cabriolet/',
 'https://carview.yahoo.co.jp/ncar/catalog/mercedes

In [None]:
all_data = cleaned_list

In [12]:
import json
from urllib.parse import urlparse

urls = cleaned_list

result = {}

for url in urls:
    path = urlparse(url).path.split("/")  # ['', 'ncar', 'catalog', maker, model, '']
    
    if len(path) >= 5:
        maker = path[3]  # メーカー名
    else:
        continue

    # 辞書に追加
    if maker not in result:
        result[maker] = []

    result[maker].append(url)

# JSONファイルに保存 (必要な場合)
with open("maker_urls.json", "w", encoding="utf-8") as f:
    json.dump(result, f, ensure_ascii=False, indent=2)

print(result)


{'nissan': ['https://carview.yahoo.co.jp/ncar/catalog/nissan/leaf/', 'https://carview.yahoo.co.jp/ncar/catalog/nissan/clipper_ev/', 'https://carview.yahoo.co.jp/ncar/catalog/nissan/sakura/', 'https://carview.yahoo.co.jp/ncar/catalog/nissan/ariya/', 'https://carview.yahoo.co.jp/ncar/catalog/nissan/e_nv200_van/', 'https://carview.yahoo.co.jp/ncar/catalog/nissan/e_nv200/'], 'suzuki': ['https://carview.yahoo.co.jp/ncar/catalog/suzuki/e_vitara/'], 'peugeot': ['https://carview.yahoo.co.jp/ncar/catalog/peugeot/e-2008/', 'https://carview.yahoo.co.jp/ncar/catalog/peugeot/e-208/'], 'audi': ['https://carview.yahoo.co.jp/ncar/catalog/audi/a6_sportback_e_tron/', 'https://carview.yahoo.co.jp/ncar/catalog/audi/a6_avant_e_tron/', 'https://carview.yahoo.co.jp/ncar/catalog/audi/q4_sportback_e_tron/', 'https://carview.yahoo.co.jp/ncar/catalog/audi/s6_sportback_e_tron/', 'https://carview.yahoo.co.jp/ncar/catalog/audi/rs_e_tron/', 'https://carview.yahoo.co.jp/ncar/catalog/audi/sq6_e_tron/', 'https://carvie

In [8]:
import time
import random
import json
import os
from urllib.parse import urlparse

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc

BASE_URL = "https://carview.yahoo.co.jp"

ROBOTS_DISALLOW = [
    "/news/detail/",
    "/article/detail/",
    "/article/comment",
    "/ncar/catalog/*/chiebukuro/",
    "/article/countUp/"
]


# ===============================
# Selenium 初期化
# ===============================
def create_driver():
    options = uc.ChromeOptions()
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    )
    driver = uc.Chrome(options=options)
    driver.set_page_load_timeout(30)
    return driver


# ===============================
# robots.txt チェック
# ===============================
def is_disallowed(url):
    path = urlparse(url).path
    for dis in ROBOTS_DISALLOW:
        if dis in path:
            return True
    return False


# ===============================
# 安全取得
# ===============================
def safe_get(driver, url, retries=5, wait_minutes=8):
    if is_disallowed(url):
        print(f"[ROBOTS] スキップ: {url}")
        return False

    for i in range(retries):
        try:
            driver.get(url)
            time.sleep(random.uniform(3.5, 5.5))
            html = driver.page_source

            # ブロック判定
            blocked = (
                "現在表示できません" in html and "Yahoo! JAPAN" in html
            ) or (
                "<title>Yahoo! JAPAN - ご覧になろうとしているページは現在表示できません。" in html
            )

            if blocked:
                print(f"[BLOCKED] {url}")
                time.sleep(wait_minutes * 60)
                continue

            return True

        except Exception as e:
            print(f"[WARN] {url} ({i+1}/{retries}) 失敗: {e}")
            time.sleep(i * 20)

    return False


# ===============================
# maker / car 抽出
# ===============================
def parse_maker_car(url):
    parts = urlparse(url).path.strip("/").split("/")
    maker = parts[2]
    car = parts[3]
    return maker, car


# ===============================
# 歴代モデル
# ===============================
def get_historical_models(driver, wait, TOP_URL):
    if not safe_get(driver, TOP_URL):
        return []

    try:
        elems = wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, "#mdl_his a.p-list_landscape--item--link")
        ))
        urls = [e.get_attribute("href") for e in elems]

        return [
            url for url in urls
            if url.rstrip("/").split("/")[-1].startswith("FMC")
        ]

    except:
        print("[WARN] 歴代モデルなし")
        return []


# ===============================
# マイナー
# ===============================
def get_minor_changes(driver, wait, model_url):
    if not safe_get(driver, model_url):
        return []

    try:
        elems = wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, "div#minr_his a.p-list_landscape--item--link")
        ))
        return [e.get_attribute("href") for e in elems]

    except:
        print(f"[WARN] マイナーチェンジなし: {model_url}")
        return []


# ===============================
# グレード
# ===============================
def get_grade_links(driver, wait, minor_url):
    if not safe_get(driver, minor_url):
        return []

    try:
        wait.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR, "div.grade_inform_mod_list--table--wrap a")
        ))
        elems = driver.find_elements(
            By.CSS_SELECTOR, "div.grade_inform_mod_list--table--wrap a"
        )
        return [e.get_attribute("href") for e in elems]

    except:
        print(f"[WARN] グレードなし: {minor_url}")
        return []


# ===============================
# 仕様情報スクレイプ
# ===============================
def scrape_grade_page(driver, wait, grade_url):
    if not safe_get(driver, grade_url):
        return {}

    data = {}
    try:
        wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, "div.model_explanation--tbl table")
        ))
        tables = driver.find_elements(By.CSS_SELECTOR, "div.model_explanation--tbl")

        sections = [
            "基本情報", "寸法・重量", "エンジン・性能", "バッテリー・モーター性能",
            "タイヤ・足回り", "安全装備", "快適装備",
            "エクステリア", "インテリア"
        ]

        for section, tbl in zip(sections, tables):
            rows = tbl.find_elements(By.TAG_NAME, "tr")
            sec = {}
            for row in rows:
                ths = row.find_elements(By.TAG_NAME, "th")
                tds = row.find_elements(By.TAG_NAME, "td")
                for th, td in zip(ths, tds):
                    sec[th.text.strip()] = td.text.strip()
            data[section] = sec

    except Exception as e:
        print(f"[WARN] 仕様取得失敗 {grade_url}: {e}")

    return data


# ===============================
# メインスクレイピング
# ===============================
def scrape_car(driver, wait, TOP_URL, visited_minor_urls):
    maker, car = parse_maker_car(TOP_URL)
    print(f"[INFO] {maker}/{car}")

    result = {"maker": maker, "car": car, "historical_models": {}}

    historical_models = get_historical_models(driver, wait, TOP_URL)
    for model_url in historical_models:
        result["historical_models"][model_url] = {"minor_changes": {}}

        minor_changes = get_minor_changes(driver, wait, model_url)
        for minor_url in minor_changes:

            if minor_url in visited_minor_urls:
                continue
            visited_minor_urls.add(minor_url)

            result["historical_models"][model_url]["minor_changes"][minor_url] = {}

            grade_links = get_grade_links(driver, wait, minor_url)
            for grade_url in grade_links:
                result["historical_models"][model_url]["minor_changes"][minor_url][grade_url] = (
                    scrape_grade_page(driver, wait, grade_url)
                )

    return result


# ===============================
# 実行（EV専用）
# ===============================
if __name__ == "__main__":

    driver = create_driver()
    wait = WebDriverWait(driver, 20)

    visited_minor_urls = set()

    # ★★★ あなたが先ほど抽出した EV車の model_url リストをここに入れる ★★★
    # all_data = ["https://carview.yahoo.co.jp/ncar/catalog/...."]
    # 例：メーカーごと → フラットリストにしてよい
    # all_data = ["url1", "url2", ...]

    ev_results = {}  # すべてのEVデータをここに格納

    for url in all_data:
        print("\n============================")
        print(f"[START] {url}")
        print("============================")

        try:
            ev_results[url] = scrape_car(driver, wait, url, visited_minor_urls)
        except Exception as e:
            print(f"[ERROR] {url}: {e}")

    driver.quit()

    # ===============================
    # EV車のデータを1つの JSON に保存
    # ===============================
    with open("ev_cars.json", "w", encoding="utf-8") as f:
        json.dump(ev_results, f, ensure_ascii=False, indent=2)

    print("[DONE] EV車の情報を ev_cars.json に保存しました")



[START] h
[ERROR] h: list index out of range

[START] t
[ERROR] t: list index out of range

[START] t
[ERROR] t: list index out of range

[START] p
[ERROR] p: list index out of range

[START] s
[ERROR] s: list index out of range

[START] :
[ERROR] :: list index out of range

[START] /
[ERROR] /: list index out of range

[START] /
[ERROR] /: list index out of range

[START] c
[ERROR] c: list index out of range

[START] a
[ERROR] a: list index out of range

[START] r
[ERROR] r: list index out of range

[START] v
[ERROR] v: list index out of range

[START] i
[ERROR] i: list index out of range

[START] e
[ERROR] e: list index out of range

[START] w
[ERROR] w: list index out of range

[START] .
[ERROR] .: list index out of range

[START] y
[ERROR] y: list index out of range

[START] a
[ERROR] a: list index out of range

[START] h
[ERROR] h: list index out of range

[START] o
[ERROR] o: list index out of range

[START] o
[ERROR] o: list index out of range

[START] .
[ERROR] .: list index o