In [5]:
# -*- coding: utf-8 -*-
import time
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

# 1) 드라이버 준비
options = webdriver.ChromeOptions()
# options.add_argument("--headless")  # 창 안 띄우고 실행하려면 이 줄 주석 해제
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# 2) 크롤링 대상
sites = [
    # gamedeveloper.com: 'AI' 텍스트가 들어간 모든 <a> 태그에서 찾기
    {"url": "https://www.gamedeveloper.com/", "mode": "partial"},  
    # venturebeat: <h2><a>…</a></h2> 형태니까 CSS로 모두 긁어온 뒤 필터
    {"url": "https://venturebeat.com/category/game-development/", "mode": "css", "selector": "h2 a"},  
    # developer-tech: 역시 'AI' 텍스트 포함 <a> 태그에서 찾기
    {"url": "https://www.developer-tech.com/categories/developer-gaming/", "mode": "partial"},  
]

# 3) 찾을 키워드
KEYWORDS = ["Generative AI", "AI"]

# 4) 결과 저장용
seen = set()          # 중복 방지 (href 기준)
results = []          # 최종 결과

for site in sites:
    print(f"▶️  열기: {site['url']}")
    driver.get(site["url"])
    time.sleep(5)  # 동적 로딩 대기

    elems = []
    if site["mode"] == "css":
        elems = driver.find_elements(By.CSS_SELECTOR, site["selector"])
    else:  # mode == "partial"
        for kw in KEYWORDS:
            elems += driver.find_elements(By.PARTIAL_LINK_TEXT, kw)

    print(f"   → 발견된 후보 링크: {len(elems)}개")

    for a in elems:
        title = a.text.strip()
        href  = a.get_attribute("href") or ""
        if not href or href in seen:
            continue
        # 실제 키워드 포함 여부 재확인
        if any(kw.lower() in title.lower() for kw in KEYWORDS):
            seen.add(href)
            results.append({
                "keyword": next(kw for kw in KEYWORDS if kw.lower() in title.lower()),
                "title":   title,
                "link":    href
            })

driver.quit()
print(f"✅ 수집된 기사 수: {len(results)}개")

# 5) 엑셀 저장
df = pd.DataFrame(results, columns=["keyword", "title", "link"])
out_file = "gdc_generative_ai_articles.xlsx"
df.to_excel(out_file, index=False)
print(f"✅ '{out_file}' 에 저장되었습니다.")


▶️  열기: https://www.gamedeveloper.com/
   → 발견된 후보 링크: 1개
▶️  열기: https://venturebeat.com/category/game-development/
   → 발견된 후보 링크: 37개
▶️  열기: https://www.developer-tech.com/categories/developer-gaming/
   → 발견된 후보 링크: 5개
✅ 수집된 기사 수: 13개
✅ 'gdc_generative_ai_articles.xlsx' 에 저장되었습니다.


In [1]:
# -*- coding: utf-8 -*-
"""
Merged script to:
1. Collect "Generative AI" and "AI" articles from:
   - gamedeveloper.com
   - venturebeat.com/category/game-development/
   - developer-tech.com/categories/developer-gaming/
   Saves results to gdc_generative_ai_articles.xlsx

2. Collect "AI" and "Game" articles from TechCrunch (https://techcrunch.com/)
   Saves results to techcrunch_ai_game_articles.xlsx
"""

import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

# 1) ChromeDriver 준비 & 옵션 설정
options = webdriver.ChromeOptions()
# options.add_argument("--headless")  # Uncomment to run without opening a browser window
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# ===== Part 1: GDC sites (Generative AI & AI keywords) =====
sites = [
    {"url": "https://www.gamedeveloper.com/", "mode": "partial"},
    {"url": "https://venturebeat.com/category/game-development/", "mode": "css", "selector": "h2.article__title a"},
    {"url": "https://www.developer-tech.com/categories/developer-gaming/", "mode": "partial"},
]
KEYWORDS = ["Generative AI", "AI"]
gdc_results = []
seen = set()

for site in sites:
    print(f"▶️  Opening: {site['url']}")
    driver.get(site["url"])
    time.sleep(5)  # Wait for dynamic content to load

    elements = []
    if site["mode"] == "css":
        elements = driver.find_elements(By.CSS_SELECTOR, site["selector"])
    else:
        for kw in KEYWORDS:
            elements += driver.find_elements(By.PARTIAL_LINK_TEXT, kw)

    print(f"   Found candidates: {len(elements)}")
    for a in elements:
        title = a.text.strip()
        href = a.get_attribute("href") or ""
        if not href or href in seen:
            continue
        if any(kw.lower() in title.lower() for kw in KEYWORDS):
            seen.add(href)
            keyword = next(kw for kw in KEYWORDS if kw.lower() in title.lower())
            gdc_results.append({"keyword": keyword, "title": title, "link": href})

print(f"✅ GDC articles collected: {len(gdc_results)}")
df1 = pd.DataFrame(gdc_results, columns=["keyword", "title", "link"])
file1 = "gdc_generative_ai_articles.xlsx"
df1.to_excel(file1, index=False)
print(f"✅ Saved to '{file1}'")

# ===== Part 2: TechCrunch (AI & Game keywords) =====
print("▶️  Opening: https://techcrunch.com/")
driver.get("https://techcrunch.com/")
time.sleep(5)

article_links = driver.find_elements(By.CSS_SELECTOR, "h2.post-block__title a")
tc_results = []
for a in article_links:
    title = a.text.strip()
    href = a.get_attribute("href") or ""
    if "ai" in title.lower() and "game" in title.lower():
        tc_results.append({"title": title, "link": href})

print(f"✅ TechCrunch articles collected: {len(tc_results)}")
df2 = pd.DataFrame(tc_results, columns=["title", "link"])
file2 = "techcrunch_ai_game_articles.xlsx"
df2.to_excel(file2, index=False)
print(f"✅ Saved to '{file2}'")

driver.quit()  # Close browser when done


▶️  Opening: https://www.gamedeveloper.com/
   Found candidates: 1
▶️  Opening: https://venturebeat.com/category/game-development/
   Found candidates: 0
▶️  Opening: https://www.developer-tech.com/categories/developer-gaming/
   Found candidates: 5
✅ GDC articles collected: 6
✅ Saved to 'gdc_generative_ai_articles.xlsx'
▶️  Opening: https://techcrunch.com/
✅ TechCrunch articles collected: 0
✅ Saved to 'techcrunch_ai_game_articles.xlsx'


In [4]:
# -*- coding: utf-8 -*-
import time
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

from webdriver_manager.chrome import ChromeDriverManager

# 0) pip install selenium webdriver-manager pandas openpyxl

# 1) ChromeDriver 준비 & 옵션
options = webdriver.ChromeOptions()
# options.add_argument("--headless")
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

wait = WebDriverWait(driver, 10)

results = []
seen_links = set()

# ──────────────────────────────────────────────────────────
# Part 1: GDC 계열 3개 사이트 ("Generative AI" 또는 "AI")
# ──────────────────────────────────────────────────────────
gdc_sites = [
    {"url": "https://www.gamedeveloper.com/", 
     "mode": "partial", "selector": None},
    {"url": "https://venturebeat.com/category/game-development/", 
     "mode": "css", 
     "selector": "h2.article__title a"},
    {"url": "https://www.developer-tech.com/categories/developer-gaming/", 
     "mode": "partial", "selector": None},
]
gdc_keywords = ["Generative AI", "AI"]

for site in gdc_sites:
    print(f"▶️  Opening: {site['url']}")
    driver.get(site["url"])
    # optional: scroll to load more
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)

    elements = []
    if site["mode"] == "css":
        try:
            wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, site["selector"])))
            elements = driver.find_elements(By.CSS_SELECTOR, site["selector"])
        except TimeoutException:
            print(f"   ⚠️ Timeout for selector {site['selector']}, fallback to partial link text")
            # fallback: partial link text for each keyword
            for kw in gdc_keywords:
                elements += driver.find_elements(By.PARTIAL_LINK_TEXT, kw)
    else:
        # 직접 partial link text
        for kw in gdc_keywords:
            elements += driver.find_elements(By.PARTIAL_LINK_TEXT, kw)

    print(f"   → Candidate links: {len(elements)}")
    for a in elements:
        title = a.text.strip()
        href  = a.get_attribute("href") or ""
        if not href or href in seen_links:
            continue
        if any(kw.lower() in title.lower() for kw in gdc_keywords):
            seen_links.add(href)
            keyword = next(kw for kw in gdc_keywords if kw.lower() in title.lower())
            results.append({
                "source": site["url"],
                "keyword": keyword,
                "title": title,
                "link": href
            })

# ──────────────────────────────────────────────────────────
# Part 2: TechCrunch ("AI" & "Game")
# ──────────────────────────────────────────────────────────
tc_url = "https://techcrunch.com/"
print(f"▶️  Opening: {tc_url}")
driver.get(tc_url)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)

tc_selector = "h2.post-block__title a"
try:
    wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, tc_selector)))
    tc_links = driver.find_elements(By.CSS_SELECTOR, tc_selector)
except TimeoutException:
    print(f"   ⚠️ Timeout loading TechCrunch selector, trying all links")
    tc_links = driver.find_elements(By.TAG_NAME, "a")

print(f"   → TechCrunch candidate links: {len(tc_links)}")
for a in tc_links:
    title = a.text.strip()
    href  = a.get_attribute("href") or ""
    if not href or href in seen_links:
        continue
    if "ai" in title.lower() and "game" in title.lower():
        seen_links.add(href)
        results.append({
            "source": tc_url,
            "keyword": "AI & Game",
            "title": title,
            "link": href
        })

driver.quit()

# ──────────────────────────────────────────────────────────
# Part 3: 합쳐서 엑셀에 저장
# ──────────────────────────────────────────────────────────
df = pd.DataFrame(results, columns=["source", "keyword", "title", "link"])
out_file = "all_ai_game_articles.xlsx"
df.to_excel(out_file, index=False)
print(f"✅ Total {len(df)} articles saved to '{out_file}'")


▶️  Opening: https://www.gamedeveloper.com/
   → Candidate links: 1
▶️  Opening: https://venturebeat.com/category/game-development/
   ⚠️ Timeout for selector h2.article__title a, fallback to partial link text
   → Candidate links: 2
▶️  Opening: https://www.developer-tech.com/categories/developer-gaming/
   → Candidate links: 5
▶️  Opening: https://techcrunch.com/
   ⚠️ Timeout loading TechCrunch selector, trying all links
   → TechCrunch candidate links: 333
✅ Total 8 articles saved to 'all_ai_game_articles.xlsx'


In [6]:
# UPDATE 
# -*- coding: utf-8 -*-
"""
Unified scraper for:
1. GDC sites (gamedeveloper.com, venturebeat.com, developer-tech.com) collecting "Generative AI" and "AI" articles over first 3 pages.
2. TechCrunch RSS feed to collect articles containing both "AI" and "Game".
Saves all results to a single Excel file: all_ai_game_articles.xlsx
"""
import time
import re
import pandas as pd
import feedparser
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager

# 0) Ensure dependencies:
#    pip install selenium webdriver-manager pandas openpyxl feedparser

# 1) Setup Selenium WebDriver
options = webdriver.ChromeOptions()
# options.add_argument("--headless")  # Uncomment to run headless
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 10)

results = []
seen_links = set()

# ──────────────────────────────────────
# Part 1: GDC sites pagination (pages 1-3)
# ──────────────────────────────────────
gdc_sites = [
    {"url": "https://www.gamedeveloper.com", "mode": "css", "selector": "h3.entry-title a"},
    {"url": "https://venturebeat.com/category/game-development", "mode": "css", "selector": "h2.article__title a"},
    {"url": "https://www.developer-tech.com/categories/developer-gaming", "mode": "css", "selector": "h3.node-title a"},
]
gdc_keywords = ["Generative AI", "AI"]

for site in gdc_sites:
    base = site["url"].rstrip("/")
    for page in range(1, 4):  # pages 1,2,3
        page_url = f"{base}/page/{page}/"
        print(f"▶️ Loading: {page_url}")
        driver.get(page_url)
        # scroll multiple times to load dynamic content
        for _ in range(3):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
        # wait until at least 5 elements appear, else fallback
        selector = site["selector"]
        elems = []
        try:
            wait.until(lambda d: len(d.find_elements(By.CSS_SELECTOR, selector)) >= 5)
            elems = driver.find_elements(By.CSS_SELECTOR, selector)
        except TimeoutException:
            print(f"   ⚠️ Timeout on {selector}, fallback to partial link text")
            for kw in gdc_keywords:
                elems.extend(driver.find_elements(By.PARTIAL_LINK_TEXT, kw))

        print(f"   Found {len(elems)} candidates")
        for a in elems:
            title = a.text.strip()
            link = a.get_attribute("href") or ""
            # filter only site-specific and new
            if not link.startswith(base) or link in seen_links:
                continue
            if any(kw.lower() in title.lower() for kw in gdc_keywords):
                seen_links.add(link)
                keyword = next(kw for kw in gdc_keywords if kw.lower() in title.lower())
                results.append({
                    "source": base,
                    "keyword": keyword,
                    "title": title,
                    "link": link
                })

# ──────────────────────────────────────
# Part 2: TechCrunch via RSS feed
# ──────────────────────────────────────
print("▶️ Parsing TechCrunch RSS...")
feed = feedparser.parse("https://techcrunch.com/feed/")
for entry in feed.entries:
    title = entry.get("title", "").strip()
    link = entry.get("link", "").strip()
    if link in seen_links:
        continue
    if "ai" in title.lower() and "game" in title.lower():
        seen_links.add(link)
        results.append({
            "source": "TechCrunch",
            "keyword": "AI & Game",
            "title": title,
            "link": link
        })

# cleanup
driver.quit()

# ──────────────────────────────────────
# Save to single Excel file
# ──────────────────────────────────────
df = pd.DataFrame(results, columns=["source", "keyword", "title", "link"])
out_file = "all_ai_game_articles.xlsx"
df.to_excel(out_file, index=False)
print(f"✅ Saved {len(df)} articles to '{out_file}'")


▶️ Loading: https://www.gamedeveloper.com/page/1/
   ⚠️ Timeout on h3.entry-title a, fallback to partial link text
   Found 0 candidates
▶️ Loading: https://www.gamedeveloper.com/page/2/
   ⚠️ Timeout on h3.entry-title a, fallback to partial link text
   Found 0 candidates
▶️ Loading: https://www.gamedeveloper.com/page/3/
   ⚠️ Timeout on h3.entry-title a, fallback to partial link text
   Found 0 candidates
▶️ Loading: https://venturebeat.com/category/game-development/page/1/
   ⚠️ Timeout on h2.article__title a, fallback to partial link text
   Found 2 candidates
▶️ Loading: https://venturebeat.com/category/game-development/page/2/
   ⚠️ Timeout on h2.article__title a, fallback to partial link text
   Found 3 candidates
▶️ Loading: https://venturebeat.com/category/game-development/page/3/
   ⚠️ Timeout on h2.article__title a, fallback to partial link text
   Found 9 candidates
▶️ Loading: https://www.developer-tech.com/categories/developer-gaming/page/1/
   ⚠️ Timeout on h3.node-title