In [2]:
# Cell 1/7 — Install (run once in the environment / notebook)
# In a Jupyter notebook run the pip cell. If running as a script, install via CLI.
%pip install -q pandas playwright
!python -m playwright install chromium


Note: you may need to restart the kernel to use updated packages.


In [10]:
import asyncio
import json
import time
from pathlib import Path
import pandas as pd
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError

# ---------- CONFIG ----------
DATA_DIR = Path("/work/pilot_project/data/raw")
DATA_DIR.mkdir(parents=True, exist_ok=True)

OUTPUT_FILE = DATA_DIR / "altinget_answers_playwright.csv"
TEMP_FILE = DATA_DIR / "temp_progress.json"

BASE_URL = "https://www.altinget.dk/kandidattest/KV25/valgkort"
AARHUS_VALUE = "271"   # the <option> value for Aarhus
HEADLESS = True        # set False to debug with visible browser
NAV_TIMEOUT = 30_000   # navigation/selector timeouts in ms
# ----------------------------


In [11]:
def load_progress():
    """Load already-scraped rows (if any)."""
    if TEMP_FILE.exists():
        try:
            with open(TEMP_FILE, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception:
            return []
    return []

def save_progress(rows):
    """Save progress to TEMP_FILE (human readable)."""
    with open(TEMP_FILE, "w", encoding="utf-8") as f:
        json.dump(rows, f, ensure_ascii=False, indent=2)

def ensure_q_columns(df):
    """Make sure dataframe has Q1..Q26 in order."""
    for i in range(1, 27):
        col = f"Q{i}"
        if col not in df.columns:
            df[col] = ""
    cols = ["candidate_name", "party"] + [f"Q{i}" for i in range(1, 27)]
    return df[cols]


In [12]:
PARTIES_EXTRACTION_JS = '''
() => {
  // Walk the DOM in order: collect headings and candidate anchors and group anchors under the nearest preceding heading.
  const items = Array.from(document.querySelectorAll('h1,h2,h3,h4,h5,h6,a[href*="/kandidattest/KV25/profil/"]'));
  const parties = [];
  let current = null;
  for (const node of items) {
    if (node.tagName && node.tagName.match(/^H[1-6]$/i)) {
      const partyName = (node.textContent || '').trim() || 'Unknown';
      current = { party: partyName, candidates: [] };
      parties.push(current);
    } else if (node.tagName && node.tagName.toLowerCase() === 'a') {
      const href = node.href || '';
      const text = (node.textContent || '').trim();
      if (!href || !href.includes('/kandidattest/KV25/profil/')) continue;
      if (!current) {
        current = { party: 'Unknown', candidates: [] };
        parties.push(current);
      }
      current.candidates.push({ href, text });
    }
  }
  return parties;
}
'''


In [13]:
ANSWERS_EXTRACTION_JS = '''
() => {
  // Find the "Sammenlign svar" H3
  const header = Array.from(document.querySelectorAll('h3')).find(h => (h.textContent||'').trim().includes('Sammenlign svar'));
  if (!header) return Array(26).fill('');

  // The answer block usually sits in the next sibling (a table or wrapper)
  let container = header.nextElementSibling || header.parentElement;
  if (!container) return Array(26).fill('');

  // Collect rows: prefer <tr>, otherwise fallback to blocks that contain the question-number span
  let rows = Array.from(container.querySelectorAll('tr'));
  if (rows.length === 0) {
    rows = Array.from(container.querySelectorAll('div, li, section')).filter(el => el.querySelector('span.h-7.w-7'));
  }

  const mapping = [-2, -1, 1, 2];
  const answers = [];

  for (const row of rows) {
    // Locate the question number span (class "h-7 w-7 ...")
    const qSpan = row.querySelector('span.h-7.w-7');
    if (!qSpan) continue;

    // Find candidate answer cells: td elements that include the centered flex container
    let tds = Array.from(row.querySelectorAll('td'));
    let answerCells = tds.filter(td => {
      // the exact inner div used by the site
      const inner = td.querySelector('div.flex.items-center.justify-center');
      return inner !== null;
    });

    // Fallback: if we didn't find the expected td subset, try the first 4 td elements
    if (!answerCells || answerCells.length < 4) {
      answerCells = tds.slice(0, 4);
    }

    // If still empty, push blank and continue
    if (!answerCells || answerCells.length === 0) {
      answers.push('');
      if (answers.length >= 26) break;
      continue;
    }

    // Inspect each of the four answer cells in order, mapping to [-2, -1, 1, 2]
    let chosen = '';
    for (let i = 0; i < Math.min(answerCells.length, 4); i++) {
      const td = answerCells[i];
      const img = td.querySelector('img');
      if (img) {
        // treat presence of an <img> as the candidate's avatar (site uses candidate image in that cell)
        // extra heuristics: if src contains candidate-like keywords, prefer it (but presence alone is enough)
        const src = (img.src || '').toLowerCase();
        if (src.includes('kandidat') || src.includes('kvadrat') || src.includes('kandidatdatabase') || src.includes('/images/') ) {
          chosen = String(mapping[i]);
          break;
        } else {
          // if heuristics don't detect, still accept presence of img as candidate
          chosen = String(mapping[i]);
          break;
        }
      }
    }

    answers.push(chosen);
    if (answers.length >= 26) break;
  }

  // pad to 26 answers
  while (answers.length < 26) answers.push('');
  return answers.slice(0, 26);
}
'''


In [14]:
async def scrape_altinget_playwright():
    rows = load_progress()
    completed_keys = {f"{r.get('candidate_name','')}|{r.get('party','')}" for r in rows}

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=HEADLESS)
        context = await browser.new_context()
        page = await context.new_page()

        print("Opening main page:", BASE_URL)
        await page.goto(BASE_URL, timeout=NAV_TIMEOUT)
        await page.wait_for_timeout(1500)

        # 1) Select Aarhus using Playwright's select API (avoids JS string braces issues)
        try:
            # Try generic select
            try:
                await page.select_option("select", AARHUS_VALUE)
            except Exception:
                # fallback: try to find select by name or id and set via evaluate
                await page.evaluate("""
                    () => {
                        const s = document.querySelector('select[name*="kommune"], select[id*="kommune"], select');
                        if (s) {
                            s.value = '%s';
                            s.dispatchEvent(new Event('change', { bubbles: true }));
                        }
                    }
                """ % AARHUS_VALUE)
            await page.wait_for_timeout(1400)
        except Exception as e:
            print("Warning selecting Aarhus:", e)

        # 2) Scroll to load all candidates (lazy load)
        last_count = 0
        stable = 0
        max_scrolls = 60
        for _ in range(max_scrolls):
            await page.evaluate("window.scrollBy(0, window.innerHeight)")
            await page.wait_for_timeout(400)
            try:
                curr = await page.evaluate("document.querySelectorAll('a[href*=\"/kandidattest/KV25/profil/\"]').length")
            except Exception:
                curr = 0
            if curr == last_count:
                stable += 1
            else:
                stable = 0
                last_count = curr
            if stable >= 4:
                break
        print("Profile links found after scrolling:", last_count)

        # 3) Wait for at least one profile link
        try:
            await page.wait_for_selector("a[href*='/kandidattest/KV25/profil/']", timeout=NAV_TIMEOUT)
        except PlaywrightTimeoutError:
            print("No candidate links found - aborting.")
            await browser.close()
            return

        # 4) Extract party groups + candidate profile links
        parties = await page.evaluate(PARTIES_EXTRACTION_JS)
        print(f"Found {len(parties)} party groups.")
        for i, pinfo in enumerate(parties, start=1):
            print(f"[{i}] {pinfo.get('party','')} -> {len(pinfo.get('candidates', []))} candidates")

        # 5) Iterate each candidate, visit profile and extract 26 answers precisely
        for pinfo in parties:
            party_name = (pinfo.get('party') or '').strip()
            for cand in pinfo.get('candidates', []):
                cand_text = (cand.get('text') or '').strip()
                href = cand.get('href')
                if not href:
                    continue
                key = f"{cand_text}|{party_name}"
                if key in completed_keys:
                    print("Skipping (already done):", key)
                    continue

                print("Visiting:", cand_text, href)
                success = False
                for attempt in range(1, 4):
                    try:
                        await page.goto(href, timeout=NAV_TIMEOUT)
                        await page.wait_for_load_state("networkidle", timeout=NAV_TIMEOUT)
                        await page.wait_for_timeout(700)

                        # Scroll the "Sammenlign svar" section into view if present
                        try:
                            await page.evaluate("""
                                () => {
                                    const h = Array.from(document.querySelectorAll('h3')).find(x => (x.textContent||'').trim().includes('Sammenlign svar'));
                                    if (h) h.scrollIntoView({behavior:'auto', block:'center'});
                                }
                            """)
                            await page.wait_for_timeout(400)
                        except Exception:
                            pass

                        # Extract answers using the precise JS snippet above
                        answers = await page.evaluate(ANSWERS_EXTRACTION_JS)
                        if not isinstance(answers, list) or len(answers) < 26:
                            answers = [''] * 26

                        # Build and save row
                        row = {"candidate_name": cand_text, "party": party_name}
                        for i, v in enumerate(answers, start=1):
                            row[f"Q{i}"] = v
                        rows.append(row)
                        completed_keys.add(key)
                        save_progress(rows)
                        print(" ✅ Saved:", cand_text)
                        success = True
                        break

                    except Exception as e:
                        print(f" ❌ Attempt {attempt} failed for {cand_text}: {e}")
                        await asyncio.sleep(1.2)

                if not success:
                    print("  - All attempts failed; saving blanks for:", cand_text)
                    row = {"candidate_name": cand_text, "party": party_name}
                    for i in range(1, 27):
                        row[f"Q{i}"] = ''
                    rows.append(row)
                    save_progress(rows)

        # 6) Final CSV
        df = pd.DataFrame(rows)
        df = ensure_q_columns(df)
        df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")
        print("\nDone. CSV saved to:", OUTPUT_FILE)

        await browser.close()


In [16]:
# Run in notebook with top-level await
await scrape_altinget_playwright()

# If running as a script, instead use:
# import asyncio
# asyncio.run(scrape_altinget_playwright())



Opening main page: https://www.altinget.dk/kandidattest/KV25/valgkort
Profile links found after scrolling: 199
Found 23 party groups.
[1] Stemmeseddel -> 0 candidates
[2] Kommunalvalg 2025 -> 0 candidates
[3] Kommunalvalg 2025 -> 0 candidates
[4] A. Socialdemokratiet -> 22 candidates
[5] B. Radikale Venstre -> 18 candidates
[6] C. Det Konservative Folkeparti -> 17 candidates
[7] E. Borger-Initiativet -> 3 candidates
[8] F. SF - Socialistisk Folkeparti -> 25 candidates
[9] I. Liberal Alliance -> 12 candidates
[10] J. Aarhus Borgernes Stemme -> 1 candidates
[11] K. Kristendemokraterne -> 4 candidates
[12] M. Moderaterne -> 6 candidates
[13] N. Nej til privatisering af kommunale veje -> 9 candidates
[14] O. Dansk Folkeparti -> 7 candidates
[15] P. Troværdig Politik -> 1 candidates
[16] Q. Frie Grønne -> 1 candidates
[17] R. Kommunistisk Parti -> 5 candidates
[18] T. Borgertinget -> 4 candidates
[19] V. Venstre, Danmarks Liberale Parti -> 15 candidates
[20] Æ. Danmarksdemokraterne ‒ Inger 

In [26]:
!git config user.name "NadiaWojtowicz"
!git config user.email "202309081@post.au.dk"


In [28]:
# sprawdź, czy działa
!git config --list

core.repositoryformatversion=0
core.filemode=true
core.bare=false
core.logallrefupdates=true
remote.origin.url=https://github.com/NadiaWojtowicz/pilot_project.git
remote.origin.fetch=+refs/heads/*:refs/remotes/origin/*
branch.main.remote=origin
branch.main.merge=refs/heads/main
user.name=NadiaWojtowicz
user.email=202309081@post.au.dk


In [None]:
!git add scripts/Kandidattestfile.ipynb
!git commit -m "File containing "
!git push origin main
