In [26]:
from scraper_wiki import pobierz_soup
import os
import re
import time
import random
import pandas as pd
from openpyxl import Workbook, load_workbook

In [59]:
raw_path = r"D:\MyProjects_4Fun\projects\World of Warcraft\excel-mappingi\surowe\wowhead_id_kraina_dodatek.xlsx"
out_path = r"D:\MyProjects_4Fun\projects\World of Warcraft\excel-mappingi\mapping_01.xlsx"

input_sheet = "prawie_gotowe_dane"
output_sheet = "mapping_01"
url_col = "MISJA_URL_WOWHEAD"

def wyciagnij_patch(soup):
    for s in soup.find_all("script"):
        t = s.get_text(" ", strip=True)
        if "Added in patch" not in t:
            continue
        m = re.search(r'Added in patch\s*\[acronym=\\?"[^"]*\\?"\]([0-9]+\.[0-9]+\.[0-9]+)\[\\?/acronym\]', t)
        if m:
            return m.group(1)
    return ""

def normalize_cell(v):
    if v is None:
        return None
    if isinstance(v, float) and pd.isna(v):
        return None
    if pd.isna(v):
        return None
    return v

df_raw = pd.read_excel(raw_path, sheet_name=input_sheet)
print(f"Odczytano {len(df_raw)} wierszy z: {raw_path} [{input_sheet}]")

if url_col not in df_raw.columns:
    raise ValueError(f"Brak kolumny {url_col} w arkuszu {input_sheet}")

df_raw[url_col] = (
    df_raw[url_col]
      .astype(str)
      .str.strip()
)

df_raw = df_raw[df_raw[url_col].notna() & (df_raw[url_col] != "")].copy()
print(f"Po odfiltrowaniu pustych URL: {len(df_raw)} wierszy")

headers = list(df_raw.columns) + ["storyline", "patch"]

if not os.path.exists(out_path):
    wb = Workbook()
    ws = wb.active
    ws.title = output_sheet
    ws.append(headers)
    wb.save(out_path)
    print(f"Utworzono plik wynikowy: {out_path} [{output_sheet}]")
else:
    wb = load_workbook(out_path)
    if output_sheet not in wb.sheetnames:
        ws = wb.create_sheet(output_sheet)
        ws.append(headers)
        wb.save(out_path)
        print(f"Dodano arkusz: {output_sheet} do {out_path}")
    else:
        ws = wb[output_sheet]
        first_row = [ws.cell(row=1, column=i + 1).value for i in range(ws.max_column)]
        if first_row[:len(headers)] != headers:
            print("Uwaga: Nagłówki w mapping_01 różnią się od źródła. Dopiszę wiersze wg bieżących nagłówków mapping_01.")
            headers = [h for h in first_row if h is not None]

df_out = pd.read_excel(out_path, sheet_name=output_sheet)
if url_col in df_out.columns:
    existing_urls = set(
        df_out[url_col]
          .dropna()
          .astype(str)
          .str.strip()
          .tolist()
    )
else:
    existing_urls = set()

print(f"W mapping_01 jest już {len(existing_urls)} URL-i")

df_new = df_raw[~df_raw[url_col].isin(existing_urls)].copy()
print(f"Nowych wierszy do pobrania: {len(df_new)}")

if df_new.empty:
    print("Nic do roboty — wszystko już jest w mapping_01.")
else:
    wb = load_workbook(out_path)
    ws = wb[output_sheet]

    batch_size = 200
    bufor = []
    dopisane = 0
    bledy = 0

    for idx, row in enumerate(df_new.itertuples(index=False), start=1):
        row_dict = row._asdict()
        link = str(row_dict.get(url_col, "")).strip()

        print(f"[{idx}/{len(df_new)}] Scrapuję: {link}")
        soup = pobierz_soup(link, parser="lxml")
        if soup is None:
            bledy += 1
            print(f"[{idx}/{len(df_new)}] Błąd pobierania: {link}")
            continue

        element = soup.select_one(".quick-facts-storyline-title")
        if element is None:
            bledy += 1
            print(f"[{idx}/{len(df_new)}] Brak storyline: {link}")
            continue

        storyline = element.get_text().strip()
        patch = wyciagnij_patch(soup)

        row_dict["storyline"] = storyline
        row_dict["patch"] = patch

        out_row = [normalize_cell(row_dict.get(col)) for col in headers]
        bufor.append(out_row)

        print(f"[{idx}/{len(df_new)}] OK: storyline='{storyline}' | patch='{patch}'")

        if len(bufor) >= batch_size:
            start_row_excel = ws.max_row + 1
            for r in bufor:
                ws.append(r)
            wb.save(out_path)
            dopisane += len(bufor)
            print(f"Zapisano paczkę {len(bufor)} wierszy od wiersza {start_row_excel}")
            bufor = []

        time.sleep(random.uniform(1.3, 1.9))

    if bufor:
        start_row_excel = ws.max_row + 1
        for r in bufor:
            ws.append(r)
        wb.save(out_path)
        dopisane += len(bufor)
        print(f"Zapisano ostatnią paczkę {len(bufor)} wierszy od wiersza {start_row_excel}")

    print(f"Koniec. Dopisano: {dopisane}, błędy: {bledy}, plik: {out_path}")


Odczytano 4 wierszy z: D:\MyProjects_4Fun\projects\World of Warcraft\excel-mappingi\surowe\wowhead_id_kraina_dodatek.xlsx [prawie_gotowe_dane]
Po odfiltrowaniu pustych URL: 4 wierszy
W mapping_01 jest już 2 URL-i
Nowych wierszy do pobrania: 2
[1/2] Scrapuję: https://www.wowhead.com/quest=86881
[1/2] OK: storyline='Call of the Goddess' | patch='12.0.0'
[2/2] Scrapuję: https://www.wowhead.com/quest=90777
[2/2] OK: storyline='Foothold' | patch='12.0.0'
Zapisano ostatnią paczkę 2 wierszy od wiersza 4
Koniec. Dopisano: 2, błędy: 0, plik: D:\MyProjects_4Fun\projects\World of Warcraft\excel-mappingi\mapping_01.xlsx
