# Place to Test and Develop Scripts

In [5]:
import time, io, re, os
from urllib.parse import urljoin, urlparse
import requests
import pandas as pd
from bs4 import BeautifulSoup

## Data Collection

In [6]:
url = "https://www.bowwwl.com/bowling-ball-database"

# Fetch page
response = requests.get(url)
if response.status_code == 200:
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Example: Extract table rows (depends on how the site structures data)
    tables = soup.find_all("table")
    if tables:
        df = pd.read_html(str(tables[0]))[0]  # convert first table into DataFrame
        print(df.head())
    else:
        print("No tables found.")
else:
    print(f"Request failed with status {response.status_code}")


                   Ball  Brand Release Date  \
0  RST Hyperdrive Pearl    NaN     Sep 2025   
1          Primal Ghost    NaN     Sep 2025   
2               Lock-On    NaN     Aug 2025   
3  Hyped Super Pearl II    NaN     Aug 2025   
4                Combat    NaN     Aug 2025   

                                         Coverstock  \
0                          U-R1 PearlPearl Reactive   
1  Coercion HFS (High Friction Solid)Solid Reactive   
2                            RX PearlPearl Reactive   
3                           VTC PearlPearl Reactive   
4         HK22C - Alpha Premier PearlPearl Reactive   

                                 Factory Finish                  Core     RG  \
0                                    Power Edge  RST + A.I.Asymmetric  2.510   
1                                      3000 LSS   Impulse V2Symmetric  2.550   
2                                    Power Edge       RAD-XAsymmetric  2.480   
3                                 1500 Polished        HypedSymmet

  df = pd.read_html(str(tables[0]))[0]  # convert first table into DataFrame


In [8]:
BASE = "https://www.bowwwl.com"
URL  = f"{BASE}/bowling-ball-database"

HEADERS = {"User-Agent": "Mozilla/5.0 (Mac) Safari/605.1.15 (learning project)"}

In [9]:
def get_soup(url: str) -> BeautifulSoup:
    r = requests.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")

soup = get_soup(URL)


In [12]:
def get_main_table(soup: BeautifulSoup):
    tables = soup.find_all("table")
    return tables[0] if tables else None  # adjust if there are multiple

tbl = get_main_table(soup)
headers = [th.get_text(strip=True) for th in tbl.select("thead th")]
print(headers)

['Ball', 'Brand', 'Release Date', 'Coverstock', 'Factory Finish', 'Core', 'RG', 'Diff', 'MB Diff', 'US/Overseas Release', 'Discontinued?']


In [13]:
def parse_table(tbl) -> pd.DataFrame:
    headers = [th.get_text(strip=True) for th in tbl.select("thead th")]
    rows = []
    for tr in tbl.select("tbody tr"):
        tds = tr.find_all("td")
        if not tds or len(tds) != len(headers):  # defensive
            continue

        row = {}
        for col_name, td in zip(headers, tds):
            # base text
            row[col_name] = td.get_text(" ", strip=True)

            # enrich Ball cell with image + detail url
            if col_name.lower() == "ball":
                a = td.find("a")
                img = td.find("img")

                if a and a.get("href"):
                    row["detail_url"] = urljoin(BASE, a["href"])
                if a:
                    row["Ball"] = a.get_text(strip=True) or row["Ball"]

                if img:
                    src = img.get("src") or img.get("data-src")
                    if src:
                        row["image_url"] = urljoin(BASE, src)

        rows.append(row)

    return pd.DataFrame(rows)

df_page = parse_table(tbl)
df_page.head()


Unnamed: 0,Ball,detail_url,image_url,Brand,Release Date,Coverstock,Factory Finish,Core,RG,Diff,MB Diff,US/Overseas Release,Discontinued?
0,RST Hyperdrive Pearl,https://www.bowwwl.com/bowling-ball-database/r...,https://www.bowwwl.com/sites/default/files/sty...,,Sep 2025,U-R1 Pearl Pearl Reactive,Power Edge,RST + A.I. Asymmetric,2.51,0.055,0.016,,
1,Primal Ghost,https://www.bowwwl.com/bowling-ball-database/m...,https://www.bowwwl.com/sites/default/files/sty...,,Sep 2025,Coercion HFS (High Friction Solid) Solid Reactive,3000 LSS,Impulse V2 Symmetric,2.55,0.05,,,
2,Lock-On,https://www.bowwwl.com/bowling-ball-database/s...,https://www.bowwwl.com/sites/default/files/sty...,,Aug 2025,RX Pearl Pearl Reactive,Power Edge,RAD-X Asymmetric,2.48,0.054,0.018,,
3,Hyped Super Pearl II,https://www.bowwwl.com/bowling-ball-database/r...,https://www.bowwwl.com/sites/default/files/sty...,,Aug 2025,VTC Pearl Pearl Reactive,1500 Polished,Hyped Symmetric,2.52,0.036,,,
4,Combat,https://www.bowwwl.com/bowling-ball-database/b...,https://www.bowwwl.com/sites/default/files/sty...,,Aug 2025,HK22C - Alpha Premier Pearl Pearl Reactive,"500/1000/1500 Siaair, Crown Factory Compound",Rampart Asymmetric,2.502,0.051,0.019,,


In [15]:
def discover_pages(soup: BeautifulSoup) -> list[str]:
    pages = {URL}  # include first page
    for a in soup.select("a[href]"):
        href = a["href"]
        if "bowling-ball-database" in href:  # stays in the listing
            pages.add(urljoin(BASE, href))
    return sorted(pages)

candidate_pages = discover_pages(soup)
len(candidate_pages), candidate_pages[:5]


(101,
 ['https://www.bowwwl.com/bowling-ball-database',
  'https://www.bowwwl.com/bowling-ball-database/900-global/cove',
  'https://www.bowwwl.com/bowling-ball-database/900-global/duty-majesty',
  'https://www.bowwwl.com/bowling-ball-database/900-global/ember',
  'https://www.bowwwl.com/bowling-ball-database/900-global/honey-badger-blameless'])