In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

BASE_URL = "https://www.scrapethissite.com/pages/forms/"


In [2]:
resp = requests.get(BASE_URL)
resp.status_code


200

In [3]:
from bs4 import BeautifulSoup

def parse_page(html):
    soup = BeautifulSoup(html, "html.parser")
    data = []
    for row in soup.select("tr.team"):
        name = row.select_one("td.name").get_text(strip=True)
        year = int(row.select_one("td.year").get_text(strip=True))
        wins = int(row.select_one("td.wins").get_text(strip=True) or 0)
        losses = int(row.select_one("td.losses").get_text(strip=True) or 0)
        ot_text = row.select_one("td.ot-losses").get_text(strip=True)
        ot_losses = int(ot_text) if ot_text else 0
        pct = float(row.select_one("td.pct").get_text(strip=True) or 0.0)
        gf = int(row.select_one("td.gf").get_text(strip=True) or 0)
        ga = int(row.select_one("td.ga").get_text(strip=True) or 0)
        diff = int(row.select_one("td.diff").get_text(strip=True) or 0)

        data.append({
            "name": name,
            "year": year,
            "wins": wins,
            "losses": losses,
            "ot_losses": ot_losses,
            "pct": pct,
            "gf": gf,
            "ga": ga,
            "diff": diff,
        })
    return data

test_rows = parse_page(resp.text)
len(test_rows), test_rows[0]


(25,
 {'name': 'Boston Bruins',
  'year': 1990,
  'wins': 44,
  'losses': 24,
  'ot_losses': 0,
  'pct': 0.55,
  'gf': 299,
  'ga': 264,
  'diff': 35})

In [4]:
all_rows = []

for page in range(1, 25):  # wir probieren Seiten 1â€“24
    params = {"page_num": page, "per_page": 100}
    r = requests.get(BASE_URL, params=params)
    r.raise_for_status()
    rows = parse_page(r.text)
    print(f"Seite {page}: {len(rows)} Zeilen")
    if not rows:
        break
    all_rows.extend(rows)

len(all_rows)


Seite 1: 100 Zeilen
Seite 2: 100 Zeilen
Seite 3: 100 Zeilen
Seite 4: 100 Zeilen
Seite 5: 100 Zeilen
Seite 6: 82 Zeilen
Seite 7: 0 Zeilen


582

In [5]:
df = pd.DataFrame(all_rows)
df.head(), df.shape


(                 name  year  wins  losses  ot_losses    pct   gf   ga  diff
 0       Boston Bruins  1990    44      24          0  0.550  299  264    35
 1      Buffalo Sabres  1990    31      30          0  0.388  292  278    14
 2      Calgary Flames  1990    46      26          0  0.575  344  263    81
 3  Chicago Blackhawks  1990    49      23          0  0.613  284  211    73
 4   Detroit Red Wings  1990    34      38          0  0.425  273  298   -25,
 (582, 9))

In [6]:
df.to_csv("data.csv", index=False)



In [7]:
df = pd.read_csv("data.csv")
df.head()


Unnamed: 0,name,year,wins,losses,ot_losses,pct,gf,ga,diff
0,Boston Bruins,1990,44,24,0,0.55,299,264,35
1,Buffalo Sabres,1990,31,30,0,0.388,292,278,14
2,Calgary Flames,1990,46,26,0,0.575,344,263,81
3,Chicago Blackhawks,1990,49,23,0,0.613,284,211,73
4,Detroit Red Wings,1990,34,38,0,0.425,273,298,-25


In [8]:
for year in [1990, 2000, 2010]:
    sub = df[df["year"] == year]
    idx = sub["wins"].idxmax()
    row = sub.loc[idx]
    print(f"{year}: {row['name']} mit {row['wins']} Siegen")



1990: Chicago Blackhawks mit 49 Siegen
2000: Colorado Avalanche mit 52 Siegen
2010: Vancouver Canucks mit 54 Siegen


In [9]:
for year in [1991, 2001, 2011]:
    sub = df[df["year"] == year]
    n = sub["name"].nunique()
    print(f"{year}: {n} Teams")


1991: 22 Teams
2001: 30 Teams
2011: 30 Teams
