## Import der Python Bibliotheken

In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

import pandas as pd

## Pagination

In [3]:
# Basis-URL der NHL-Form-Seite
BASE_URL = "https://www.scrapethissite.com/pages/forms/"

def get_all_page_urls(base_url: str, max_page_num: int = 24):
    """
    Liefert eine Liste aller Seiten-URLs:
    - Seite 1: base_url (ohne page_num)
    - Seite 2..max_page_num: base_url + ?page_num=2 ... ?page_num=max_page_num
    """
    page_urls = [base_url]  # Seite 1

    # Seiten 2 bis max_page_num hinzufügen
    for page_num in range(2, max_page_num + 1):
        url = f"{base_url}?page_num={page_num}"
        page_urls.append(url)

    return page_urls

# Funktion aufrufen und Ergebnis anschauen
page_urls = get_all_page_urls(BASE_URL)

print("Anzahl Seiten:", len(page_urls))
for url in page_urls:
    print(url)


Anzahl Seiten: 24
https://www.scrapethissite.com/pages/forms/
https://www.scrapethissite.com/pages/forms/?page_num=2
https://www.scrapethissite.com/pages/forms/?page_num=3
https://www.scrapethissite.com/pages/forms/?page_num=4
https://www.scrapethissite.com/pages/forms/?page_num=5
https://www.scrapethissite.com/pages/forms/?page_num=6
https://www.scrapethissite.com/pages/forms/?page_num=7
https://www.scrapethissite.com/pages/forms/?page_num=8
https://www.scrapethissite.com/pages/forms/?page_num=9
https://www.scrapethissite.com/pages/forms/?page_num=10
https://www.scrapethissite.com/pages/forms/?page_num=11
https://www.scrapethissite.com/pages/forms/?page_num=12
https://www.scrapethissite.com/pages/forms/?page_num=13
https://www.scrapethissite.com/pages/forms/?page_num=14
https://www.scrapethissite.com/pages/forms/?page_num=15
https://www.scrapethissite.com/pages/forms/?page_num=16
https://www.scrapethissite.com/pages/forms/?page_num=17
https://www.scrapethissite.com/pages/forms/?page_n

## Webseiten Scrapen

In [7]:
def scrape_page(url: str):
    """
    Ruft eine Seite der NHL-Form-Tabelle ab und extrahiert alle Teams
    als Liste von Dictionaries.
    """
    response = requests.get(url)
    response.raise_for_status()  # wirft Fehler, falls HTTP-Status z.B. 404 ist
    
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Jede Datenzeile hat auf der Seite die Klasse "team"
    rows = soup.select("tr.team")
    
    data = []
    
    for row in rows:
        # Kleine Hilfsfunktion, damit wir nicht ständig if-Checks schreiben müssen
        def get_text(selector):
            cell = row.select_one(selector)
            return cell.get_text(strip=True) if cell else ""
        
        team_name = get_text("td.name")
        year      = get_text("td.year")
        wins      = get_text("td.wins")
        losses    = get_text("td.losses")
        ot_losses = get_text("td.ot-losses")
        pct       = get_text("td.pct")
        gf        = get_text("td.gf")
        ga        = get_text("td.ga")
        diff      = get_text("td.diff")
        
        data.append({
            "Team Name": team_name,
            "Year": year,
            "Wins": wins,
            "Losses": losses,
            "OT Losses": ot_losses,
            "Win %": pct,
            "Goals For (GF)": gf,
            "Goals Against (GA)": ga,
            "+ / -": diff,
        })
    
    return data

# Test: nur die erste Seite (page_urls[0]) scrapen
erste_seite = scrape_page(page_urls[0])
len(erste_seite), erste_seite[:3]



(25,
 [{'Team Name': 'Boston Bruins',
   'Year': '1990',
   'Wins': '44',
   'Losses': '24',
   'OT Losses': '',
   'Win %': '0.55',
   'Goals For (GF)': '299',
   'Goals Against (GA)': '264',
   '+ / -': '35'},
  {'Team Name': 'Buffalo Sabres',
   'Year': '1990',
   'Wins': '31',
   'Losses': '30',
   'OT Losses': '',
   'Win %': '0.388',
   'Goals For (GF)': '292',
   'Goals Against (GA)': '278',
   '+ / -': '14'},
  {'Team Name': 'Calgary Flames',
   'Year': '1990',
   'Wins': '46',
   'Losses': '26',
   'OT Losses': '',
   'Win %': '0.575',
   'Goals For (GF)': '344',
   'Goals Against (GA)': '263',
   '+ / -': '81'}])

In [None]:
# Seite 9: erste drei Teams (nur kontrolle)
neunte_seite = scrape_page(page_urls[8])
len(neunte_seite), neunte_seite[:3]


(25,
 [{'Team Name': 'Calgary Flames',
   'Year': '1998',
   'Wins': '30',
   'Losses': '40',
   'OT Losses': '',
   'Win %': '0.366',
   'Goals For (GF)': '211',
   'Goals Against (GA)': '234',
   '+ / -': '-23'},
  {'Team Name': 'Carolina Hurricanes',
   'Year': '1998',
   'Wins': '34',
   'Losses': '30',
   'OT Losses': '',
   'Win %': '0.415',
   'Goals For (GF)': '210',
   'Goals Against (GA)': '202',
   '+ / -': '8'},
  {'Team Name': 'Chicago Blackhawks',
   'Year': '1998',
   'Wins': '29',
   'Losses': '41',
   'OT Losses': '',
   'Win %': '0.354',
   'Goals For (GF)': '202',
   'Goals Against (GA)': '248',
   '+ / -': '-46'}])

In [9]:
all_rows = []

for i, url in enumerate(page_urls, start=1):
    print(f"Scraping Seite {i}: {url}")
    page_data = scrape_page(url)
    all_rows.extend(page_data)

len(all_rows)


Scraping Seite 1: https://www.scrapethissite.com/pages/forms/
Scraping Seite 2: https://www.scrapethissite.com/pages/forms/?page_num=2
Scraping Seite 3: https://www.scrapethissite.com/pages/forms/?page_num=3
Scraping Seite 4: https://www.scrapethissite.com/pages/forms/?page_num=4
Scraping Seite 5: https://www.scrapethissite.com/pages/forms/?page_num=5
Scraping Seite 6: https://www.scrapethissite.com/pages/forms/?page_num=6
Scraping Seite 7: https://www.scrapethissite.com/pages/forms/?page_num=7
Scraping Seite 8: https://www.scrapethissite.com/pages/forms/?page_num=8
Scraping Seite 9: https://www.scrapethissite.com/pages/forms/?page_num=9
Scraping Seite 10: https://www.scrapethissite.com/pages/forms/?page_num=10
Scraping Seite 11: https://www.scrapethissite.com/pages/forms/?page_num=11
Scraping Seite 12: https://www.scrapethissite.com/pages/forms/?page_num=12
Scraping Seite 13: https://www.scrapethissite.com/pages/forms/?page_num=13
Scraping Seite 14: https://www.scrapethissite.com/page

582

## Data frame bauen und CSV datei speichern'

In [10]:
# DataFrame aus der gesammelten Liste bauen
df = pd.DataFrame(all_rows)

# Datentypen anpassen – Zahlen in echte numerische Spalten umwandeln
numeric_cols = [
    "Year",
    "Wins",
    "Losses",
    "OT Losses",
    "Win %",
    "Goals For (GF)",
    "Goals Against (GA)",
    "+ / -",
]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Kurzer Blick auf die ersten Zeilen und die Typen
df.head(), df.dtypes


(            Team Name  Year  Wins  Losses  OT Losses  Win %  Goals For (GF)  \
 0       Boston Bruins  1990    44      24        NaN  0.550             299   
 1      Buffalo Sabres  1990    31      30        NaN  0.388             292   
 2      Calgary Flames  1990    46      26        NaN  0.575             344   
 3  Chicago Blackhawks  1990    49      23        NaN  0.613             284   
 4   Detroit Red Wings  1990    34      38        NaN  0.425             273   
 
    Goals Against (GA)  + / -  
 0                 264     35  
 1                 278     14  
 2                 263     81  
 3                 211     73  
 4                 298    -25  ,
 Team Name              object
 Year                    int64
 Wins                    int64
 Losses                  int64
 OT Losses             float64
 Win %                 float64
 Goals For (GF)          int64
 Goals Against (GA)      int64
 + / -                   int64
 dtype: object)

In [11]:
df.to_csv("data.csv", index=False)


## Analyse des Datensatzes

In [12]:
df = pd.read_csv("data.csv")

# Sicherstellen, dass wichtige Spalten numerisch sind
numeric_cols = [
    "Year",
    "Wins",
    "Losses",
    "OT Losses",
    "Win %",
    "Goals For (GF)",
    "Goals Against (GA)",
    "+ / -"
]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

df.head()


Unnamed: 0,Team Name,Year,Wins,Losses,OT Losses,Win %,Goals For (GF),Goals Against (GA),+ / -
0,Boston Bruins,1990,44,24,,0.55,299,264,35
1,Buffalo Sabres,1990,31,30,,0.388,292,278,14
2,Calgary Flames,1990,46,26,,0.575,344,263,81
3,Chicago Blackhawks,1990,49,23,,0.613,284,211,73
4,Detroit Red Wings,1990,34,38,,0.425,273,298,-25


## welches teams hat die meisten siege in den verschiedenen jahren

In [14]:
def most_wins_in_year(df, year):
    subset = df[df["Year"] == year]
    if subset.empty:
        return None
    
    # Index der Zeile mit den meisten Siegen
    idx = subset["Wins"].idxmax()
    row = subset.loc[idx]

    return {
        "Year": year,
        "Team": row["Team Name"],
        "Wins": int(row["Wins"])
    }

years_to_check = [1990, 2000, 2010]
results_most_wins = [most_wins_in_year(df, y) for y in years_to_check]

pd.DataFrame(results_most_wins)


Unnamed: 0,Year,Team,Wins
0,1990,Chicago Blackhawks,49
1,2000,Colorado Avalanche,52
2,2010,Vancouver Canucks,54


## bestimmen der anzahl pro teams in den jahren 1991, 2001, 2011

In [15]:
def number_of_teams(df, year):
    subset = df[df["Year"] == year]
    return {
        "Year": year,
        "Teams": subset["Team Name"].nunique()
    }

years_to_count = [1991, 2001, 2011]
results_team_counts = [number_of_teams(df, y) for y in years_to_count]

pd.DataFrame(results_team_counts)


Unnamed: 0,Year,Teams
0,1991,22
1,2001,30
2,2011,30
