In [8]:
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd
from tqdm import tqdm
import time

In [28]:
def get_table_from_comments(soup, table_id):
    """
    Basketball-Reference often encloses tables within comments.
    This function looks for the table by checking within comments.
    """
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
    for comment in comments:
        comment_soup = BeautifulSoup(comment, "html.parser")
        table = comment_soup.find("table", id=table_id)
        if table:
            return table
    return None

def get_coaches_index(index_url):
    """
    Download the coaches index page and extract the coaches table.
    """
    response = requests.get(index_url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Try to directly find the table
    table = soup.find("table", id="coaches")
    if not table:
        # If not found, look inside comments.
        table = get_table_from_comments(soup, "NBA_stats")
    
    if table:
        df = pd.read_html(str(table), header=1)[0]  # Ensures first row is header
        return df, soup
    else:
        raise ValueError("Could not find the coaches table.")

def get_coach_season_data(coach_url, coach_name):
    """
    For a given coach URL, scrape and extract season-by-season data.
    """
    response = requests.get(coach_url)
    soup = BeautifulSoup(response.content, "html.parser")
    # The season data might be in a table with an id like "coaches" or "season"
    # Check direct presence first
    table = soup.find("table", id="coach-stats")
    if not table:
        table = get_table_from_comments(soup, "coaches")
    
    if table:
        df = pd.read_html(str(table), header=1)[0]
        df['Coach'] = coach_name
        # Assume that the season column is named "Season" and win percentage as "W/L%" (or similar)
        if "Season" in df.columns and ("W/L%" in df.columns or "Win%" in df.columns):
            return df
        else:
            return df  # you can further process if needed
    else:
        return None

In [18]:
# Main script:
index_url = "https://www.basketball-reference.com/coaches/NBA_stats.html"
coaches_df, index_soup = get_coaches_index(index_url)    

Coaches Index Sample:


In [19]:
coaches_df

Unnamed: 0,Rk,Coach,From,To,Yrs,G,W,L,W/L%,W > .500,G.1,W.1,L.1,W/L%.1,Conf,Champ
0,1,Rick Adelman*,1989,2014,23,1791,1042,749,.582,146.5,157,79,78,.503,2,0
1,2,Richie Adubato,1980,1997,6,367,127,240,.346,-56.5,8,2,6,.250,0,0
2,3,Danny Ainge,1997,2000,4,226,136,90,.602,23.0,12,3,9,.250,0,0
3,4,Stan Albeck,1980,1986,7,574,307,267,.535,20.0,44,18,26,.409,0,0
4,5,Curly Armstrong,1949,1949,1,54,22,32,.407,-5.0,,,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384,351,Randy Wittman,2000,2016,10,684,278,406,.406,-64.0,21,12,9,.571,0,0
385,352,Dave Wohl,1986,1988,3,179,65,114,.363,-24.5,3,0,3,.000,0,0
386,353,Charles Wolf,1961,1965,5,330,143,187,.433,-22.0,16,7,9,.438,0,0
387,354,Mike Woodson,2005,2014,9,680,315,365,.463,-25.0,46,18,28,.391,0,0


In [44]:
whole_coaches_df = pd.DataFrame()

# Loop over some coaches to get season data (for demonstration, limit to the first 5)
for idx, row in tqdm(coaches_df.iterrows(), total=len(coaches_df)):
    coach_name = row["Coach"]
    try: 
        coach_name = coach_name.replace("*", "")
        # Locate the coach hyperlink from the index page's soup.
        # This assumes the text in the <a> tag exactly matches the coach's name.
        link_tag = index_soup.find("a", string=coach_name)
        if link_tag and link_tag.get("href"):
            coach_url = "https://www.basketball-reference.com" + link_tag["href"]
            season_df = get_coach_season_data(coach_url, coach_name=coach_name)
            if season_df is not None:
                whole_coaches_df = pd.concat([whole_coaches_df, season_df], ignore_index=True)

                if "Season" in season_df.columns and ("W/L%" in season_df.columns or "Win%" in season_df.columns):
                    win_pct_col = "W/L%" if "W/L%" in season_df.columns else "Win%"
                else:
                    print("Expected columns not found in the season table.")
            else:
                print("No season data found for this coach.")
        else:
            print(f"Link not found for coach: {coach_name}")
    except Exception as e:
        print(e, coach_name)
    time.sleep(3)

  comment_soup = BeautifulSoup(comment, "html.parser")


No season data found for this coach.


  0%|          | 1/389 [00:03<20:43,  3.21s/it]

No season data found for this coach.


  0%|          | 1/389 [00:05<35:46,  5.53s/it]


KeyboardInterrupt: 

In [45]:
whole_coaches_df