#Web Scraping and Data Preparation

In [1]:
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup


In [7]:
def scrape_NBA_team_data(years=[2017, 2018]):
    final_df = pd.DataFrame(columns=["Year", "Team", "W", "L",
                                      "W/L%", "GB", "PS/G", "PA/G",
                                      "SRS", "Playoffs", "Losing_season"])

    for y in years:
        url = f"https://www.basketball-reference.com/leagues/NBA_{y}_standings.html"
        html = urlopen(url)
        soup = BeautifulSoup(html, "lxml")

        titles = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
        headers = titles[1:titles.index("SRS")+1]
        titles = titles[titles.index("SRS")+1:]

        try:
            row_titles = titles[0:titles.index("Eastern Conference")]
        except:
            row_titles = titles
        for i in headers:
            if i in row_titles:
                row_titles.remove(i)
        row_titles = [x for x in row_titles if x not in
                      ["Eastern Conference", "Western Conference"] +
                      ["Atlantic Division", "Central Division", "Southeast Division",
                       "Northwest Division", "Pacific Division", "Southwest Division",
                       "Midwest Division"]]

        rows = soup.findAll('tr')[1:]
        team_stats = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]
        team_stats = [e for e in team_stats if e != []]
        team_stats = team_stats[0:len(row_titles)]

        for i in range(len(team_stats)):
            team_stats[i].insert(0, row_titles[i])
            team_stats[i].insert(0, y)

        headers.insert(0, "Team")
        headers.insert(0, "Year")

        year_standings = pd.DataFrame(team_stats, columns=headers)
        year_standings["Playoffs"] = ["Y" if "*" in ele else "N" for ele in year_standings["Team"]]
        year_standings["Team"] = [ele.replace('*', '') for ele in year_standings["Team"]]
        year_standings["Losing_season"] = ["Y" if float(ele) < .5 else "N" for ele in year_standings["W/L%"]]

        final_df = pd.concat([final_df, year_standings], ignore_index=True)

    final_df.reset_index(drop=True, inplace=True)
    final_df.to_csv("nba_team_data.csv", index=False)
    return final_df


In [8]:
# Execute the scraping function
nba_data = scrape_NBA_team_data([2017, 2018])