# Importing Libraries

In [1]:
import pandas as pd
pd.set_option('display.max_columns',None)
import requests as rq 

The request headers are specified from the user consulting of the web page.

In [2]:
Headers = {
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'es-ES,es;q=0.9,en;q=0.8',
    'Connection': 'keep-alive',
    'Host': 'stats.nba.com',
    'Origin': 'https://www.nba.com',
    'Referer': 'https://www.nba.com/',
    'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Google Chrome";v="110"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-site',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
    (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
}

# Scraping the Data

After creating a dataframe DF, the desired season types and season years are specified and used in a for loop that fills the dataset. Later the correct column headers are added and a text is set to be printed as the process is complete. 

The reason for the number of 71 years is due to the fact that the 1951-1952 season is the earliest season with available data.

In [6]:
def Pipeline():
    DF = pd.DataFrame()

    season_types = ['Regular%20Season','Playoffs']
    years = ['1951-52', '1952-53', '1953-54', '1954-55', '1955-56', '1956-57', '1957-58', '1958-59', '1959-60', '1960-61',
             '1961-62', '1962-63', '1963-64', '1964-65', '1965-66', '1966-67', '1967-68', '1968-69', '1969-70', '1970-71',
             '1971-72', '1972-73', '1973-74', '1974-75', '1975-76', '1976-77', '1977-78', '1978-79', '1979-80', '1980-81',
             '1981-82', '1982-83', '1983-84', '1984-85', '1985-86', '1986-87', '1987-88', '1988-89', '1989-90', '1990-91',
             '1991-92', '1992-93', '1993-94', '1994-95', '1995-96', '1996-97', '1997-98', '1998-99', '1999-00', '2000-01',
             '2001-02', '2002-03', '2003-04', '2004-05', '2005-06', '2006-07', '2007-08', '2008-09', '2009-10', '2010-11',
             '2011-12', '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20', '2020-21',
             '2021-22', '2022-23'] 
    
    for y in years:
         for s in season_types:
            API_URL  = 'https://stats.nba.com/stats/leagueLeaders?LeagueID=00&PerMode=Totals&Scope=S&Season='+y+'&SeasonType='+s+'&StatCategory=PTS'
            r = rq.get(url = API_URL, headers = Headers).json()
            Table_Headers = r['resultSet']['headers']
            SingleDF = pd.DataFrame(r['resultSet']['rowSet'], columns=Table_Headers)
            Add_Columns = pd.DataFrame({'Year':[y for i in range(len(SingleDF))],
                                        'Season Type':[s for i in range(len(SingleDF))]})

            Updated_Table = pd.concat([SingleDF, Add_Columns], axis=1)
            DF = pd.concat([DF, Updated_Table], axis=0)
            if s == season_types[1]:
                print(f"Succesfully finished scraping data for the {y} {s}.")
            elif s == season_types[0]:
                s = 'Regular Season'
                print(f"Succesfully finished scraping data for the {y} {s}.")

    DF.columns = Table_Headers + ['Year', 'Season Type'] 

    print('''------------------------------------------------------------------\n 
                      Scraping Process Complete!''')
    
    DF.rename(columns = {'RANK':'Rank'}, inplace = True)
    DF.rename(columns = {'PLAYER_ID':'Player ID'}, inplace = True)
    DF.rename(columns = {'TEAM_ID':'Team ID'}, inplace = True)
    DF.rename(columns = {'PLAYER':'Player'}, inplace = True)
    DF.rename(columns = {'TEAM':'Team'}, inplace = True)
    DF.rename(columns = {'GP':'Games Played'}, inplace = True)
    DF.rename(columns = {'MIN':'Minutes Played'}, inplace = True)
    DF.rename(columns = {'FGM':'FG Made'}, inplace = True)
    DF.rename(columns = {'FGA':'FG Attempts'}, inplace = True)
    DF.rename(columns = {'FG_PCT':'FG %'}, inplace = True)
    DF.rename(columns = {'FG3M':'3-Pt FG Made'}, inplace = True)
    DF.rename(columns = {'FG3A':'3-Pt FG Attempts'}, inplace = True)
    DF.rename(columns = {'FG3_PCT':'3-Pt FG %'}, inplace = True)
    DF.rename(columns = {'FTM':'FT Made'}, inplace = True)
    DF.rename(columns = {'FTA':'FT Attempts'}, inplace = True)
    DF.rename(columns = {'FT_PCT':'FT %'}, inplace = True)
    DF.rename(columns = {'OREB':'Offensive Rebounds'}, inplace = True)
    DF.rename(columns = {'DREB':'Defensive Rebounds'}, inplace = True)
    DF.rename(columns = {'REB':'Rebounds'}, inplace = True)
    DF.rename(columns = {'AST':'Assists'}, inplace = True)
    DF.rename(columns = {'STL':'Steals'}, inplace = True)
    DF.rename(columns = {'BLK':'Blocks'}, inplace = True)
    DF.rename(columns = {'TOV':'Turnovers'}, inplace = True)
    DF.rename(columns = {'PF':'Personal Fouls'}, inplace = True)
    DF.rename(columns = {'PTS':'Points Scored'}, inplace = True)
    DF.rename(columns = {'EFF':'Efficency'}, inplace = True)
    DF.rename(columns = {'AST_TOV':'AST/TOV'}, inplace = True)
    DF.rename(columns = {'STL_TOV':'STL/TOV'}, inplace = True)


    DF['Season Type'].replace(to_replace=['Regular%20Season'], value='Regular Season', inplace=True)

    DF['Season Start Year'] = DF['Year'].str[:4].astype(int)    



    Ordered_Cols = ['Rank','Year', 'Season Start Year', 'Season Type', 'Player ID', 'Player', 'Team ID', 'Team',
           'Games Played', 'Minutes Played', 'FG Made', 'FG Attempts', 'FG %',
           '3-Pt FG Made', '3-Pt FG Attempts', '3-Pt FG %', 'FT Made',
           'FT Attempts', 'FT %', 'Offensive Rebounds', 'Defensive Rebounds',
           'Rebounds', 'Assists', 'Steals', 'Blocks', 'Turnovers',
           'Personal Fouls', 'Points Scored', 'Efficency', 'AST/TOV', 'STL/TOV']
    DF = DF[Ordered_Cols]
    
    
    if DF.isna().any().any():
        DF = DF.drop_duplicates()
        print(f"Duplicate rows removed.")
    else:
        print(f"No duplicate rows found.")
    
    if DF.duplicated().any():
        DF = DF.dropna()
        print(f"Missing values removed.")
    else:
        print(f"No missing values found.")

    print('''------------------------------------------------------------------\n 
                          Cleaning Process Complete!''')
    
    print(f"\n \
                                     There are {DF.Team.nunique()} team names considered in the dataframe:\
    \n")
    print(sorted(DF.Team.unique()))
    
    filename = 'NBA_Stats_71_Years_DS.xlsx'

    print('''------------------------------------------------------------------\n 
                                Process Complete!''')
    print(f"Data saved to file:  {filename}.")

    DF.to_excel(filename, index=False)

In [7]:
Pipeline()

Succesfully finished scraping data for the 1951-52 Regular Season.
Succesfully finished scraping data for the 1951-52 Playoffs.
Succesfully finished scraping data for the 1952-53 Regular Season.
Succesfully finished scraping data for the 1952-53 Playoffs.
Succesfully finished scraping data for the 1953-54 Regular Season.
Succesfully finished scraping data for the 1953-54 Playoffs.
Succesfully finished scraping data for the 1954-55 Regular Season.
Succesfully finished scraping data for the 1954-55 Playoffs.
Succesfully finished scraping data for the 1955-56 Regular Season.
Succesfully finished scraping data for the 1955-56 Playoffs.
Succesfully finished scraping data for the 1956-57 Regular Season.
Succesfully finished scraping data for the 1956-57 Playoffs.
Succesfully finished scraping data for the 1957-58 Regular Season.
Succesfully finished scraping data for the 1957-58 Playoffs.
Succesfully finished scraping data for the 1958-59 Regular Season.
Succesfully finished scraping data fo

Succesfully finished scraping data for the 2015-16 Playoffs.
Succesfully finished scraping data for the 2016-17 Regular Season.
Succesfully finished scraping data for the 2016-17 Playoffs.
Succesfully finished scraping data for the 2017-18 Regular Season.
Succesfully finished scraping data for the 2017-18 Playoffs.
Succesfully finished scraping data for the 2018-19 Regular Season.
Succesfully finished scraping data for the 2018-19 Playoffs.
Succesfully finished scraping data for the 2019-20 Regular Season.
Succesfully finished scraping data for the 2019-20 Playoffs.
Succesfully finished scraping data for the 2020-21 Regular Season.
Succesfully finished scraping data for the 2020-21 Playoffs.
Succesfully finished scraping data for the 2021-22 Regular Season.
Succesfully finished scraping data for the 2021-22 Playoffs.
Succesfully finished scraping data for the 2022-23 Regular Season.
Succesfully finished scraping data for the 2022-23 Playoffs.
-------------------------------------------

There are missing values (None, NaN) for some columns in the earlier seasons up until about 1979. This is likely to be due to those stats not having been recorded yet. This should be considered before simply setting them to 0.

# Cleaning

Afterwards, the dataframe is cleaned by a renaming of the columns and the "Regular Season" value of the "Seaosn Type" column in a more readable way. Additionally, since the "Year" column has object-type values (e.g. 2008-09), a numerical equivalent column ("Season Start Year") is created for manipulation.

The columns are then reordered in a more appropriate manner and the resulting dataframe is previewed for confirmation.

# Checking Properties

The data types are specified and we then perform a quick search for duplicates.

Afterwards, we check for the unique values in the "Teams". It is important to note that the number of NBA teams per season have steadily increased through the years (for a while they have been 30 per season). However, the number of teams considered is double that. This is likely due to multiple teams changing names (and location) through the years (and therefore appearing more than once) or simply ceasing to exist. It is crucial to consider this before making any analysis.

# Saving Dataset to File

Finally, the data is saved to an Excel file.