In [1]:

import pandas as pd
import requests
from datetime import datetime


In [2]:
def remove_multiindex(df: pd.DataFrame):
    return df.droplevel(level=0, axis=1)


In [3]:
# link for extract html data
def getdata(url):
    r = requests.get(url)
    return r.text

In [5]:
import pandas as pd
import os
from datetime import datetime

# Create an empty DataFrame to store data from all years
all_years_data = pd.DataFrame()

# Inside the loop for each year
for year in range(1980, datetime.now().year):
    # Construct the filename for the CSV file
    csv_filename = f'playoffs/{year}_player_playoff_stats.csv'

    # Check if the file already exists
    if os.path.exists(csv_filename):
        # Read the CSV file into a DataFrame
        year_data = pd.read_csv(csv_filename, index_col=0)

        # Add a 'Year' column to the DataFrame
        year_data['Year'] = year

        # Concatenate the data for the current year to the all_years_data DataFrame
        all_years_data = pd.concat([all_years_data, year_data], ignore_index=True)
        print(f'Read {year} data from existing file.')
    else:
        # Query the data and process it (if the file doesn't exist)
        df = pd.read_html(getdata(f'https://www.basketball-reference.com/playoffs/NBA_{year}_per_game.html'))[0]
        df_adv = pd.read_html(getdata(f'https://www.basketball-reference.com/playoffs/NBA_{year}_advanced.html'))[0]

        # Drop rows with header information
        df = df.drop(df[df['Player'] == 'Player'].index)
        df_adv = df_adv.drop(df_adv[df_adv['Player'] == 'Player'].index)

        # Merge the DataFrames
        new = pd.merge(df, df_adv, on=['Player', 'Age', 'Tm', 'Pos'])

        # Define column lists
        player_stats_list = ['Player', 'Pos', 'Age', 'Tm', 'G_x', 'GS', 'MP_x', 'FG', 'FGA',
                             'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA',
                             'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
                             'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%',
                             'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS',
                             'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']

        player_stats_list_correct = ['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
                                     '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
                                     'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PER',
                                     'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
                                     'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM',
                                     'VORP']

        # Select columns and set column names
        new = new[player_stats_list]
        new.columns = player_stats_list_correct

        # Remove duplicate rows based on 'Player' column
        new = new.drop_duplicates(subset='Player', keep=False).reset_index(drop=True)

        # Fill NaN values with 0
        new.fillna(value=0, inplace=True)

        # Add a 'Year' column
        new['Year'] = year

        # Concatenate the data for the current year to the all_years_data DataFrame
        all_years_data = pd.concat([all_years_data, new], ignore_index=True)

        # Save the processed DataFrame to the CSV file
        new.to_csv(csv_filename)
        print(year, 'Complete')

# After the loop, you can work with the 'all_years_data' DataFrame
all_years_data.to_csv(f'playoffs/all_player_playoff_stats.csv')

Read 1980 data from existing file.
Read 1981 data from existing file.
Read 1982 data from existing file.
Read 1983 data from existing file.
Read 1984 data from existing file.
Read 1985 data from existing file.
Read 1986 data from existing file.
Read 1987 data from existing file.
Read 1988 data from existing file.
Read 1989 data from existing file.
Read 1990 data from existing file.
Read 1991 data from existing file.
Read 1992 data from existing file.
Read 1993 data from existing file.
Read 1994 data from existing file.
Read 1995 data from existing file.
Read 1996 data from existing file.
Read 1997 data from existing file.
Read 1998 data from existing file.
Read 1999 data from existing file.
Read 2000 data from existing file.
Read 2001 data from existing file.
Read 2002 data from existing file.
Read 2003 data from existing file.
Read 2004 data from existing file.
Read 2005 data from existing file.
Read 2006 data from existing file.
Read 2007 data from existing file.
Read 2008 data from 