In [None]:
%config IPCompleter.greedy=True

# Import the dependencies.
import json
import re
import time

import numpy as np
import pandas as pd
import requests

# https://beautiful-soup-4.readthedocs.io/en/latest/
from bs4 import BeautifulSoup as bs

In [None]:
season_id = 2020
url = f"https://www.basketball-reference.com/leagues/NBA_{season_id}_per_game.html"
response = requests.get(url=url)
soup = bs(response.content, "html.parser")

In [None]:
soup.findAll("tr", limit=2)

In [None]:
column_headers = [th.getText() for th in soup.findAll("tr", limit=2)[0].findAll("th")]
print(column_headers)
column_headers = column_headers[1:]

In [None]:
data_rows = soup.findAll("tr")[1:]

player_data = [
    [td.getText() for td in data_rows[i].findAll("td")] for i in range(len(data_rows))
]
print(len(player_data))
#print(player_data)

player_data_filtered = list(filter(None, player_data))
print(len(player_data_filtered))
#print(player_data_filtered)

player_ids = [
    [td.get("data-append-csv") for td in data_rows[i].findAll("td", attrs={"data-append-csv": True})] for i in range(len(data_rows))
]
print(len(player_ids))
#print(player_ids)

flat_list = [item for sublist in player_ids for item in sublist]
print(len(flat_list))
#print(flat_list)

In [None]:
general_2019_20_df = pd.DataFrame(player_data_filtered, columns=column_headers)
general_2019_20_df.head()

In [None]:
# We want to drop the NaN rows
# How to select rows with NaN in particular column?
# df.loc[df['Col2'].isnull()] 
na_reference_df = general_2019_20_df.loc[general_2019_20_df["Player"].isnull()]
na_reference_df

# Dropped rows where the column Player is equal to NaN
general_2019_20_df = general_2019_20_df.loc[~general_2019_20_df["Player"].isin([np.nan])]
general_2019_20_df

In [None]:
# Now add the season date to match the format from NBA Stats at the beginning of the data frame
general_2019_20_df.insert(0, "season_id", "2019-20")
general_2019_20_df.head()

In [None]:
# Now add the player_id
general_2019_20_df.insert(1, "bbref_player_id", flat_list)
general_2019_20_df.head()

In [None]:
# rename the matching columns
rename_these_columns = {
    "Tm": "team_abbreviation",
    "Age": "age",
    "G": "gp",
    "MP": "min",
    "FG": "fgm",
    "FGA": "fga",
    "FG%": "fg_pct",
    "3P": "fg3m",
    "3PA": "fg3a",
    "3P%": "fg3_pct",
    "FT": "ftm",
    "FTA": "fta",
    "FT%": "ft_pct",
    "ORB": "oreb",
    "DRB": "dreb",
    "TRB": "reb",
    "AST": "ast",
    "TOV": "tov",
    "STL": "stl",
    "BLK": "blk",
    "PF": "pf",
    "PTS": "pts",
}

general_2019_20_df = general_2019_20_df.rename(columns=rename_these_columns, errors="raise")
general_2019_20_df = general_2019_20_df.rename(str.lower, axis='columns')
general_2019_20_df.head()

In [None]:
# Save to CSV
general_2019_20_df.to_csv(
    "../../data/interim/bbref-general-traditional-2019-20.csv", index=False
)

In [None]:
url_adv = f"https://www.basketball-reference.com/leagues/NBA_{season_id}_advanced.html"
response = requests.get(url=url_adv)
soup = bs(response.content, "html.parser")

In [None]:
soup.findAll("tr", limit=2)

In [None]:
column_headers = [th.getText() for th in soup.findAll("tr", limit=2)[0].findAll("th")]
column_headers = column_headers[1:]

In [None]:
data_rows = soup.findAll("tr")[1:]

player_data = [
    [td.getText() for td in data_rows[i].findAll("td")] for i in range(len(data_rows))
]
print(len(player_data))
#print(player_data)

player_data_filtered = list(filter(None, player_data))
print(len(player_data_filtered))
#print(player_data_filtered)

player_ids = [
    [td.get("data-append-csv") for td in data_rows[i].findAll("td", attrs={"data-append-csv": True})] for i in range(len(data_rows))
]
print(len(player_ids))
#print(player_ids)

flat_list = [item for sublist in player_ids for item in sublist]
print(len(flat_list))
#print(flat_list)

In [None]:
advanced_2019_20_df = pd.DataFrame(player_data_filtered, columns=column_headers)
advanced_2019_20_df.head()

In [None]:
# We want to drop the NaN rows
# How to select rows with NaN in particular column?
# df.loc[df['Col2'].isnull()] 
advanced_na_reference_df = advanced_2019_20_df.loc[advanced_2019_20_df["Player"].isnull()]
advanced_na_reference_df

# Dropped rows where the column Player is equal to NaN
advanced_2019_20_df = advanced_2019_20_df.loc[~advanced_2019_20_df["Player"].isin([np.nan])]
advanced_2019_20_df

In [None]:
# Now add the season date to match the format from NBA Stats at the beginning of the data frame
advanced_2019_20_df.insert(0, "season_id", "2019-20")
advanced_2019_20_df.head()

In [None]:
# Now add the player_id
advanced_2019_20_df.insert(1, "bbref_player_id", flat_list)
advanced_2019_20_df.head()

In [None]:
# Figure out Columns
print(advanced_2019_20_df.columns.values.tolist())

In [None]:
# rename the matching columns
rename_these_columns = {
    "Tm": "team_abbreviation",
    "G": "gp",
    "MP": "min",
}

advanced_2019_20_df = advanced_2019_20_df.rename(columns=rename_these_columns, errors="raise")
advanced_2019_20_df = advanced_2019_20_df.rename(str.lower, axis='columns')
advanced_2019_20_df.head()

In [None]:
# Save to CSV
advanced_2019_20_df.to_csv(
    "../../data/interim/bbref-general-advanced-2019-20.csv", index=False
)

In [None]:
general_2019_20_df = general_2019_20_df[general_2019_20_df["player"].notnull()]

general_2019_20_df = general_2019_20_df[:].fillna(0)

general_2019_20_df = general_2019_20_df.drop_duplicates(["player"], keep="first")

In [None]:
advanced_2019_20_df = advanced_2019_20_df[advanced_2019_20_df["player"].notnull()]

advanced_2019_20_df = advanced_2019_20_df[:].fillna(0)

advanced_2019_20_df = advanced_2019_20_df.drop_duplicates(["player"], keep="first")

In [None]:
complete_2019_20_stats_df = pd.merge(
    general_2019_20_df, advanced_2019_20_df, on="player"
)

In [None]:
complete_2019_20_stats_df

In [None]:
# Save to CSV
complete_2019_20_stats_df.to_csv(
    "../../data/interim/bbref-general-complete-2019-20.csv", index=False
)