# Modules To Import

In [7]:
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import requests
import time
import json
import re

# Crawler Class
> #### Note On Season Identifiers
> Season years are represented in th from of "Season year-year+1" (e.g. Season 2021-2022). To have less data complexity and more efficieny I choose "year+1" from each season in the form of "year-year+1" as a key for that specific season because the website [basketball-reference.com](www.basketball-reference.com) is also doing the same.
> 
>  **TL;DR**: The key "2024" points to the 2023-24 season.  

In [18]:
class Crawler:
    current_year = datetime.now().year
    # Gets Soup From url with error handling
    def __get_soup(self, url, retries=3, wait=5): 
        for i in range(retries):
            try: 
                page = requests.get(url)
            except:
                if i==retries-1:
                    print(f"Failed to Retrieve from {url} After {retries} retries. Passing null")
                    return BeautifulSoup()
                print(f"Failed to Rerieve from {url}, {i+1} of {retries} retries. Waiting {wait} seconds...")
                time.sleep(wait)
                continue
            break
        return BeautifulSoup(page.content, "html.parser")

    # Gets all The MVP's and return them with some data attached in a pd.DataFrame
    def get_mvps(self):
        mvp_soup = self.__get_soup("https://www.basketball-reference.com/awards/mvp.html")
        mvp_soup = mvp_soup.find('table', id="mvp_NBA").tbody.find_all("tr")
        mvp_df = pd.DataFrame(columns=["player_name", "player_id", "team", "year"])

    # Scraps Each Player in the Table
        for tr in mvp_soup:
            mvp_df.loc[len(mvp_df)] = [      
                tr.find(attrs={"data-stat":"player"}).a.string,
                tr.find(attrs={"data-stat":"player"}).a.get("href")[9:-5],
                tr.find(attrs={"data-stat":"team_id"}).a.get("href")[1:-5].split("/")[1],
                tr.find(attrs={"data-stat":"team_id"}).a.get("href")[1:-5].split("/")[-1]
            ]
        return mvp_df

    #def get_top_season_players
    #def get_top_season_teams
    #def get_team_players

    # Gets Data From Each Player using a series that contains the player_id. The reason for this approach is to be able to apply this function to a dataframe as a whole 
    def get_player_data(self, player_series: pd.Series):
        player_series = player_series.copy()
        player_soup = self.__get_soup(f"https://www.basketball-reference.com/players/{player_series["player_id"]}.html")                                                           
        player_json = json.loads(player_soup.find("script", type="application/ld+json").string)                                      # some player data that are in json form
        player_weightheight = player_soup.find("span", string=re.compile(".*lb")).next_sibling.strip()[1:-1].split(",\xa0")          # Player weight and height in cm and kg in a list

        # Gets and Cleans Player Position
        position_string = player_soup.find("strong", string=re.compile(".*Position:.*")).next_sibling
        positions_cleaned = re.sub(r'[^\w\s,]', '', position_string)
        position_list = re.split(r',|and', positions_cleaned)
        position_list = [pos.strip() for pos in position_list if pos.strip()]

        # get if player is retired
        retired = True if (player_soup.find("strong", string=re.compile(".*Career Length:.*"))) else False

        # Applying the Captured data to the player pd.Series (functionallity of each line insists upon itself)
        player_series["height_cm"] = player_weightheight[0][:-2]
        player_series["weight_kg"] = player_weightheight[1][:-2]
        player_series["position"] = position_list
        player_series["shooting_hand"] = player_soup.find("strong", string=re.compile(".*Shoots:.*")).next_sibling.strip()
        player_series["retired"] = retired     
        player_series["experience_total"] = int(player_soup.find("strong", string=re.compile(".*Experience:.*|.*Career Length:.*")).next_sibling.strip().split()[0])      # note that the experience got is with respect to the current year (if they are still playing)
        player_series["experience_at_year"] = player_series["experience_total"] - (self.current_year - player_series["year"]) if not retired else player_series["experience_total"]    # This gets the experience at the year data is gotten if player isnt retired.
        player_series["birthplace"] = player_json.get('birthPlace').split(",")[-1].strip()
        player_series["birthdate"] = player_json.get('birthDate')

        return player_series
        
crawler = Crawler()

## Crawler Initialization
creating the `Crawler` object and getting some basic info such as the MVP's list

In [9]:
mvp = crawler.get_mvps()
mvp

Unnamed: 0,player_name,player_id,team,year
0,Nikola Jokić,j/jokicni01,DEN,2024
1,Joel Embiid,e/embiijo01,PHI,2023
2,Nikola Jokić,j/jokicni01,DEN,2022
3,Nikola Jokić,j/jokicni01,DEN,2021
4,Giannis Antetokounmpo,a/antetgi01,MIL,2020
...,...,...,...,...
64,Wilt Chamberlain,c/chambwi01,PHW,1960
65,Bob Pettit,p/pettibo01,STL,1959
66,Bill Russell,r/russebi01,BOS,1958
67,Bob Cousy,c/cousybo01,BOS,1957


## Adding details
Adding player details by applying the `get_player_data` method to the MVP's list.

In [87]:
mvp_detailed = mvp.apply(crawler.get_player_data, axis="columns")
mvp_detailed

Failed to Rerieve from https://www.basketball-reference.com/players/j/jamesle01.html, 1 of 3 retries. Waiting 5 seconds...
Failed to Rerieve from https://www.basketball-reference.com/players/n/nashst01.html, 1 of 3 retries. Waiting 5 seconds...


Unnamed: 0,player_name,player_id,team,year,height_cm,weight_kg,position,shooting_hand,experience,birthplace,birthdate
0,Nikola Jokić,j/jokicni01,DEN,2024,211,128,[Center],Right,9,Serbia,1995-02-19
1,Joel Embiid,e/embiijo01,PHI,2023,213,127,[Center],Right,8,Cameroon,1994-03-16
2,Nikola Jokić,j/jokicni01,DEN,2022,211,128,[Center],Right,9,Serbia,1995-02-19
3,Nikola Jokić,j/jokicni01,DEN,2021,211,128,[Center],Right,9,Serbia,1995-02-19
4,Giannis Antetokounmpo,a/antetgi01,MIL,2020,211,109,"[Power Forward, Small Forward, Point Guard, Sh...",Right,11,Greece,1994-12-06
...,...,...,...,...,...,...,...,...,...,...,...
64,Wilt Chamberlain,c/chambwi01,PHW,1960,216,124,[Center],Right,14,United States,1936-08-21
65,Bob Pettit,p/pettibo01,STL,1959,206,92,"[Power Forward, Center]",Right,11,United States,1932-12-12
66,Bill Russell,r/russebi01,BOS,1958,208,97,[Center],Left,13,United States,1934-02-12
67,Bob Cousy,c/cousybo01,BOS,1957,185,79,[Point Guard],Right,14,United States,1928-08-09


In [104]:
crawler.get_player_data(mvp.iloc[0])

player_name      Nikola Jokić
player_id         j/jokicni01
team                      DEN
year                     2024
height_cm                 211
weight_kg                 128
position             [Center]
shooting_hand           Right
experience                  9
retired                 False
birthplace             Serbia
birthdate          1995-02-19
Name: 0, dtype: object

In [88]:
mvp_detailed.to_csv("data/mvp.csv")

In [130]:
# <td class="left" csk="Dončić Luka-1" data-append-csv="doncilu01" data-stat="name_display">\
a = requests.get("https://www.basketball-reference.com/leagues/NBA_2024_totals.html")
a2 = requests.get("https://www.basketball-reference.com/leagues/NBA_2024_standings.html")
s2 = BeautifulSoup(a2.content, "html.parser")
s = BeautifulSoup(a.content, "html.parser")

In [201]:
# top player by year
l = [[td.a.string,td.a.get("href")[9:-5]] if td.a else None for td in s.find_all("td", attrs={"data-stat":"name_display"})]

<!DOCTYPE html>
<html class="no-js" data-root="/home/bbr/build" data-version="klecko-" lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="ie=edge" http-equiv="x-ua-compatible"/>
  <meta content="width=device-width, initial-scale=1.0, maximum-scale=2.0" name="viewport">
   <link href="https://cdn.ssref.net/req/202501161" rel="dns-prefetch"/>
   <script>
    /* https://docs.osano.com/hc/en-us/articles/22469433444372-Google-Consent-Mode-v2  */
  window.dataLayer = window.dataLayer ||[];
      function gtag(){dataLayer.push(arguments);}
      gtag('consent','default',{
        'ad_storage':'denied',
        'analytics_storage':'denied',
        'ad_user_data':'denied',
        'ad_personalization':'denied',
        'personalization_storage':'denied',
        'functionality_storage':'granted',
        'security_storage':'granted',
        'wait_for_update': 500
      });
      gtag("set", "ads_data_redaction", true);
   </script>
   <script src="https://cmp.osano.com/16CGnCU8UtNh

In [200]:
# teamseason list
teamlistsoup = BeautifulSoup(s2.find(class_="placeholder").next_sibling.next_sibling.string, "html.parser").find_all("td", attrs={"data-stat":"team_name"})
teamlist = [[td.a.string,td.a.get("href")[7:-5]] if td.a else None for td in teamlistsoup]
teamlist

[['Boston Celtics', 'BOS/2024'],
 ['Denver Nuggets', 'DEN/2024'],
 ['Oklahoma City Thunder', 'OKC/2024'],
 ['Minnesota Timberwolves', 'MIN/2024'],
 ['Los Angeles Clippers', 'LAC/2024'],
 ['Dallas Mavericks', 'DAL/2024'],
 ['New York Knicks', 'NYK/2024'],
 ['Milwaukee Bucks', 'MIL/2024'],
 ['New Orleans Pelicans', 'NOP/2024'],
 ['Phoenix Suns', 'PHO/2024'],
 ['Cleveland Cavaliers', 'CLE/2024'],
 ['Indiana Pacers', 'IND/2024'],
 ['Los Angeles Lakers', 'LAL/2024'],
 ['Orlando Magic', 'ORL/2024'],
 ['Philadelphia 76ers', 'PHI/2024'],
 ['Miami Heat', 'MIA/2024'],
 ['Golden State Warriors', 'GSW/2024'],
 ['Sacramento Kings', 'SAC/2024'],
 ['Houston Rockets', 'HOU/2024'],
 ['Chicago Bulls', 'CHI/2024'],
 ['Atlanta Hawks', 'ATL/2024'],
 ['Brooklyn Nets', 'BRK/2024'],
 ['Utah Jazz', 'UTA/2024'],
 ['Memphis Grizzlies', 'MEM/2024'],
 ['Toronto Raptors', 'TOR/2024'],
 ['San Antonio Spurs', 'SAS/2024'],
 ['Charlotte Hornets', 'CHO/2024'],
 ['Portland Trail Blazers', 'POR/2024'],
 ['Washington Wizar

In [11]:
# teamseason active roster
a3 = requests.get("https://www.basketball-reference.com/teams/BOS/2024.html")
s3 = BeautifulSoup(a3.content, "html.parser")
playerlistsoup = s3.find("table", id="roster").find_all("td", attrs={"data-stat":"player"})
playerlist = [[td.a.string, td.a.get("href")[9:-5]] for td in playerlistsoup]

In [12]:
playerlist

[['Dalano Banton', 'b/bantoda01'],
 ['Oshae Brissett', 'b/brissos01'],
 ['Jaylen Brown', 'b/brownja02'],
 ['JD Davison', 'd/davisjd01'],
 ['Sam Hauser', 'h/hausesa01'],
 ['Jrue Holiday', 'h/holidjr01'],
 ['Al Horford', 'h/horfoal01'],
 ['Luke Kornet', 'k/kornelu01'],
 ['Svi Mykhailiuk', 'm/mykhasv01'],
 ['Drew Peterson', 'p/peterdr01'],
 ['Kristaps Porziņģis', 'p/porzikr01'],
 ['Payton Pritchard', 'p/pritcpa01'],
 ['Neemias Queta', 'q/quetane01'],
 ['Jaden Springer', 's/sprinja01'],
 ['Lamar Stevens', 's/stevela01'],
 ['Jayson Tatum', 't/tatumja01'],
 ['Xavier Tillman Sr.', 't/tillmxa01'],
 ['Jordan Walsh', 'w/walshjo01'],
 ['Derrick White', 'w/whitede01']]