# Modules To Import

In [8]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import json

# Crawler Class
> #### Note On Season Identifiers
> Season years are represented in th from of "Season year-year+1" (e.g. Season 2021-2022). To have less data complexity and more efficieny I choose "year+1" from each season in the form of "year-year+1" as a key for that specific season because the website [basketball-reference.com](www.basketball-reference.com) is also doing the same.
> 
>  **TL;DR**: The key "2024" points to the 2023-24 season.  

In [60]:
class Crawler:
    # Gets all The MVP's and return them with some data attached in a pd.DataFrame
    def get_mvps(self):
        mvp_page = requests.get("https://www.basketball-reference.com/awards/mvp.html")
        mvp_soup = BeautifulSoup(mvp_page.content, "html.parser")
        mvp_soup = mvp_soup.find('table', id="mvp_NBA").tbody.find_all("tr")
        mvp_df = pd.DataFrame(columns=["player_name", "player_id", "team", "year"])

    # Scraps Each Player in the Table
        for tr in mvp_soup:
            mvp_df.loc[len(mvp_df)] = [      
                tr.find(attrs={"data-stat":"player"}).a.string,
                tr.find(attrs={"data-stat":"player"}).a.get("href")[9:-5],
                tr.find(attrs={"data-stat":"team_id"}).a.get("href")[1:-5].split("/")[1],
                tr.find(attrs={"data-stat":"team_id"}).a.get("href")[1:-5].split("/")[-1]
            ]
        return mvp_df

    # Gets Data From Each Player using a series that contains the player_id. The reason for this approach is to be able to apply this function to a dataframe as a whole 
    def get_player_data(self, player_series: pd.Series):
        player_series = player_series.copy()
        player_page = requests.get("https://www.basketball-reference.com/players/" + player_series["player_id"] + ".html")           # gets players web page using their id
        player_soup = BeautifulSoup(player_page.content, "html.parser")                                                               
        player_json = json.loads(player_soup.find("script", type="application/ld+json").string)                                      # some player data that are in json form
        player_weightheight = player_soup.find("span", string=re.compile(".*lb")).next_sibling.strip()[1:-1].split(",\xa0")          # Player weight and height in cm and kg in a list

        # Gets and Cleans Player Position
        position_string = player_soup.find("strong", string=re.compile(".*Position:.*")).next_sibling
        positions_cleaned = re.sub(r'[^\w\s,]', '', position_string)
        position_list = re.split(r',|and', positions_cleaned)
        position_list = [pos.strip() for pos in position_list if pos.strip()]

        # Applying the Captured data to the player pd.Series (functionallity of each line insists upon itself)
        player_series["height_cm"] = player_weightheight[0][:-2]
        player_series["weight_kg"] = player_weightheight[1][:-2]
        player_series["position"] = position_list
        player_series["shooting_hand"] = player_soup.find("strong", string=re.compile(".*Shoots:.*")).next_sibling.strip()
        player_series["experience"] = int(player_soup.find("strong", string=re.compile(".*Experience:.*|.*Career Length:.*")).next_sibling.strip().split()[0])
        player_series["birthplace"] = player_json.get('birthPlace').split()[-1]
        player_series["birthdate"] = player_json.get('birthDate')

        return player_series
        
        

In [61]:
c = Crawler()
mvp = c.get_mvps()
mvp2 = mvp.apply(c.get_player_data, axis="columns")

ConnectionError: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))