## Research Question
Can we calculate the probability that a player is selected MVP for a season based on their stats for the season? 

## Data Cleaning & Collection
The content below describes the process for collecting the data from the NBA site.



In [130]:
import requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
import time

**Player Stat Data Collection** 

In this section after importing above, I collected the url and created a dataframe that collected all of the data from all of the seasons from the 2003-2004 season to the 2022-2023 season for the top player stats, ranking which players led during the season and what their stats were. I combined all of this data into one main data frame using a for loop.

**QUESTIONS**
Should we clean the data (i.e. get rid of the data we don't want to use?)

In [131]:
#Create a list of all of the seasons to collect data from
season_years = ["2003-04", "2004-05", "2005-06", "2006-07", "2007-08", "2008-09", "2009-10", "2010-11", "2012-13", "2013-14", "2014-15", "2015-16", "2016-17", "2017-18", "2018-19", "2019-20", "2020-21", "2021-22","2022-23"]

#Intialize empty list for dataframes to be added to
dataframes = []

#Loop through the season years and 
for season_year in season_years:

    #Add {season_year} into the api url to change each iteration
    url = f"https://stats.nba.com/stats/leagueLeaders?LeagueID=00&PerMode=PerGame&Scope=S&Season={season_year}&SeasonType=Regular%20Season&StatCategory=PTS"
    response = requests.get(url).json()

    table_headers = response['resultSet']['headers']
    season_data = pd.DataFrame(response['resultSet']['rowSet'], columns=table_headers)

    # Do we want this to be first or last?
    season_data['Year'] = season_year

    # Append the dataframe to the list
    dataframes.append(season_data)

# Concatenate all dataframes into one giant dataframe
player_stat_df = pd.concat(dataframes, ignore_index=True)

#Check to see that latest season is stored (2022-23)
player_stat_df.head()



Unnamed: 0,PLAYER_ID,RANK,PLAYER,TEAM_ID,TEAM,GP,MIN,FGM,FGA,FG_PCT,...,OREB,DREB,REB,AST,STL,BLK,TOV,PTS,EFF,Year
0,1503,1,Tracy McGrady,1610612753,ORL,67,39.9,9.7,23.4,0.417,...,1.4,4.6,6.0,5.5,1.4,0.6,2.7,28.0,23.7,2003-04
1,978,2,Peja Stojakovic,1610612758,SAC,81,40.3,8.2,17.1,0.48,...,1.1,5.1,6.3,2.1,1.3,0.2,1.9,24.2,23.0,2003-04
2,708,3,Kevin Garnett,1610612750,MIN,82,39.4,9.8,19.6,0.499,...,3.0,10.9,13.9,5.0,1.5,2.2,2.6,24.2,33.1,2003-04
3,977,4,Kobe Bryant,1610612747,LAL,65,37.7,7.9,18.1,0.438,...,1.6,3.9,5.5,5.1,1.7,0.4,2.6,24.0,22.7,2003-04
4,1718,5,Paul Pierce,1610612738,BOS,80,38.8,7.5,18.7,0.402,...,0.9,5.7,6.5,5.1,1.6,0.7,3.8,23.0,20.5,2003-04


In [132]:
team_name_mapping = {
    'ATL': 'Atlanta Hawks',
    'BOS': 'Boston Celtics',
    'BKN': 'Brooklyn Nets',
    'CHA': 'Charlotte Hornets',
    'CHI': 'Chicago Bulls',
    'CLE': 'Cleveland Cavaliers',
    'DAL': 'Dallas Mavericks',
    'DEN': 'Denver Nuggets',
    'DET': 'Detroit Pistons',
    'GSW': 'Golden State Warriors',
    'HOU': 'Houston Rockets',
    'IND': 'Indiana Pacers',
    'LAC': 'LA Clippers',
    'LAL': 'Los Angeles Lakers',
    'MEM': 'Memphis Grizzlies',
    'MIA': 'Miami Heat',
    'MIL': 'Milwaukee Bucks',
    'MIN': 'Minnesota Timberwolves',
    'NOP': 'New Orleans Pelicans',
    'NYK': 'New York Knicks',
    'OKC': 'Oklahoma City Thunder',
    'ORL': 'Orlando Magic',
    'PHI': 'Philadelphia 76ers',
    'PHX': 'Phoenix Suns',
    'POR': 'Portland Trail Blazers',
    'SAC': 'Sacramento Kings',
    'SAS': 'San Antonio Spurs',
    'TOR': 'Toronto Raptors',
    'UTA': 'Utah Jazz',
    'WAS': 'Washington Wizards',
}

player_stat_df['Team Name'] = player_stat_df['TEAM'].map(team_name_mapping)



**Team Stat Data Collection**

To collect the team stats, I had to use a different site than the original ones found since they were locked or illegal to scrape. I collected the team name and average points per game for each team for the seasons from 2003-04 up until 2022-23. It is possible to collect more data about the team performance by combining this data with other pages on the same source site. I used a loop to collect the data and modified the Year column to match the other table (for easy joining in the future?). 


In [133]:
data=[]

#Collect this number of years
years = ['2004-06-16', '2005-06-16', '2006-06-16', '2007-06-16', '2008-06-16', '2009-06-16', '2010-06-16', '2011-06-16', '2012-06-16', '2013-06-16', '2014-06-16', '2015-06-16', '2016-06-16', '2017-06-16', '2018-06-16', '2019-06-16', '2020-06-16', '2021-06-16', '2022-06-16', '2023-06-16'] 

#Loop through the different years and collect the data about team name and average points per game
for year in years:
    url = f"https://www.teamrankings.com/nba/stat/points-per-game?date={year}"

    response = requests.get(url)

    if response.status_code != 200:
        print("Something went wrong:", response.status_code, response.reason)
        continue

    page = BeautifulSoup(response.content, 'html.parser')

    table_body = page.find('tbody')

    rows = table_body.find_all('tr')

    year_num = int(year[:4])
    season_str = str((year_num-1))+"-"+str(year_num)[2:4]

    for row in rows:
        team_name = row.find('td', {"class": "text-left nowrap"}).text
        PTG = row.find('td', {"class" :'text-right'}).text
        data.append([season_str, team_name, PTG])

team_stats_df = pd.DataFrame(data, columns=['Year', 'Team Name', 'PTG'])

In [134]:

#Mapping in order to standardize the team names across data frames
team_name_mapping = {
    'Atlanta': 'Atlanta Hawks',
    'Boston': 'Boston Celtics',
    'Brooklyn': 'Brooklyn Nets',
    'Charlotte': 'Charlotte Hornets',
    'Chicago': 'Chicago Bulls',
    'Cleveland': 'Cleveland Cavaliers',
    'Dallas': 'Dallas Mavericks',
    'Denver': 'Denver Nuggets',
    'Detroit': 'Detroit Pistons',
    'Golden State': 'Golden State Warriors',
    'Houston': 'Houston Rockets',
    'Indiana': 'Indiana Pacers',
    'LA Clippers': 'LA Clippers',
    'LA Lakers': 'LA Lakers',
    'Memphis': 'Memphis Grizzlies',
    'Miami': 'Miami Heat',
    'Milwaukee': 'Milwaukee Bucks',
    'Minnesota': 'Minnesota Timberwolves',
    'New Orleans': 'New Orleans Pelicans',
    'New York': 'New York Knicks',
    'Okla City': 'Oklahoma City Thunder',
    'Orlando': 'Orlando Magic',
    'Philadelphia': 'Philadelphia 76ers',
    'Phoenix': 'Phoenix Suns',
    'Portland': 'Portland Trail Blazers',
    'Sacramento': 'Sacramento Kings',
    'San Antonio': 'San Antonio Spurs',
    'Toronto': 'Toronto Raptors',
    'Utah': 'Utah Jazz',
    'Washington': 'Washington Wizards',
}

team_stats_df['Team Name'] = team_stats_df['Team Name'].map(team_name_mapping)

#Add rank starting at index 1 
team_stats_df.insert(0, 'RANK', range(1, len(team_stats_df) + 1))

team_stats_df.head()


Unnamed: 0,RANK,Year,Team Name,PTG
0,1,2003-04,Dallas Mavericks,104.9
1,2,2003-04,Sacramento Kings,101.9
2,3,2003-04,Milwaukee Bucks,97.3
3,4,2003-04,Oklahoma City Thunder,97.1
4,5,2003-04,Denver Nuggets,96.8


In [135]:
team_stats_df.tail()

Unnamed: 0,RANK,Year,Team Name,PTG
594,595,2022-23,Cleveland Cavaliers,111.2
595,596,2022-23,Charlotte Hornets,111.0
596,597,2022-23,Houston Rockets,110.7
597,598,2022-23,Detroit Pistons,110.3
598,599,2022-23,Miami Heat,109.1


**Collecting Data About Players Rank Within Their Team**
TO DO: Describe process here

In [151]:

#  Merge dataframes on 'Year' and 'TEAM' columns
merged_df = player_stat_df.merge(team_stats_df, on=['Year', 'Team Name'], how='left')

# Create a new column to calculate player's rank within their team for each season
#TO DO

# Sort the merged dataframe by 'Year', 'Team Name'
merged_df = merged_df.sort_values(by=['Year', 'Team Name'])

merged_df.head()


Unnamed: 0,PLAYER_ID,RANK_x,PLAYER,TEAM_ID,TEAM,GP,MIN,FGM,FGA,FG_PCT,...,AST,STL,BLK,TOV,PTS,EFF,Year,Team Name,RANK_y,PTG
24,1536,25,Stephen Jackson,1610612737,ATL,80,36.8,6.7,15.8,0.425,...,3.1,1.8,0.3,2.8,18.1,15.2,2003-04,Atlanta Hawks,15.0,92.8
34,1891,35,Jason Terry,1610612737,ATL,81,37.2,6.2,14.8,0.417,...,5.4,1.5,0.2,2.8,16.8,16.1,2003-04,Atlanta Hawks,15.0,92.8
125,682,126,Bob Sura,1610612737,ATL,80,20.8,2.5,6.1,0.416,...,2.9,0.8,0.2,1.3,7.5,9.9,2003-04,Atlanta Hawks,15.0,92.8
168,2564,169,Boris Diaw,1610612737,ATL,76,25.3,1.8,4.1,0.447,...,2.4,0.8,0.5,1.7,4.5,8.2,2003-04,Atlanta Hawks,15.0,92.8
174,1521,175,Jacque Vaughn,1610612737,ATL,71,18.0,1.5,3.9,0.386,...,2.7,0.6,0.0,1.2,3.8,5.0,2003-04,Atlanta Hawks,15.0,92.8
