**Process of Data Collection & Conversion to .csv File**

In this process, we outline the code steps taken to collect the data and convert it to .csv files. 

In [1]:
import requests
from bs4 import BeautifulSoup

import pandas as pd
import duckdb
import numpy as np
import time

These files were then read, cleaned, and manipulated in the phase 2 submission file. 

**Player Stats**
In this section after importing above, I collected the url and created a dataframe that collected all of the data from all of the seasons from the 2003-2004 season to the 2022-2023 season for the top player stats, ranking which players led during the season and what their stats were. I combined all of this data into one main data frame using a for loop. The data is originally filtered by average points per game for each player per season.

In [2]:
#Create a list of all of the seasons to collect data from
season_years = ["2003-04", "2004-05", "2005-06", "2006-07", "2007-08", "2008-09", "2009-10", "2010-11", \
    "2011-12", "2012-13", "2013-14", "2014-15", "2015-16", "2016-17", "2017-18", "2018-19", "2019-20", "2020-21", \
    "2021-22","2022-23"]

#Intialize empty list for dataframes to be added to
dataframes = []

#Loop through the season years and 
for season_year in season_years:

    #Add {season_year} into the api url to change each iteration
    url = f"https://stats.nba.com/stats/leagueLeaders?LeagueID=00&PerMode=PerGame&Scope=S&Season={season_year}&SeasonType=Regular%20Season&StatCategory=PTS"
    response = requests.get(url).json()

    table_headers = response['resultSet']['headers']
    season_data = pd.DataFrame(response['resultSet']['rowSet'], columns=table_headers)

    # Do we want this to be first or last?
    season_data['Year'] = season_year

    # Append the dataframe to the list
    dataframes.append(season_data)

# Concatenate all dataframes into one giant dataframe
player_stat_df = pd.concat(dataframes, ignore_index=True)
player_stat_df
#player_stat_df.to_csv('player_stats.csv')


Unnamed: 0,PLAYER_ID,RANK,PLAYER,TEAM_ID,TEAM,GP,MIN,FGM,FGA,FG_PCT,...,OREB,DREB,REB,AST,STL,BLK,TOV,PTS,EFF,Year
0,1503,1,Tracy McGrady,1610612753,ORL,67,39.9,9.7,23.4,0.417,...,1.4,4.6,6.0,5.5,1.4,0.6,2.7,28.0,23.7,2003-04
1,978,2,Peja Stojakovic,1610612758,SAC,81,40.3,8.2,17.1,0.480,...,1.1,5.1,6.3,2.1,1.3,0.2,1.9,24.2,23.0,2003-04
2,708,3,Kevin Garnett,1610612750,MIN,82,39.4,9.8,19.6,0.499,...,3.0,10.9,13.9,5.0,1.5,2.2,2.6,24.2,33.1,2003-04
3,977,4,Kobe Bryant,1610612747,LAL,65,37.7,7.9,18.1,0.438,...,1.6,3.9,5.5,5.1,1.7,0.4,2.6,24.0,22.7,2003-04
4,1718,5,Paul Pierce,1610612738,BOS,80,38.8,7.5,18.7,0.402,...,0.9,5.7,6.5,5.1,1.6,0.7,3.8,23.0,20.5,2003-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4381,200782,241,P.J. Tucker,1610612755,PHI,75,25.6,1.3,3.0,0.427,...,1.3,2.7,3.9,0.8,0.5,0.2,0.6,3.5,6.6,2022-23
4382,1630540,242,Miles McBride,1610612752,NYK,64,11.9,1.2,3.4,0.358,...,0.2,0.6,0.8,1.1,0.6,0.1,0.4,3.5,3.3,2022-23
4383,1630264,243,Anthony Gill,1610612764,WAS,59,10.6,1.2,2.2,0.538,...,0.6,1.1,1.7,0.6,0.1,0.2,0.3,3.3,4.3,2022-23
4384,1631132,244,Christian Koloko,1610612761,TOR,58,13.8,1.2,2.6,0.480,...,1.4,1.5,2.9,0.5,0.4,1.0,0.3,3.1,5.9,2022-23


**Team Stat Data Collection**
This was collected by importing csv files from the site, then we merged them in the main file. Once this data was collected and converted to a .csv file for each year, we looped through all of the years and collect the data into one main dataframe. we were able to rename columns to match the desired headers and ensure there were no string oddities, such as the '*' that had indicated a playoff team in the original site. We removed this so that it is consistent with the other data frame and then it will be easy to merge and compare the data properly. 

In [3]:
years = range(2003, 2023)

team_stats_dfs = []

for year in years:
    file_path = f'nba_team_stats/{year}.csv'
    
    team_stats_df = pd.read_csv(file_path)
    
    team_stats_df['Team'] = team_stats_df['Team'].str.replace('*', '')

    team_stats_df['Year'] = year

    year_end = str(year + 1)
    team_stats_df['Season'] = f'{year}-{year_end[2:4]}'
    
    team_stats_dfs.append(team_stats_df)

all_team_stats_df = pd.concat(team_stats_dfs, ignore_index=True)
all_team_stats_df = all_team_stats_df[all_team_stats_df['Team'] != \
                                      'League Average']

#Accounting for any name discrepancies
all_team_stats_df.loc[all_team_stats_df['Team'] == 'Los Angeles Clippers', \
                      "Team"] = 'LA Clippers'
all_team_stats_df.loc[all_team_stats_df['Team'] == 'Charlotte Hornets', \
                      "Team"] = 'Charlotte Bobcats'

all_team_stats_df = all_team_stats_df.drop("Year", axis=1)

all_team_stats_df.head()

all_team_stats_df.to_csv('all_team_stats.csv')

**Historical MVP Data Collection**
This is the data historically ranking the MVPs over the seasons that are being analyzed. It goes through the list of target seasons and splits the text on the page to create a dataframe. This had to be done by using a text representation since the other sources were not able to be scraped.

In [4]:
url = "https://www.nba.com/news/history-mvp-award-winners"

response = requests.get(url)

if response.status_code != 200:
        print("Something went wrong:", response.status_code, response.reason)

page = BeautifulSoup(response.content, 'html.parser')


target_seasons = ["2003-04", "2004-05", "2005-06", "2006-07", "2007-08", "2008-09", "2009-10", \
    "2010-11", "2011-12", "2012-13", "2013-14", "2014-15", "2015-16", "2016-17", "2017-18", \
    "2018-19", "2019-20", "2020-21", "2021-22", "2022-23"]

data = {} 

# Loop through the target seasons
for target_season in target_seasons:
    for p_tag in page.find_all('p'):
        if target_season in p_tag.text:
            split_data = p_tag.text.split(' — ')
            if len(split_data) > 1:
                winner = split_data[1].split(',')[0]
                data[target_season] = winner

# Create a DataFrame from the data dictionary
mvp_df = pd.DataFrame(list(data.items()), columns=["Year", "MVP_Name"])
mvp_df
#mvp_df.to_csv('mvp_historical.csv')

Unnamed: 0,Year,MVP_Name
0,2003-04,Kevin Garnett
1,2004-05,Steve Nash
2,2005-06,Steve Nash
3,2006-07,Dirk Nowitzki
4,2007-08,Kobe Bryant
5,2008-09,LeBron James
6,2009-10,LeBron James
7,2010-11,Derrick Rose
8,2011-12,LeBron James
9,2012-13,LeBron James
