# Web Scraping

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [2]:
#Create loop for all pages and convert table to dataframe
df = pd.DataFrame()
url_main = 'http://www.espn.com/nba/salaries/_/'
webpage = requests.get(url_main)
soup = BeautifulSoup(webpage.text,'html.parser')

#Find all YEAR options
options = soup.select('option')

#Iterate through all option tags and get url text
years = [option.get('value') for option in options]

for i in range(0, len(years)):
    years[i] = years[i].rstrip('/4').rstrip('/seasontype')
del years[:2]

current_year = int(years[0][-4:])+1

#Get page numbers
pages = soup.find(class_="page-numbers").get_text()
pages = int(pages[-2:].strip())

for page in range(1,pages):
    webpage = requests.get(url_main + 'page/' + str(page) + '/')
    soup = BeautifulSoup(webpage.text,'html.parser')
    table = soup.find_all('table')
    temp_df = pd.read_html(str(table))
    df = pd.concat([df, temp_df[0]])

In [3]:
df

Unnamed: 0,0,1,2,3
0,RK,NAME,TEAM,SALARY
1,1,"Stephen Curry, PG",Golden State Warriors,"$48,070,014"
2,2,"Russell Westbrook, PG",Los Angeles Lakers,"$47,063,478"
3,3,"LeBron James, SF",Los Angeles Lakers,"$44,474,988"
4,4,"Kevin Durant, PF",Brooklyn Nets,"$44,119,845"
...,...,...,...,...
39,476,"Max Christie, G",Los Angeles Lakers,"$1,017,781"
40,477,"Jaden Hardy, G",Dallas Mavericks,"$1,017,781"
41,478,"Tyrese Martin, F",Atlanta Hawks,"$1,017,781"
42,479,"Luke Kornet, F",Boston Celtics,"$565,986"


In [4]:
#Data cleaning
df = df.dropna()
df = df.drop_duplicates()
df.columns = ['Rank', 'Name', 'Team', str(current_year)]
df = df.drop(df.loc[df['Rank'].str.contains('RK')].index)

In [5]:
#Dataframe sort

#Currency to Float
df[str(current_year)] = df[str(current_year)].str.replace(',', '')
df[str(current_year)] = df[str(current_year)].str.replace('$', '')
df[str(current_year)] = df[str(current_year)].astype(float)

#Sort
df[['Name','Position']] = df['Name'].str.split(",",expand=True)
df['Position'] = df['Position'].str.strip()
df = df.reset_index().drop(columns = ['index', 'Rank'])
df = df[['Name', 'Position', 'Team', str(current_year)]]

  df[str(current_year)] = df[str(current_year)].str.replace('$', '')


In [6]:
#Create loop for all YEARS and merge dataframes
df_main = df

#Loop for all years
for year_url in years:
    url_loop = str(year_url)
    year_df = pd.DataFrame()
    year = year_url[-4:]    
    
    #Get page numbers
    webpage = requests.get('http:' + url_loop + '/')
    soup = BeautifulSoup(webpage.text,'html.parser')
    pages = soup.find(class_="page-numbers").get_text()
    pages = int(pages[-2:].strip())
    
    #Loop for all pages
    for page in range(1,pages):
        
        #Get data from all pages
        webpage = requests.get('http:' + url_loop + '/page/' + str(page) + '/')
        soup = BeautifulSoup(webpage.text,'html.parser')
        table = soup.find_all('table')
        temp_df = pd.read_html(str(table))
        year_df = pd.concat([year_df, temp_df[0]])
        
    #Data cleaning, sorting and formatting
    year_df = year_df.dropna()
    year_df = year_df.drop_duplicates()
    year_df.columns = ['Rank', 'Name', 'Team', str(year)]
    year_df = year_df.drop(year_df.loc[year_df['Rank'].str.contains('RK')].index)
    
    #Currency to Int
    year_df[str(year)] = year_df[str(year)].str.replace(',', '')
    year_df[str(year)] = year_df[str(year)].str.replace('$', '')
    year_df[str(year)] = year_df[str(year)].astype(float)
    
    #Keep only Name and Salary
    year_df[['Name','Position']] = year_df['Name'].str.split(",",expand=True)
    year_df = year_df.reset_index().drop(columns = ['index', 'Rank', 'Position', 'Team'])
    year_df = year_df[['Name', str(year)]]
        
    #Merge into main dataframe
    df_main = df_main.merge(year_df, how='left', on='Name')

  year_df[str(year)] = year_df[str(year)].str.replace('$', '')


In [7]:
#Cleaning main dataset
df_main = df_main.dropna(axis='columns', how='all')
df_main.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 480 entries, 0 to 479
Data columns (total 21 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      480 non-null    object 
 1   Position  480 non-null    object 
 2   Team      480 non-null    object 
 3   2023      480 non-null    float64
 4   2022      423 non-null    float64
 5   2021      364 non-null    float64
 6   2020      303 non-null    float64
 7   2019      248 non-null    float64
 8   2018      190 non-null    float64
 9   2017      172 non-null    float64
 10  2016      144 non-null    float64
 11  2015      115 non-null    float64
 12  2014      90 non-null     float64
 13  2013      76 non-null     float64
 14  2012      56 non-null     float64
 15  2011      40 non-null     float64
 16  2010      35 non-null     float64
 17  2009      24 non-null     float64
 18  2008      12 non-null     float64
 19  2007      10 non-null     float64
 20  2006      3 non-null      float6

# Data Visualization

In [8]:
import dash
from dash import html
from dash import dcc
from dash.dependencies import Input, Output, State
import plotly.graph_objects as go
import plotly.express as px
from dash import no_update

In [9]:
# Create a dash application
app = dash.Dash(__name__)

# Clear the layout and do not display exception till callback gets executed
app.config.suppress_callback_exceptions = True

# Create Year Options
year_list = list(df_main.columns)
del year_list[0:3]

# Application layout
app.layout = html.Div(children=[ 
                                # Add title to the dashboard
                                html.H1('Active NBA Player Salary Summary',
                                        style={ 'textAlign': 'center',
                                                'color': '#503D36',
                                                'font-size': 24
                                                }
                                        ),
    
                                # Dropdown creation
                                # Create an outer division 
                                html.Div([
                                    # Add an division
                                    html.Div([
                                        # Create an division for adding dropdown helper text for report type
                                        html.Div(
                                            [
                                            html.H2('Report Type:', style={'margin-right': '2em'}),
                                            ]
                                        ),
                                        # Add a dropdown
                                          dcc.Dropdown(id='input-type', 
                                                            options=[
                                                                    {'label': 'Overall Report', 'value': 'OPT1'},
                                                                    {'label': 'Report by Positions', 'value': 'OPT2'}
                                                                    ],
                                                            placeholder='Select a report type',
                                                            style={ 'textAlign': 'center',
                                                                    'font-size': '20px',
                                                                    'padding': '3px',
                                                                    'width': '80%'
                                                                    }
                                                        )
                                        
                                    # Place them next to each other using the division style
                                    ], style={'display':'flex'}),
                                    
                                   # Add next division 
                                   html.Div([
                                       # Create an division for adding dropdown helper text for choosing year
                                        html.Div(
                                            [
                                            html.H2('Choose Year:', style={'margin-right': '2em'})
                                            ]
                                        ),
                                        dcc.Dropdown(id='input-year', 
                                                     # Update dropdown values using list comphrehension
                                                     options=[{'label': i, 'value': i} for i in year_list],
                                                     placeholder="Select a year",
                                                     style={'width':'80%',
                                                            'padding':'3px',
                                                            'font-size': '20px',
                                                            'text-align-last' : 'center'}
                                                    ),
                                            # Place them next to each other using the division style
                                            ], style={'display': 'flex'}),  
                                          ]),
                                
                                # Add Computed graphs
                                # Add an empty division and providing an id that will be updated during callback
                                html.Div([ ], id='plot1'),
    
                                html.Div([
                                        html.Div([ ], id='plot2'),
                                        html.Div([ ], id='plot3'),
                                        html.Div([ ], id='plot4'),
                                ], style={'display': 'flex'}),
    
                                ])

# Callback function definition
# Add 3 ouput components

@app.callback( [Output(component_id='plot1', component_property='children'),
                Output(component_id='plot2', component_property='children'),
                Output(component_id='plot3', component_property='children'),
                Output(component_id='plot4', component_property='children')
                ],

               [Input(component_id='input-type', component_property='value'),
                Input(component_id='input-year', component_property='value')],
               # Holding output state till user enters all the form information. In this case, it will be chart type and year
               [State("plot1", 'children'), State("plot2", "children"),
                State("plot3", "children"), State("plot4", "children")
               ])
# Add computation to callback function and return graph
def get_graph(chart, year, children1, children2, c3, c4):
        # Get data of the selected year
        df_data = df_main[['Name', 'Position', str(year)]].dropna()
        df_data = df_data.sort_values(str(year), ascending=False).reset_index().drop(columns = ['index'])
    
        if chart == 'OPT1':
            # Scatter Chart
            sca_fig = px.scatter(df_data, y=str(year),
                             title='Active NBA Player Salary by Rank',
                             labels={'index': 'Rank', str(year):'USD'})
            
            # Box Chart
            box_fig = px.box(df_data, y=str(year),
                             title='Active NBA Player Salary Distribution',
                             labels={'index': 'Rank', str(year):'USD'})
            
            # Percentage of Player Positions
            pie_data = df_data['Position'].value_counts().rename_axis('Position').reset_index(name='Counts')
            pie_fig = px.pie(pie_data, values = 'Counts', names = 'Position',
                             title='Percentage of Active Player Positions by Reporting Year')
            
            # Tree Map
            tree_fig = px.treemap(df_data, path=['Position', 'Name'],
                                    values=str(year),
                                    color=str(year),
                                    color_continuous_scale='RdBu',
                                    title='Top Paid Players by Position'
                                )
            # Return dcc.Graph component to the empty division
            return [dcc.Graph(figure=tree_fig), 
                    dcc.Graph(figure=pie_fig),
                    dcc.Graph(figure=sca_fig),
                    dcc.Graph(figure=box_fig)
                   ]
        else:
            # Create graph
            
            # Bar Chart
            total_data = df_data.groupby(['Position'])[str(year)].sum().rename_axis('Position').reset_index(name='Total Salary')
            Total_fig = px.bar(total_data, x='Total Salary', y='Position',
                               orientation='h',
                               title='Total Active NBA Player Salary by Position')
            
            # Scatter Charts
            G_fig = px.scatter(df_data.loc[df_data['Position'].str.contains('G')], y=str(year),
                             title='Active Guards Salary by Rank',
                             labels={'index': 'Rank', str(year):'USD'})
            
            F_fig = px.scatter(df_data.loc[df_data['Position'].str.contains('F')], y=str(year),
                             title='Active Forwards Salary by Rank',
                             labels={'index': 'Rank', str(year):'USD'})
            
            C_fig = px.scatter(df_data.loc[df_data['Position'].str.contains('C')], y=str(year),
                             title='Active Centers Salary by Rank',
                             labels={'index': 'Rank', str(year):'USD'})
            
            # Return dcc.Graph component to the empty division
            return[dcc.Graph(figure=Total_fig),
                   dcc.Graph(figure=G_fig), 
                   dcc.Graph(figure=F_fig), 
                   dcc.Graph(figure=C_fig)
                  ]


# Run the app
if __name__ == '__main__':
    app.run_server()

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:8050 (Press CTRL+C to quit)
127.0.0.1 - - [02/Aug/2022 19:14:14] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [02/Aug/2022 19:14:14] "GET /_dash-layout HTTP/1.1" 200 -
127.0.0.1 - - [02/Aug/2022 19:14:14] "GET /_dash-dependencies HTTP/1.1" 200 -
127.0.0.1 - - [02/Aug/2022 19:14:14] "GET /_favicon.ico?v=2.6.0 HTTP/1.1" 200 -
127.0.0.1 - - [02/Aug/2022 19:14:14] "GET /_dash-component-suites/dash/dcc/async-dropdown.js HTTP/1.1" 304 -


Exception on /_dash-update-component [POST]
Traceback (most recent call last):
  File "C:\Users\kinev\AppData\Local\Programs\Python\Python310\lib\site-packages\flask\app.py", line 2077, in wsgi_app
    response = self.full_dispatch_request()
  File "C:\Users\kinev\AppData\Local\Programs\Python\Python310\lib\site-packages\flask\app.py", line 1525, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "C:\Users\kinev\AppData\Local\Programs\Python\Python310\lib\site-packages\flask\app.py", line 1523, in full_dispatch_request
    rv = self.dispatch_request()
  File "C:\Users\kinev\AppData\Local\Programs\Python\Python310\lib\site-packages\flask\app.py", line 1509, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**req.view_args)
  File "C:\Users\kinev\AppData\Local\Programs\Python\Python310\lib\site-packages\dash\dash.py", line 1260, in dispatch
    ctx.run(
  File "C:\Users\kinev\AppData\Local\Programs\Python\Python310\lib\site-packages\d

127.0.0.1 - - [02/Aug/2022 19:14:14] "POST /_dash-update-component HTTP/1.1" 500 -


Exception on /_dash-update-component [POST]
Traceback (most recent call last):
  File "C:\Users\kinev\AppData\Local\Programs\Python\Python310\lib\site-packages\flask\app.py", line 2077, in wsgi_app
    response = self.full_dispatch_request()
  File "C:\Users\kinev\AppData\Local\Programs\Python\Python310\lib\site-packages\flask\app.py", line 1525, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "C:\Users\kinev\AppData\Local\Programs\Python\Python310\lib\site-packages\flask\app.py", line 1523, in full_dispatch_request
    rv = self.dispatch_request()
  File "C:\Users\kinev\AppData\Local\Programs\Python\Python310\lib\site-packages\flask\app.py", line 1509, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**req.view_args)
  File "C:\Users\kinev\AppData\Local\Programs\Python\Python310\lib\site-packages\dash\dash.py", line 1260, in dispatch
    ctx.run(
  File "C:\Users\kinev\AppData\Local\Programs\Python\Python310\lib\site-packages\d

127.0.0.1 - - [02/Aug/2022 19:14:17] "POST /_dash-update-component HTTP/1.1" 500 -
127.0.0.1 - - [02/Aug/2022 19:14:20] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [02/Aug/2022 19:14:20] "GET /_dash-component-suites/dash/dcc/async-graph.js HTTP/1.1" 304 -
127.0.0.1 - - [02/Aug/2022 19:14:20] "GET /_dash-component-suites/dash/dcc/async-plotlyjs.js HTTP/1.1" 304 -
127.0.0.1 - - [02/Aug/2022 19:14:30] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [02/Aug/2022 19:14:39] "POST /_dash-update-component HTTP/1.1" 200 -
