In [1]:
#importing required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
#link to the site I wish to scrape
url = "https://www.basketball-reference.com/playoffs/"

In [3]:
#creating a response object
response = requests.get(url)

In [4]:
response

<Response [200]>

In [5]:
data = response.text

In [6]:
#parsing the data
soup = BeautifulSoup(data,"html.parser")

In [7]:
#extract column headers into a list
headers = [th.getText() for th in soup.find_all("tr", limit=2)[1].find_all('th')]

In [8]:
headers

['Year',
 'Lg',
 'Champion',
 'Runner-Up',
 'Finals MVP',
 '\xa0',
 'Points',
 'Rebounds',
 'Assists',
 'Win Shares']

In [9]:
#get rows from table
rows = soup.find_all("tr")[2:]
rows_data = [[td.getText() for td in rows[i].find_all("td")] for i in range(len(rows))]

In [10]:
#slicing the first 41 rows from the table
rows_data = rows_data[:41]

In [11]:
rows_data

[['NBA',
  'Golden State Warriors',
  'Boston Celtics',
  'S. Curry',
  '',
  'J. Tatum (615)',
  'A. Horford (214)',
  'J. Tatum (148)',
  'J. Butler (3.8)'],
 ['NBA',
  'Milwaukee Bucks',
  'Phoenix Suns',
  'G. Antetokounmpo',
  '',
  'G. Antetokounmpo (634)',
  'G. Antetokounmpo (269)',
  'J. Holiday (199)',
  'G. Antetokounmpo (3.7)'],
 ['NBA',
  'Los Angeles Lakers',
  'Miami Heat',
  'L. James',
  '',
  'A. Davis (582)',
  'L. James (226)',
  'L. James (184)',
  'A. Davis (4.5)'],
 ['NBA',
  'Toronto Raptors',
  'Golden State Warriors',
  'K. Leonard',
  '',
  'K. Leonard (732)',
  'D. Green (223)',
  'D. Green (187)',
  'K. Leonard (4.9)'],
 ['NBA',
  'Golden State Warriors',
  'Cleveland Cavaliers',
  'K. Durant',
  '',
  'L. James (748)',
  'D. Green (222)',
  'L. James (198)',
  'L. James (5.2)'],
 ['NBA',
  'Golden State Warriors',
  'Cleveland Cavaliers',
  'K. Durant',
  '',
  'L. James (591)',
  'K. Love (191)',
  'L. James (141)',
  'L. James (4.3)'],
 ['NBA',
  'Clevel

In [12]:
#adding years into rows_data
last_year = 2022
for i in range(0, len(rows_data)):
    rows_data[i].insert(0, last_year)
    last_year -= 1

In [13]:
rows_data

[[2022,
  'NBA',
  'Golden State Warriors',
  'Boston Celtics',
  'S. Curry',
  '',
  'J. Tatum (615)',
  'A. Horford (214)',
  'J. Tatum (148)',
  'J. Butler (3.8)'],
 [2021,
  'NBA',
  'Milwaukee Bucks',
  'Phoenix Suns',
  'G. Antetokounmpo',
  '',
  'G. Antetokounmpo (634)',
  'G. Antetokounmpo (269)',
  'J. Holiday (199)',
  'G. Antetokounmpo (3.7)'],
 [2020,
  'NBA',
  'Los Angeles Lakers',
  'Miami Heat',
  'L. James',
  '',
  'A. Davis (582)',
  'L. James (226)',
  'L. James (184)',
  'A. Davis (4.5)'],
 [2019,
  'NBA',
  'Toronto Raptors',
  'Golden State Warriors',
  'K. Leonard',
  '',
  'K. Leonard (732)',
  'D. Green (223)',
  'D. Green (187)',
  'K. Leonard (4.9)'],
 [2018,
  'NBA',
  'Golden State Warriors',
  'Cleveland Cavaliers',
  'K. Durant',
  '',
  'L. James (748)',
  'D. Green (222)',
  'L. James (198)',
  'L. James (5.2)'],
 [2017,
  'NBA',
  'Golden State Warriors',
  'Cleveland Cavaliers',
  'K. Durant',
  '',
  'L. James (591)',
  'K. Love (191)',
  'L. James

In [14]:
#creating the dataframe
nba_finals = pd.DataFrame(rows_data, columns=headers)
nba_finals.head()

Unnamed: 0,Year,Lg,Champion,Runner-Up,Finals MVP,Unnamed: 6,Points,Rebounds,Assists,Win Shares
0,2022,NBA,Golden State Warriors,Boston Celtics,S. Curry,,J. Tatum (615),A. Horford (214),J. Tatum (148),J. Butler (3.8)
1,2021,NBA,Milwaukee Bucks,Phoenix Suns,G. Antetokounmpo,,G. Antetokounmpo (634),G. Antetokounmpo (269),J. Holiday (199),G. Antetokounmpo (3.7)
2,2020,NBA,Los Angeles Lakers,Miami Heat,L. James,,A. Davis (582),L. James (226),L. James (184),A. Davis (4.5)
3,2019,NBA,Toronto Raptors,Golden State Warriors,K. Leonard,,K. Leonard (732),D. Green (223),D. Green (187),K. Leonard (4.9)
4,2018,NBA,Golden State Warriors,Cleveland Cavaliers,K. Durant,,L. James (748),D. Green (222),L. James (198),L. James (5.2)


In [15]:
nba_finals.columns

Index(['Year', 'Lg', 'Champion', 'Runner-Up', 'Finals MVP', ' ', 'Points',
       'Rebounds', 'Assists', 'Win Shares'],
      dtype='object')

In [16]:
#dropping the whitespace column
nba_finals.drop('\xa0',axis=1,inplace=True)

In [17]:
nba_finals.columns

Index(['Year', 'Lg', 'Champion', 'Runner-Up', 'Finals MVP', 'Points',
       'Rebounds', 'Assists', 'Win Shares'],
      dtype='object')

In [18]:
nba_finals.head()

Unnamed: 0,Year,Lg,Champion,Runner-Up,Finals MVP,Points,Rebounds,Assists,Win Shares
0,2022,NBA,Golden State Warriors,Boston Celtics,S. Curry,J. Tatum (615),A. Horford (214),J. Tatum (148),J. Butler (3.8)
1,2021,NBA,Milwaukee Bucks,Phoenix Suns,G. Antetokounmpo,G. Antetokounmpo (634),G. Antetokounmpo (269),J. Holiday (199),G. Antetokounmpo (3.7)
2,2020,NBA,Los Angeles Lakers,Miami Heat,L. James,A. Davis (582),L. James (226),L. James (184),A. Davis (4.5)
3,2019,NBA,Toronto Raptors,Golden State Warriors,K. Leonard,K. Leonard (732),D. Green (223),D. Green (187),K. Leonard (4.9)
4,2018,NBA,Golden State Warriors,Cleveland Cavaliers,K. Durant,L. James (748),D. Green (222),L. James (198),L. James (5.2)


In [19]:
nba_finals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Year        41 non-null     int64 
 1   Lg          41 non-null     object
 2   Champion    40 non-null     object
 3   Runner-Up   40 non-null     object
 4   Finals MVP  40 non-null     object
 5   Points      40 non-null     object
 6   Rebounds    40 non-null     object
 7   Assists     40 non-null     object
 8   Win Shares  40 non-null     object
dtypes: int64(1), object(8)
memory usage: 3.0+ KB


In [20]:
#searching for missing values
nba_finals.isnull()

Unnamed: 0,Year,Lg,Champion,Runner-Up,Finals MVP,Points,Rebounds,Assists,Win Shares
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False


#### Only the 20th row is missing data

In [21]:
nba_finals.iloc[20]

Year          2002
Lg                
Champion      None
Runner-Up     None
Finals MVP    None
Points        None
Rebounds      None
Assists       None
Win Shares    None
Name: 20, dtype: object

In [22]:
#filling in the missing values
nba_finals.fillna({"Champion":"Los Angeles Lakers","Runner-Up":"New Jersey Nets","Finals MVP":"S. O'Neal","Points":"S. O'Neal (541)", "Rebounds":"S. O'Neal (239)","Assists":"J. Kidd (182)","Win Shares":"S. O'Neal (3.8)"},inplace=True)

In [23]:
nba_finals["Lg"]

0     NBA
1     NBA
2     NBA
3     NBA
4     NBA
5     NBA
6     NBA
7     NBA
8     NBA
9     NBA
10    NBA
11    NBA
12    NBA
13    NBA
14    NBA
15    NBA
16    NBA
17    NBA
18    NBA
19    NBA
20       
21    NBA
22    NBA
23    NBA
24    NBA
25    NBA
26    NBA
27    NBA
28    NBA
29    NBA
30    NBA
31    NBA
32    NBA
33    NBA
34    NBA
35    NBA
36    NBA
37    NBA
38    NBA
39    NBA
40    NBA
Name: Lg, dtype: object

In [24]:
nba_finals.at[20, "Lg"] = 'NBA'

In [25]:
nba_finals["Lg"]

0     NBA
1     NBA
2     NBA
3     NBA
4     NBA
5     NBA
6     NBA
7     NBA
8     NBA
9     NBA
10    NBA
11    NBA
12    NBA
13    NBA
14    NBA
15    NBA
16    NBA
17    NBA
18    NBA
19    NBA
20    NBA
21    NBA
22    NBA
23    NBA
24    NBA
25    NBA
26    NBA
27    NBA
28    NBA
29    NBA
30    NBA
31    NBA
32    NBA
33    NBA
34    NBA
35    NBA
36    NBA
37    NBA
38    NBA
39    NBA
40    NBA
Name: Lg, dtype: object

In [26]:
nba_finals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Year        41 non-null     int64 
 1   Lg          41 non-null     object
 2   Champion    41 non-null     object
 3   Runner-Up   41 non-null     object
 4   Finals MVP  41 non-null     object
 5   Points      41 non-null     object
 6   Rebounds    41 non-null     object
 7   Assists     41 non-null     object
 8   Win Shares  41 non-null     object
dtypes: int64(1), object(8)
memory usage: 3.0+ KB


In [27]:
nba_finals.describe()

Unnamed: 0,Year
count,41.0
mean,2002.0
std,11.979149
min,1982.0
25%,1992.0
50%,2002.0
75%,2012.0
max,2022.0


In [28]:
nba_finals.rename(columns = {"Lg":"League"},inplace=True)

### Data Discovery

#### Champions refers to the team that won that year's NBA playoffs (every league is NBA).

#### Runner-up is the team that lost the finals of that year's playoffs thus taking second place in the tournament (a.k.a Runner-up)

#### Finals MVP is the most valuable player in that year's playoffs final game.

#### Points refers to the player that scored the most points in that year playoffs, with the number of points scored in brackets. The NBA playoffs is the postseason tournament of the National Basketball Association (NBA) held to determine the league's champion. An annual best-of-seven elimination tournment, the NBA playoffs are held after the league's regular season and its preliminary postseason tournament, the NBA Play-in Tournament.

#### The playoffs are comprised of 16 teams in total- the eight best from each of the NBA's two conferences (East and West). The teams in each conference are ranked by their win-loss record from the regular season. The six teams with the most wins from each conference progress immediately to the playoffs. The NBA Play-In tournament determines the final two spots (the seventh and eighth seeds) of the main playoff brackets, using a four-team, three-game format. Within each conference, teams that finished ranked seventh-tenth play each other as follows.

#### Rebounds refers to the player with the highest number of rebounds in that playoff. A rebound is when a player gets the ball after it misses the hoop, hits off the backboard, or bounces off the rim. It occurs when a player gains possession of the basketball after a missed field goal, three-point field goal, or free throw attempt.

#### Win shares refers to the player that contributed the most to their team wins in that year's playoff.

### Data Manipulation

In [29]:
nba_finals.head()

Unnamed: 0,Year,League,Champion,Runner-Up,Finals MVP,Points,Rebounds,Assists,Win Shares
0,2022,NBA,Golden State Warriors,Boston Celtics,S. Curry,J. Tatum (615),A. Horford (214),J. Tatum (148),J. Butler (3.8)
1,2021,NBA,Milwaukee Bucks,Phoenix Suns,G. Antetokounmpo,G. Antetokounmpo (634),G. Antetokounmpo (269),J. Holiday (199),G. Antetokounmpo (3.7)
2,2020,NBA,Los Angeles Lakers,Miami Heat,L. James,A. Davis (582),L. James (226),L. James (184),A. Davis (4.5)
3,2019,NBA,Toronto Raptors,Golden State Warriors,K. Leonard,K. Leonard (732),D. Green (223),D. Green (187),K. Leonard (4.9)
4,2018,NBA,Golden State Warriors,Cleveland Cavaliers,K. Durant,L. James (748),D. Green (222),L. James (198),L. James (5.2)


#### Splitting the Points, Rebounds, Assists, Win Shares columns to create four additional columns.

In [30]:
nba_finals["Highest Total Points"] = pd.to_numeric(nba_finals["Points"].str.split('(').str[1].str.strip(')'))

In [31]:
nba_finals["Scored by"] = nba_finals["Points"].str.split('(').str[0].str.strip()

In [32]:
nba_finals["Highest Rebounds"] = pd.to_numeric(nba_finals["Rebounds"].str.split('(').str[1].str.strip(')'))

In [33]:
nba_finals["Caught by"] = nba_finals["Rebounds"].str.split('(').str[0].str.strip()

In [34]:
nba_finals["Highest Assists"] = pd.to_numeric(nba_finals["Assists"].str.split('(').str[1].str.strip(')'))

In [35]:
nba_finals["Made by"] = nba_finals["Assists"].str.split('(').str[0].str.strip()

In [36]:
nba_finals["Highest Win Shares"] = pd.to_numeric(nba_finals["Win Shares"].str.split('(').str[1].str.strip(')'))

In [37]:
nba_finals["Player with HWS"] = nba_finals["Win Shares"].str.split('(').str[0].str.strip()

In [38]:
nba_finals.drop(["Points","Rebounds","Assists","Win Shares"], axis=1,inplace=True)

In [39]:
nba_finals

Unnamed: 0,Year,League,Champion,Runner-Up,Finals MVP,Highest Total Points,Scored by,Highest Rebounds,Caught by,Highest Assists,Made by,Highest Win Shares,Player with HWS
0,2022,NBA,Golden State Warriors,Boston Celtics,S. Curry,615,J. Tatum,214,A. Horford,148,J. Tatum,3.8,J. Butler
1,2021,NBA,Milwaukee Bucks,Phoenix Suns,G. Antetokounmpo,634,G. Antetokounmpo,269,G. Antetokounmpo,199,J. Holiday,3.7,G. Antetokounmpo
2,2020,NBA,Los Angeles Lakers,Miami Heat,L. James,582,A. Davis,226,L. James,184,L. James,4.5,A. Davis
3,2019,NBA,Toronto Raptors,Golden State Warriors,K. Leonard,732,K. Leonard,223,D. Green,187,D. Green,4.9,K. Leonard
4,2018,NBA,Golden State Warriors,Cleveland Cavaliers,K. Durant,748,L. James,222,D. Green,198,L. James,5.2,L. James
5,2017,NBA,Golden State Warriors,Cleveland Cavaliers,K. Durant,591,L. James,191,K. Love,141,L. James,4.3,L. James
6,2016,NBA,Cleveland Cavaliers,Golden State Warriors,L. James,582,K. Thompson,228,D. Green,198,R. Westbrook,4.7,L. James
7,2015,NBA,Golden State Warriors,Cleveland Cavaliers,A. Iguodala,601,L. James,238,D. Howard,169,L. James,3.9,S. Curry
8,2014,NBA,San Antonio Spurs,Miami Heat,K. Leonard,563,K. Durant,211,T. Duncan,153,R. Westbrook,4.3,L. James
9,2013,NBA,Miami Heat,San Antonio Spurs,L. James,596,L. James,214,T. Duncan,152,L. James,5.2,L. James


In [40]:
nba_finals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Year                  41 non-null     int64  
 1   League                41 non-null     object 
 2   Champion              41 non-null     object 
 3   Runner-Up             41 non-null     object 
 4   Finals MVP            41 non-null     object 
 5   Highest Total Points  41 non-null     int64  
 6   Scored by             41 non-null     object 
 7   Highest Rebounds      41 non-null     int64  
 8   Caught by             41 non-null     object 
 9   Highest Assists       41 non-null     int64  
 10  Made by               41 non-null     object 
 11  Highest Win Shares    41 non-null     float64
 12  Player with HWS       41 non-null     object 
dtypes: float64(1), int64(4), object(8)
memory usage: 4.3+ KB


### EXPLORATORY DATA ANALYSIS

In [41]:
nba_finals.corr()

Unnamed: 0,Year,Highest Total Points,Highest Rebounds,Highest Assists,Highest Win Shares
Year,1.0,0.257277,0.037175,-0.418909,0.323496
Highest Total Points,0.257277,1.0,0.321165,-0.04831,0.390516
Highest Rebounds,0.037175,0.321165,1.0,-0.225206,0.284492
Highest Assists,-0.418909,-0.04831,-0.225206,1.0,0.020659
Highest Win Shares,0.323496,0.390516,0.284492,0.020659,1.0


#### I will be using tableau to visualize the data.

In [42]:
nba_finals.to_csv("nba.csv")