# Analyze NBA Matchup

## Module Imports and Settings

In [5]:
from bs4 import BeautifulSoup as Soup
from bs4 import Comment
from sys import exit
from os import path
import requests
from pandas import DataFrame
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell

In [6]:
pd.options.display.max_columns = None
InteractiveShell.ast_node_interactivity = "all"

## Global Variables

In [7]:
DATA_DIR = 'C:\\Users\\Harry\\Documents\\LTCWFF\\ltcwff_files\\data'

teams = {'ATL': 'Atlanta Hawks', 'BOS': 'Boston Celtics', 'BRK': 'Brooklyn Nets', 'CHO': 'Charlotte Hornets', 'CHI': 'Chicago Bulls', 'CLE': 'Cleveland Cavaliers', 'DAL': 'Dallas Mavericks', 'DEN': 'Denver Nuggets', 'DET': 'Detroit Pistons', 'GSW': 'Golden State Warriors', 'HOU': 'Houston Rockets', 'IND': 'Indiana Pacers', 'LAC': 'Los Angeles Clippers', 'LAL': 'Los Angeles Lakers', 'MEM': 'Memphis Grizzlies', 'MIA': 'Miami Heat', 'MIL': 'Milwaukee Bucks', 'MIN': 'Minnesota Timberwolves', 'NOP': 'New Orleans Pelicans', 'NYK': 'New York Knicks', 'OKC': 'Oklahoma City Thunder', 'ORL': 'Orlando Magic', 'PHI': 'Philadelphia 76ers', 'PHO': 'Phoenix Suns', 'POR': 'Portland Trail Blazers', 'SAC': 'Sacramento Kings', 'SAS': 'San Antonio Spurs', 'TOR': 'Toronto Raptors', 'UTA': 'Utah Jazz', 'WAS': 'Washington Wizards'}

## Helper Functions

In [8]:
def get_url_from_team(team, year, games = '', prefix = 'https://www.basketball-reference.com/teams'):
    return f'{prefix}/{team}/{year}{games}.html'

In [9]:
def get_soup(team, year, games = ''):
    url = get_url_from_team(team, year, games)
    print(url)
    response = requests.get(url)
    if not 200 <= response.status_code < 300:
        exit('Invalid Team')
    return Soup(response.content, 'html.parser')

In [10]:
def parse_row(row):
    result = [ x.string for x in row.find_all('td') ]
    return result

In [11]:
def table_to_df(table, overheader = 0):
    cols = table.find('thead').find_all('tr')[overheader].find_all('th')
    cols = [ col.string for col in cols ]
    cols
    
    stat_table = table.find('tbody')
    stat_table
        
    rows = stat_table.find_all('tr')
    rows
    
    headers = [ row.find('th').string for row in rows ]
    headers = [ header for header in headers if header != 'G' ]
    
    list_of_parsed_rows = [ parse_row(row) for row in rows[0:len(rows)] ]
    list_of_parsed_rows = [ row for row in list_of_parsed_rows if row != [] ]
    list_of_parsed_rows
    
    df = DataFrame(list_of_parsed_rows)
    df.insert(0, '', headers)
    df.columns = cols
    
    return df

In [12]:
def convert_df_to_int(df):
    for col in df.columns:
        try:
            df[col] = pd.to_numeric(df[col])
        except:
            continue
    return df

## Team Statistics

In [13]:
def get_team_misc(soup, team, year):    
    div = soup.find('div', {'id': 'all_team_misc'})
    comment = div.find(string = lambda text: isinstance(text, Comment))
    table = Soup(comment, 'html.parser')
    table

    return table_to_df(table, 1)

In [14]:
def get_team_stats(soup, team, year, prompt = 'Team to research: '):    
    div = soup.find('div', {'id': 'all_team_and_opponent'})
    comment = div.find(string = lambda text: isinstance(text, Comment))
    table = Soup(comment, 'html.parser')
    
    return table_to_df(table, 0)

In [15]:
def get_team_dfs(team, year = 2021):
    soup = get_soup(team, year)
    
    main_df = get_team_stats(soup, team, year)
    
    misc_df = get_team_misc(soup, team, year)
    
    return [main_df, misc_df]

## Game Statistics

In [16]:
def get_all_games(team, year = 2021):
    soup = get_soup(team, year, '_games')
    
    table = soup.find('table', {'id': 'games'})
    table
    
    tds = table.find_all('td', {'data-stat': 'box_score_text'})
    hrefs = [ f"https://www.basketball-reference.com{td.find('a')['href']}" for td in tds ]
        
    df = table_to_df(table)
    
    df['url'] = hrefs
    
    return df

In [17]:
def get_game_stats(url):
    response = requests.get(url)
    if not 200 <= response.status_code < 300:
        exit('Invalid Game')
    soup = Soup(response.content, 'html.parser')
    
    line_div = soup.find('div', {'id': 'all_line_score'})
    line_comment = line_div.find(string = lambda text: isinstance(text, Comment))
    line_table = Soup(line_comment, 'html.parser')
    
    factors_div = soup.find('div', {'id': 'all_four_factors'})
    factors_comment = factors_div.find(string = lambda text: isinstance(text, Comment))
    factors_table = Soup(factors_comment, 'html.parser')
    
    line_df = table_to_df(line_table, 1)
    line_df = line_df.set_index(line_df.columns[0])
    factors_df = table_to_df(factors_table, 1)
    factors_df = factors_df.set_index(factors_df.columns[0])
    
    df = pd.concat([line_df, factors_df], axis = 1)
    
    return df

## Analyze Matchup

In [18]:
team_1 = input('Team 1: ')
team_2 = input('Team 2: ')

Team 1:  BOS
Team 2:  LAL


In [19]:
team_1_dfs = get_team_dfs(team_1)
team_1_dfs[0]
team_1_dfs[1]

https://www.basketball-reference.com/teams/BOS/2021.html


Unnamed: 0,Unnamed: 1,G,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Team,12.0,2880,496,1060,0.468,146,399,0.366,350,661,0.53,191,256,0.746,136,404,540,272,108,72,186,254,1329
1,Team/G,,240.0,41.3,88.3,0.468,12.2,33.3,0.366,29.2,55.1,0.53,15.9,21.3,0.746,11.3,33.7,45.0,22.7,9.0,6.0,15.5,21.2,110.8
2,Lg Rank,,29,12,14,12.0,20,22,15.0,11,12,15.0,20,17,19.0,4,24,12,25,6,5,23,26,16
3,Year/Year,,-0.9%,0.2%,-1.4%,0.007,-3.2%,-3.7%,0.002,1.6%,0.1%,0.008,-14.2%,-7.9%,-0.055,6.2%,-4.9%,-2.3%,-1.4%,8.9%,6.4%,12.2%,-1.9%,-2.6%
4,Opponent,12.0,2880,480,1038,0.462,140,387,0.362,340,651,0.522,214,273,0.784,112,398,510,267,94,63,180,241,1314
5,Opponent/G,,240.0,40.0,86.5,0.462,11.7,32.3,0.362,28.3,54.3,0.522,17.8,22.8,0.784,9.3,33.2,42.5,22.3,7.8,5.3,15.0,20.1,109.5
6,Lg Rank,,29,11,9,18.0,5,4,12.0,17,16,16.0,23,20,24.0,8,7,6,1,14,16,14,15,12
7,Year/Year,,-0.9%,4.2%,-0.6%,0.021,-1.3%,-7.3%,0.022,6.6%,3.8%,0.014,-4.9%,-7.0%,0.017,-9.6%,-2.5%,-4.1%,-0.7%,9.7%,-4.5%,-1.5%,-2.8%,2.0%


Unnamed: 0,Unnamed: 1,W,L,PW,PL,MOV,SOS,SRS,ORtg,DRtg,Pace,FTr,3PAr,eFG%,TOV%,ORB%,FT/FGA,eFG%.1,TOV%.1,DRB%,FT/FGA.1,Arena,Attendance
0,Team,8,4,6,6,1.25,-0.16,1.09,111.1,109.9,99.7,0.242,0.376,0.537,13.7,25.5,0.18,0.53,13.5,78.3,0.206,TD Garden,0
1,Lg Rank,7,27,16,9,12.0,20.0,14.0,12.0,17.0,20.0,19.0,20.0,15.0,22.0,4.0,20.0,16.0,11.0,15.0,23.0,,8


In [20]:
team_2_dfs = get_team_dfs(team_2)
team_2_dfs[0]
team_2_dfs[1]

https://www.basketball-reference.com/teams/LAL/2021.html


Unnamed: 0,Unnamed: 1,G,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Team,15.0,3600,644,1321,0.488,187,478,0.391,457,843,0.542,254,337,0.754,157,572,729,383,104,97,228,279,1729
1,Team/G,,240.0,42.9,88.1,0.488,12.5,31.9,0.391,30.5,56.2,0.542,16.9,22.5,0.754,10.5,38.1,48.6,25.5,6.9,6.5,15.2,18.6,115.3
2,Lg Rank,,2,5,17,2.0,18,23,5.0,5,10,10.0,15,14,15.0,14,2,3,13,23,2,19,6,6
3,Year/Year,,-0.3%,1.4%,-0.3%,0.008,13.2%,0.9%,0.042,-2.7%,-0.9%,-0.01,-4.6%,-7.7%,0.025,-1.8%,8.7%,6.3%,0.5%,-19.6%,-1.9%,0.2%,-10.2%,1.6%
4,Opponent,15.0,3600,603,1354,0.445,172,509,0.338,431,845,0.51,199,261,0.762,134,485,619,363,124,70,205,309,1577
5,Opponent/G,,240.0,40.2,90.3,0.445,11.5,33.9,0.338,28.7,56.3,0.51,13.3,17.4,0.762,8.9,32.3,41.3,24.2,8.3,4.7,13.7,20.6,105.1
6,Lg Rank,,2,15,21,5.0,4,13,4.0,21,23,12.0,1,1,15.0,7,4,4,14,22,11,23,11,3
7,Year/Year,,-0.3%,2.9%,3.6%,-0.003,-1.6%,1.5%,-0.011,4.8%,4.9%,-0.0,-25.8%,-24.0%,-0.019,-5.1%,-1.6%,-2.4%,3.4%,0.3%,26.0%,-14.1%,-5.3%,-2.3%


Unnamed: 0,Unnamed: 1,W,L,PW,PL,MOV,SOS,SRS,ORtg,DRtg,Pace,FTr,3PAr,eFG%,TOV%,ORB%,FT/FGA,eFG%.1,TOV%.1,DRB%,FT/FGA.1,Arena,Attendance
0,Team,11,4,12,3,10.13,-0.66,9.47,114.6,104.5,100.6,0.255,0.362,0.558,13.4,24.5,0.192,0.509,12.2,81.0,0.147,STAPLES Center,0
1,Lg Rank,1,27,1,1,1.0,24.0,1.0,5.0,1.0,17.0,14.0,23.0,5.0,19.0,5.0,14.0,5.0,24.0,2.0,2.0,,8


In [21]:
def dfs_to_composite_df(team, dfs):
    df_basic = dfs[0].iloc[[1], :]
    df_basic.iloc[0, 0] = f'{team}/G'
    df_basic = df_basic.set_index(df_basic.columns[0])
    df_basic_lg = dfs[0].iloc[[2], :]
    df_basic_lg.iloc[0, 0] = f'{team}/LG'
    df_basic_lg = df_basic_lg.set_index(df_basic_lg.columns[0])
    #print(df_basic_lg)
    df_advanced = dfs[1].iloc[[0], :]
    df_advanced.iloc[0, 0] = f'{team}/G'
    df_advanced = df_advanced.set_index(df_advanced.columns[0])
    df_advanced_lg = dfs[1].iloc[[1], :]
    df_advanced_lg.iloc[0, 0] = f'{team}/LG'
    df_advanced_lg = df_advanced_lg.set_index(df_advanced_lg.columns[0])
    #print(df_advanced_lg)
    row_1 = pd.concat([df_basic, df_advanced], axis = 1)
    row_2 = pd.concat([df_basic_lg, df_advanced_lg], axis = 1)
    #print(row_1)
    #print(row_2)
    composite_df = row_1.append(row_2)
    #print(type(list(composite_df.columns[0:38])))
    #print(list(composite_df.columns[0:38]) + [ f'OPP {col}' for col in composite_df.columns[38:] ])
    composite_df.columns = list(composite_df.columns[0:39]) + [ f'OPP {col}' for col in composite_df.columns[39:] ]
    return composite_df

In [22]:
composite_df = dfs_to_composite_df(team_1, team_1_dfs).append(dfs_to_composite_df(team_2, team_2_dfs))
composite_df = composite_df.drop(labels = ['G', 'OPP Arena', 'OPP Attendance'], axis = 1)
#composite_df = composite_df.iloc[:, 0:38]
composite_df.iloc[[0, 2], :] = convert_df_to_int(composite_df.iloc[[0, 2], :])

composite_df = composite_df.reindex([composite_df.index[0], composite_df.index[2], composite_df.index[1], composite_df.index[3]])
composite_df
#tm_2, lg_1 = composite_df.iloc[2].copy(), composite_df.iloc[1].copy()
#composite_df.iloc[1], composite_df.iloc[2] = tm_2, lg_1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_basic.iloc[0, 0] = f'{team}/G'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_basic_lg.iloc[0, 0] = f'{team}/LG'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_advanced.iloc[0, 0] = f'{team}/G'
A value is tryin

Unnamed: 0,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,W,L,PW,PL,MOV,SOS,SRS,ORtg,DRtg,Pace,FTr,3PAr,eFG%,TOV%,ORB%,FT/FGA,OPP eFG%,OPP TOV%,OPP DRB%,OPP FT/FGA
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
BOS/G,240.0,41.3,88.3,0.468,12.2,33.3,0.366,29.2,55.1,0.53,15.9,21.3,0.746,11.3,33.7,45.0,22.7,9.0,6.0,15.5,21.2,110.8,8.0,4.0,6.0,6.0,1.25,-0.16,1.09,111.1,109.9,99.7,0.242,0.376,0.537,13.7,25.5,0.18,0.53,13.5,78.3,0.206
LAL/G,240.0,42.9,88.1,0.488,12.5,31.9,0.391,30.5,56.2,0.542,16.9,22.5,0.754,10.5,38.1,48.6,25.5,6.9,6.5,15.2,18.6,115.3,11.0,4.0,12.0,3.0,10.13,-0.66,9.47,114.6,104.5,100.6,0.255,0.362,0.558,13.4,24.5,0.192,0.509,12.2,81.0,0.147
BOS/LG,29.0,12.0,14.0,12.0,20.0,22.0,15.0,11.0,12.0,15.0,20.0,17.0,19.0,4.0,24.0,12.0,25.0,6.0,5.0,23.0,26.0,16.0,7.0,27.0,16.0,9.0,12.0,20.0,14.0,12.0,17.0,20.0,19.0,20.0,15.0,22.0,4.0,20.0,16.0,11.0,15.0,23.0
LAL/LG,2.0,5.0,17.0,2.0,18.0,23.0,5.0,5.0,10.0,10.0,15.0,14.0,15.0,14.0,2.0,3.0,13.0,23.0,2.0,19.0,6.0,6.0,1.0,27.0,1.0,1.0,1.0,24.0,1.0,5.0,1.0,17.0,14.0,23.0,5.0,19.0,5.0,14.0,5.0,24.0,2.0,2.0


In [23]:
means = composite_df.iloc[[0, 1], :].mean()
#means = composite_df.iloc[[0, 2], :].mean()
means.name = 'Mean'
composite_df.append(means)

Unnamed: 0,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,W,L,PW,PL,MOV,SOS,SRS,ORtg,DRtg,Pace,FTr,3PAr,eFG%,TOV%,ORB%,FT/FGA,OPP eFG%,OPP TOV%,OPP DRB%,OPP FT/FGA
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
BOS/G,240.0,41.3,88.3,0.468,12.2,33.3,0.366,29.2,55.1,0.53,15.9,21.3,0.746,11.3,33.7,45.0,22.7,9.0,6.0,15.5,21.2,110.8,8.0,4.0,6.0,6.0,1.25,-0.16,1.09,111.1,109.9,99.7,0.242,0.376,0.537,13.7,25.5,0.18,0.53,13.5,78.3,0.206
LAL/G,240.0,42.9,88.1,0.488,12.5,31.9,0.391,30.5,56.2,0.542,16.9,22.5,0.754,10.5,38.1,48.6,25.5,6.9,6.5,15.2,18.6,115.3,11.0,4.0,12.0,3.0,10.13,-0.66,9.47,114.6,104.5,100.6,0.255,0.362,0.558,13.4,24.5,0.192,0.509,12.2,81.0,0.147
BOS/LG,29.0,12.0,14.0,12.0,20.0,22.0,15.0,11.0,12.0,15.0,20.0,17.0,19.0,4.0,24.0,12.0,25.0,6.0,5.0,23.0,26.0,16.0,7.0,27.0,16.0,9.0,12.0,20.0,14.0,12.0,17.0,20.0,19.0,20.0,15.0,22.0,4.0,20.0,16.0,11.0,15.0,23.0
LAL/LG,2.0,5.0,17.0,2.0,18.0,23.0,5.0,5.0,10.0,10.0,15.0,14.0,15.0,14.0,2.0,3.0,13.0,23.0,2.0,19.0,6.0,6.0,1.0,27.0,1.0,1.0,1.0,24.0,1.0,5.0,1.0,17.0,14.0,23.0,5.0,19.0,5.0,14.0,5.0,24.0,2.0,2.0
Mean,240.0,42.1,88.2,0.478,12.35,32.6,0.3785,29.85,55.65,0.536,16.4,21.9,0.75,10.9,35.9,46.8,24.1,7.95,6.25,15.35,19.9,113.05,9.5,4.0,9.0,4.5,5.69,-0.41,5.28,112.85,107.2,100.15,0.2485,0.369,0.5475,13.55,25.0,0.186,0.5195,12.85,79.65,0.1765


In [24]:
games = get_all_games(team_1)
#head_to_heads = [ games.loc[ind, :] for ind in games.index if games.loc[ind, 'Opponent'] == teams[team_2] ]
head_to_heads = games.loc[games['Opponent'] == teams[team_2], :]
head_to_heads

https://www.basketball-reference.com/teams/BOS/2021_games.html


Unnamed: 0,G,Date,Start (ET),Unnamed: 4,Unnamed: 5,Unnamed: 6,Opponent,Unnamed: 8,Unnamed: 9,Tm,Opp,W,L,Streak,Notes,url
17,18,"Sat, Jan 30, 2021",8:30p,,,,Los Angeles Lakers,,,,,,,,,https://www.basketball-reference.com/boxscores...


In [25]:
box_scores = [ get_game_stats(url) for url in head_to_heads['url'] ]
for box_score in box_scores:
    box_score

SystemExit: Invalid Game

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
