In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup, Comment 
import pandas as pd
import html5lib
import time 
import requests
import re

In [2]:
# URL for team stats 
url_template_team = "http://www.basketball-reference.com/teams/{team}/{year}/gamelog/"

In [3]:
# teams
teams = ["LAL", "BOS", "DEN", "DAL", "DET", "SAC", "WAS", "WSB", "GSW", "SAS", "POR", "PHI", "NJN", "BRK", "UTA", "OKC", 
         "SEA", "LAC", "ATL", "HOU", "CHA", "CHH", "CHO", "MIL", "NYK", "CHI", "PHO", "MIA", "CLE", "IND", "ORL", "MIN", 
         "MEM", "VAN", "TOR", "NOH", "NOK", "NOP"]

In [4]:
# function to get column headers (pass in the url and the row at which the column headers begin)
def get_columns(url_column, row):
    
    # get the HTML from the url passed in 
    html = urlopen(url_column)
    
    # make a BS object
    soup = BeautifulSoup(html, "lxml")
    
    # find all the tr tags for the column header of interest 
    tr_tag_list = soup.findAll('tr')[row-1] 
    
    # extract table header cell elements from the tag object
    cell_el_list = tr_tag_list.findAll('th')

    # create an empty list to hold all the elements in the column header 
    column_headers = []  

    # for each cell element
    for th in cell_el_list:  
        col_element = th.getText()
        # append each cell element to the column_header list
        column_headers.append(col_element)
    return column_headers

In [5]:
column_teams = get_columns("http://www.basketball-reference.com/teams/ATL/1975/gamelog/", 2)

In [6]:
# function to run through multiple years and multiple teams and get the regular season game logs 
def get_game_logs(url_curr, column_headers, teams, start, end):
    
    # create an empty DataFrame to store all the game logs for all the team's seasons 
    team_stats_df = pd.DataFrame()
    
    for team in teams:
        for year in range(start, end):
            # get the url
            url = url_curr.format(team=team, year=year)
        
            # check if there are game logs for the current team and the current season
            try:
                html = urlopen(url)
            except:
                print("No data for " + team + " in the " + str(year) + " season" + " in the regular season")
                continue
                
            # get the html
            html = urlopen(url)
            # create the beautiful soup object 
            soup = BeautifulSoup(html, "lxml")
            
            ### PART 1: REGULAR SEASON DATA 
    
            # get regular season data (starts on second row)
            data_rows = soup.findAll('tr')[1:]
            
            # check if data_rows actually contains data
            if len(data_rows)<1:
                print("No data for " + team + " in the " + str(year) + " season" + " in the regular season")
                continue 

            # create an empty list to hold all the regular season stats for the current season and the current team 
            season_data = []  

            for i in range(len(data_rows)):
                # create an empty list for each game 
                game_row = []
            
                # first value. Must be handled separately since it goes by th tag instead of td tag 
                game_row.append(data_rows[i].findAll('th')[0].getText())

                # for each table data element from each table row
                for td in data_rows[i].findAll('td'):        
                    # get the text content and append to the game_row 
                    game_row.append(td.getText())        

                # then append each game to the season_data matrix
                season_data.append(game_row)
            
            # Turn season data into a DatFrame
            season_df = pd.DataFrame(season_data, columns=column_headers)
            # Add game_type column
            season_df.insert(0, 'GameType', 'RegularSeason')
                
            ### PART 2: PLAYOFF DATA
            
            # get all the comments on the page
            comments=soup.find_all(string=lambda text:isinstance(text,Comment))
            
            # create an empty list to hold all the playoff stats for the current season and the current team 
            season_data_2 = []
            
            for comment in comments:
                comment_soup = BeautifulSoup(comment, "lxml")
                # print(len(comment_soup.find_all("table")))
                # check if the team has playoffs data
                if(len(comment_soup.find_all("table"))>0):
                    for table in comment_soup.find_all("table"):
                        data_rows_2 = table.findAll('tr')[2:]
                        
                        for i in range(len(data_rows_2)):
                            game_row_2 = []
                            game_row_2.append(data_rows_2[i].findAll('th')[0].getText())
                            for td_2 in data_rows_2[i].findAll('td'):
                                game_row_2.append(td_2.getText())
            
                            season_data_2.append(game_row_2)
            
                        season_2_df = pd.DataFrame(season_data_2, columns=column_teams)
                        season_2_df.insert(0, 'GameType', 'Playoffs')
                    # exit the loop after it finds the playoff data 
                    break
                else:
                    # no playoff data
                    season_2_df = pd.DataFrame()
                    continue 
        
            # combine the regular season data frame with the playoffs data frame
            all_season_df = season_df.append(season_2_df)
            
            # create and insert the Season and Team column
            all_season_df.insert(1, 'Season', year)
            all_season_df.insert(2, 'Team', team)
        
            # Append to the big dataframe
            team_stats_df = team_stats_df.append(all_season_df, ignore_index=True)

    return team_stats_df

In [7]:
start_time = time.time()

# call function for game logs 
team_game_logs_df = get_game_logs(url_template_team, column_teams, teams, 1987, 2018)

print("%f seconds" % (time.time() - start_time))

No data for WAS in the 1987 season in the regular season
No data for WAS in the 1988 season in the regular season
No data for WAS in the 1989 season in the regular season
No data for WAS in the 1990 season in the regular season
No data for WAS in the 1991 season in the regular season
No data for WAS in the 1992 season in the regular season
No data for WAS in the 1993 season in the regular season
No data for WAS in the 1994 season in the regular season
No data for WAS in the 1995 season in the regular season
No data for WAS in the 1996 season in the regular season
No data for WAS in the 1997 season in the regular season
No data for WSB in the 1998 season in the regular season
No data for WSB in the 1999 season in the regular season
No data for WSB in the 2000 season in the regular season
No data for WSB in the 2001 season in the regular season
No data for WSB in the 2002 season in the regular season
No data for WSB in the 2003 season in the regular season
No data for WSB in the 2004 sea

In [8]:
# look at number of rows
len(team_game_logs_df)

83724

In [9]:
# look at first few rows 
team_game_logs_df.head()

Unnamed: 0,GameType,Season,Team,Rk,G,Date,Unnamed: 7,Opp,W/L,Tm,...,FT,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF
0,RegularSeason,1987,LAL,Rk,,,,,,,...,,,,,,,,,,
1,RegularSeason,1987,LAL,1,1.0,1986-11-01,@,HOU,L,102.0,...,21.0,34.0,0.618,13.0,35.0,27.0,7.0,6.0,11.0,22.0
2,RegularSeason,1987,LAL,2,2.0,1986-11-04,@,SEA,W,110.0,...,20.0,25.0,0.8,19.0,44.0,24.0,12.0,4.0,19.0,26.0
3,RegularSeason,1987,LAL,3,3.0,1986-11-07,,DEN,W,138.0,...,23.0,28.0,0.821,5.0,33.0,33.0,5.0,1.0,19.0,25.0
4,RegularSeason,1987,LAL,4,4.0,1986-11-09,,NYK,W,111.0,...,20.0,23.0,0.87,8.0,39.0,20.0,8.0,7.0,22.0,22.0


In [10]:
# look at last few rows 
team_game_logs_df.tail()

Unnamed: 0,GameType,Season,Team,Rk,G,Date,Unnamed: 7,Opp,W/L,Tm,...,FT,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF
83719,RegularSeason,2017,NOP,80,80.0,2017-04-08,@,GSW,L,101.0,...,15.0,16.0,0.938,14.0,53.0,37.0,2.0,4.0,9.0,15.0
83720,RegularSeason,2017,NOP,,,,,,,,...,,,,,,,,,,
83721,RegularSeason,2017,NOP,Rk,,,,,,,...,,,,,,,,,,
83722,RegularSeason,2017,NOP,81,81.0,2017-04-11,@,LAL,L,96.0,...,16.0,23.0,0.696,8.0,47.0,24.0,15.0,3.0,11.0,18.0
83723,RegularSeason,2017,NOP,82,82.0,2017-04-12,@,POR,W,103.0,...,5.0,13.0,0.385,14.0,53.0,22.0,4.0,2.0,23.0,18.0


In [11]:
# function to clean data
def clean_df(df):
    # Convert data to proper data types
    df = df.convert_objects(convert_numeric=True)

    # Get rid of the rows full of null values
    df = df[df.Season.notnull()]

    # Replace NaNs with 0s
    df = df.fillna(0)
    
    # Change % symbol
    df.columns = df.columns.str.replace('%', '_Perc')
    
    return df

In [12]:
# clean advanced player stats df
team_game_logs_df = clean_df(team_game_logs_df)



In [13]:
team_game_logs_df.columns

Index(['GameType', 'Season', 'Team', 'Rk', 'G', 'Date', ' ', 'Opp', 'W/L',
       'Tm', 'Opp', 'FG', 'FGA', 'FG_Perc', '3P', '3PA', '3P_Perc', 'FT',
       'FTA', 'FT_Perc', 'ORB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', ' ',
       'FG', 'FGA', 'FG_Perc', '3P', '3PA', '3P_Perc', 'FT', 'FTA', 'FT_Perc',
       'ORB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF'],
      dtype='object')

In [14]:
# Check if missing values in the DataFrame
team_game_logs_df.isnull().sum() 

GameType    0
Season      0
Team        0
Rk          0
G           0
Date        0
            0
Opp         0
W/L         0
Tm          0
Opp         0
FG          0
FGA         0
FG_Perc     0
3P          0
3PA         0
3P_Perc     0
FT          0
FTA         0
FT_Perc     0
ORB         0
TRB         0
AST         0
STL         0
BLK         0
TOV         0
PF          0
            0
FG          0
FGA         0
FG_Perc     0
3P          0
3PA         0
3P_Perc     0
FT          0
FTA         0
FT_Perc     0
ORB         0
TRB         0
AST         0
STL         0
BLK         0
TOV         0
PF          0
dtype: int64

In [15]:
# write data frame to CSV
team_game_logs_df.to_csv("team_game_logs_basic_1987_2017.csv")