Skip to content

Commit

Permalink
Fixed issue with changes eventTypeId in json. Also fixed espn as they…
Browse files Browse the repository at this point in the history
… changed the layout
  • Loading branch information
HarryShomer committed Jul 17, 2019
1 parent c17f9bc commit 6d45ca9
Show file tree
Hide file tree
Showing 8 changed files with 58 additions and 37 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,8 @@ v1.31

* Added functionality to automatically create docs_dir
* Added folder to store csv files

v1.34
-----
* Fixed bug with nhl changing contents of eventTypeId
* Updated ESPN scraping after they changed the layout of the pages
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@

# General information about the project.
project = 'hockey_scraper'
copyright = '2018, Harry Shomer'
copyright = '2019, Harry Shomer'
author = 'Harry Shomer'

# The version info for the project you're documenting, acts as replacement for
Expand Down
31 changes: 26 additions & 5 deletions hockey_scraper/nhl/pbp/espn_pbp.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,29 +36,50 @@ def get_game_ids(response):
"""
soup = BeautifulSoup(response, 'lxml')

divs = soup.findAll('div', {'class': "game-header"})
regex = re.compile(r'id="(\d+)')
game_ids = [regex.findall(str(div))[0] for div in divs]
sections = soup.findAll("section", {"class": "Scoreboard bg-clr-white flex flex-auto justify-between"})
game_ids = [section['id'] for section in sections]

return game_ids


def get_teams(response):
"""
Extract Teams for date from doc
ul-> class = ScoreCell__Competitors
div -> class = ScoreCell__TeamName ScoreCell__TeamName--shortDisplayName truncate db
:param response: doc
:return: list of teams
"""
teams = []
soup = BeautifulSoup(response, 'lxml')

td = soup.findAll('td', {'class': "team"})
teams = [shared.get_team(t.get_text().upper()) for t in td if t.get_text() != '']
uls = soup.findAll('div', {'class': "ScoreCell__Team"})

for ul in uls:
actual_tm = None
tm = ul.find('div', {'class': "ScoreCell__TeamName ScoreCell__TeamName--shortDisplayName truncate db"}).text

# ESPN stores the name and not the city
for real_tm in list(shared.TEAMS.keys()):
if tm.upper() in real_tm:
actual_tm = shared.TEAMS[real_tm]

# If not found we'll let the user know...this may happens
if actual_tm is None:
shared.print_warning("The team {} in the espn pbp is unknown. We use the supplied team name".format(tm))
actual_tm = tm

teams.append(actual_tm)

# Make a list of both teams for each game
games = [teams[i:i + 2] for i in range(0, len(teams), 2)]

print(games)

return games


Expand Down
37 changes: 16 additions & 21 deletions hockey_scraper/nhl/pbp/json_pbp.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,27 +55,27 @@ def change_event_name(event):
:return: fixed event type
"""
event_types = {
'PERIOD_START': 'PSTR',
'PERIOD START': 'PSTR',
'FACEOFF': 'FAC',
'BLOCKED_SHOT': 'BLOCK',
'GAME_END': 'GEND',
'BLOCKED SHOT': 'BLOCK',
'GAME END': 'GEND',
'GIVEAWAY': 'GIVE',
'GOAL': 'GOAL',
'HIT': 'HIT',
'MISSED_SHOT': 'MISS',
'PERIOD_END': 'PEND',
'MISSED SHOT': 'MISS',
'PERIOD END': 'PEND',
'SHOT': 'SHOT',
'STOP': 'STOP',
'STOPPAGE': 'STOP',
'TAKEAWAY': 'TAKE',
'PENALTY': 'PENL',
'EARLY_INT_START': 'EISTR',
'EARLY_INT_END': 'EIEND',
'SHOOTOUT_COMPLETE': 'SOC',
'EARLY INT START': 'EISTR',
'EARLY INT END': 'EIEND',
'SHOOTOUT COMPLETE': 'SOC',
'CHALLENGE': 'CHL',
'EMERGENCY_GOALTENDER': 'EGPID'
'EMERGENCY GOALTENDER': 'EGPID'
}

return event_types.get(event, event)
return event_types.get(event.upper(), event)


def parse_event(event):
Expand All @@ -90,7 +90,7 @@ def parse_event(event):

play['event_id'] = event['about']['eventIdx']
play['period'] = event['about']['period']
play['event'] = str(change_event_name(event['result']['eventTypeId']))
play['event'] = str(change_event_name(event['result']['event']))
play['seconds_elapsed'] = shared.convert_to_seconds(event['about']['periodTime'])

# If there's a players key that means an event occurred on the play.
Expand All @@ -103,13 +103,8 @@ def parse_event(event):
play['p{}_name'.format(i + 1)] = shared.fix_name(event['players'][i]['player']['fullName'].upper())
play['p{}_ID'.format(i + 1)] = event['players'][i]['player']['id']

# Coordinates aren't always there
try:
play['xC'] = event['coordinates']['x']
play['yC'] = event['coordinates']['y']
except KeyError:
play['xC'] = ''
play['yC'] = ''
play['xC'] = event['coordinates'].get('x')
play['yC'] = event['coordinates'].get('y')

return play

Expand All @@ -126,11 +121,11 @@ def parse_json(game_json, game_id):
columns = ['period', 'event', 'seconds_elapsed', 'p1_name', 'p1_ID', 'p2_name', 'p2_ID', 'p3_name', 'p3_ID', 'xC', 'yC']

# 'PERIOD READY' & 'PERIOD OFFICIAL'..etc aren't found in html...so get rid of them
events_to_ignore = ['PERIOD_READY', 'PERIOD_OFFICIAL', 'GAME_READY', 'GAME_OFFICIAL', 'GAME_SCHEDULED']
events_to_ignore = ['PERIOD READY', 'PERIOD OFFICIAL', 'GAME READY', 'GAME OFFICIAL', 'GAME SCHEDULED']

try:
plays = game_json['liveData']['plays']['allPlays']
events = [parse_event(play) for play in plays if play['result']['eventTypeId'] not in events_to_ignore]
events = [parse_event(play) for play in plays if play['result']['event'].upper() not in events_to_ignore]
except Exception as e:
shared.print_warning('Error parsing Json pbp for game {} {}'.format(game_id, e))
return None
Expand Down
7 changes: 4 additions & 3 deletions hockey_scraper/utils/save_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,11 @@ def check_file_exists(file_info):
:return: Boolean - True if it exists
"""
# Create the docs subdir if it doesn't exist
# Create the docs and csvs subdir if it doesn't exist
if not os.path.isdir(os.path.join(file_info['dir'], 'docs')):
os.mkdir("docs")
os.mkdir("csvs")
os.mkdir(os.path.join(file_info['dir'], 'docs'))
if not os.path.isdir(os.path.join(file_info['dir'], 'csvs')):
os.mkdir(os.path.join(file_info['dir'], 'csvs'))

# Check if the folder for the season for the given game was created yet...if not create it
if not os.path.isdir(os.path.join(file_info['dir'], '/'.join(['docs', str(file_info['season'])]))):
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def read():

setup(
name='hockey_scraper',
version='1.32.3',
version='1.33',
description="""This package is designed to allow one to scrape the raw data for both the National Hockey League
(NHL) and the National Women's Hockey League (NWHL) off of their respective API and websites.""",
long_description=read(),
Expand Down
10 changes: 4 additions & 6 deletions tests/test_espn_pbp.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,16 @@ def test_get_teams(date_response):
""" Check to make sure we get a list of both teams for every game that day"""

# Games for that date
date_games = [
['ANA', 'MIN'], ['TOR', 'MTL'], ['NYR', 'PHI'], ['PIT', 'NSH'], ['T.B', 'CHI'], ['DET', 'VAN'],
['N.J', 'BUF'], ['ARI', 'OTT'], ['NYI', 'STL'], ['FLA', 'DAL'], ['CBJ', 'COL'], ['CAR', 'S.J']
]
date_games = [['ANA', 'MIN'], ['N.J', 'BUF'], ['TOR', 'MTL'], ['PHX', 'OTT'], ['NYR', 'PHI'], ['NYI', 'STL'],
['PIT', 'NSH'], ['FLA', 'DAL'], ['T.B', 'CHI'], ['CBJ', 'COL'], ['DET', 'VAN'], ['CAR', 'S.J']]

assert espn_pbp.get_teams(date_response) == date_games


def test_get_game_ids(date_response):
""" Check to see that all the espn game id's for that day are correct"""
game_ids = ['400814970', '400814972', '400814974', '400814976', '400814978', '400814980', '400814971', '400814973',
'400814975', '400814977', '400814979', '400814981']
game_ids = ['400814970', '400814971', '400814972', '400814973', '400814974', '400814975', '400814976',
'400814977', '400814978', '400814979', '400814980', '400814981']

assert espn_pbp.get_game_ids(date_response) == game_ids

Expand Down
1 change: 1 addition & 0 deletions tests/test_shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,4 +92,5 @@ def test_get_file(file_info):
# Some cleanup....remove stuff created from the file directory and move back
os.chdir(os.path.dirname(os.path.realpath(__file__)))
shutil.rmtree("docs")
shutil.rmtree("csvs")
os.chdir(original_path)

0 comments on commit 6d45ca9

Please sign in to comment.