Skip to content

Commit

Permalink
Fixed issue with duolicate rows in pbp
Browse files Browse the repository at this point in the history
  • Loading branch information
HarryShomer committed Feb 11, 2018
1 parent e82e03c commit 781bbc3
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 16 deletions.
6 changes: 3 additions & 3 deletions hockey_scraper/espn_pbp.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,8 @@ def parse_event(event):
if fields[4] == '5':
return None

info['xC'] = fields[0]
info['yC'] = fields[1]
info['xC'] = float(fields[0])
info['yC'] = float(fields[1])
info['time_elapsed'] = shared.convert_to_seconds(fields[3])
info['period'] = fields[4]
info['event'] = event_type(fields[8].upper())
Expand All @@ -149,7 +149,7 @@ def parse_espn(espn_xml):
text = espn_xml.text
# Occasionally we get malformed XML because of the presence of \x13 characters
# Let's just replace them with dashes
text = text.replace(u'\x13','-')
text = text.replace(u'\x13', '-')

try:
tree = etree.fromstring(text)
Expand Down
11 changes: 7 additions & 4 deletions hockey_scraper/game_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,10 +163,13 @@ def combine_espn_html_pbp(html_df, espn_df, game_id, date, away_team, home_team)
if espn_df is not None:
try:
espn_df.period = espn_df.period.astype(int)
df = pd.merge(html_df, espn_df, left_on=['Period', 'Seconds_Elapsed', 'Event'],
right_on=['period', 'time_elapsed', 'event'], how='left')
game_df = pd.merge(html_df, espn_df, left_on=['Period', 'Seconds_Elapsed', 'Event'],
right_on=['period', 'time_elapsed', 'event'], how='left')

df = df.drop(['period', 'time_elapsed', 'event'], axis=1)
# Shit happens
game_df = game_df.drop_duplicates(subset=['Period', 'Event', 'Description', 'Seconds_Elapsed'])

df = game_df.drop(['period', 'time_elapsed', 'event'], axis=1)
except Exception as e:
print('Error for combining espn and html pbp for game {}'.format(game_id), e)
return None
Expand Down Expand Up @@ -216,7 +219,7 @@ def scrape_pbp(game_id, date, roster, game_json, players, teams):
game_df = combine_espn_html_pbp(html_df, espn_df, str(game_id), date, teams['Away'], teams['Home'])

# Sometimes espn is corrupted so can't get coordinates
if espn_df.empty:
if espn_df is None or espn_df.empty:
missing_coords.extend([[game_id, date]])

# Because every game b4 2010 uses ESPN so no point in adding it in there
Expand Down
27 changes: 22 additions & 5 deletions hockey_scraper/html_pbp.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,17 +161,19 @@ def add_zone(event_dict, play_description):
event_dict['Ev_Zone'] = 'Def'


def add_type(event_dict, event):
def add_type(event_dict, event, players, home_team):
"""
Add "type" for event -> either a penalty or a shot type
:param event_dict: dict of event info
:param event: list with parsed event info
:param players: dict of home and away players in game
:param home_team: home team for game
:return: None
"""
if 'PENL' in event[4]:
event_dict['Type'] = get_penalty(event[5])
event_dict['Type'] = get_penalty(event[5], players, home_team)
else:
event_dict['Type'] = shot_type(event[5]).upper()

Expand Down Expand Up @@ -269,19 +271,34 @@ def add_score(event_dict, event, current_score, home_team):
event_dict['score_diff'] = current_score['Home'] - current_score['Away']


def get_penalty(play_description):
# TODO: Fix penalty name -> fucks up with names like Del Zotto
# Note: Remember master list player name != html name!!!!!!!!
def get_penalty(play_description, players, home_team):
"""
Get the penalty info
:param play_description: description of play field
:param players: all players with info
:param home_team: home team for game
:return: penalty info
# Get player who took the penalty
player_regex = re.compile(r'(.{3})\s+#(\d+)')
desc = player_regex.findall(play_description)
player = get_player_name(desc[0][1], players, desc[0][0], home_team)
# Find where in the description his name is located
player_index = play_description.find(player)
if player_index == -1:
return
player_description[player_index+len(players): play_description.find(")"]
"""
regex = re.compile(r'.{3}\s+#\d+\s+\w+\s+(.*)\)')
penalty = regex.findall(play_description)

if penalty:
return penalty[0]+')'
return penalty[0] + ')'
else:
return ''

Expand Down Expand Up @@ -558,7 +575,7 @@ def parse_event(event, players, home_team, if_plays_in_json, current_score):
add_score(event_dict, event, current_score, home_team)
populate_players(event_dict, players, away_players, home_players)
add_strength(event_dict, home_players, away_players)
add_type(event_dict, event)
add_type(event_dict, event, players, home_team)
add_zone(event_dict, event[5])
add_home_zone(event_dict, home_team)

Expand Down
6 changes: 5 additions & 1 deletion hockey_scraper/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,12 @@ def fix_name(name):
:param name: name in pbp
:return: Either the given parameter or the fixed name
if name == "SEBASTIAN AHO" and team == "CAR":
name = "SEBASTIAN ANTERO AHO"
"""
return Names.get(name,name).upper()
return Names.get(name, name).upper()


def convert_to_seconds(minutes):
"""
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
BeautifulSoup4==4.5.3
requests==2.12.4
requests==2.14.2
lxml==3.7.2
html5lib==0.999999999
pandas==0.19.2
pandas==0.20.3
sphinx==1.5.1
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def read():

setup(
name='hockey_scraper',
version='1.2',
version='1.2.1',
description="""This package is designed to allow people to scrape Play by Play and Shift data off of the National
Hockey League (NHL) API and website for all preseason, regular season and playoff games since the
2007-2008 season""",
Expand Down

0 comments on commit 781bbc3

Please sign in to comment.