Fixed issue with changes eventTypeId in json. Also fixed espn as they…

… changed the layout
HarryShomer · Jul 17, 2019 · 6d45ca9 · 6d45ca9
1 parent c17f9bc
commit 6d45ca9
Show file tree

Hide file tree

Showing 8 changed files with 58 additions and 37 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -27,3 +27,8 @@ v1.31
 
   * Added functionality to automatically create docs_dir
   * Added folder to store csv files
+
+v1.34
+-----
+  * Fixed bug with nhl changing contents of eventTypeId
+  * Updated ESPN scraping after they changed the layout of the pages
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -44,7 +44,7 @@
 
 # General information about the project.
 project = 'hockey_scraper'
-copyright = '2018, Harry Shomer'
+copyright = '2019, Harry Shomer'
 author = 'Harry Shomer'
 
 # The version info for the project you're documenting, acts as replacement for

diff --git a/hockey_scraper/nhl/pbp/espn_pbp.py b/hockey_scraper/nhl/pbp/espn_pbp.py
@@ -36,29 +36,50 @@ def get_game_ids(response):
     """
     soup = BeautifulSoup(response, 'lxml')
 
-    divs = soup.findAll('div', {'class': "game-header"})
-    regex = re.compile(r'id="(\d+)')
-    game_ids = [regex.findall(str(div))[0] for div in divs]
+    sections = soup.findAll("section", {"class": "Scoreboard bg-clr-white flex flex-auto justify-between"})
+    game_ids = [section['id'] for section in sections]
 
     return game_ids
 
 
 def get_teams(response):
     """
     Extract Teams for date from doc
+
+    ul-> class = ScoreCell__Competitors
+
+    div -> class = ScoreCell__TeamName ScoreCell__TeamName--shortDisplayName truncate db
     
     :param response: doc
     
     :return: list of teams    
     """
+    teams = []
     soup = BeautifulSoup(response, 'lxml')
 
-    td = soup.findAll('td', {'class': "team"})
-    teams = [shared.get_team(t.get_text().upper()) for t in td if t.get_text() != '']
+    uls = soup.findAll('div', {'class': "ScoreCell__Team"})
+
+    for ul in uls:
+        actual_tm = None
+        tm = ul.find('div', {'class': "ScoreCell__TeamName ScoreCell__TeamName--shortDisplayName truncate db"}).text
+
+        # ESPN stores the name and not the city
+        for real_tm in list(shared.TEAMS.keys()):
+            if tm.upper() in real_tm:
+                actual_tm = shared.TEAMS[real_tm]
 
+        # If not found we'll let the user know...this may happens
+        if actual_tm is None:
+            shared.print_warning("The team {} in the espn pbp is unknown. We use the supplied team name".format(tm))
+            actual_tm = tm
+
+        teams.append(actual_tm)
+
     # Make a list of both teams for each game
     games = [teams[i:i + 2] for i in range(0, len(teams), 2)]
 
+    print(games)
+
     return games
 
 

diff --git a/hockey_scraper/nhl/pbp/json_pbp.py b/hockey_scraper/nhl/pbp/json_pbp.py
@@ -55,27 +55,27 @@ def change_event_name(event):
     :return: fixed event type
     """
     event_types = {
-        'PERIOD_START': 'PSTR',
+        'PERIOD START': 'PSTR',
         'FACEOFF': 'FAC',
-        'BLOCKED_SHOT': 'BLOCK',
-        'GAME_END': 'GEND',
+        'BLOCKED SHOT': 'BLOCK',
+        'GAME END': 'GEND',
         'GIVEAWAY': 'GIVE',
         'GOAL': 'GOAL',
         'HIT': 'HIT',
-        'MISSED_SHOT': 'MISS',
-        'PERIOD_END': 'PEND',
+        'MISSED SHOT': 'MISS',
+        'PERIOD END': 'PEND',
         'SHOT': 'SHOT',
-        'STOP': 'STOP',
+        'STOPPAGE': 'STOP',
         'TAKEAWAY': 'TAKE',
         'PENALTY': 'PENL',
-        'EARLY_INT_START': 'EISTR',
-        'EARLY_INT_END': 'EIEND',
-        'SHOOTOUT_COMPLETE': 'SOC',
+        'EARLY INT START': 'EISTR',
+        'EARLY INT END': 'EIEND',
+        'SHOOTOUT COMPLETE': 'SOC',
         'CHALLENGE': 'CHL',
-        'EMERGENCY_GOALTENDER': 'EGPID'
+        'EMERGENCY GOALTENDER': 'EGPID'
     }
 
-    return event_types.get(event, event)
+    return event_types.get(event.upper(), event)
 
 
 def parse_event(event):
@@ -90,7 +90,7 @@ def parse_event(event):
 
     play['event_id'] = event['about']['eventIdx']
     play['period'] = event['about']['period']
-    play['event'] = str(change_event_name(event['result']['eventTypeId']))
+    play['event'] = str(change_event_name(event['result']['event']))
     play['seconds_elapsed'] = shared.convert_to_seconds(event['about']['periodTime'])
 
     # If there's a players key that means an event occurred on the play.
@@ -103,13 +103,8 @@ def parse_event(event):
                 play['p{}_name'.format(i + 1)] = shared.fix_name(event['players'][i]['player']['fullName'].upper())
                 play['p{}_ID'.format(i + 1)] = event['players'][i]['player']['id']
 
-        # Coordinates aren't always there
-        try:
-            play['xC'] = event['coordinates']['x']
-            play['yC'] = event['coordinates']['y']
-        except KeyError:
-            play['xC'] = ''
-            play['yC'] = ''
+        play['xC'] = event['coordinates'].get('x')
+        play['yC'] = event['coordinates'].get('y')
 
     return play
 
@@ -126,11 +121,11 @@ def parse_json(game_json, game_id):
     columns = ['period', 'event', 'seconds_elapsed', 'p1_name', 'p1_ID', 'p2_name', 'p2_ID', 'p3_name', 'p3_ID', 'xC', 'yC']
 
     # 'PERIOD READY' & 'PERIOD OFFICIAL'..etc aren't found in html...so get rid of them
-    events_to_ignore = ['PERIOD_READY', 'PERIOD_OFFICIAL', 'GAME_READY', 'GAME_OFFICIAL', 'GAME_SCHEDULED']
+    events_to_ignore = ['PERIOD READY', 'PERIOD OFFICIAL', 'GAME READY', 'GAME OFFICIAL', 'GAME SCHEDULED']
 
     try:
         plays = game_json['liveData']['plays']['allPlays']
-        events = [parse_event(play) for play in plays if play['result']['eventTypeId'] not in events_to_ignore]
+        events = [parse_event(play) for play in plays if play['result']['event'].upper() not in events_to_ignore]
     except Exception as e:
         shared.print_warning('Error parsing Json pbp for game {} {}'.format(game_id, e))
         return None

diff --git a/hockey_scraper/utils/save_pages.py b/hockey_scraper/utils/save_pages.py
@@ -62,10 +62,11 @@ def check_file_exists(file_info):
 
     :return: Boolean - True if it exists
     """
-    # Create the docs subdir if it doesn't exist
+    # Create the docs and csvs subdir if it doesn't exist
     if not os.path.isdir(os.path.join(file_info['dir'], 'docs')):
-        os.mkdir("docs")
-        os.mkdir("csvs")
+        os.mkdir(os.path.join(file_info['dir'], 'docs'))
+    if not os.path.isdir(os.path.join(file_info['dir'], 'csvs')): 
+        os.mkdir(os.path.join(file_info['dir'], 'csvs'))
 
     # Check if the folder for the season for the given game was created yet...if not create it
     if not os.path.isdir(os.path.join(file_info['dir'], '/'.join(['docs', str(file_info['season'])]))):

diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@ def read():
 
 setup(
     name='hockey_scraper',
-    version='1.32.3',
+    version='1.33',
     description="""This package is designed to allow one to scrape the raw data for both the National Hockey League
                    (NHL) and the National Women's Hockey League (NWHL) off of their respective API and websites.""",
     long_description=read(),

diff --git a/tests/test_espn_pbp.py b/tests/test_espn_pbp.py
@@ -32,18 +32,16 @@ def test_get_teams(date_response):
     """ Check to make sure we get a list of both teams for every game that day"""
 
     # Games for that date
-    date_games = [
-        ['ANA', 'MIN'], ['TOR', 'MTL'], ['NYR', 'PHI'], ['PIT', 'NSH'], ['T.B', 'CHI'], ['DET', 'VAN'],
-        ['N.J', 'BUF'], ['ARI', 'OTT'], ['NYI', 'STL'], ['FLA', 'DAL'], ['CBJ', 'COL'], ['CAR', 'S.J']
-    ]
+    date_games = [['ANA', 'MIN'], ['N.J', 'BUF'], ['TOR', 'MTL'], ['PHX', 'OTT'], ['NYR', 'PHI'], ['NYI', 'STL'],
+                  ['PIT', 'NSH'], ['FLA', 'DAL'], ['T.B', 'CHI'], ['CBJ', 'COL'], ['DET', 'VAN'], ['CAR', 'S.J']]
 
     assert espn_pbp.get_teams(date_response) == date_games
 
 
 def test_get_game_ids(date_response):
     """ Check to see that all the espn game id's for that day are correct"""
-    game_ids = ['400814970', '400814972', '400814974', '400814976', '400814978', '400814980', '400814971', '400814973',
-                '400814975', '400814977', '400814979', '400814981']
+    game_ids = ['400814970', '400814971', '400814972', '400814973', '400814974', '400814975', '400814976',
+                '400814977', '400814978', '400814979', '400814980', '400814981']
 
     assert espn_pbp.get_game_ids(date_response) == game_ids
 

diff --git a/tests/test_shared.py b/tests/test_shared.py
@@ -92,4 +92,5 @@ def test_get_file(file_info):
     # Some cleanup....remove stuff created from the file directory and move back
     os.chdir(os.path.dirname(os.path.realpath(__file__)))
     shutil.rmtree("docs")
+    shutil.rmtree("csvs")
     os.chdir(original_path)