Skip to content

Commit

Permalink
update nhl schedule to always rescrape the page. See the docstring fo…
Browse files Browse the repository at this point in the history
…r nhl.json_schedule.get_schedule for more details
  • Loading branch information
HarryShomer committed Jan 25, 2021
1 parent eff06f3 commit fb14534
Show file tree
Hide file tree
Showing 8 changed files with 31 additions and 19 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,4 @@ v1.37
* Now saves scraped pages in docs_dir as a GZIP
* Only print full error summary when the number of games scraped is >= 25
* Remove hardcoded exception for Sebastian Aho. Updated process to work without it.
* Always rescrape schedule pages
5 changes: 3 additions & 2 deletions hockey_scraper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#from .nwhl import scrape_functions as nwhl
from .nhl.live_scrape import ScrapeLiveGames, LiveGame
from .nhl.scrape_functions import scrape_games, scrape_date_range, scrape_seasons, scrape_schedule
from .nhl import live_scrape
from .utils import shared
from . import utils
from . import utils

#from .nwhl import scrape_schedule as nwhl_scrape_schedule
6 changes: 5 additions & 1 deletion hockey_scraper/nhl/json_schedule.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
import hockey_scraper.utils.shared as shared


# TODO: Currently rescraping page each time since the status of some games may have changed
# (e.g. Scraped on 2020-01-20 and game on 2020-01-21 was not Final...when use old page again will still think not Final)
# Need to find a more elegant way of doing this
def get_schedule(date_from, date_to):
"""
Scrapes games in date range
Expand All @@ -25,7 +28,7 @@ def get_schedule(date_from, date_to):
"season": shared.get_season(date_from),
}

return json.loads(shared.get_file(page_info))
return json.loads(shared.get_file(page_info, force=True))


def chunk_schedule_calls(from_date, to_date):
Expand Down Expand Up @@ -129,4 +132,5 @@ def scrape_schedule(date_from, date_to, preseason=False, not_over=False):
"status": game["status"]["abstractGameState"]
})


return schedule
4 changes: 2 additions & 2 deletions hockey_scraper/nhl/scrape_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,11 +194,11 @@ def scrape_seasons(seasons, if_scrape_shifts, data_format='csv', preseason=False
if data_format.lower() == 'csv':
shared.to_csv(str(season) + str(season + 1), pbp_df, "nhl", "pbp")
shared.to_csv(str(season) + str(season + 1), shifts_df, "nhl", "shifts")
else:
elif pbp_df is not None:
master_pbps.append(pbp_df)
master_shifts.append(shifts_df)

if data_format.lower() == 'pandas':
if data_format.lower() == 'pandas' and master_pbps:
if if_scrape_shifts:
return {"pbp": pd.concat(master_pbps), "shifts": pd.concat(master_shifts)}
else:
Expand Down
2 changes: 1 addition & 1 deletion hockey_scraper/nwhl/scrape_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import random
import pandas as pd

from . import html_schedule, json_pbp
#from . import html_schedule, json_pbp
import hockey_scraper.utils.shared as shared

# All columns for the pbp
Expand Down
21 changes: 13 additions & 8 deletions hockey_scraper/nwhl/scrape_schedule.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def scrape_dynamic(url):
return pg


def get_schedule(url, name):#
def get_schedule(url, name):
"""
Given a url it returns the raw html
Expand All @@ -50,7 +50,7 @@ def get_schedule(url, name):#
"""
file_info = {
"url": url,
"name": str(name),
"name": str(name) + "_schedule",
"type": "html_schedule_nwhl",
"season": "nwhl",
'dir': shared.docs_dir
Expand All @@ -70,7 +70,7 @@ def get_season_codes():
"""
They use fucked up codes instead of actual years to represent seasons in the url.
e.g. For 2015 - 'https://www.nwhl.zone/stats#/100/schedule?season_id=246
e.g. For 2019 - https://www.nwhl.zone/stats#/100/schedule?all&season_id=1950
Instead of hardcoding it I just ping the base page and get the codes
Expand All @@ -96,7 +96,9 @@ def get_season_codes():

def parse_game(game, season):
"""
Given a soup object for a given game parse out the info
Given a soup object for a given game parse out the info.
Skip over all-star game
:param games: Soup object
:param season: nwhl season
Expand All @@ -107,6 +109,10 @@ def parse_game(game, season):

# Team info
teams = game.find_all("span", {"class": "team-inline"})

if "All-Star" in teams[0].text:
return parsed_game

parsed_game['away_team'] = teams[0].find("span").text
parsed_game['home_team'] = teams[1].find("span").text

Expand Down Expand Up @@ -155,10 +161,9 @@ def get_season_games(season, season_code):
games = sched.find_all("tr", {"class": re.compile("^ng-scope")})

for game in games:
# For 2015 below each game is an additionak <tr> with the attendance
# We filter out by checking if the following attribute exists
if game.get('ng-if') is None:
parsed_games.append(parse_game(game, season))
g = parse_game(game, season)
if g:
parsed_games.append(g)

return parsed_games

Expand Down
6 changes: 3 additions & 3 deletions hockey_scraper/utils/save_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ def create_base_file_path(file_info):
:return: path
"""
# Shitty fix for when you already have it saved but don't have nwhl folders
# if 'nwhl' in file_info['type']:
# if not os.path.isdir(os.path.join(file_info['dir'], 'docs', str(file_info['season']), file_info['type'])):
# os.mkdir(os.path.join(file_info['dir'], 'docs', str(file_info['season']), file_info['type']))
if 'nwhl' in file_info['type']:
if not os.path.isdir(os.path.join(file_info['dir'], 'docs', str(file_info['season']), file_info['type'])):
os.mkdir(os.path.join(file_info['dir'], 'docs', str(file_info['season']), file_info['type']))

return os.path.join(file_info['dir'], 'docs', str(file_info['season']), file_info['type'], file_info['name'] + ".txt")

Expand Down
5 changes: 3 additions & 2 deletions hockey_scraper/utils/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ def add_dir(user_dir):
"deposited in or recheck the directory you typed in and start again.\n")


def get_file(file_info):
def get_file(file_info, force=False):
"""
Get the specified file.
Expand All @@ -280,13 +280,14 @@ def get_file(file_info):
:param file_info: Dictionary containing the info for the file.
Contains the url, name, type, and season
:param force: Force a rescrape. Default is False
:return: page
"""
file_info['dir'] = docs_dir

# If everything checks out we'll retrieve it, otherwise we scrape it
if docs_dir and sp.check_file_exists(file_info) and not re_scrape:
if docs_dir and sp.check_file_exists(file_info) and not re_scrape and not force:
page = sp.get_page(file_info)
else:
page = scrape_page(file_info['url'])
Expand Down

0 comments on commit fb14534

Please sign in to comment.