Skip to content

Commit

Permalink
Fix issue when no players listed in html description of block. Also m…
Browse files Browse the repository at this point in the history
…oved some stuff to a new config file
  • Loading branch information
HarryShomer committed Mar 6, 2021
1 parent e0fbdb6 commit b8c4acc
Show file tree
Hide file tree
Showing 11 changed files with 145 additions and 99 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ tests.py
notes.txt
build/
.pytest_cache
update_season_data.py

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
Binary file removed hockey_scraper/.DS_Store
Binary file not shown.
2 changes: 1 addition & 1 deletion hockey_scraper/nhl/game_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def combine_html_json_pbp(json_df, html_df, game_id, date):
# This is always done - because merge doesn't work well with shootouts
game_df = game_df.drop_duplicates(subset=['Period', 'Event', 'Description', 'Seconds_Elapsed'])
except Exception as e:
shared.print_error('Problem combining Html Json pbp for game {}'.format(game_id, e))
shared.print_error('Problem combining Html Json pbp for game {}'.format(game_id))
return

game_df['Game_Id'] = game_id[-5:]
Expand Down
2 changes: 1 addition & 1 deletion hockey_scraper/nhl/pbp/espn_pbp.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def parse_espn(espn_xml):

try:
tree = etree.fromstring(espn_xml)
except etree.ParseError:
except etree.ParseError as e:
shared.print_error("Espn pbp isn't valid xml, therefore coordinates can't be obtained for this game")
return pd.DataFrame([], columns=columns)

Expand Down
21 changes: 12 additions & 9 deletions hockey_scraper/nhl/pbp/html_pbp.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,14 +538,17 @@ def parse_block(description, players, home_team):
regex = re.compile(r'(.{3})\s+#(\d+)')
desc = regex.findall(description) # [[Team, num], [Team, num]]

p1 = get_player_name(desc[len(desc) - 1][1], players, desc[len(desc) - 1][0], home_team)
event_info['p1_name'] = p1['name']
event_info['p1_ID'] = p1['id']
if len(desc) == 0:
event_info['p1_name'] = event_info['p2_name'] = event_info['p1_ID'] = event_info['p2_ID'] = None
else:
p1 = get_player_name(desc[len(desc) - 1][1], players, desc[len(desc) - 1][0], home_team)
event_info['p1_name'] = p1['name']
event_info['p1_ID'] = p1['id']

if len(desc) > 1:
p2 = get_player_name(desc[0][1], players, desc[0][0], home_team)
event_info['p2_name'] = p2['name']
event_info['p2_ID'] = p2['id']
if len(desc) > 1:
p2 = get_player_name(desc[0][1], players, desc[0][0], home_team)
event_info['p2_name'] = p2['name']
event_info['p2_ID'] = p2['id']

return event_info

Expand Down Expand Up @@ -805,8 +808,8 @@ def scrape_pbp(game_html, game_id, players, teams):
try:
game_df = parse_html(cleaned_html, players, teams)
except Exception as e:
shared.print_error('Error parsing Html pbp for game {} {}'.format(game_id, e))
return None
shared.print_error('Error parsing Html pbp for game {} {}'.format(game_id, e))
return None

# These sometimes end up as objects
game_df.Period = game_df.Period.astype(int)
Expand Down
14 changes: 14 additions & 0 deletions hockey_scraper/utils/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""
Basic configurations
"""

# Directory where to save pages
# When True assumes ~/hockey_scraper_data
# Otherwise can take str to `existing` directory
DOCS_DIR = False

# Boolean that tells us whether or not we should re-scrape a given page if it's already saved
RESCRAPE = False

# Whether to log verbose errors to log file
LOG = False
3 changes: 2 additions & 1 deletion hockey_scraper/utils/player_name_fixes.json
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@
"ALEXANDER NYLANDER": "ALEX NYLANDER",
"CHRISTOPHER WAGNER": "CHRIS WAGNER",
"EGOR SHARANGOVICH": "Yegor Sharangovich",
"ALEXIS LAFRENI?RE": "Alexis Lafrenière"
"ALEXIS LAFRENI?RE": "Alexis Lafrenière",
"CALLAN FOOTE": "Cal Foote"
}
}
2 changes: 2 additions & 0 deletions hockey_scraper/utils/save_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ def create_dir_structure(dir_name):
Create the basic directory structure for docs_dir if not done yet.
Creates the docs and csvs subdir if it doesn't exist
:param dir_name: Name of dir to create
:return None
"""
if not os.path.isdir(os.path.join(dir_name, 'docs')):
Expand Down
166 changes: 96 additions & 70 deletions hockey_scraper/utils/shared.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,26 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This file is a bunch of the shared functions or just general stuff used by the different scrapers in the package.
"""

import os
import time
import json
import logging
import warnings
import requests
from datetime import datetime, timedelta
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from . import save_pages as sp

# Directory where to save pages
docs_dir = False

# Boolean that tells us whether or not we should re-scrape a given page if it's already saved
re_scrape = False
from . import config
import inspect

# Directory where this file lives
FILE_DIR = os.path.dirname(os.path.realpath(__file__))


# Name and Team fixes used
with open(os.path.join(FILE_DIR, "player_name_fixes.json"), "r") as f:
Names = json.load(f)['fixes']


with open(os.path.join(FILE_DIR, "team_tri_codes.json"), "r") as f:
TEAMS = json.load(f)['teams']

Expand Down Expand Up @@ -63,41 +56,75 @@ def custom_formatwarning(msg, *args, **kwargs):

def print_error(msg):
"""
Implement own custom error using warning module.
Reason why i still use warning for errors is so i can set to ignore them if i want to (e.g. live_scrape line 200).
shared.print_error relies on this function.
Implement own custom error using warning module. Prints in red
See here for more on ANSI escape codes - https://en.wikipedia.org/wiki/ANSI_escape_code
Reason why i still use warning for errors is so i can set to ignore them if i want to (e.g. live_scrape line 200).
:param msg: Str to print
:return: str
:return: None
"""
ansi_red_code = '\033[0;31m'
warning_msg = "{}Error: {}".format(ansi_red_code, msg)

# if config.LOG:
# caller_file = os.path.basename(inspect.stack()[1].filename)
# get_logger(caller_file).error(msg + " " + verbose)

warnings.warn(warning_msg)


def print_warning(msg):
"""
Implement own custom warning using warning module.
shared.print_error relies on this function.
See here for more on ANSI escape codes - https://en.wikipedia.org/wiki/ANSI_escape_code
Implement own custom warning using warning module. Prints in Orange.
:param msg: Str to print
:return: str
:return: None
"""
ansi_yellow_code = '\033[0;33m'
warning_msg = "{}Warning: {}".format(ansi_yellow_code, msg)

warnings.warn(warning_msg)


def get_logger(python_file):
"""
Create a basic logger to a log file
:param python_file: File that instantiates the logger instance
:return: logger
"""
base_py_file = os.path.basename(python_file)

# If already exists we don't try to recreate it
if base_py_file in logging.Logger.manager.loggerDict.keys():
return logging.getLogger(base_py_file)

logger = logging.getLogger(base_py_file)
logger.setLevel(logging.INFO)

fh = logging.FileHandler("hockey_scraper_errors_{}.log".format(datetime.now().strftime("%Y-%m-%dT%H:%M:%S")))
fh.setFormatter(logging.Formatter('%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s', datefmt='%Y-%m-%d %I:%M:%S'))
logger.addHandler(fh)

return logger


def log_error(err, py_file):
"""
Log error when Logging is specified
:param err: Error to log
:param python_file: File that instantiates the logger instance
:return: None
"""
if config.LOG:
get_logger(py_file).error(err)


def get_season(date):
"""
Get Season based on from_date
Expand Down Expand Up @@ -181,39 +208,6 @@ def convert_to_seconds(minutes):
return timedelta(hours=x.tm_hour, minutes=x.tm_min, seconds=x.tm_sec).total_seconds()


def scrape_page(url):
"""
Scrape a given url
:param url: url for page
:return: response object
"""
response = requests.Session()
retries = Retry(total=10, backoff_factor=.1)
response.mount('http://', HTTPAdapter(max_retries=retries))

try:
response = response.get(url, timeout=5)
response.raise_for_status()
page = response.text
except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError):
page = None
except requests.exceptions.ReadTimeout:
# If it times out and it's the schedule print an error message...otherwise just make the page = None
if "schedule" in url:
raise Exception("Timeout Error: The NHL API took too long to respond to our request. "
"Please Try Again (you may need to try a few times before it works). ")
else:
print_error("Timeout Error: The server took too long to respond to our request.")
page = None

# Pause for 1 second - make it more if you want
time.sleep(1)

return page


def if_rescrape(user_rescrape):
"""
If you want to re_scrape. If someone is a dumbass and feeds it a non-boolean it terminates the program
Expand All @@ -224,10 +218,8 @@ def if_rescrape(user_rescrape):
:return: None
"""
global re_scrape

if isinstance(user_rescrape, bool):
re_scrape = user_rescrape
config.RESCRAPE = user_rescrape
else:
raise ValueError("Error: 'if_rescrape' must be a boolean. Not a {}".format(type(user_rescrape)))

Expand All @@ -242,35 +234,67 @@ def add_dir(user_dir):
:return: None
"""
global docs_dir

# False so they don't want it
if not user_dir:
docs_dir = False
config.DOCS_DIR = False
return

# Something was given
# Either True or string to directory
# If boolean refer to the home directory
if isinstance(user_dir, bool):
docs_dir = os.path.join(os.path.expanduser('~'), "hockey_scraper_data")
config.DOCS_DIR = os.path.join(os.path.expanduser('~'), "hockey_scraper_data")
# Create if needed
if not os.path.isdir(docs_dir):
if not os.path.isdir(config.DOCS_DIR):
print_warning("Creating the hockey_scraper_data directory in the home directory")
os.mkdir(docs_dir)
os.mkdir(config.DOCS_DIR)
elif isinstance(user_dir, str) and os.path.isdir(user_dir):
docs_dir = user_dir
config.DOCS_DIR = user_dir
elif not (isinstance(user_dir, str) and isinstance(user_dir, bool)):
docs_dir = False
config.DOCS_DIR = False
print_error("The docs_dir argument provided is invalid")
else:
docs_dir = False
config.DOCS_DIR = False
print_error("The directory specified for the saving of scraped docs doesn't exist. Therefore:"
"\n1. All specified games will be scraped from their appropriate sources (NHL or ESPN)."
"\n2. All scraped files will NOT be saved at all. Please either create the directory you want them to be "
"deposited in or recheck the directory you typed in and start again.\n")


def scrape_page(url):
"""
Scrape a given url
:param url: url for page
:return: response object
"""
response = requests.Session()
retries = Retry(total=10, backoff_factor=.1)
response.mount('http://', HTTPAdapter(max_retries=retries))

try:
response = response.get(url, timeout=5)
response.raise_for_status()
page = response.text
except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError):
page = None
except requests.exceptions.ReadTimeout:
# If it times out and it's the schedule print an error message...otherwise just make the page = None
if "schedule" in url:
raise Exception("Timeout Error: The NHL API took too long to respond to our request. "
"Please Try Again (you may need to try a few times before it works). ")
else:
print_error("Timeout Error: The server took too long to respond to our request.")
page = None

# Pause for 1 second - make it more if you want
time.sleep(1)

return page



def get_file(file_info, force=False):
"""
Get the specified file.
Expand All @@ -284,10 +308,10 @@ def get_file(file_info, force=False):
:return: page
"""
file_info['dir'] = docs_dir
file_info['dir'] = config.DOCS_DIR

# If everything checks out we'll retrieve it, otherwise we scrape it
if docs_dir and sp.check_file_exists(file_info) and not re_scrape and not force:
if file_info['dir'] and sp.check_file_exists(file_info) and not config.RESCRAPE and not force:
page = sp.get_page(file_info)
else:
page = scrape_page(file_info['url'])
Expand Down Expand Up @@ -338,6 +362,8 @@ def to_csv(base_file_name, df, league, file_type):
:return: None
"""
docs_dir = config.DOCS_DIR

# This was a late addition so we add support here
if isinstance(docs_dir, str) and not os.path.isdir(os.path.join(docs_dir, "csvs")):
os.mkdir(os.path.join(docs_dir, "csvs"))
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def read():

setup(
name='hockey_scraper',
version='1.37.5',
version='1.37.6',
description="""Python Package for scraping NHL Play-by-Play and Shift data.""",
long_description=read(),
classifiers=[
Expand Down

0 comments on commit b8c4acc

Please sign in to comment.