Fix issue when no players listed in html description of block. Also m…

…oved some stuff to a new config file
HarryShomer · Mar 6, 2021 · b8c4acc · b8c4acc
1 parent e0fbdb6
commit b8c4acc
Show file tree

Hide file tree

Showing 11 changed files with 145 additions and 99 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,7 @@ tests.py
 notes.txt
 build/
 .pytest_cache
+update_season_data.py
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/hockey_scraper/.DS_Store b/hockey_scraper/.DS_Store
diff --git a/hockey_scraper/nhl/game_scraper.py b/hockey_scraper/nhl/game_scraper.py
@@ -165,7 +165,7 @@ def combine_html_json_pbp(json_df, html_df, game_id, date):
         # This is always done - because merge doesn't work well with shootouts
         game_df = game_df.drop_duplicates(subset=['Period', 'Event', 'Description', 'Seconds_Elapsed'])
     except Exception as e:
-        shared.print_error('Problem combining Html Json pbp for game {}'.format(game_id, e))
+        shared.print_error('Problem combining Html Json pbp for game {}'.format(game_id))
         return
 
     game_df['Game_Id'] = game_id[-5:]

diff --git a/hockey_scraper/nhl/pbp/espn_pbp.py b/hockey_scraper/nhl/pbp/espn_pbp.py
@@ -195,7 +195,7 @@ def parse_espn(espn_xml):
 
     try:
         tree = etree.fromstring(espn_xml)
-    except etree.ParseError:
+    except etree.ParseError as e:
         shared.print_error("Espn pbp isn't valid xml, therefore coordinates can't be obtained for this game")
         return pd.DataFrame([], columns=columns)
 

diff --git a/hockey_scraper/nhl/pbp/html_pbp.py b/hockey_scraper/nhl/pbp/html_pbp.py
@@ -538,14 +538,17 @@ def parse_block(description, players, home_team):
     regex = re.compile(r'(.{3})\s+#(\d+)')
     desc = regex.findall(description)  # [[Team, num], [Team, num]]
 
-    p1 = get_player_name(desc[len(desc) - 1][1], players, desc[len(desc) - 1][0], home_team)
-    event_info['p1_name'] = p1['name']
-    event_info['p1_ID'] = p1['id']
+    if len(desc) == 0:
+        event_info['p1_name'] = event_info['p2_name'] = event_info['p1_ID'] = event_info['p2_ID'] = None
+    else:
+        p1 = get_player_name(desc[len(desc) - 1][1], players, desc[len(desc) - 1][0], home_team)
+        event_info['p1_name'] = p1['name']
+        event_info['p1_ID'] = p1['id']
 
-    if len(desc) > 1:
-        p2 = get_player_name(desc[0][1], players, desc[0][0], home_team)
-        event_info['p2_name'] = p2['name']
-        event_info['p2_ID'] = p2['id']
+        if len(desc) > 1:
+            p2 = get_player_name(desc[0][1], players, desc[0][0], home_team)
+            event_info['p2_name'] = p2['name']
+            event_info['p2_ID'] = p2['id']
 
     return event_info
 
@@ -805,8 +808,8 @@ def scrape_pbp(game_html, game_id, players, teams):
     try:
         game_df = parse_html(cleaned_html, players, teams)
     except Exception as e:
-        shared.print_error('Error parsing Html pbp for game {} {}'.format(game_id, e))
-        return None
+       shared.print_error('Error parsing Html pbp for game {} {}'.format(game_id, e))
+       return None
 
     # These sometimes end up as objects
     game_df.Period = game_df.Period.astype(int)

diff --git a/hockey_scraper/utils/config.py b/hockey_scraper/utils/config.py
@@ -0,0 +1,14 @@
+"""
+Basic configurations
+"""
+
+# Directory where to save pages
+# When True assumes ~/hockey_scraper_data
+# Otherwise can take str to `existing` directory
+DOCS_DIR = False
+
+# Boolean that tells us whether or not we should re-scrape a given page if it's already saved
+RESCRAPE = False
+
+# Whether to log verbose errors to log file
+LOG = False
diff --git a/hockey_scraper/utils/player_name_fixes.json b/hockey_scraper/utils/player_name_fixes.json
@@ -274,6 +274,7 @@
         "ALEXANDER NYLANDER": "ALEX NYLANDER",
         "CHRISTOPHER WAGNER": "CHRIS WAGNER",
         "EGOR SHARANGOVICH": "Yegor Sharangovich",
-        "ALEXIS LAFRENI?RE": "Alexis Lafrenière"
+        "ALEXIS LAFRENI?RE": "Alexis Lafrenière",
+        "CALLAN FOOTE": "Cal Foote"
     }
 }
diff --git a/hockey_scraper/utils/save_pages.py b/hockey_scraper/utils/save_pages.py
@@ -41,6 +41,8 @@ def create_dir_structure(dir_name):
     Create the basic directory structure for docs_dir if not done yet.
     Creates the docs and csvs subdir if it doesn't exist
 
+    :param dir_name: Name of dir to create
+
     :return None
     """
     if not os.path.isdir(os.path.join(dir_name, 'docs')):

diff --git a/hockey_scraper/utils/shared.py b/hockey_scraper/utils/shared.py
@@ -1,33 +1,26 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
 """
 This file is a bunch of the shared functions or just general stuff used by the different scrapers in the package.
 """
-
 import os
 import time
 import json
+import logging
 import warnings
 import requests
 from datetime import datetime, timedelta
 from requests.adapters import HTTPAdapter
 from requests.packages.urllib3.util.retry import Retry
 from . import save_pages as sp
-
-# Directory where to save pages
-docs_dir = False
-
-# Boolean that tells us whether or not we should re-scrape a given page if it's already saved
-re_scrape = False
+from . import config
+import inspect
 
 # Directory where this file lives
 FILE_DIR = os.path.dirname(os.path.realpath(__file__))
 
-
+# Name and Team fixes used 
 with open(os.path.join(FILE_DIR, "player_name_fixes.json"), "r") as f:
     Names = json.load(f)['fixes']
 
-
 with open(os.path.join(FILE_DIR, "team_tri_codes.json"), "r") as f:
     TEAMS = json.load(f)['teams']
 
@@ -63,41 +56,75 @@ def custom_formatwarning(msg, *args, **kwargs):
 
 def print_error(msg):
     """
-    Implement own custom error using warning module.
-    Reason why i still use warning for errors is so i can set to ignore them if i want to (e.g. live_scrape line 200).
-
-    shared.print_error relies on this function.
+    Implement own custom error using warning module. Prints in red
 
-    See here for more on ANSI escape codes - https://en.wikipedia.org/wiki/ANSI_escape_code
+    Reason why i still use warning for errors is so i can set to ignore them if i want to (e.g. live_scrape line 200).
 
     :param msg: Str to print
 
-    :return: str
+    :return: None
     """
     ansi_red_code = '\033[0;31m'
     warning_msg = "{}Error: {}".format(ansi_red_code, msg)
 
+    # if config.LOG:
+    #     caller_file = os.path.basename(inspect.stack()[1].filename)
+    #     get_logger(caller_file).error(msg + " " + verbose)
+
     warnings.warn(warning_msg) 
 
 
 def print_warning(msg):
     """
-    Implement own custom warning using warning module.
-
-    shared.print_error relies on this function.
-
-    See here for more on ANSI escape codes - https://en.wikipedia.org/wiki/ANSI_escape_code
+    Implement own custom warning using warning module. Prints in Orange.
 
     :param msg: Str to print
 
-    :return: str
+    :return: None
     """
     ansi_yellow_code = '\033[0;33m'
     warning_msg = "{}Warning: {}".format(ansi_yellow_code, msg)
 
     warnings.warn(warning_msg)
 
 
+def get_logger(python_file):
+    """
+    Create a basic logger to a log file
+
+    :param python_file: File that instantiates the logger instance
+    
+    :return: logger 
+    """
+    base_py_file = os.path.basename(python_file)
+
+    # If already exists we don't try to recreate it
+    if base_py_file in logging.Logger.manager.loggerDict.keys():
+        return logging.getLogger(base_py_file)
+
+    logger = logging.getLogger(base_py_file)
+    logger.setLevel(logging.INFO)  
+
+    fh = logging.FileHandler("hockey_scraper_errors_{}.log".format(datetime.now().strftime("%Y-%m-%dT%H:%M:%S"))) 
+    fh.setFormatter(logging.Formatter('%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s', datefmt='%Y-%m-%d %I:%M:%S'))
+    logger.addHandler(fh)
+
+    return logger
+
+
+def log_error(err, py_file):
+    """
+    Log error when Logging is specified
+
+    :param err: Error to log
+    :param python_file: File that instantiates the logger instance
+
+    :return: None
+    """
+    if config.LOG:
+        get_logger(py_file).error(err)
+
+
 def get_season(date):
     """
     Get Season based on from_date
@@ -181,39 +208,6 @@ def convert_to_seconds(minutes):
     return timedelta(hours=x.tm_hour, minutes=x.tm_min, seconds=x.tm_sec).total_seconds()
 
 
-def scrape_page(url):
-    """
-    Scrape a given url
-
-    :param url: url for page
-
-    :return: response object
-    """
-    response = requests.Session()
-    retries = Retry(total=10, backoff_factor=.1)
-    response.mount('http://', HTTPAdapter(max_retries=retries))
-
-    try:
-        response = response.get(url, timeout=5)
-        response.raise_for_status()
-        page = response.text
-    except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError):
-        page = None
-    except requests.exceptions.ReadTimeout:
-        # If it times out and it's the schedule print an error message...otherwise just make the page = None
-        if "schedule" in url:
-            raise Exception("Timeout Error: The NHL API took too long to respond to our request. "
-                                "Please Try Again (you may need to try a few times before it works). ")
-        else:
-            print_error("Timeout Error: The server took too long to respond to our request.")
-            page = None
-
-    # Pause for 1 second - make it more if you want
-    time.sleep(1)
-
-    return page
-
-
 def if_rescrape(user_rescrape):
     """
     If you want to re_scrape. If someone is a dumbass and feeds it a non-boolean it terminates the program
@@ -224,10 +218,8 @@ def if_rescrape(user_rescrape):
 
     :return: None
     """
-    global re_scrape
-
     if isinstance(user_rescrape, bool):
-        re_scrape = user_rescrape
+        config.RESCRAPE = user_rescrape
     else:
         raise ValueError("Error: 'if_rescrape' must be a boolean. Not a {}".format(type(user_rescrape)))
 
@@ -242,35 +234,67 @@ def add_dir(user_dir):
 
     :return: None
     """
-    global docs_dir
-
     # False so they don't want it
     if not user_dir:
-        docs_dir = False
+        config.DOCS_DIR = False
         return
 
     # Something was given
     # Either True or string to directory
     # If boolean refer to the home directory
     if isinstance(user_dir, bool):
-        docs_dir = os.path.join(os.path.expanduser('~'), "hockey_scraper_data")
+        config.DOCS_DIR = os.path.join(os.path.expanduser('~'), "hockey_scraper_data")
         # Create if needed
-        if not os.path.isdir(docs_dir):
+        if not os.path.isdir(config.DOCS_DIR):
             print_warning("Creating the hockey_scraper_data directory in the home directory")
-            os.mkdir(docs_dir)
+            os.mkdir(config.DOCS_DIR)
     elif isinstance(user_dir, str) and os.path.isdir(user_dir):
-        docs_dir = user_dir
+        config.DOCS_DIR = user_dir
     elif not (isinstance(user_dir, str) and isinstance(user_dir, bool)):
-        docs_dir = False
+        config.DOCS_DIR = False
         print_error("The docs_dir argument provided is invalid")
     else:
-        docs_dir = False
+        config.DOCS_DIR = False
         print_error("The directory specified for the saving of scraped docs doesn't exist. Therefore:"
               "\n1. All specified games will be scraped from their appropriate sources (NHL or ESPN)."
               "\n2. All scraped files will NOT be saved at all. Please either create the directory you want them to be "
               "deposited in or recheck the directory you typed in and start again.\n")
 
 
+def scrape_page(url):
+    """
+    Scrape a given url
+
+    :param url: url for page
+
+    :return: response object
+    """
+    response = requests.Session()
+    retries = Retry(total=10, backoff_factor=.1)
+    response.mount('http://', HTTPAdapter(max_retries=retries))
+
+    try:
+        response = response.get(url, timeout=5)
+        response.raise_for_status()
+        page = response.text
+    except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError):
+        page = None
+    except requests.exceptions.ReadTimeout:
+        # If it times out and it's the schedule print an error message...otherwise just make the page = None
+        if "schedule" in url:
+            raise Exception("Timeout Error: The NHL API took too long to respond to our request. "
+                                "Please Try Again (you may need to try a few times before it works). ")
+        else:
+            print_error("Timeout Error: The server took too long to respond to our request.")
+            page = None
+
+    # Pause for 1 second - make it more if you want
+    time.sleep(1)
+
+    return page
+
+
+
 def get_file(file_info, force=False):
     """
     Get the specified file.
@@ -284,10 +308,10 @@ def get_file(file_info, force=False):
 
     :return: page
     """
-    file_info['dir'] = docs_dir
+    file_info['dir'] = config.DOCS_DIR
 
     # If everything checks out we'll retrieve it, otherwise we scrape it
-    if docs_dir and sp.check_file_exists(file_info) and not re_scrape and not force:
+    if file_info['dir'] and sp.check_file_exists(file_info) and not config.RESCRAPE and not force:
         page = sp.get_page(file_info)
     else:
         page = scrape_page(file_info['url'])
@@ -338,6 +362,8 @@ def to_csv(base_file_name, df, league, file_type):
 
     :return: None
     """
+    docs_dir = config.DOCS_DIR
+
     # This was a late addition so we add support here
     if isinstance(docs_dir, str) and not os.path.isdir(os.path.join(docs_dir, "csvs")):
         os.mkdir(os.path.join(docs_dir, "csvs"))

diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@ def read():
 
 setup(
     name='hockey_scraper',
-    version='1.37.5',
+    version='1.37.6',
     description="""Python Package for scraping NHL Play-by-Play and Shift data.""",
     long_description=read(),
     classifiers=[