# Notebook for data acquisition functions


#### The first fctn take he target year and a filepath as argument to download the data from a specific season and put it in a certain path

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
#Imports
import pandas as pd
import requests
import os
from tqdm.notebook import tqdm
import pickle
import sys

In [4]:

def get_season_data(year, file_path):
    """
    Get the data from the Hockey API in Pickle format for a specific season (year-year+1) and store them to a given file_path. Pickle contains list[dicts of all games in the season]

    :param year: year of the season (example 2017 will download Season 2017-2018)
    :type year: int
    :param file_path: Directory where the pickle file for entire season is stored are going to be saved
    :type file_path: str

    :rtype: list, list[dict]
    :return: list[ dicts of all games in the season],

    """
    YEAR = year
    DIRECTORY  = f"{file_path}/PICKLE/"
    PATH = f"{DIRECTORY}/{YEAR}.pkl"
    MAX_GAMES=1300
    os.makedirs(DIRECTORY, exist_ok=True)
    season_types = ["01", "02", "03", "04"]
    games_list = []
    if os.path.isfile(PATH):
        with open(PATH, 'rb') as f:
            games_list = pickle.load(f)

    else:
        for season_type in season_types:
            for g in tqdm(range(1,MAX_GAMES)):
                game_number = str(g).zfill(4)
                GAME_ID = f"{YEAR}{season_type}{game_number}"
                r = requests.get(f"https://statsapi.web.nhl.com/api/v1/game/{GAME_ID}/feed/live/")
                if r.status_code == 200:
                    ## Storing as dicts
                    game_dict = r.json()
                    games_list.append(game_dict)
        with open(PATH,'wb') as f:
            pickle.dump(games_list,f)
    # print(f"Len of games_list in {year} is {len(games_list)}")

    return games_list
                    
                    
                    # with open(PATH, 'wb') as f:
                    #     f.write(r.content)  
            #else:
               # print("File already exists")

In [5]:
season2016=get_season_data(2016, "../ift6758/data/")
season2017=get_season_data(2017, "../ift6758/data/")
season2018=get_season_data(2018, "../ift6758/data/")
season2019=get_season_data(2019, "../ift6758/data/")
season2020=get_season_data(2020, "../ift6758/data/")


In [8]:
from ift6758.data.data_acquisition import Season
Season2016=Season(2016,"../ift6758/data/")
season2016_data = Season2016.get_season_data()
print(season2016 == season2016_data) #Verifying if we are loading same data

File already Exists, loading from ../ift6758/data//PICKLE//2016.pkl
Len of games_list in 2016 is 1441
True


In [16]:
# a=Season2016+Season2016

In [17]:
# len(a)

2882

## From the api documentation the max of matches per season is less than 1300. For security, we put 1300 to check all the ids where we can find a match on the data. We could have put a break after we have a 404 answer from the request (for efficency). But we assumed that the data might miss some game ids. And we download them just once. So no need for time efficiency for now