# Data tidying and extraction (question 2)

This notebook presents the functions implemented to tidy the json data and put them in a dataframe format.

In [1]:
import pandas as pd
import json
from os import listdir
from os.path import isfile, join

In [2]:
def get_generalData(data):
    '''
    this function take as entry :
        - data : a json file data (corresponding to one match)
    it returns the general metadata of the match:
        - [id_game,season,dateTime,endDateTime,abstractGameState,team_away_name,team_home_name]
    '''
    id_game = data['gameData']['game']['pk']
    season = data['gameData']['game']['season']
    try :
        dateTime =  data['gameData']['datetime']['dateTime']
    except :
        print(f"dateTime not found for match {id_game}")
        dateTime = None
    try :
        endDateTime = data['gameData']['datetime']['endDateTime']
    except :
        #print(f"endDateTime not found for match {id_game}")
        endDateTime = None
    
    abstractGameState = data['gameData']['status']['abstractGameState']
    team_away_name = data['gameData']['teams']['away']['name']
    team_home_name = data['gameData']['teams']['home']['name']
    return [id_game,season,dateTime,endDateTime,abstractGameState,team_away_name,team_home_name]

In [3]:
def get_file_event_rows_data(data,type_season):
    ''' 
    this function take as entry :
        - data : a json file data (corresponding to one match of season defined by type_season)
        - type_season : the season in format 20XX20XX (for instance 20162017)
    It ouputs the a list of list, in which each row contains metadata of one event of the match (only shots and goals are considered)
    the metadata the following :
        -  columns_name = [
            "event_Idx","period", "periodTime","id_team_that_shot","name_team_that_shot","result_event","x_coord","y_coord",\
                "rinkSide_of_the_team_that_shot","goalie_name",\
                "shooter_name","shot_type","empty_net","strength",\
                "type_season","id_game","season","dateTime","endDateTime","abstractGameState","team_away_name","team_home_name"]
    '''
    match_data = get_generalData(data)
    team_away_name,team_home_name = match_data[-2],match_data[-1]
    match_events_list = []
    for item in data['liveData']['plays']['allPlays']:
        if item['result']['event'] not in ["Goal", "Shot"]:
            continue
        else :
            # event Idx
            event_Idx = item['about']['eventIdx']

            # period
            period = item['about']['period']

            # periodTime
            periodTime = item['about']['periodTime']
            
            # team information (which team shot)
            id_team_that_shot = item['team']['id'] 
            name_team_that_shot = item['team']['name'] 

            # indicator if its a shot or a goal
            result_event = item['result']['event']
            
            # the on ice coordinates
            try :
                x_coord = item['coordinates']['x'] 
                y_coord = item['coordinates']['y']
            except :
                #print(f"coordinates not found for match {match_data[-7]} and event {event_Idx}")
                x_coord = None
                y_coord = None

            # the rinkside of the the_team_that_shot
            try : 
                # sometimes we don't have the info on the rinksidee apparently
                if team_away_name == name_team_that_shot :
                    if int(period)%2==0:
                        # in case of prolongation even (knowing that the max prolongation in the history is 6)
                        rinkSide_of_the_team_that_shot = data['liveData']['linescore']['periods'][1]['away']['rinkSide']
                    else :
                         # in case of prolongation odd
                        rinkSide_of_the_team_that_shot = data['liveData']['linescore']['periods'][0]['away']['rinkSide']
                else :
                    if int(period)%2==0:
                        rinkSide_of_the_team_that_shot = data['liveData']['linescore']['periods'][1]['home']['rinkSide']
                    else : 
                        rinkSide_of_the_team_that_shot = data['liveData']['linescore']['periods'][0]['home']['rinkSide']
            except Exception as e :
                #print(e)
                #print(f"period not defined for match {match_data[-7]} and event {event_Idx} and period:{period}")
                rinkSide_of_the_team_that_shot = None


            # the shooter and goalie name
            goalie_name = None
            shooter_name = None
            for item_bis in item['players']:
                if item_bis['playerType']=="Goalie":
                    goalie_name = item_bis['player']["fullName"]
                elif item_bis['playerType'] in ["Shooter", "Scorer"]:
                    shooter_name = item_bis['player']["fullName"]
                else:
                    continue
            '''
            if goalie_name == None :
                print(f"goalie_name not found for match {match_data[-7]} and event {event_Idx}")
            if shooter_name == None :
                print(f"shooter_name  not found for match {match_data[-7]} and event {event_Idx}")
            '''
            # shot type
            try :
                shot_type = item['result']['secondaryType']
            except :
                # sometimes the secondary Type is not defined
                #print(f"shot_type not found for match {match_data[-7]} and event {event_Idx}")
                shot_type = None

            try : 
                # empty net
                empty_net = item['result']['emptyNet']
            except :
                empty_net = None

            
            # strength
            try :
                strength = item['result']['name']
            except :
                strength = None
            
            all_data = [event_Idx, period, periodTime, id_team_that_shot,name_team_that_shot,result_event,x_coord,y_coord,rinkSide_of_the_team_that_shot,goalie_name,shooter_name,shot_type,empty_net,strength]
            match_events_list.append(all_data+[type_season]+match_data)
    return match_events_list


 

In [4]:
# list all the json files extracted in datasets
dir_year = [ join("../datasets/raw/", d_y) for d_y in listdir("../datasets/raw/")]
dir_pl_reg = [(join(path, d_pl_reg),d_pl_reg)  for path in dir_year for d_pl_reg in listdir(path)  ]
fichiers = [(join(dir_path, f),d_pl_reg) \
    for (dir_path, d_pl_reg) in dir_pl_reg
    for f in listdir(dir_path) if isfile(join(dir_path, f))]

In [None]:
# extract all metedata 
all_list_data = []
for (file_name_path, type_season) in fichiers :
    with open(file_name_path,'r') as f:
        data = json.loads(f.read())
        
        if 'messageNumber' in data and data['messageNumber'] == 2:
            continue
            
        all_list_data = all_list_data + get_file_event_rows_data(data,type_season)

In [6]:
# transform into a dataframe
columns_name = [
"event_Idx","period", "periodTime","id_team_that_shot","name_team_that_shot","result_event","x_coord","y_coord","rinkSide_of_the_team_that_shot","goalie_name",\
        "shooter_name","shot_type","empty_net","strength",\
        "type_season","id_game","season","dateTime","endDateTime","abstractGameState","team_away_name","team_home_name"]
df = pd.DataFrame(all_list_data, columns=columns_name)

In [None]:
# save in datasets directory
df.to_csv("../datasets/tidy_data.csv", index=False)

In [7]:
df[df["rinkSide_of_the_team_that_shot"].isna()]

Unnamed: 0,event_Idx,period,periodTime,id_team_that_shot,name_team_that_shot,result_event,x_coord,y_coord,rinkSide_of_the_team_that_shot,goalie_name,...,empty_net,strength,type_season,id_game,season,dateTime,endDateTime,abstractGameState,team_away_name,team_home_name
82612,8,1,01:01,54,Vegas Golden Knights,Shot,-35.0,26.0,,Jonathan Quick,...,,,playoffs,2017030171,20172018,2018-04-12T02:00:00Z,2018-04-12T04:42:56Z,Final,Los Angeles Kings,Vegas Golden Knights
82613,19,1,03:11,54,Vegas Golden Knights,Shot,-19.0,-15.0,,Jonathan Quick,...,,,playoffs,2017030171,20172018,2018-04-12T02:00:00Z,2018-04-12T04:42:56Z,Final,Los Angeles Kings,Vegas Golden Knights
82614,21,1,03:23,54,Vegas Golden Knights,Goal,-34.0,-4.0,,Jonathan Quick,...,False,,playoffs,2017030171,20172018,2018-04-12T02:00:00Z,2018-04-12T04:42:56Z,Final,Los Angeles Kings,Vegas Golden Knights
82615,26,1,04:04,26,Los Angeles Kings,Shot,-40.0,-8.0,,Marc-Andre Fleury,...,,,playoffs,2017030171,20172018,2018-04-12T02:00:00Z,2018-04-12T04:42:56Z,Final,Los Angeles Kings,Vegas Golden Knights
82616,32,1,05:39,26,Los Angeles Kings,Shot,35.0,21.0,,Marc-Andre Fleury,...,,,playoffs,2017030171,20172018,2018-04-12T02:00:00Z,2018-04-12T04:42:56Z,Final,Los Angeles Kings,Vegas Golden Knights
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384943,315,3,15:06,19,St. Louis Blues,Shot,84.0,19.0,,Marc-Andre Fleury,...,,,regular_season,2020020866,20202021,2021-05-09T02:00:00Z,2021-05-09T04:42:37Z,Final,St. Louis Blues,Vegas Golden Knights
384944,319,3,15:19,19,St. Louis Blues,Shot,62.0,-10.0,,Marc-Andre Fleury,...,,,regular_season,2020020866,20202021,2021-05-09T02:00:00Z,2021-05-09T04:42:37Z,Final,St. Louis Blues,Vegas Golden Knights
384945,323,3,16:35,54,Vegas Golden Knights,Shot,85.0,7.0,,Ville Husso,...,,,regular_season,2020020866,20202021,2021-05-09T02:00:00Z,2021-05-09T04:42:37Z,Final,St. Louis Blues,Vegas Golden Knights
384946,324,3,16:55,19,St. Louis Blues,Shot,55.0,-27.0,,Marc-Andre Fleury,...,,,regular_season,2020020866,20202021,2021-05-09T02:00:00Z,2021-05-09T04:42:37Z,Final,St. Louis Blues,Vegas Golden Knights


In [8]:
df[df.goalie_name.isna()]

Unnamed: 0,event_Idx,period,periodTime,id_team_that_shot,name_team_that_shot,result_event,x_coord,y_coord,rinkSide_of_the_team_that_shot,goalie_name,...,empty_net,strength,type_season,id_game,season,dateTime,endDateTime,abstractGameState,team_away_name,team_home_name
60,383,3,18:50,3,New York Rangers,Goal,69.0,-35.0,left,,...,True,,playoffs,2016030111,20162017,2017-04-12T23:00:00Z,2017-04-13T01:41:46Z,Final,New York Rangers,Montréal Canadiens
386,351,3,19:42,3,New York Rangers,Goal,69.0,1.0,right,,...,True,,playoffs,2016030116,20162017,2017-04-23T00:00:00Z,2017-04-23T02:47:39Z,Final,Montréal Canadiens,New York Rangers
1295,343,3,19:14,5,Pittsburgh Penguins,Goal,76.0,0.0,left,,...,True,,playoffs,2016030142,20162017,2017-04-14T23:00:00Z,2017-04-15T01:43:08Z,Final,Columbus Blue Jackets,Pittsburgh Penguins
1776,301,3,18:12,18,Nashville Predators,Goal,78.0,-1.0,left,,...,True,,playoffs,2016030154,20162017,2017-04-21T00:00:00Z,2017-04-21T02:41:51Z,Final,Chicago Blackhawks,Nashville Predators
1973,346,3,18:49,19,St. Louis Blues,Goal,20.0,13.0,left,,...,True,,playoffs,2016030163,20162017,2017-04-16T19:00:00Z,2017-04-16T22:05:56Z,Final,Minnesota Wild,St. Louis Blues
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384189,279,3,19:59,53,Arizona Coyotes,Goal,-65.0,-5.0,right,,...,True,,regular_season,2020020853,20202021,2021-05-08T02:30:00Z,2021-05-08T05:12:20Z,Final,Arizona Coyotes,San Jose Sharks
384684,264,3,18:57,18,Nashville Predators,Goal,-51.0,1.0,left,,...,True,,regular_season,2020020861,20202021,2021-05-09T00:00:00Z,2021-05-09T02:30:20Z,Final,Carolina Hurricanes,Nashville Predators
384786,283,3,19:54,9,Ottawa Senators,Goal,79.0,0.0,left,,...,True,,regular_season,2020020863,20202021,2021-05-08T23:00:00Z,2021-05-09T01:31:18Z,Final,Ottawa Senators,Winnipeg Jets
384850,271,3,19:41,23,Vancouver Canucks,Goal,54.0,12.0,right,,...,True,,regular_season,2020020864,20202021,2021-05-18T20:00:00Z,2021-05-18T22:39:15Z,Final,Calgary Flames,Vancouver Canucks
