# 1. Imports

In [None]:
import pandas as pd
import json
import os
import copy
import time
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

# 2. Loading data

## 2.1 Identify id:s for Liverpool games

In [None]:
# path = 'C:\\Users\\johnl\\Documents\\GitHub\\opendata\\data\\matches.json'

# with open(path) as f:
#     data = json.load(f)

# # print(json.dumps(data, indent=4, sort_keys=True))

# # Manually found that the Liverpool games has id's 4039 and 2440

In [None]:
path = 'C:\\Users\\johnl\\Documents\\GitHub\\opendata\\data\\matches'

id1 = '\\4039'
id2 = '\\2440'

## 2.2 Create player dict

In [None]:
with open(path + id1 + '\\match_data.json') as f:
    data = json.load(f)

name_dict = {}
for player in data['players']:
    obj_id = player['trackable_object']
    first_name = player['first_name']
    last_name = player['last_name']
    name_dict[obj_id] = first_name + ' ' + last_name

lfc_dict = {}
lfc_list = []
if data['id'] == 4039:
    lfc_id = data['away_team']['id']
    for player in data['players']:
        if player['team_id'] == lfc_id:
            obj_id = player['trackable_object']
            first_name = player['first_name']
            last_name = player['last_name']
            lfc_dict[obj_id] = first_name + ' ' + last_name
            lfc_list.append(obj_id)

# name_dict
lfc_dict
# lfc_list

## 2.3 Load and explore tracking data 

In [None]:
with open(path + id1 + '\\structured_data.json') as f:
    data = json.load(f)

In [None]:
# for frame in data:
#     print(frame['time'])
'''
Manually checked the time intervals. dt = 0.1 seconds
'''
dt = 0.1

In [None]:
players = [i for i in name_dict.keys()]
# cols = ['frame'] + players

In [None]:
#data[1000]['data'][0]

In [None]:
df = pd.DataFrame(columns=players)
print(f'Number of frames: {len(data)}')
print(f'Number of columns: {len(players)}')

In [None]:
new_frame = {}
for i in players:
    new_frame[i] = None

t0 = time.time()

li =  []
for frame in data:
    frame_nr = frame['frame']
    curr_frame = copy.deepcopy(new_frame)
    for obj in frame['data']:
        x = obj['x']
        y = obj['y']
        try:
            obj_id = obj['trackable_object']
            curr_frame[obj_id] = (x, y)
        except:
            None
    row = pd.Series(curr_frame, name = frame_nr)
    li.append(curr_frame)

df = pd.DataFrame(li)
#     if frame_nr % 1000 == 0 :
#         t = time.time() - t0
#         print(f'{frame_nr} frames took {t} seconds to process')
print(time.time()-t0)

# df.to_pickle(id1 +'_df')
# df = pd.read_pickle(file_name)

In [None]:
# col_list = []
# p_list = []
# for col in df.columns:
#     player_df = df[col]
#     p = len([i for i in player_df if i!= None])/len(player_df)
#     # print(col, p)
#     col_list.append(str(col))
#     p_list.append(p)

# plt.bar(x=col_list, height=p_list)

In [None]:
lfc_df = df[lfc_list]

for col in lfc_df.columns:
    player_df = lfc_df[col]
    p = len([i for i in player_df if i!= None])/len(player_df)
    if p == 0.0:
        lfc_df = lfc_df.drop(columns=col)
        
lfc_list = [i for i in lfc_df.columns]

# lfc_df

In [None]:
col_list = []
p_list = []
for col in lfc_df.columns:
    player_df = lfc_df[col]
    p = len([i for i in player_df if i!= None])/len(player_df)
    # print(col, p)
    col_list.append(str(col))
    p_list.append(p)

plt.bar(x=col_list, height=p_list)

Obviously, for all players, more than half of the game is not covered by the broadcast data. This is a clear limitation. However, the most important parts of the games (i.e. those when they are close to the ball) is likely to be the ones visible in the broadcast and this may thus not be a huge problem. It is not likely that players far from the ball will run as quick as the ones close to the ball.

**To investigate:**
* Ways to interpolate player positions

# 3. Computing velocities and accelerations

## 3.1 Compute velocities 

In [None]:
# lfc_df_numeric = lfc_df.fillna(value=np.nan)

lfc_cords = pd.DataFrame()

for col in lfc_df.columns:
    x_list = []
    y_list = []
    for i in lfc_df[col]:
        if i == None:
            x_list.append(np.nan)
            y_list.append(np.nan)
        else:
            x_list.append(i[0])
            y_list.append(i[1])
    x_col = str(col)+'_x'
    y_col = str(col)+'_y'
    lfc_cords[x_col] = x_list
    lfc_cords[y_col] = y_list

lfc_cords

In [None]:
lfc_cords_diff = lfc_cords.diff(periods=1, axis=0)
lfc_cords_diff

In [None]:
vel_lfc_df = pd.DataFrame()

for i in lfc_list:
    dx_col = str(i)+'_x'
    dy_col = str(i)+'_y'
    dx = lfc_cords_diff[dx_col]
    dy = lfc_cords_diff[dy_col]
    v = np.sqrt(dx**2 + dy**2)/dt
    
    for j in range(len(v)):
        if v[j] > 12.5: # Usain Bolt's top speed
            v[j] = np.nan
    
    vel_lfc_df[i] = v

vel_lfc_df

In [None]:
# new check after dropping outlier velocities

p_list2 = [i for i in vel_lfc_df.count()/len(vel_lfc_df)]
col_list2 = [str(i) for i in vel_lfc_df.columns]

plt.bar(x=col_list2, height=p_list2)

In [None]:
# printing how much data is dropped due to the removing of outlier velocities
assert len(p_list) == len(p_list2)

for i in range(len(p_list)):
    print(p_list[i]-p_list2[i])

## 3.2 Smoothing of velocities 

In [None]:
# TBD

## 3.3 Compute accelerations and decelerations 

In [None]:
# TBD

# 4. Decompose into speed and acceleration/deceleration zones

In [None]:
# TBD

# 5. Acceleration/Deceleration profiles

In [None]:
# TBD

# 6. Sequences of peak intensity

In [None]:
# TBD

# 7. Metabolic power

In [None]:
# TBD