In [1]:
#packages
import pandas as pd
import numpy as np
import json
import math

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#import data
data=pd.read_pickle('data/clean_df.pickle')

In [3]:
#euclidean distance between two points
def distance(x, y):
    return np.sqrt( (y[0]-x[0])**2  +  (y[1]-x[1])**2)

In [4]:
#shoot angle : angle defined by the straight lines connecting the goal location to the two posts
def shotAngle(x):
    
    near_post_coor=(120,36) #1st post coordinates based on Statsbomb dimensions (in yards)
    far_post_coor=(120,44) #2nd post coordinates based on Statsbomb dimensions (in yards)
    near_post_dist=shotDistance(x, near_post_coor) #distance shooting point vs 1st post
    far_post_dist=shotDistance(x, far_post_coor) #distance shooting point vs 2nd post
    
    #44-36 : distance between 1st post and 2nd post (goal line)
    #use of Al-Kashi theorem
    res=(near_post_dist**2 + far_post_dist**2 - (44-36)**2) / (2*near_post_dist*far_post_dist)
    if res==1.0 or res==-1.0:
        return 0
    elif res<-1.0 or res>1.0:
        return 0
    else:
        return math.acos(res)#return shoot angle in radian

In [5]:
#define if a player is in the shoot angle
#A player is in the shooting angle if the area of the 3 triangles 
#formed by the position of the player and the vertices (shooting point, 1st post, 2nd post) is equal to 
#the area of the triangle formed by these same three vertices
def isInTriangle(p, a, b, c):
    a_b=distance(a,b)
    a_c=distance(a,c)
    b_c=distance(b,c)
    
    a_b=distance(a,b)
    a_p=distance(a,p)
    b_p=distance(b,p)
    
    a_p=distance(a,p)
    a_c=distance(a,c)
    p_c=distance(p,c)
    
    p_b=distance(p,b)
    p_c=distance(p,c)
    b_c=distance(b,c)
    
    p=(a_b+b_c+a_c)/2
    s_abc=(p*(p-a_b)*(p-b_c)*(p-a_c))**(1/2)
    
    p=(a_b+b_p+a_p)/2
    s_abp=(p*(p-a_b)*(p-b_p)*(p-a_p))**(1/2)
    
    p=(a_p+p_c+a_c)/2
    s_acp=(p*(p-a_p)*(p-p_c)*(p-a_c))**(1/2)
    
    p=(p_b+b_c+p_c)/2
    s_bcp=(p*(p-p_b)*(p-b_c)*(p-p_c))**(1/2)
    
    if abs(s_abc - (s_abp+s_acp+s_bcp)) < 0.01:
        return True 
    else:
        False

In [6]:
#define number of players in the shooting angle
def nbPlayersTriangle(location, player_list):
    
    if player_list is not None:
        nb_players=0
        for player in player_list:

            p=player['location']
            if isInTriangle(p, tuple(location), (120,36), (120,44))==True:
                nb_players+=1
    else:
        nb_players=0
        
    return nb_players

In [7]:
#distance between the shooting point and the middle of the goal line
def shotDistance(x, middle_pitch):
    return np.sqrt( (middle_pitch[0]-x[0])**2  +  (middle_pitch[1]-x[1])**2)

In [8]:
middle_pitch=(120,40) #middle of the goal line

#calculation of new features
data['distance']=data['location'].apply(lambda x: shotDistance(x, middle_pitch))
data['angle']=data['location'].apply(lambda x: shotAngle(x))
data['nb_players']=data.apply(lambda x: nbPlayersTriangle(x['location'], x['freeze_frame']), axis=1)
data['body_part']=data['body_part'].apply(lambda x: 'Foot' if (x=='Right Foot') or (x=='Left Foot') else x)

#useless features
data=data.drop(['location', 'freeze_frame'], axis=1)



In [9]:
#one corner... drop this row
print(data['type'].value_counts())
mask=data['type']!='Corner'
data=data[mask]

Open Play    10252
Free Kick      858
Penalty        140
Corner           1
Name: type, dtype: int64


In [10]:
data.head()

Unnamed: 0,under_pressure,xg,first_time,open_goal,technique,body_part,type,outcome,distance,angle,nb_players
333,True,0.481119,True,False,Normal,Foot,Open Play,False,10.231813,0.708496,1
337,False,0.053685,True,False,Volley,Foot,Open Play,False,9.881295,0.516821,2
376,False,0.080306,False,False,Normal,Foot,Free Kick,False,26.057245,0.289665,4
424,False,0.025205,False,False,Normal,Foot,Open Play,False,27.30293,0.181144,1
428,False,0.039715,False,False,Normal,Foot,Free Kick,False,29.343653,0.233293,4


In [11]:
#save data
data.to_pickle('data/feature_eng__df.pickle')