In [2]:
import pandas as pd
import numpy as np
import math
import statsmodels.formula.api as smf
from os import path

# Directory where data is stored
DATA_DIR = '../resources/code-soccer-files-main/data'

dfs = pd.read_csv(path.join(DATA_DIR, 'shots.csv'))
dfp = pd.read_csv(path.join(DATA_DIR, 'players.csv'))

dfs['goal'] = dfs['goal'].astype(int)
dfs['header'] = dfs['foot'] == 'head/body'
dfs['dist_m_sq'] = dfs['dist_m'] ** 2

In [3]:
# holding things constant
model = smf.ols(formula=
        """
        goal ~ header
        """, data=dfs)
results = model.fit()
results.summary2()

dfs.groupby('header')['dist_m'].mean()

header
False    19.595073
True     10.639813
Name: dist_m, dtype: float64

In [4]:
# adding distance
model = smf.ols(formula=
        """
        goal ~ header + dist_m
        """, data=dfs)
results = model.fit()
results.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.05
Dependent Variable:,goal,AIC:,392.6176
Date:,2023-09-19 12:46,BIC:,408.2765
No. Observations:,1366,Log-Likelihood:,-193.31
Df Model:,2,F-statistic:,37.12
Df Residuals:,1363,Prob (F-statistic):,2.01e-16
R-squared:,0.052,Scale:,0.077875

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
Intercept,0.2208,0.0191,11.5668,0.0000,0.1834,0.2583
header[T.True],-0.0069,0.0218,-0.3176,0.7509,-0.0497,0.0359
dist_m,-0.0072,0.0009,-8.1456,0.0000,-0.0089,-0.0054

0,1,2,3
Omnibus:,721.248,Durbin-Watson:,1.985
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3281.097
Skew:,2.644,Prob(JB):,0.0
Kurtosis:,8.449,Condition No.:,68.0


In [6]:
# Fixed effects 
pd.get_dummies(dfs['foot']).head()

model = smf.ols(formula="goal ~ C(foot) + dist_m + dist_m_sq", data=dfs)
results = model.fit()
print(results.summary2())

dfs['foot'].value_counts()

                 Results: Ordinary least squares
Model:              OLS              Adj. R-squared:     0.069   
Dependent Variable: goal             AIC:                367.9501
Date:               2023-09-19 12:50 BIC:                394.0483
No. Observations:   1366             Log-Likelihood:     -178.98 
Df Model:           4                F-statistic:        26.14   
Df Residuals:       1361             Prob (F-statistic): 6.61e-21
R-squared:          0.071            Scale:              0.076370
-----------------------------------------------------------------
                   Coef.  Std.Err.    t    P>|t|   [0.025  0.975]
-----------------------------------------------------------------
Intercept          0.2840   0.0244 11.6350 0.0000  0.2361  0.3319
C(foot)[T.left]    0.0413   0.0247  1.6695 0.0952 -0.0072  0.0899
C(foot)[T.right]   0.0360   0.0233  1.5488 0.1217 -0.0096  0.0817
dist_m            -0.0161   0.0019 -8.5633 0.0000 -0.0198 -0.0124
dist_m_sq          0.0002  

right        716
left         425
head/body    225
Name: foot, dtype: int64

Including all shot types is redundant - statsmodels automatically dropped one (headers in my case).

In [7]:
# dropping right footed shots
model = smf.ols(
    formula="goal ~ C(foot, Treatment(reference='right')) + dist_m + dist_m_sq",
    data=dfs)
results = model.fit()
results.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.069
Dependent Variable:,goal,AIC:,367.9501
Date:,2023-09-19 12:55,BIC:,394.0483
No. Observations:,1366,Log-Likelihood:,-178.98
Df Model:,4,F-statistic:,26.14
Df Residuals:,1361,Prob (F-statistic):,6.610000000000001e-21
R-squared:,0.071,Scale:,0.07637

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
Intercept,0.3200,0.0274,11.6610,0.0000,0.2662,0.3739
"C(foot, Treatment(reference='right'))[T.head/body]",-0.0360,0.0233,-1.5488,0.1217,-0.0817,0.0096
"C(foot, Treatment(reference='right'))[T.left]",0.0053,0.0169,0.3122,0.7549,-0.0279,0.0385
dist_m,-0.0161,0.0019,-8.5633,0.0000,-0.0198,-0.0124
dist_m_sq,0.0002,0.0000,5.3680,0.0000,0.0001,0.0002

0,1,2,3
Omnibus:,698.222,Durbin-Watson:,2.005
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3077.477
Skew:,2.553,Prob(JB):,0.0
Kurtosis:,8.292,Condition No.:,2993.0


Fixed effects coefficients can be interpreted as probability added compared to a right footed shot. In this case the header decreases it by -0.0360, and shooting with the left foot is similar to right (coef = 0.0053)

In [8]:
# squaring variables
dfs['dist2'] = dfs['dist_m'] ** 2
model = smf.ols(formula="goal ~ dist_m + dist2", data=dfs)
results = model.fit()
results.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.068
Dependent Variable:,goal,AIC:,366.9415
Date:,2023-09-19 12:59,BIC:,382.6004
No. Observations:,1366,Log-Likelihood:,-180.47
Df Model:,2,F-statistic:,50.75
Df Residuals:,1363,Prob (F-statistic):,5.51e-22
R-squared:,0.069,Scale:,0.076425

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
Intercept,0.2985,0.0229,13.0560,0.0000,0.2537,0.3434
dist_m,-0.0148,0.0017,-8.5768,0.0000,-0.0182,-0.0114
dist2,0.0001,0.0000,5.0956,0.0000,0.0001,0.0002

0,1,2,3
Omnibus:,699.924,Durbin-Watson:,1.997
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3093.164
Skew:,2.559,Prob(JB):,0.0
Kurtosis:,8.305,Condition No.:,2122.0


In [9]:
# natural log
dfs['ln_dist'] = np.log(dfs['dist_m'])

model = smf.ols(formula='goal ~ ln_dist', data=dfs)
results = model.fit()
results.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.082
Dependent Variable:,goal,AIC:,344.8149
Date:,2023-09-19 13:03,BIC:,355.2542
No. Observations:,1366,Log-Likelihood:,-170.41
Df Model:,1,F-statistic:,123.3
Df Residuals:,1364,Prob (F-statistic):,1.71e-27
R-squared:,0.083,Scale:,0.075252

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
Intercept,0.5336,0.0406,13.1350,0.0000,0.4539,0.6133
ln_dist,-0.1600,0.0144,-11.1055,0.0000,-0.1883,-0.1317

0,1,2,3
Omnibus:,692.218,Durbin-Watson:,1.989
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3049.133
Skew:,2.524,Prob(JB):,0.0
Kurtosis:,8.3,Condition No.:,17.0


In [10]:
# Interactions
dfs['is_header'] = dfs['foot'] == 'head/body'

model = smf.ols(formula=
        """
        goal ~ dist_m + dist_m:is_header
        """, data=dfs)
results = model.fit()
results.summary2() 

0,1,2,3
Model:,OLS,Adj. R-squared:,0.05
Dependent Variable:,goal,AIC:,392.6703
Date:,2023-09-19 13:05,BIC:,408.3293
No. Observations:,1366,Log-Likelihood:,-193.34
Df Model:,2,F-statistic:,37.09
Df Residuals:,1363,Prob (F-statistic):,2.07e-16
R-squared:,0.052,Scale:,0.077878

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
Intercept,0.2189,0.0173,12.6256,0.0000,0.1849,0.2530
dist_m,-0.0071,0.0008,-8.5607,0.0000,-0.0087,-0.0055
dist_m:is_header[T.True],-0.0004,0.0016,-0.2196,0.8262,-0.0035,0.0028

0,1,2,3
Omnibus:,721.388,Durbin-Watson:,1.984
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3282.767
Skew:,2.644,Prob(JB):,0.0
Kurtosis:,8.451,Condition No.:,47.0


In [11]:
# Logistic regression
model = smf.logit(formula=
        """
        goal ~ dist_m + dist_m:is_header
        """, data=dfs)
logit_results = model.fit()
logit_results.summary2()


Optimization terminated successfully.
         Current function value: 0.263986
         Iterations 8


0,1,2,3
Model:,Logit,Method:,MLE
Dependent Variable:,goal,Pseudo R-squared:,0.128
Date:,2023-09-19 13:12,AIC:,727.2096
No. Observations:,1366,BIC:,742.8686
Df Model:,2,Log-Likelihood:,-360.60
Df Residuals:,1363,LL-Null:,-413.41
Converged:,1.0000,LLR p-value:,1.1721e-23
No. Iterations:,8.0000,Scale:,1.0000

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.0859,0.2601,0.3301,0.7413,-0.4239,0.5956
dist_m,-0.1593,0.0187,-8.5188,0.0000,-0.1959,-0.1226
dist_m:is_header[T.True],-0.0446,0.0245,-1.8179,0.0691,-0.0926,0.0035


In [12]:
# calculating the probability of scoring based on distance
def prob_goal_logit(dist, is_header):
    b0, b1, b2  = logit_results.params
    value = (b0 + b1*dist + b2*dist*is_header) 
    return 1/(1 + math.exp(-value))

print(prob_goal_logit(20, 0))
print(prob_goal_logit(14, 1))
print(prob_goal_logit(14, 0))


0.0431088596283838
0.05907488159031021
0.10487284058049676
