In [1]:
import pandas as pd
import statsmodels.formula.api as smf
from os import path


# Directory where data is stored
DATA_DIR = '../resources/code-soccer-files-main/data'

# First step is creating a dataset where columns are variables, the last being the output variable
# load
df = pd.read_csv(path.join(DATA_DIR, 'shots.csv'))

df['dist_m_sq'] = df['dist_m']**2
df['goal'] = df['goal'].astype(int)
df[['goal', 'dist_m', 'dist_m_sq']].head()

Unnamed: 0,goal,dist_m,dist_m_sq
0,0,12.987566,168.67686
1,0,16.559476,274.216235
2,0,17.013624,289.463402
3,1,8.506812,72.36585
4,0,15.975528,255.217498


In [3]:
# OLS - Ordinary Least Squares (basic type of linear regression)
model = smf.ols(formula='goal ~ dist_m + dist_m_sq', data=df)   # output ~ input1 + input2
results = model.fit()
results.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.068
Dependent Variable:,goal,AIC:,366.9415
Date:,2023-09-18 15:56,BIC:,382.6004
No. Observations:,1366,Log-Likelihood:,-180.47
Df Model:,2,F-statistic:,50.75
Df Residuals:,1363,Prob (F-statistic):,5.51e-22
R-squared:,0.069,Scale:,0.076425

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
Intercept,0.2985,0.0229,13.0560,0.0000,0.2537,0.3434
dist_m,-0.0148,0.0017,-8.5768,0.0000,-0.0182,-0.0114
dist_m_sq,0.0001,0.0000,5.0956,0.0000,0.0001,0.0002

0,1,2,3
Omnibus:,699.924,Durbin-Watson:,1.997
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3093.164
Skew:,2.559,Prob(JB):,0.0
Kurtosis:,8.305,Condition No.:,2122.0


In [4]:
# calculating probability
def prob_of_goal(meters):
    b0, b1, b2 = results.params
    return (b0 + b1*meters + b2*(meters**2))

In [5]:
# Testing with some values
print(prob_of_goal(2))
print(prob_of_goal(10))
print(prob_of_goal(20))

0.26943497157930496
0.16473737830448498
0.06020057110997604


In [6]:
# Probability for every shot
df['goal_hat'] = results.predict(df)
df[['goal', 'goal_hat']].head(5)


Unnamed: 0,goal,goal_hat
0,0,0.130441
1,0,0.092865
2,0,0.088355
3,1,0.182858
4,0,0.098753


In [7]:
# also period taken into account
model = smf.ols(formula='goal ~ dist_m + dist_m_sq + C(period)', data=df)
results = model.fit()
results.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.07
Dependent Variable:,goal,AIC:,367.5479
Date:,2023-09-19 12:32,BIC:,398.8658
No. Observations:,1366,Log-Likelihood:,-177.77
Df Model:,5,F-statistic:,21.41
Df Residuals:,1360,Prob (F-statistic):,1.13e-20
R-squared:,0.073,Scale:,0.076292

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
Intercept,0.2792,0.0245,11.4147,0.0000,0.2312,0.3272
C(period)[T.2H],0.0348,0.0153,2.2762,0.0230,0.0048,0.0648
C(period)[T.E1],-0.0080,0.0661,-0.1207,0.9039,-0.1376,0.1217
C(period)[T.E2],0.0096,0.0575,0.1664,0.8679,-0.1033,0.1224
dist_m,-0.0149,0.0017,-8.5919,0.0000,-0.0183,-0.0115
dist_m_sq,0.0001,0.0000,5.1737,0.0000,0.0001,0.0002

0,1,2,3
Omnibus:,696.024,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3054.676
Skew:,2.545,Prob(JB):,0.0
Kurtosis:,8.268,Condition No.:,6142.0
