In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [8]:
ptb_data = pd.read_csv('ptb_data.csv')

In [8]:
sample_df = ptb_data.sample(frac=0.1)

In [204]:
# DATA SETUP

ptb_data['currMargin_gameTime_interaction'] = ptb_data['CurrentMargin'] * ptb_data['GameTime']

df_encoded = ptb_data.drop(columns = ['PTB Defence', 'Anonymize 1PlayerId', 'ZonePhysical', 'EventName', 'MatchId', 'Tackle', 'PositionId',
                                     'OppScore', 'Away Score', 'Home Score', 'PTB Tackle Result', 'TotalPossessionSecs', 'ElapsedTime',
                                     'currMargin_gameTime_interaction'])


df_encoded = pd.get_dummies(df_encoded, columns=['WeatherConditionName', 'Club Id', 'Opposition Id', 'Half', 'PTB Contest', 'PTB Ultimate Outcome', 
                                                 'OfficialId', 'Player Id', 'ZonePossession'], drop_first=True)


df_encoded = df_encoded.dropna()

df_encoded[df_encoded.select_dtypes(include=['bool']).columns] = df_encoded.select_dtypes(include=['bool']).astype(int)


df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 111990 entries, 0 to 112026
Columns: 821 entries, DurationSecs to ZonePossession_79
dtypes: float64(8), int32(806), int64(7)
memory usage: 358.0 MB


In [206]:
# RUNNING OLS WITH TRAIN TEST SPLIT

import statsmodels.api as sm

Y = df_encoded['DurationSecs']
X = df_encoded.drop(['DurationSecs'],axis = 1)
X = sm.add_constant(X)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=50)

model = sm.OLS(Y, X).fit()

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}" )

r_squared = r2_score(y_test, y_pred)
print(f"R-squared: {r_squared}")
n = X_test.shape[0]  # Number of observations
p = X_test.shape[1]  # Number of features (predictors)
adjusted_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - p - 1)
print(f"Adjusted_r_squared: {adjusted_r_squared}")

print(model.summary())

Mean Absolute Error: 0.7067192778958771
Mean Squared Error: 0.911976785024076
R-squared: 0.2302748234809724
Adjusted_r_squared: 0.20098559610230526
                            OLS Regression Results                            
Dep. Variable:           DurationSecs   R-squared:                       0.234
Model:                            OLS   Adj. R-squared:                  0.228
Method:                 Least Squares   F-statistic:                     41.38
Date:                Thu, 07 Nov 2024   Prob (F-statistic):               0.00
Time:                        00:08:32   Log-Likelihood:            -1.5297e+05
No. Observations:              111990   AIC:                         3.076e+05
Df Residuals:                  111169   BIC:                         3.155e+05
Df Model:                         820                                         
Covariance Type:            nonrobust                                         
                                                        coef  

In [138]:
# RUNNING OLS WITH CROSS-VALIDATION

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer, mean_squared_error
import statsmodels.api as sm

Y = df_encoded['DurationSecs']
X = df_encoded.drop(['DurationSecs'],axis = 1)
X = sm.add_constant(X)

model = LinearRegression()

# Define the negative MSE scoring function
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Perform cross-validation and get MSE scores
cv_scores = -1 * cross_val_score(model, X, y, scoring=mse_scorer, cv=5)  # 5-fold cross-validation

cv_mse = cv_scores.mean()

print("Cross-Validation MSE:", cv_mse)

Cross-Validation MSE: 0.9873940239095017


In [190]:
# JOINT F TESTS FOR CATEGORICAL FEATURES:

# Add a constant to the independent variables
X_full = sm.add_constant(X)
X_restricted = sm.add_constant(X.loc[:, ~X.columns.str.startswith('OfficialId')])

# Fit the modelshow c
model_full = sm.OLS(df_encoded['DurationSecs'], X_full).fit()
model_restricted = sm.OLS(df_encoded['DurationSecs'], X_restricted).fit()

# Perform the F-test to compare the models
f_test = model_full.compare_f_test(model_restricted)

# Results
print("F-statistic:", f_test[0])
print("p-value:", f_test[1])

F-statistic: 5.47439280158252
p-value: 2.890888194038193e-11


In [120]:
from sklearn.linear_model import LassoCV, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y = df_encoded['DurationSecs']
X = df_encoded.drop(['DurationSecs'],axis = 1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=99)

lasso_cv = LassoCV(cv=5, max_iter=100000).fit(X_train, y_train)
print("Optimal alpha:", lasso_cv.alpha_)

lasso_best = Lasso(alpha=lasso_cv.alpha_)
lasso_best.fit(X_train, y_train)


y_pred = lasso_best.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")


mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}" )


r_squared = r2_score(y_test, y_pred)
print(f"R-squared: {r_squared}")
n = X_test.shape[0]  # Number of observations
p = X_test.shape[1]  # Number of features (predictors)
adjusted_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - p - 1)
print(f"Adjusted_r_squared: {adjusted_r_squared}")

pd.set_option('display.max_rows', None)

lasso_coefficients = pd.Series(lasso_best.coef_, index=X.columns)
print(lasso_coefficients)



Optimal alpha: 0.0003143152902245518
Mean Absolute Error: 0.7156416804058044
Mean Squared Error: 0.9196891223360028
R-squared: 0.19223661681387516
Adjusted_r_squared: 0.18919569339758713
OppPossessionSecs                                    0.122882
Player Id                                            0.004062
PositionId                                          -0.048661
PossessionSecs                                       0.175134
Set Type                                             0.084324
Total Involved Tacklers                              0.034675
Raw Tackle Number                                   -0.138175
RoundId                                             -0.015485
RunOn                                                0.019533
Score                                                0.028286
SeasonId                                             0.060310
SeqNumber                                           -0.274618
Set                                                 -0.024785
ZonePos

  model = cd_fast.enet_coordinate_descent(
