In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [83]:
ptb_data = pd.read_csv('ptb_data.csv')

In [8]:
sample_df = ptb_data.sample(frac=0.1)

In [111]:
# DATA SETUP

# ptb_data['currMargin_gameTime_interaction'] = ptb_data['CurrentMargin'] * ptb_data['GameTime']

df_encoded = ptb_data.drop(columns = ['PTB Defence', 'Anonymize 1PlayerId', 'Player Id', 'ZonePhysical', 'EventName', 'MatchId', 'Tackle', 'PositionId',
                                     'OppScore', 'Away Score', 'Home Score', 'PTB Tackle Result', 'TotalPossessionSecs', 'ElapsedTime', 'GameTime', 
                                      'Set', 'SeqNumber'])


df_encoded = pd.get_dummies(df_encoded, columns=['WeatherConditionName', 'Club Id', 'SeasonId', 'Opposition Id', 'Half', 'PTB Contest', 
                                                 'PTB Ultimate Outcome', 'OfficialId', 'Raw Tackle Number'], drop_first=True)


df_encoded = df_encoded.dropna()

df_encoded[df_encoded.select_dtypes(include=['bool']).columns] = df_encoded.select_dtypes(include=['bool']).astype(int)

df_encoded.info()


<class 'pandas.core.frame.DataFrame'>
Index: 111991 entries, 0 to 112026
Data columns (total 86 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   DurationSecs                                       111991 non-null  float64
 1   OppPossessionSecs                                  111991 non-null  float64
 2   PossessionSecs                                     111991 non-null  float64
 3   Set Type                                           111991 non-null  int64  
 4   Total Involved Tacklers                            111991 non-null  float64
 5   RoundId                                            111991 non-null  int64  
 6   RunOn                                              111991 non-null  int64  
 7   Score                                              111991 non-null  float64
 8   ZonePossession                                     111991 non-null  int64  
 9 

In [113]:
# RUNNING OLS WITH TRAIN TEST SPLIT
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, r2_score

Y = df_encoded['DurationSecs']
X = df_encoded.drop(['DurationSecs'],axis = 1)
X = sm.add_constant(X)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=37)

OLS_model = sm.OLS(Y, X).fit()

y_pred = OLS_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}" )

r_squared = r2_score(y_test, y_pred)
print(f"R-squared: {r_squared}")
n = X_test.shape[0]  # Number of observations
p = X_test.shape[1]  # Number of features (predictors)
adjusted_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - p - 1)
print(f"Adjusted_r_squared: {adjusted_r_squared}")

print(OLS_model.summary())

Mean Squared Error: 0.9379024914496598
R-squared: 0.19354869243959338
Adjusted_r_squared: 0.19044028385003653
                            OLS Regression Results                            
Dep. Variable:           DurationSecs   R-squared:                       0.192
Model:                            OLS   Adj. R-squared:                  0.191
Method:                 Least Squares   F-statistic:                     312.6
Date:                Mon, 11 Nov 2024   Prob (F-statistic):               0.00
Time:                        15:53:14   Log-Likelihood:            -1.5596e+05
No. Observations:              111991   AIC:                         3.121e+05
Df Residuals:                  111905   BIC:                         3.129e+05
Df Model:                          85                                         
Covariance Type:            nonrobust                                         
                                                        coef    std err          t      P>|t|      [

In [105]:
# RUNNING OLS WITH CROSS-VALIDATION

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer, mean_squared_error
import statsmodels.api as sm

Y = df_encoded['DurationSecs']
X = df_encoded.drop(['DurationSecs'],axis = 1)
X = sm.add_constant(X)

ols_model = LinearRegression()

# Perform cross-validation and get MSE scores
cv_mses = -1 * cross_val_score(ols_model, X, Y, cv=5, scoring='neg_mean_squared_error')  # 5-fold cross-validation

cv_mse = cv_mses.mean()

print("Cross-Validation MSE:", cv_mse)

Cross-Validation MSE: 0.9755223651529871


In [190]:
# JOINT F TESTS FOR CATEGORICAL FEATURES:

# Add a constant to the independent variables
X_full = sm.add_constant(X)
X_restricted = sm.add_constant(X.loc[:, ~X.columns.str.startswith('OfficialId')])

# Fit the modelshow c
model_full = sm.OLS(df_encoded['DurationSecs'], X_full).fit()
model_restricted = sm.OLS(df_encoded['DurationSecs'], X_restricted).fit()

# Perform the F-test to compare the models
f_test = model_full.compare_f_test(model_restricted)

# Results
print("F-statistic:", f_test[0])
print("p-value:", f_test[1])

F-statistic: 5.47439280158252
p-value: 2.890888194038193e-11


In [120]:
from sklearn.linear_model import LassoCV, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y = df_encoded['DurationSecs']
X = df_encoded.drop(['DurationSecs'],axis = 1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=37)

lasso_cv = LassoCV(cv=5, max_iter=100000).fit(X_train, y_train)
print("Optimal alpha:", lasso_cv.alpha_)

lasso_best = Lasso(alpha=lasso_cv.alpha_)
lasso_best.fit(X_train, y_train)


y_pred = lasso_best.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")


mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}" )


r_squared = r2_score(y_test, y_pred)
print(f"R-squared: {r_squared}")
n = X_test.shape[0]  # Number of observations
p = X_test.shape[1]  # Number of features (predictors)
adjusted_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - p - 1)
print(f"Adjusted_r_squared: {adjusted_r_squared}")

pd.set_option('display.max_rows', None)

lasso_coefficients = pd.Series(lasso_best.coef_, index=X.columns)
print(lasso_coefficients)



Optimal alpha: 0.0003143152902245518
Mean Absolute Error: 0.7156416804058044
Mean Squared Error: 0.9196891223360028
R-squared: 0.19223661681387516
Adjusted_r_squared: 0.18919569339758713
OppPossessionSecs                                    0.122882
Player Id                                            0.004062
PositionId                                          -0.048661
PossessionSecs                                       0.175134
Set Type                                             0.084324
Total Involved Tacklers                              0.034675
Raw Tackle Number                                   -0.138175
RoundId                                             -0.015485
RunOn                                                0.019533
Score                                                0.028286
SeasonId                                             0.060310
SeqNumber                                           -0.274618
Set                                                 -0.024785
ZonePos

  model = cd_fast.enet_coordinate_descent(
