In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [6]:
ptb_data = pd.read_csv('ptb_data.csv')

In [8]:
sample_df = ptb_data.sample(frac=0.1)

In [77]:
# df_encoded = ptb_data.drop(columns = ['PTB Defence','Anonymize 1PlayerId', 'EventName', 'Club Id', 'Opposition Id', 'PTB Contest', 
#                                         'PTB Ultimate Outcome', 'WeatherConditionName'])


df_encoded = ptb_data.drop(columns = ['PTB Defence', 'Anonymize 1PlayerId', 'Player Id', 'ZonePhysical'])

df_encoded = pd.get_dummies(df_encoded, columns=['WeatherConditionName', 'Club Id', 'Opposition Id', 'PTB Contest', 'EventName', 'PTB Ultimate Outcome', 'OfficialId'], drop_first=True)


df_encoded = df_encoded.dropna()

df_encoded[df_encoded.select_dtypes(include=['bool']).columns] = df_encoded.select_dtypes(include=['bool']).astype(int)

df_encoded.info()

df_encoded.to_csv('encoded_ptb_data.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 111990 entries, 0 to 112026
Data columns (total 91 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   Away Score                                         111990 non-null  int64  
 1   DurationSecs                                       111990 non-null  float64
 2   Half                                               111990 non-null  int64  
 3   Home Score                                         111990 non-null  int64  
 4   MatchId                                            111990 non-null  int64  
 5   OppPossessionSecs                                  111990 non-null  float64
 6   OppScore                                           111990 non-null  float64
 7   PositionId                                         111990 non-null  float64
 8   PossessionSecs                                     111990 non-null  float64
 9 

In [121]:
import statsmodels.api as sm

Y = df_encoded['DurationSecs']
X = df_encoded.drop(['DurationSecs'],axis = 1)
X = sm.add_constant(X)

In [123]:
model = sm.OLS(Y, X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:           DurationSecs   R-squared:                       0.173
Model:                            OLS   Adj. R-squared:                  0.173
Method:                 Least Squares   F-statistic:                     809.6
Date:                Sun, 03 Nov 2024   Prob (F-statistic):               0.00
Time:                        11:15:48   Log-Likelihood:            -1.5723e+05
No. Observations:              111990   AIC:                         3.145e+05
Df Residuals:                  111960   BIC:                         3.148e+05
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------
const           

In [79]:
from sklearn.linear_model import LassoCV, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y = df_encoded['DurationSecs']
X = df_encoded.drop(['DurationSecs'],axis = 1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=99)

lasso_cv = LassoCV(cv=5, max_iter=100000).fit(X_train, y_train)
print("Optimal alpha:", lasso_cv.alpha_)

lasso_best = Lasso(alpha=lasso_cv.alpha_)
lasso_best.fit(X_train, y_train)


y_pred = lasso_best.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")


mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}" )


r_squared = r2_score(y_test, y_pred)
print(f"R-squared: {r_squared}")
n = X_test.shape[0]  # Number of observations
p = X_test.shape[1]  # Number of features (predictors)
adjusted_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - p - 1)
print(f"Adjusted_r_squared: {adjusted_r_squared}")

pd.set_option('display.max_rows', None)

lasso_coefficients = pd.Series(lasso_best.coef_, index=X.columns)
print(lasso_coefficients)



Optimal alpha: 0.0006610074504539635
Mean Absolute Error: 0.5450026483202601
Mean Squared Error: 0.46016071926547153
R-squared: 0.5958406266031282
Adjusted_r_squared: 0.5942100019738317
Away Score                                           0.006216
Half                                                 0.000000
Home Score                                           0.012281
MatchId                                              0.021525
OppPossessionSecs                                    0.000000
OppScore                                            -0.000000
PositionId                                          -0.021226
PossessionSecs                                       0.031061
PTB Tackle Result                                   -0.215594
Set Type                                             0.043432
Total Involved Tacklers                              0.041862
Raw Tackle Number                                   -0.050469
RoundId                                             -0.010002
RunOn   