In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import Ridge, Lasso

In [2]:
#Get stats from csv
games_df = pd.read_csv('game_data.csv')

In [3]:
#Splitting testing and training data with FG3M as the output
columns_to_drop = ['FG3M']
X = games_df.drop(columns_to_drop, axis=1)
y = games_df['FG3M']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Initialize a RandomForestRegressor
model = RandomForestRegressor(random_state=42)

# Train the model on the training set
model.fit(X_train, y_train)

# Feature selection based on model's importance
feature_selector = SelectFromModel(model, prefit=True)

# Manually specify feature names
selected_features = X_train.columns[feature_selector.get_support()]

# Transform the datasets
X_train_selected = feature_selector.transform(X_train)
X_test_selected = feature_selector.transform(X_test)

# Print the selected features
print(f'Selected Features: {list(selected_features)}')

# Train and evaluate the model with selected features
model.fit(X_train_selected, y_train)
y_pred = model.predict(X_test_selected)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error on Test Set: {mse}')

# Perform k-fold cross-validation on the selected features
cv_mse_scores = cross_val_score(model, X_train_selected, y_train, cv=5, scoring='neg_mean_squared_error')
average_cv_mse = np.mean(-cv_mse_scores)
print(f'Average Mean Squared Error (Cross-validation): {average_cv_mse}')



Selected Features: ['FG3A', 'FG3_PCT', 'FT_PCT', 'last_5_FT_PCT', 'TOR_assistPercentage']
Mean Squared Error on Test Set: 2.4012000000000002
Average Mean Squared Error (Cross-validation): 2.066912545454546


In [5]:
# Specify the Ridge Regression model
ridge_model = Ridge(alpha=1.0)  # You can adjust the alpha parameter for regularization

# Specify the number of folds for cross-validation
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Perform k-fold cross-validation
cross_val_results = cross_val_score(ridge_model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

# Convert the negative mean squared errors to positive values
cross_val_results = -cross_val_results

# Print the cross-validation results
print(f'Cross-Validation Mean MSE: {cross_val_results.mean()}')
print(f'Cross-Validation Standard Deviation: {cross_val_results.std()}')

ridge_model.fit(X_train, y_train)

# Get feature names
feature_names = X_train.columns

# Get the coefficients
coefficients = ridge_model.coef_

# Create a DataFrame with coefficients and feature names
ridge_coefficients = pd.DataFrame({'Feature': feature_names, 'Coefficient': ridge_model.coef_})

# Display or use the filtered DataFrame
print(ridge_coefficients)

Cross-Validation Mean MSE: 0.42163528483473733
Cross-Validation Standard Deviation: 0.27363542698269566
              Feature  Coefficient
0                 FGM    -1.885448
1                 FGA     0.045345
2              FG_PCT    -0.022442
3                FG3A     0.017253
4             FG3_PCT     0.164501
..                ...          ...
70  OPP_estimatedPace     0.037840
71           OPP_pace    -0.022336
72      OPP_pacePer40    -0.018727
73    OPP_possessions     0.019000
74            OPP_PIE     0.009679

[75 rows x 2 columns]


In [7]:
y_pred = ridge_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error on Test Set: {mse}')

Mean Squared Error on Test Set: 0.04922507832659151


In [None]:
# Specify the Lasso Regression model
lasso_model = Lasso(alpha=1.0)  # You can adjust the alpha parameter for regularization

# Specify the number of folds for cross-validation
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Perform k-fold cross-validation
cross_val_results = cross_val_score(lasso_model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

# Convert the negative mean squared errors to positive values
cross_val_results = -cross_val_results

# Print the cross-validation results
print(f'Cross-Validation Mean MSE: {cross_val_results.mean()}')
print(f'Cross-Validation Standard Deviation: {cross_val_results.std()}')

lasso_model.fit(X_train, y_train)

# Get feature names
feature_names = X_train.columns

# Create a DataFrame with coefficients and feature names
lasso_coefficients = pd.DataFrame({'Feature': feature_names, 'Coefficient': lasso_model.coef_})

# Filter rows where the coefficient is not equal to 0
lasso_coefficients_nonzero = lasso_coefficients[lasso_coefficients['Coefficient'] != 0]

# Display or use the filtered DataFrame
print(lasso_coefficients_nonzero)


Cross-Validation Mean MSE: 4.8737178021910434
Cross-Validation Standard Deviation: 3.086620108294803
                         Feature  Coefficient
0                            FGM    -0.417016
3                           FG3A     0.254717
5                            FTM    -0.071769
6                            FTA    -0.106363
8                           OREB    -0.095149
10                           REB    -0.055531
16                           PTS     0.247991
29  TOR_estimatedOffensiveRating     0.051978
33        TOR_estimatedNetRating     0.030845
54  OPP_estimatedDefensiveRating     0.002248


In [None]:
y_pred = lasso_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error on Test Set: {mse}')