In [10]:
import pandas as pd
import gdown
from google.colab import drive
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import numpy as np
import sklearn.model_selection as KFold

In [11]:
import pandas as pd
import gdown
# Mount Google Drive
drive.mount('/content/drive')

# Specify the path to the CSV file
file_path = '/content/drive/My Drive/FIT/Work_Space_Shot_Quality_Metric/Machine_Learning/gmm_clustering_with_play_styles.csv'
df=pd.read_csv(file_path)
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,player,forehand_winner_per,backhand_winner_per,forehand_unforced_error_per,backhand_unforced_error_per,net_per,net_point_direct_win_per,net_point_winning_per,net_point_error,passing_per,...,pts_won_Ite_3_shots_per,shots_in_pts_won_per,shots_in_pts_lost_per,shots_in_won_vs_lost_ratio,inside_in_per,inside_out_per,Counter Puncher,Attacking Baseliner,All-Court Player,Solid Baseliner
0,Aaron Krickstein,4.99,3.69,7.72,8.9,9.03,31.96,5.21,13.4,23.53,...,25.73,43.51,56.49,0.78,2.1,8.72,94.38394,5.615032,0.0008901479,0.0001384078
1,Adam Pavlasek,10.28,3.74,7.48,6.54,29.91,33.33,19.63,21.88,12.5,...,30.82,60.0,40.0,1.5,2.15,12.37,2.41806e-43,7.033928e-08,99.99832,0.001681317
2,Adrian Mannarino,4.95,3.48,9.87,7.11,11.77,46.86,7.72,25.2,13.57,...,34.81,52.36,47.64,1.16,0.48,5.45,7.563223e-15,0.2228162,4.078528e-05,99.77714
3,Adriano Panatta,8.19,5.17,9.05,7.76,46.98,34.0,21.55,14.68,22.02,...,24.89,40.0,60.0,0.67,0.0,6.03,1.075557e-08,9.272471e-45,100.0,6.321417e-116
4,Agustin Calleri,10.8,6.82,11.36,15.91,27.84,60.71,15.91,20.41,18.37,...,21.24,38.64,61.36,0.63,1.23,11.55,100.0,3.784055e-06,1.4981010000000002e-17,5.774794e-16


In [12]:
# Selecting feature columns and target columns based on the provided instructions
feature_columns = [
    'forehand_winner_per', 'backhand_winner_per', 'net_per', 'net_point_direct_win_per', 'net_point_winning_per',
    'net_point_error', 'passing_per', 'winner_per', 'err_per',
    'pts_won_Ite_3_shots_per', 'shots_in_pts_won_per', 'shots_in_pts_lost_per',
    'shots_in_won_vs_lost_ratio', 'inside_in_per', 'inside_out_per'
]

target_columns = ['Counter Puncher', 'Attacking Baseliner', 'All-Court Player', 'Solid Baseliner']

# Splitting the data into features and targets
X = df[feature_columns]
y = df[target_columns]

In [13]:
# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing and fitting the Random Forest Regressor
#overwrite the random forest regressor
rf_regressor=RandomForestRegressor(
    n_estimators=100,
    criterion='squared_error',
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42
    )

rf_regressor.fit(X_train, y_train)

# Making predictions on the test set
y_pred = rf_regressor.predict(X_test)

# Evaluating the model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 684.5245143940706
R² Score: 0.4997177707837125


In [14]:
# Save the trained model to a file
joblib.dump(rf_regressor, '/content/drive/My Drive/FIT/Work_Space_Shot_Quality_Metric/Machine_Learning/rf_regressor_model.joblib')

['/content/drive/My Drive/FIT/Work_Space_Shot_Quality_Metric/Machine_Learning/rf_regressor_model.joblib']

In [15]:
nsplits=5
kf=KFold.KFold(n_splits=nsplits, shuffle=True, random_state=42)
mse_scores=[]
r2_scores=[]
for train_index, test_index in kf.split(X):
  x_train_fold, x_test_fold=X.iloc[train_index], X.iloc[test_index]
  y_train_fold, y_test_fold=y.iloc[train_index], y.iloc[test_index]
  rf_regresssor=RandomForestRegressor(random_state=42)
  rf_regressor.fit(x_train_fold, y_train_fold)
  y_pred_fold=rf_regressor.predict(x_test_fold)
  mse_scores.append(mean_squared_error(y_test_fold, y_pred_fold))
  r2_scores.append(r2_score(y_test_fold, y_pred_fold))
print(f"MSE scores: ", mse_scores)
print(f"R^2 scores: ", r2_scores)
print(f"Mean Squared Error (MSE):", np.mean(mse_scores))
print(f"Average R² Score:", np.mean(r2_scores))

MSE scores:  [645.9426844530905, 555.6672045783209, 626.0221582906927, 765.1003947566168, 694.672459458808]
R^2 scores:  [0.527053383555369, 0.6372430389467459, 0.5770042845594909, 0.5134878982654114, 0.5788793221405439]
Mean Squared Error (MSE): 657.4809803075058
Average R² Score: 0.5667335854935123


In [16]:
#this cell operates leave-one-out CV for the dataset
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
mse_scores = []
r2_scores = []
for train_index, test_index in loo.split(X):
    x_train, x_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    rf_regressor = RandomForestRegressor(random_state=42)
    rf_regressor.fit(x_train, y_train)
    y_pred = rf_regressor.predict(x_test)
    mse_scores.append(mean_squared_error(y_test, y_pred))
    r2_scores.append(r2_score(y_test, y_pred))
avg_mse = np.mean(mse_scores)
avg_r2 = np.mean(r2_scores)
print(f"MSE scores: {mse_scores}")
print(f"R^2 scores: {r2_scores}")
print(f"Average MSE: {avg_mse}")
print(f"Average R^2: {avg_r2}")



KeyboardInterrupt: 

In [None]:
#overwrite the random forest regressor
rf_regressor=RandomForestRegressor(
    n_estimators=100,
    criterion='squared_error',
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42
    )
