 # Preparing and Cleaning the Data

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

nba_df = pd.read_csv('player_totals.csv', encoding = 'unicode_escape', engine ='python')

print(f"Original data set length: {len(nba_df)} rows")

season2024_condition = nba_df['Season'] > 2023
nba_df.drop(index=nba_df[season2024_condition].index, inplace=True)

# season2005_condition = nba_df['Season'] <= 2005
# nba_df.drop(index=nba_df[season2005_condition].index, inplace=True)

# targeting modern game

print(f"Length of data set after dropping the 2024 season: {len(nba_df)} rows")

gp_condition = nba_df["GP"] <= 10
nba_df.drop(index=nba_df[gp_condition].index, inplace=True)

print(f"Length of data set after dropping all rows with less than 10 games played: {len(nba_df)} rows")

# nba_df = nba_df.dropna()

# print(f"Length of data set after dropping all rows with NA values: {len(nba_df)} rows")

missing_data = nba_df.isnull().sum()

print("Columns with missing data:")
print(missing_data[missing_data > 0])
print("\n")

columns_to_fill = ['3P%', '2P%', 'FT%', 'Age', 'GS', 'MP', '3P', '3PA', 'ORB', 'DRB', 'TRB', 'STL', 'BLK', 'TOV']

nba_df[columns_to_fill] = nba_df[columns_to_fill].fillna(0)

missing_data_updated = nba_df[columns_to_fill].isnull().sum()

print('Columns with missing data after filling:')
print(missing_data_updated)


Original data set length: 31550 rows
Length of data set after dropping the 2024 season: 31135 rows
Length of data set after dropping all rows with less than 10 games played: 27308 rows
Columns with missing data:
Age      12
GS     7706
MP      939
3P     5725
3PA    5725
3P%    8346
2P%       1
FT%      85
ORB    4153
DRB    4153
TRB     769
STL    4969
BLK    4968
TOV    5068
dtype: int64


Columns with missing data after filling:
3P%    0
2P%    0
FT%    0
Age    0
GS     0
MP     0
3P     0
3PA    0
ORB    0
DRB    0
TRB    0
STL    0
BLK    0
TOV    0
dtype: int64


# Training with Linear Regression

In [4]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Extract features and target variable
X = nba_df[["GP", "MP", "FG%", "FG"]]
y_pts = nba_df["PTS"]

# Create and fit the scaler on the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create linear regression model
model = LinearRegression()

# Create KFold cross-validator
kf = KFold(n_splits=10, shuffle=True, random_state=13)

# Initialize lists to store min and max values for each fold
min_y_pts_values = []
max_y_pts_values = []

# Perform cross-validation
for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y_pts.iloc[train_index], y_pts.iloc[test_index]

    # Fit the model
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Calculate min and max values for this fold
    min_y_pts_values.append(min(y_test))
    max_y_pts_values.append(max(y_test))

# Calculate mean values of metrics
mse_pts_mean = -np.mean(cross_val_score(model, X_scaled, y_pts, cv=kf, scoring='neg_mean_squared_error'))
mae_pts_mean = -np.mean(cross_val_score(model, X_scaled, y_pts, cv=kf, scoring='neg_mean_absolute_error'))
r2_pts_mean = np.mean(cross_val_score(model, X_scaled, y_pts, cv=kf, scoring='r2'))

# Calculate adjusted R-squared
n = len(y_pts)
k = X.shape[1]
adjusted_r2_pts = 1 - ((1 - r2_pts_mean) * (n - 1) / (n - k - 1))

print("Metrics for Points (PTS) with Linear Regression and Cross-Validation:")
print(f"MSE: {mse_pts_mean}")
print(f"MAE: {mae_pts_mean}")
print(f"R-squared: {r2_pts_mean}")
print(f"Adjusted R-squared: {adjusted_r2_pts}")

# Print min and max values for each fold
for i, (min_val, max_val) in enumerate(zip(min_y_pts_values, max_y_pts_values), 1):
    print(f"Fold {i}: Min PTS: {min_val}, Max PTS: {max_val}")


Metrics for Points (PTS) with Linear Regression and Cross-Validation:
MSE: 3318.2778205637574
MAE: 37.399940900823395
R-squared: 0.9858163503124384
Adjusted R-squared: 0.9858142723503555
Fold 1: Min PTS: 6, Max PTS: 2822
Fold 2: Min PTS: 2, Max PTS: 3033
Fold 3: Min PTS: 4, Max PTS: 2719
Fold 4: Min PTS: 8, Max PTS: 2633
Fold 5: Min PTS: 2, Max PTS: 2719
Fold 6: Min PTS: 1, Max PTS: 4029
Fold 7: Min PTS: 3, Max PTS: 3586
Fold 8: Min PTS: 2, Max PTS: 2948
Fold 9: Min PTS: 5, Max PTS: 3041
Fold 10: Min PTS: 4, Max PTS: 2495


# Justification
The choice to use linear regression was due to the fact that the relationship between the features and the target variable was linear. So any changes to one of them would affect the outcome of points.

# Random Forest Regression

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score
import numpy as np

# Create and fit the scaler on the entire data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create the Random Forest regression model
rf_model = RandomForestRegressor(random_state=42)

# Perform 10-fold cross-validation with parallel processing
cv_mse_pts = -cross_val_score(rf_model, X_scaled, y_pts, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
cv_mae_pts = -cross_val_score(rf_model, X_scaled, y_pts, cv=10, scoring='neg_mean_absolute_error', n_jobs=-1)
cv_r2_pts = cross_val_score(rf_model, X_scaled, y_pts, cv=10, scoring='r2', n_jobs=-1)

# Calculate mean values of metrics
mse_pts_mean = np.mean(cv_mse_pts)
mae_pts_mean = np.mean(cv_mae_pts)
r2_pts_mean = np.mean(cv_r2_pts)

# Calculate adjusted R-squared
n = len(y_pts)
k = X_scaled.shape[1]
adjusted_r2_pts = 1 - ((1 - r2_pts_mean) * (n - 1) / (n - k - 1))

print("Metrics for Points (PTS) with Untuned Random Forest Model and 10-fold Cross-Validation:")
print(f"MSE: {mse_pts_mean}")
print(f"MAE: {mae_pts_mean}")
print(f"R-squared: {r2_pts_mean}")
print(f"Adjusted R-squared: {adjusted_r2_pts}")


Metrics for Points (PTS) with Untuned Random Forest Model and 10-fold Cross-Validation:
MSE: 3653.0698892261585
MAE: 37.092705480288046
R-squared: 0.98399982360813
Adjusted R-squared: 0.983997479517533


# Random Forest Retraining with Grid Search


In [6]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import time
import psutil

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['log2', 'sqrt', None],
    'bootstrap': [True, False]
}


# Define a custom scorer for adjusted R-squared
def adjusted_r2_scorer(estimator, X, y):
    n = len(y)
    k = X.shape[1]
    r2 = r2_score(y, estimator.predict(X))
    return 1 - (1 - r2) * (n - 1) / (n - k - 1)

# Create the Random Forest regression model
rf_model = RandomForestRegressor(random_state=42)

# Create the GridSearchCV object with cross-validation
grid_search = GridSearchCV(
    rf_model,
    param_grid=param_grid,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    scoring={'neg_mean_squared_error': 'neg_mean_squared_error', 'adjusted_r2': make_scorer(adjusted_r2_scorer)},
    refit='neg_mean_squared_error',  # Refit using neg_mean_squared_error for predictions
    return_train_score=True,
    n_jobs=-1
)

# Create and fit the scaler on the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Record the start time
start_time = time.time()

# Monitor CPU and memory usage during grid search
cpu_percentages = []
memory_percentages = []

# Fit the model to the data and perform cross-validation
for i, (train_index, test_index) in enumerate(grid_search.cv.split(X_scaled, y_pts)):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y_pts.iloc[train_index], y_pts.iloc[test_index]
    
    grid_search.fit(X_train, y_train)
    
    # Monitor CPU and memory usage
    cpu_percentages.append(psutil.cpu_percent())
    memory_percentages.append(psutil.virtual_memory().percent)

# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Round the elapsed time to two decimal places
elapsed_time_rounded = round(elapsed_time, 2)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_

# Create a new Random Forest model with the best hyperparameters
best_rf_model = RandomForestRegressor(random_state=13, **best_params).fit(X_scaled, y_pts)

# Calculate adjusted R-squared for the best model
best_rf_model_r2 = r2_score(y_pts, best_rf_model.predict(X_scaled))
adjusted_r2 = 1 - (1 - best_rf_model_r2) * (len(y_pts) - 1) / (len(y_pts) - X_scaled.shape[1] - 1)

# Predictions
y_pts_pred_tuned_rf = best_rf_model.predict(X_scaled)

# Evaluation for Points (PTS) with tuned Random Forest model
mse_pts_tuned_rf = mean_squared_error(y_pts, y_pts_pred_tuned_rf)
mae_pts_tuned_rf = mean_absolute_error(y_pts, y_pts_pred_tuned_rf)
r2_pts_tuned_rf = best_rf_model_r2

print("Metrics for Points (PTS) with Grid Search Random Forest Regression Model (Cross-Validation):")
print(f"MSE: {mse_pts_tuned_rf}")
print(f"MAE: {mae_pts_tuned_rf}")
print(f"R-squared: {r2_pts_tuned_rf}")
print(f"Adjusted R-squared: {adjusted_r2}")
print(f"Grid Search with Cross-Validation took {elapsed_time_rounded} seconds.")

# Print average CPU and memory usage during grid search
print(f"Average CPU Usage: {np.mean(cpu_percentages)}%")
print(f"Average Memory Usage: {np.mean(memory_percentages)}%")

# Print the best Hyperparameters
print("Best Hyperparameters:")
print(best_params)


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan n

Metrics for Points (PTS) with Grid Search Random Forest Regression Model (Cross-Validation):
MSE: 2121.730875379217
MAE: 29.681374821723534
R-squared: 0.9909330036905352
Adjusted R-squared: 0.9909316753388802
Grid Search with Cross-Validation took 1734.83 seconds.
Average CPU Usage: 84.38%
Average Memory Usage: 61.339999999999996%
Best Hyperparameters:
{'bootstrap': True, 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


# Random Forest Retraining with Randomized Search

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import time
import psutil

# Assuming X and y are your features and target variable
X = nba_df[["GP", "MP", "FG%", "FG"]]
y_pts = nba_df["PTS"]

# Create and fit the scaler on the entire data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define the hyperparameter grid for Random Forest
param_dist = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['log2', 'sqrt', None],
    'bootstrap': [True, False]
}


# Create the Random Forest regression model
rf_model = RandomForestRegressor(random_state=13)

# Create the RandomizedSearchCV object with 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=13)
randomized_search = RandomizedSearchCV(
    rf_model, param_distributions=param_dist, cv=kf, scoring='neg_mean_squared_error', n_jobs=-1
)

# Record the start time
start_time = time.time()

# Monitor CPU and memory usage during randomized search
cpu_percentages = []
memory_percentages = []

# Fit the model to the data
randomized_search.fit(X_scaled, y_pts)

# Monitor CPU and memory usage
cpu_percentages.append(psutil.cpu_percent())
memory_percentages.append(psutil.virtual_memory().percent)

# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Round the elapsed time to two decimal places
elapsed_time_rounded = round(elapsed_time, 2)

# Get the best hyperparameters
best_params = randomized_search.best_params_

# Create a new Random Forest model with the best hyperparameters
best_rf_model = RandomForestRegressor(random_state=13, **best_params).fit(X_scaled, y_pts)

# Predictions
y_pts_pred_tuned_rf = best_rf_model.predict(X_scaled)

# Evaluation for Points (PTS) with tuned Random Forest model
mse_pts_tuned_rf = mean_squared_error(y_pts, y_pts_pred_tuned_rf)
mae_pts_tuned_rf = mean_absolute_error(y_pts, y_pts_pred_tuned_rf)
r2_pts_tuned_rf = r2_score(y_pts, y_pts_pred_tuned_rf)
adj_r2_pts_tuned_rf = 1 - (1 - r2_pts_tuned_rf) * (len(y_pts) - 1) / (len(y_pts) - X.shape[1] - 1)

print("Metrics for Points (PTS) with Randomized Search Random Forest Regression Model:")
print(f"MSE: {mse_pts_tuned_rf}")
print(f"MAE: {mae_pts_tuned_rf}")
print(f"R-squared: {r2_pts_tuned_rf}")
print(f"Adjusted R-squared: {adj_r2_pts_tuned_rf}")
print(f"Randomized Search took {elapsed_time_rounded} seconds.")

# Print average CPU and memory usage during randomized search
print(f"Average CPU Usage: {np.mean(cpu_percentages)}%")
print(f"Average Memory Usage: {np.mean(memory_percentages)}%")

# Print the best Hyperparameters
print("Best Hyperparameters:")
print(best_params)

Metrics for Points (PTS) with Randomized Search Random Forest Regression Model:
MSE: 2121.730875379217
MAE: 29.681374821723534
R-squared: 0.9909330036905352
Adjusted R-squared: 0.9909316753388802
Randomized Search took 71.32 seconds.
Average CPU Usage: 39.8%
Average Memory Usage: 59.2%
Best Hyperparameters:
{'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 10, 'bootstrap': True}


# kNN Regression

In [8]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# Assuming X and y are your features and target variable

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert y_pts to a NumPy array
y_pts_array = y_pts.values  # No need to reshape for 1D array

# Create kNN regression model
knn_model = KNeighborsRegressor(n_neighbors=5)  # You can adjust the number of neighbors

kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Perform cross-validation
cv_mse = -cross_val_score(knn_model, X_scaled, y_pts_array, cv=kf, scoring='neg_mean_squared_error')
cv_mae = -cross_val_score(knn_model, X_scaled, y_pts_array, cv=kf, scoring='neg_mean_absolute_error')
cv_r2 = cross_val_score(knn_model, X_scaled, y_pts_array, cv=kf, scoring='r2')

# Calculate mean values of metrics
mse_mean = np.mean(cv_mse)
mae_mean = np.mean(cv_mae)
r2_mean = np.mean(cv_r2)

print("Metrics for Points (PTS) with k-Nearest Neighbors Regression and 10-fold Cross-Validation:")
print(f"MSE: {mse_mean}")
print(f"MAE: {mae_mean}")
print(f"R-squared: {r2_mean}")



Metrics for Points (PTS) with k-Nearest Neighbors Regression and 10-fold Cross-Validation:
MSE: 3888.570610106457
MAE: 40.634687603864464
R-squared: 0.9833804999395147


# kNN Retraining with Grid Search

In [10]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.preprocessing import StandardScaler
import time
import psutil
import numpy as np

# Assuming X and y are your features and target variable
X = nba_df[["GP", "MP", "FG%", "FG"]]
y_pts = nba_df["PTS"]

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2],
    'leaf_size': [20, 30, 40],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# Create kNN regression model
knn_model = KNeighborsRegressor()

kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Record memory and CPU usage
cpu_percentages = []
memory_percentages = []

def get_resource_usage():
    cpu_percent = psutil.cpu_percent(interval=1)
    memory_percent = psutil.virtual_memory().percent
    return cpu_percent, memory_percent

# Create a custom scorer for explained_variance to use in cross_val_score
explained_variance_scorer = make_scorer(lambda y, y_pred: r2_score(y, y_pred), greater_is_better=True)

grid_search = GridSearchCV(knn_model, param_grid=param_grid, cv=kf, scoring='neg_mean_squared_error', n_jobs=-1)

# Record the start time
start_time = time.time()

# Fit the model to the data
grid_search.fit(X_scaled, y_pts)

# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Round the elapsed time to two decimal places
elapsed_time_rounded = round(elapsed_time, 2)

# Record memory and CPU usage during grid search
cpu_percentages.append(psutil.cpu_percent())
memory_percentages.append(psutil.virtual_memory().percent)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Create a new KNN model with the best hyperparameters
best_knn_model = KNeighborsRegressor(**best_params)

# Perform cross-validation
cv_mse = -cross_val_score(best_knn_model, X_scaled, y_pts, cv=kf, scoring='neg_mean_squared_error')
cv_mae = -cross_val_score(best_knn_model, X_scaled, y_pts, cv=kf, scoring='neg_mean_absolute_error')
cv_r2 = cross_val_score(best_knn_model, X_scaled, y_pts, cv=kf, scoring='r2')
cv_adj_r2 = cross_val_score(best_knn_model, X_scaled, y_pts, cv=kf, scoring=explained_variance_scorer)

# Calculate mean values of metrics
mse_mean = np.mean(cv_mse)
mae_mean = np.mean(cv_mae)
r2_mean = np.mean(cv_r2)
adj_r2_mean = np.mean(cv_adj_r2)

print("Metrics for Points (PTS) with Tuned k-Nearest Neighbors Regression and 10-fold Cross-Validation:")
print(f"MSE: {mse_mean}")
print(f"MAE: {mae_mean}")
print(f"R-squared: {r2_mean}")
print(f"Grid Search took {elapsed_time_rounded} seconds.")

# Print average CPU and memory usage during grid search
print(f"Average CPU Usage: {np.mean(cpu_percentages)}%")
print(f"Average Memory Usage: {np.mean(memory_percentages)}%")

# Print the best Hyperparameters
print("Best Hyperparameters:")
print(best_params)


Metrics for Points (PTS) with Tuned k-Nearest Neighbors Regression and 10-fold Cross-Validation:
MSE: 3607.4618754743597
MAE: 39.178379436308916
R-squared: 0.9845863350330074
Grid Search took 43.31 seconds.
Average CPU Usage: 9.2%
Average Memory Usage: 64.5%
Best Hyperparameters:
{'algorithm': 'ball_tree', 'leaf_size': 20, 'n_neighbors': 9, 'p': 2, 'weights': 'distance'}


# kNN Retraining with Randomized Search

In [11]:
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
import time
import numpy as np
import psutil

# Assuming X and y are your features and target variable
X = nba_df[["GP", "MP", "FG%", "FG"]]
y_pts = nba_df["PTS"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_pts, test_size=0.4, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

param_dist = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2],
    'leaf_size': [20, 30, 40],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# Create kNN regression model
knn_model = KNeighborsRegressor()

# Create KFold for cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Record the start time
start_time = time.time()

# Monitor CPU and memory usage during randomized search
cpu_percentages = []
memory_percentages = []

# Create RandomizedSearchCV object
random_search = RandomizedSearchCV(
    knn_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='neg_mean_squared_error', n_jobs=1, random_state=42
)

# Fit the model to the data
random_search.fit(X_train_scaled, y_train)

# Monitor CPU and memory usage
cpu_percentages.append(psutil.cpu_percent())
memory_percentages.append(psutil.virtual_memory().percent)

# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Round the elapsed time to two decimal places
elapsed_time_rounded = round(elapsed_time, 2)

# Get the best hyperparameters
best_params = random_search.best_params_

# Create a new KNN model with the best hyperparameters
best_knn_model = KNeighborsRegressor(**best_params).fit(X_train_scaled, y_train)

# Perform cross-validation for adjusted R-squared
cv_r2 = cross_val_score(best_knn_model, X_train_scaled, y_train, cv=kf, scoring='r2')
cv_adj_r2 = cross_val_score(best_knn_model, X_train_scaled, y_train, cv=kf, scoring='explained_variance')

# Calculate mean values of metrics
r2_mean = np.mean(cv_r2)
adj_r2_mean = np.mean(cv_adj_r2)

# Predictions
y_pts_pred_tuned_knn = best_knn_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pts_pred_tuned_knn)
mae = mean_absolute_error(y_test, y_pts_pred_tuned_knn)
r2 = r2_score(y_test, y_pts_pred_tuned_knn)

print("Metrics for Points (PTS) with Tuned k-Nearest Neighbors Regression (Randomized Search)")
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"R-squared (CV): {r2_mean}")
print(f"Adjusted R-squared (CV): {adj_r2_mean}")
print(f"Randomized Search took {elapsed_time_rounded} seconds.")

# Print average CPU and memory usage during randomized search
print(f"Average CPU Usage: {np.mean(cpu_percentages)}%")
print(f"Average Memory Usage: {np.mean(memory_percentages)}%")

# Print the best Hyperparameters
print("Best Hyperparameters:")
print(best_params)

Metrics for Points (PTS) with Tuned k-Nearest Neighbors Regression (Randomized Search)
MSE: 3889.8392584749695
MAE: 40.347583659548185
R-squared (CV): 0.9840094521432474
Adjusted R-squared (CV): 0.9840346933017091
Randomized Search took 3.34 seconds.
Average CPU Usage: 6.8%
Average Memory Usage: 64.9%
Best Hyperparameters:
{'weights': 'distance', 'p': 2, 'n_neighbors': 9, 'leaf_size': 40, 'algorithm': 'kd_tree'}
