In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
df = pd.read_csv('stats.csv')

# Split the "last_name, first_name" column into two separate columns "last_name" and "first_name"
df[['last_name','first_name']] = df['last_name, first_name'].str.split(', ', expand=True)

# Drop the original "last_name, first_name" column
df.drop(columns=['last_name, first_name'], inplace=True)

# Print dataset information
print(f'There are {len(df)} observations made on {len(df.columns)} features considered in this dataset.\n')

print('Those features are:\n')
print('\n'.join(df.columns))

print('\nFeature of interest: on_base_plus_slg')


There are 538 observations made on 24 features considered in this dataset.

Those features are:

player_id
year
pa
hit
single
double
triple
home_run
k_percent
bb_percent
on_base_plus_slg
exit_velocity_avg
sweet_spot_percent
barrel_batted_rate
solidcontact_percent
hard_hit_percent
avg_best_speed
avg_hyper_speed
whiff_percent
swing_percent
groundballs_percent
flyballs_percent
last_name
first_name

Feature of interest: on_base_plus_slg


In [3]:
# Data Cleaning and Preprocessing
print('Checking for missing (Na) values:')
print(df.isna().sum())
print('\nChecking for duplicated values:')
print(df.duplicated().sum())

Checking for missing (Na) values:
player_id               0
year                    0
pa                      0
hit                     0
single                  0
double                  0
triple                  0
home_run                0
k_percent               0
bb_percent              0
on_base_plus_slg        0
exit_velocity_avg       0
sweet_spot_percent      0
barrel_batted_rate      0
solidcontact_percent    0
hard_hit_percent        0
avg_best_speed          0
avg_hyper_speed         0
whiff_percent           0
swing_percent           0
groundballs_percent     0
flyballs_percent        0
last_name               0
first_name              0
dtype: int64

Checking for duplicated values:
0


In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Correcting the training and validation dataset definition.
train_df = df[(df['year'] == 2021) | (df['year'] == 2022)]
val_df = df[df.year == 2022]  # This could include an unwanted overlap.
test_df = df[df.year == 2023]

# Feature Importance with RandomForestRegressor
feature_rf = RandomForestRegressor()
feature_rf.fit(train_df.drop(columns=['on_base_plus_slg']), train_df['on_base_plus_slg'])
feature_importances = feature_rf.feature_importances_

# Identify top 5 features
indices = np.argsort(feature_importances)[-5:]
top_features = [train_df.drop(columns=['on_base_plus_slg']).columns[i] for i in indices]
print(f"Top 5 features: {top_features}")

# Preparing the datasets using top 5 features

X_train = train_df[top_features]
y_train = train_df['on_base_plus_slg']

X_val = val_df[top_features]
y_val = val_df['on_base_plus_slg']

X_test = test_df[top_features]
y_test = test_df['on_base_plus_slg']

models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'K-Nearest Neighbors': KNeighborsRegressor()
}

best_model = None
best_r2 = float('-inf')

for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    r2 = r2_score(y_val, predictions)
    if r2 > best_r2:
        best_r2 = r2
        best_model = model

# Use the best model to predict on test data
test_predictions = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, test_predictions)
test_r2 = r2_score(y_test, test_predictions)

print(f'Best model: {best_model.__class__.__name__}')
print(f'Test MSE: {test_mse}, Test R2: {test_r2}')

ValueError: could not convert string to float: 'Cabrera'

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=0f33139c-97ab-427b-815d-ae67021990b6' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>