### Model Training

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from sklearn.linear_model import LinearRegression

In [2]:
# Load data
batting_df = pd.read_csv('../data/batting_df.csv')
batting_df.drop(columns=['Unnamed: 0'], inplace=True)
batting_df.head()

Unnamed: 0,Player,Season,BA,SLG,launch_angle,sweet_spot_percentage,max_ev,average_ev,fly_ball_line_drive_ev,ground_ball_ev,max_distance,average_distance,average_homerun,hard_hit_95mph+,hard_hit_percentage,hard_hit_swing_percentage,total_barrels,barrels_batted_balls_percentage,barrels_plate_appearance_percentage
0,"Stanton, Giancarlo",2017,0.281,0.631,11.2,31.6,122.2,92.0,100.0,86.8,468,183,418.0,199,45.6,17.0,76,17.4,11.0
1,"Alonso, Pete",2019,0.26,0.583,14.8,36.9,118.3,90.7,96.6,85.9,489,189,414.0,176,42.7,13.7,66,15.8,9.5
2,"Judge, Aaron",2017,0.284,0.627,15.8,38.2,121.1,95.0,100.5,88.6,496,216,413.0,185,54.7,15.1,87,25.7,12.8
3,"Suárez, Eugenio",2019,0.271,0.572,17.7,38.0,112.3,89.3,93.1,86.1,457,195,400.0,157,40.8,13.0,54,13.8,8.2
4,"Davis, Khris",2018,0.247,0.549,18.1,37.7,112.8,92.5,97.2,87.7,438,211,405.0,195,48.1,15.7,70,17.2,10.7


In [3]:
# Plot average EV vs. slugging
scatter = batting_df.hvplot.scatter(
    x="barrels_batted_balls_percentage",
    y="SLG",
    title="Barrel Pct vs. Average EV"
)
scatter

In [4]:
# Pull out our dependent and independent variables
X = batting_df['barrels_batted_balls_percentage'].values.reshape(-1, 1)
y = batting_df['SLG']

In [5]:
# Create a skl linear regression model
model = LinearRegression()

In [6]:
# Fit the data to the model
model.fit(X, y)

In [7]:
# Print the formula for the line of best fit
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = 0.3618155175079031 + 0.011410132849573545X


In [8]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [9]:
# Create a copy of the original data
predictions_df = batting_df.copy()

# Add the predictions in
predictions_df['Predicted SLG'] = predicted_y_values

In [10]:
# Generate a plot for the line of best fit
best_fit_line = predictions_df.hvplot.line(
    x = 'barrels_batted_balls_percentage',
    y = 'Predicted SLG',
    color = 'red'
)

# Compose the original scatter plot and the plot for the line of best fit
composite = scatter * best_fit_line
composite

In [11]:
hvplot.save(composite, '../graphs/barrels_vs_slg.html')

### Model Assessment

In [12]:
# Import libraries
from sklearn.metrics import mean_squared_error, r2_score

In [13]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.44203190533586045.
The r2 is 0.44203190533586045.
The mean squared error is 0.0025985976419209207.
The root mean squared error is 0.05097644202885212.
The standard deviation is 0.06824405698504646.
