# Data Cleaning

In [1]:
# Import libraries
import pandas as pd
import psycopg2
import sqlite3

In [2]:
# Load MLB batting statistics
batter_stats = pd.read_csv('data/batter_stats.csv')

batter_stats.head()

Unnamed: 0,Rk,Player,HR,AB,Season,Age,Team,Lg,G,PA,...,SLG,OPS,OPS+,TB,GIDP,HBP,SH,SF,IBB,Pos
0,1,Giancarlo Stanton,59,597,2017,27,MIA,NL,159,692,...,0.631,1.007,169,377,13,7,0,3,13,*9/HD
1,2,Pete Alonso,53,597,2019,24,NYM,NL,161,693,...,0.583,0.941,147,348,13,21,0,3,6,*3/HD
2,3,Aaron Judge,52,542,2017,25,NYY,AL,155,678,...,0.627,1.049,171,340,15,5,0,4,11,*9D/H
3,4,Eugenio Suárez,49,575,2019,27,CIN,NL,159,662,...,0.572,0.93,131,329,12,11,0,6,4,*5/HD
4,5,Khris Davis,48,576,2018,30,OAK,AL,151,654,...,0.549,0.874,137,316,16,12,0,7,5,*D7/H


In [3]:
# Drop unnecessary columns
batter_stats = batter_stats[['Player', 'Season', 'BA', 'SLG']]

batter_stats.head()

Unnamed: 0,Player,Season,BA,SLG
0,Giancarlo Stanton,2017,0.281,0.631
1,Pete Alonso,2019,0.26,0.583
2,Aaron Judge,2017,0.284,0.627
3,Eugenio Suárez,2019,0.271,0.572
4,Khris Davis,2018,0.247,0.549


In [4]:
# Function to reformat player names
def reorder_name(name):
  name_parts = name.split()
  if len(name_parts) >= 2:
    last_name = name_parts[-1]
    first_name = " ".join(name_parts[:-1])
    return f"{last_name}, {first_name}"
  else:
    return name

In [5]:
batter_stats['Player'] = batter_stats['Player'].apply(reorder_name)

batter_stats.head()

Unnamed: 0,Player,Season,BA,SLG
0,"Stanton, Giancarlo",2017,0.281,0.631
1,"Alonso, Pete",2019,0.26,0.583
2,"Judge, Aaron",2017,0.284,0.627
3,"Suárez, Eugenio",2019,0.271,0.572
4,"Davis, Khris",2018,0.247,0.549


In [6]:
# Load StatCast statistics
statcast = pd.read_csv('data/statcast.csv')

statcast.head()

Unnamed: 0,id,rank,year,player,batted_ball_events,launch_angle,sweet_spot_percentage,max_ev,average_ev,fly_ball_line_drive_ev,ground_ball_ev,max_distance,average_distance,average_homerun,hard_hit_95mph+,hard_hit_percentage,hard_hit_swing_percentage,total_barrels,barrels_batted_balls_percentage,barrels_plate_appearance_percentage
0,1,1,2022,"Judge, Aaron",341,14.6,37.8,118.4,95.8,100.2,89.1,465,205,413.0,210,61.6,20.3,91,26.7,15.8
1,2,2,2022,"Alvarez, Yordan",304,12.1,40.1,117.4,95.5,98.2,92.7,469,193,403.0,186,61.2,23.7,59,19.4,12.7
2,3,3,2022,"Trout, Mike",234,24.7,37.2,114.4,91.7,94.9,87.7,472,218,407.0,120,51.3,16.5,46,19.7,11.6
3,4,4,2022,"Schwarber, Kyle",312,18.9,34.6,114.8,93.3,99.7,87.2,468,197,415.0,170,54.5,17.6,64,20.5,11.5
4,5,5,2022,"Stanton, Giancarlo",228,10.2,26.3,119.8,94.6,98.2,94.1,445,160,400.0,117,51.3,17.8,42,18.4,11.3


In [7]:
# Drop unnecessary columns
statcast = statcast[['year', 'player', 'total_barrels', 'barrels_batted_balls_percentage', 'barrels_plate_appearance_percentage']]

# Rename columns
statcast.rename(columns={
    'year': 'Season',
    'player': 'Player'
}, inplace=True)

statcast.head()

Unnamed: 0,Season,Player,total_barrels,barrels_batted_balls_percentage,barrels_plate_appearance_percentage
0,2022,"Judge, Aaron",91,26.7,15.8
1,2022,"Alvarez, Yordan",59,19.4,12.7
2,2022,"Trout, Mike",46,19.7,11.6
3,2022,"Schwarber, Kyle",64,20.5,11.5
4,2022,"Stanton, Giancarlo",42,18.4,11.3


In [8]:
# Strip whitespace
statcast = statcast.applymap(lambda x: x.strip() if isinstance(x, str) else x)

statcast.head()

  statcast = statcast.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Unnamed: 0,Season,Player,total_barrels,barrels_batted_balls_percentage,barrels_plate_appearance_percentage
0,2022,"Judge, Aaron",91,26.7,15.8
1,2022,"Alvarez, Yordan",59,19.4,12.7
2,2022,"Trout, Mike",46,19.7,11.6
3,2022,"Schwarber, Kyle",64,20.5,11.5
4,2022,"Stanton, Giancarlo",42,18.4,11.3


In [9]:
# Merge our DataFrames
_batting_df = batter_stats.merge(statcast, how="inner", on=['Player', 'Season'])

_batting_df.head()

Unnamed: 0,Player,Season,BA,SLG,total_barrels,barrels_batted_balls_percentage,barrels_plate_appearance_percentage
0,"Stanton, Giancarlo",2017,0.281,0.631,76,17.4,11.0
1,"Alonso, Pete",2019,0.26,0.583,66,15.8,9.5
2,"Judge, Aaron",2017,0.284,0.627,87,25.7,12.8
3,"Suárez, Eugenio",2019,0.271,0.572,54,13.8,8.2
4,"Davis, Khris",2018,0.247,0.549,70,17.2,10.7


In [10]:
# Exporting cleaned data to csv
_batting_df.to_csv('data/batting_cleaned.csv', index=False)

# Storing cleaned data in sqlite 

In [11]:
# Set up a database using sqlite
conn = sqlite3.connect('Statcast_DB')

In [12]:
# Create a table for the cleaned batting data
_batting_df.to_sql('batting_cleaned', conn, if_exists='replace')

723

In [13]:
# Import sqlite table back to df
conn = sqlite3.connect('Statcast_DB')
batting_df = pd.read_sql('SELECT * FROM batting_cleaned', conn)

# Random Forest Model

### Preparing data for Random Forest Model

In [14]:
# Replace all above-average BAs with 1 and below-average BAs with 0
batting_df.loc[batting_df['BA'] >= batting_df['BA'].quantile(0.5), 'BA'] = 1
batting_df.loc[batting_df['BA'] < batting_df['BA'].quantile(0.5), 'BA'] = 0

# Replace all above-average SLGs with 1 and below-average SLGs with 0
batting_df.loc[batting_df['SLG'] >= batting_df['SLG'].quantile(0.5), 'SLG'] = 1
batting_df.loc[batting_df['SLG'] < batting_df['SLG'].quantile(0.5), 'SLG'] = 0

# Rename columns
batting_df.rename(columns={
    'BA': 'BA > AVG',
    'SLG': 'SLG > AVG'
}, inplace=True)

batting_df.head()

Unnamed: 0,index,Player,Season,BA > AVG,SLG > AVG,total_barrels,barrels_batted_balls_percentage,barrels_plate_appearance_percentage
0,0,"Stanton, Giancarlo",2017,1.0,1.0,76,17.4,11.0
1,1,"Alonso, Pete",2019,0.0,1.0,66,15.8,9.5
2,2,"Judge, Aaron",2017,1.0,1.0,87,25.7,12.8
3,3,"Suárez, Eugenio",2019,1.0,1.0,54,13.8,8.2
4,4,"Davis, Khris",2018,0.0,1.0,70,17.2,10.7


### Training the Random Forest Model

In [15]:
# Import libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [16]:
# Define features set
X = batting_df.copy()
X.drop(['Player', 'Season', 'BA > AVG', 'SLG > AVG'], axis=1, inplace=True)
X.head()

Unnamed: 0,index,total_barrels,barrels_batted_balls_percentage,barrels_plate_appearance_percentage
0,0,76,17.4,11.0
1,1,66,15.8,9.5
2,2,87,25.7,12.8
3,3,54,13.8,8.2
4,4,70,17.2,10.7


In [17]:
# Define target vector
y = batting_df['SLG > AVG'].ravel()
y[:5]

array([1., 1., 1., 1., 1.])

In [18]:
# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [19]:
# Create StandardScaler
scaler = StandardScaler()

In [20]:
# Fitting standard scaler
X_scaler = scaler.fit(X_train)

In [21]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [22]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=1000, random_state=1)

In [23]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [24]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [25]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [26]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,77,12
Actual 1,20,72


Accuracy Score : 0.8232044198895028
Classification Report
              precision    recall  f1-score   support

         0.0       0.79      0.87      0.83        89
         1.0       0.86      0.78      0.82        92

    accuracy                           0.82       181
   macro avg       0.83      0.82      0.82       181
weighted avg       0.83      0.82      0.82       181



# Optimization Attemp 1