# Import libraries

In [13]:
# Load necessary libraries
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# Load raw Data for Pitchers from 2017 till 2022

In [34]:
# Load data from CSV files
pitch_2017 = pd.read_csv("../Data/Raw_Data/pitch_arsenals_2017.csv")
pitch_2018 = pd.read_csv("../Data/Raw_Data/pitch_arsenals_2018.csv")
pitch_2019 = pd.read_csv("../Data/Raw_Data/pitch_arsenals_2019.csv")
pitch_2020 = pd.read_csv("../Data/Raw_Data/pitch_arsenals_2020.csv")
pitch_2021 = pd.read_csv("../Data/Raw_Data/pitch_arsenals_2021.csv")
pitch_2022 = pd.read_csv("../Data/Raw_Data/pitch_arsenals_2022.csv")

In [35]:
# Adding a year column to each DataFrame
pitch_2017.insert(0,'Year',2017)
pitch_2018.insert(0,'Year',2018)
pitch_2019.insert(0,'Year',2019)
pitch_2020.insert(0,'Year',2020)
pitch_2021.insert(0,'Year',2021)
pitch_2022.insert(0,'Year',2022)

# Preparing the Data

In [37]:
# Concatinate 6 DataFrames in one 
pitch_all=pd.concat([pitch_2017, pitch_2018, pitch_2019, pitch_2020, pitch_2021, pitch_2022], axis=0)
pitch_all

Unnamed: 0,Year,last_name,first_name,pitcher,ff_avg_speed,si_avg_speed,fc_avg_speed,sl_avg_speed,ch_avg_speed,cu_avg_speed,fs_avg_speed,kn_avg_speed
0,2017,Verlander,Justin,434378,95.3,95.6,92.1,88.2,87.6,80.4,,
1,2017,Sale,Chris,519242,94.8,93.6,,79.8,86.8,,,
2,2017,Archer,Chris,502042,95.5,,,88.9,85.9,,,
3,2017,Porcello,Rick,519144,92.1,89.7,,85.6,80.3,74.7,,
4,2017,González,Gio,461829,90.0,89.3,,,83.0,74.8,,
...,...,...,...,...,...,...,...,...,...,...,...,...
597,2022,Castro,Anthony,621593,94.8,95.5,,84.4,87.3,,,
598,2022,Cruz,Fernando,518585,94.4,93.4,,86.7,,,81.7,
599,2022,Scott,Tayler,605463,94.1,93.5,,83.9,89.2,,,
600,2022,Nelson,Ryne,669194,94.8,,,82.5,81.8,76.9,,


In [38]:
# Number of rows and column in the pop time dataset
print("The number of rows in the pitch dataset are:", pitch_all.shape[0])
print("The number of columns in pitch dataset are:", pitch_all.shape[1])

The number of rows in the pitch dataset are: 3393
The number of columns in pitch dataset are: 12


In [42]:
# Check DataFrame info
pitch_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3393 entries, 0 to 601
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Year          3393 non-null   int64  
 1   last_name     3393 non-null   object 
 2    first_name   3393 non-null   object 
 3   pitcher       3393 non-null   int64  
 4   ff_avg_speed  3243 non-null   float64
 5   si_avg_speed  2445 non-null   float64
 6   fc_avg_speed  1007 non-null   float64
 7   sl_avg_speed  2666 non-null   float64
 8   ch_avg_speed  2688 non-null   float64
 9   cu_avg_speed  2216 non-null   float64
 10  fs_avg_speed  276 non-null    float64
 11  kn_avg_speed  3 non-null      float64
dtypes: float64(8), int64(2), object(2)
memory usage: 344.6+ KB


In [43]:
# Drop pitcher ID column 
pitch_all=pitch_all.drop(['pitcher'], axis = 1, inplace = False)
pitch_all

Unnamed: 0,Year,last_name,first_name,ff_avg_speed,si_avg_speed,fc_avg_speed,sl_avg_speed,ch_avg_speed,cu_avg_speed,fs_avg_speed,kn_avg_speed
0,2017,Verlander,Justin,95.3,95.6,92.1,88.2,87.6,80.4,,
1,2017,Sale,Chris,94.8,93.6,,79.8,86.8,,,
2,2017,Archer,Chris,95.5,,,88.9,85.9,,,
3,2017,Porcello,Rick,92.1,89.7,,85.6,80.3,74.7,,
4,2017,González,Gio,90.0,89.3,,,83.0,74.8,,
...,...,...,...,...,...,...,...,...,...,...,...
597,2022,Castro,Anthony,94.8,95.5,,84.4,87.3,,,
598,2022,Cruz,Fernando,94.4,93.4,,86.7,,,81.7,
599,2022,Scott,Tayler,94.1,93.5,,83.9,89.2,,,
600,2022,Nelson,Ryne,94.8,,,82.5,81.8,76.9,,


In [45]:
# Rename Columns 
renamed_pitch = pitch_all.rename(columns={
    "Year":"Year",
    "last_name":"Last_Name",
    "first_name": "First_Name",
    "ff_avg_speed":"4_Seamer",
    "si_avg_speed":"Sinker",
    "fc_avg_speed":"Cutter",
    "sl_avg_speed":"Slider",
    "ch_avg_speed": "Changeup",
    "cu_avg_speed": "Curve",
    "fs_avg_speed": "Splitter",
    "kn_avg_speed":"Knuckle" })
renamed_pitch.head()

Unnamed: 0,Year,Last_Name,first_name,4_Seamer,Sinker,Cutter,Slider,Changeup,Curve,Splitter,Knuckle
0,2017,Verlander,Justin,95.3,95.6,92.1,88.2,87.6,80.4,,
1,2017,Sale,Chris,94.8,93.6,,79.8,86.8,,,
2,2017,Archer,Chris,95.5,,,88.9,85.9,,,
3,2017,Porcello,Rick,92.1,89.7,,85.6,80.3,74.7,,
4,2017,González,Gio,90.0,89.3,,,83.0,74.8,,


In [46]:
# Check null values
renamed_pitch.isnull().sum()

Year              0
Last_Name         0
 first_name       0
4_Seamer        150
Sinker          948
Cutter         2386
Slider          727
Changeup        705
Curve          1177
Splitter       3117
Knuckle        3390
dtype: int64

There's only 3 knuckball pitching cases. Knuckball is a very rare pitch and the pitchers who throw it during games tend to use it almost exclusively. The goal of a knuckleball is to eliminate almost all of the spin on the baseball, causing it to flutter unpredictably on its way to the plate. Therefore : 