# Import libraries

In [1]:
# Load necessary libraries
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# Load raw Data for Pitchers from 2017 till 2022

In [2]:
# Load data from CSV files
pitch_2017 = pd.read_csv("../Data/Raw_Data/pitch_arsenals_2017.csv")
pitch_2018 = pd.read_csv("../Data/Raw_Data/pitch_arsenals_2018.csv")
pitch_2019 = pd.read_csv("../Data/Raw_Data/pitch_arsenals_2019.csv")
pitch_2020 = pd.read_csv("../Data/Raw_Data/pitch_arsenals_2020.csv")
pitch_2021 = pd.read_csv("../Data/Raw_Data/pitch_arsenals_2021.csv")
pitch_2022 = pd.read_csv("../Data/Raw_Data/pitch_arsenals_2022.csv")

In [3]:
# Adding a year column to each DataFrame
pitch_2017.insert(0,'Year',2017)
pitch_2018.insert(0,'Year',2018)
pitch_2019.insert(0,'Year',2019)
pitch_2020.insert(0,'Year',2020)
pitch_2021.insert(0,'Year',2021)
pitch_2022.insert(0,'Year',2022)

# Preparing the Data

In [4]:
# Concatinate 6 DataFrames in one 
pitch_all=pd.concat([pitch_2022, pitch_2021, pitch_2020, pitch_2019, pitch_2018, pitch_2017], axis=0)
pitch_all

Unnamed: 0,Year,last_name,first_name,pitcher,ff_avg_speed,si_avg_speed,fc_avg_speed,sl_avg_speed,ch_avg_speed,cu_avg_speed,fs_avg_speed,kn_avg_speed
0,2022,Burnes,Corbin,669203,96.1,96.3,95.0,88.2,90.3,81.6,,
1,2022,Cole,Gerrit,543037,97.8,,92.0,88.7,89.7,83.0,,
2,2022,Alcantara,Sandy,645261,98.0,97.8,,90.0,91.8,86.2,,
3,2022,Mikolas,Miles,571945,93.5,93.0,,87.7,82.7,76.2,,
4,2022,Cease,Dylan,656302,96.8,96.5,,87.4,77.9,81.1,,
...,...,...,...,...,...,...,...,...,...,...,...,...
566,2017,Goldberg,Brad,643329,96.5,96.3,,88.0,90.8,,,
567,2017,Latos,Mat,502009,91.6,90.6,,86.3,81.3,75.7,,
568,2017,Alvarez III,Henderson,506693,91.5,91.5,,85.5,87.0,79.5,,
569,2017,Germán,Domingo,593334,96.3,96.5,,,88.3,81.8,,


In [5]:
# Rename Columns 
renamed_pitch = pitch_all.rename(columns={
    "Year":"year",
    "last_name":"last_name",
    "first_name": "first_Name",
    "pitcher":"Pitcher_ID",
    "ff_avg_speed":"4_Seamer",
    "si_avg_speed":"Sinker",
    "fc_avg_speed":"Cutter",
    "sl_avg_speed":"Slider",
    "ch_avg_speed": "Changeup",
    "cu_avg_speed": "Curve",
    "fs_avg_speed": "Splitter",
    "kn_avg_speed":"Knuckle" })
renamed_pitch.head()

Unnamed: 0,year,last_name,first_name,Pitcher_ID,4_Seamer,Sinker,Cutter,Slider,Changeup,Curve,Splitter,Knuckle
0,2022,Burnes,Corbin,669203,96.1,96.3,95.0,88.2,90.3,81.6,,
1,2022,Cole,Gerrit,543037,97.8,,92.0,88.7,89.7,83.0,,
2,2022,Alcantara,Sandy,645261,98.0,97.8,,90.0,91.8,86.2,,
3,2022,Mikolas,Miles,571945,93.5,93.0,,87.7,82.7,76.2,,
4,2022,Cease,Dylan,656302,96.8,96.5,,87.4,77.9,81.1,,


In [6]:
# Drop pitcher ID column 
pitch_df=renamed_pitch.drop(['Pitcher_ID'], axis = 1, inplace = False)
pitch_df

Unnamed: 0,year,last_name,first_name,4_Seamer,Sinker,Cutter,Slider,Changeup,Curve,Splitter,Knuckle
0,2022,Burnes,Corbin,96.1,96.3,95.0,88.2,90.3,81.6,,
1,2022,Cole,Gerrit,97.8,,92.0,88.7,89.7,83.0,,
2,2022,Alcantara,Sandy,98.0,97.8,,90.0,91.8,86.2,,
3,2022,Mikolas,Miles,93.5,93.0,,87.7,82.7,76.2,,
4,2022,Cease,Dylan,96.8,96.5,,87.4,77.9,81.1,,
...,...,...,...,...,...,...,...,...,...,...,...
566,2017,Goldberg,Brad,96.5,96.3,,88.0,90.8,,,
567,2017,Latos,Mat,91.6,90.6,,86.3,81.3,75.7,,
568,2017,Alvarez III,Henderson,91.5,91.5,,85.5,87.0,79.5,,
569,2017,Germán,Domingo,96.3,96.5,,,88.3,81.8,,


In [7]:
# Check DataFrame info
pitch_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3393 entries, 0 to 570
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   year         3393 non-null   int64  
 1   last_name    3393 non-null   object 
 2    first_name  3393 non-null   object 
 3   4_Seamer     3243 non-null   float64
 4   Sinker       2445 non-null   float64
 5   Cutter       1007 non-null   float64
 6   Slider       2666 non-null   float64
 7   Changeup     2688 non-null   float64
 8   Curve        2216 non-null   float64
 9   Splitter     276 non-null    float64
 10  Knuckle      3 non-null      float64
dtypes: float64(8), int64(1), object(2)
memory usage: 318.1+ KB


In [8]:
# Check null values
pitch_df.isnull().sum()

year              0
last_name         0
 first_name       0
4_Seamer        150
Sinker          948
Cutter         2386
Slider          727
Changeup        705
Curve          1177
Splitter       3117
Knuckle        3390
dtype: int64

There's only 3 knuckball pitching cases. Knuckball is a very rare pitch and the pitchers who throw it during games tend to use it almost exclusively. The goal of a knuckleball is to eliminate almost all of the spin on the baseball, causing it to flutter unpredictably on its way to the plate. Therefore, it's important to keep all of these data. 

In [9]:
# Number of rows and column in the pop time dataset
print("The number of rows in the pitch dataset are:", pitch_df.shape[0])
print("The number of columns in pitch dataset are:", pitch_df.shape[1])

The number of rows in the pitch dataset are: 3393
The number of columns in pitch dataset are: 11


# Calculate the time 

In [10]:
# Distance between pitcher & home base is 78 inches 
d=78/63360

In [11]:
# Calculate the time of picthing between the pitcher & the home base
pitch_df['4_Seamer_T']= (d/pitch_df['4_Seamer'])*3600
pitch_df['Sinker_T']= (d/pitch_df['Sinker'])*3600
pitch_df['Cutter_T']= (d/pitch_df['Cutter'])*3600
pitch_df['Slider_T']= (d/pitch_df['Slider'])*3600
pitch_df['Changeup_T']= (d/pitch_df['Changeup'])*3600
pitch_df['Curve_T']= (d/pitch_df['Curve'])*3600
pitch_df['Splitter_T']= (d/pitch_df['Splitter'])*3600
pitch_df['Knuckle_T']= (d/pitch_df['Knuckle'])*3600
pitch_df.head()

Unnamed: 0,year,last_name,first_name,4_Seamer,Sinker,Cutter,Slider,Changeup,Curve,Splitter,Knuckle,4_Seamer_T,Sinker_T,Cutter_T,Slider_T,Changeup_T,Curve_T,Splitter_T,Knuckle_T
0,2022,Burnes,Corbin,96.1,96.3,95.0,88.2,90.3,81.6,,,0.046117,0.046021,0.046651,0.050247,0.049079,0.054311,,
1,2022,Cole,Gerrit,97.8,,92.0,88.7,89.7,83.0,,,0.045315,,0.048172,0.049964,0.049407,0.053395,,
2,2022,Alcantara,Sandy,98.0,97.8,,90.0,91.8,86.2,,,0.045223,0.045315,,0.049242,0.048277,0.051413,,
3,2022,Mikolas,Miles,93.5,93.0,,87.7,82.7,76.2,,,0.047399,0.047654,,0.050534,0.053589,0.05816,,
4,2022,Cease,Dylan,96.8,96.5,,87.4,77.9,81.1,,,0.045783,0.045926,,0.050707,0.056891,0.054646,,


In [12]:
# Create a DataFrame for the picthing time 
pitch_time_df=pitch_df.drop(['4_Seamer_T','Sinker_T','Cutter_T','Slider_T','Changeup_T','Curve_T','Splitter_T','Knuckle_T'], axis = 1, inplace = False)
pitch_time_df.head()

Unnamed: 0,year,last_name,first_name,4_Seamer,Sinker,Cutter,Slider,Changeup,Curve,Splitter,Knuckle
0,2022,Burnes,Corbin,96.1,96.3,95.0,88.2,90.3,81.6,,
1,2022,Cole,Gerrit,97.8,,92.0,88.7,89.7,83.0,,
2,2022,Alcantara,Sandy,98.0,97.8,,90.0,91.8,86.2,,
3,2022,Mikolas,Miles,93.5,93.0,,87.7,82.7,76.2,,
4,2022,Cease,Dylan,96.8,96.5,,87.4,77.9,81.1,,


In [15]:
# Export DataFrame to csv
pitch_time_df.to_csv('../Data/Clean_Data/pitch_time_clean.csv', index=False)