In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
pd.set_option('display.max_columns', None)

In [2]:
df=pd.read_csv('all_df.csv')

In [3]:
### will divide df into different dataframes by season

In [4]:
#convert the game id column to a string
dfy = df.g_id.astype('str')

In [6]:
#Since the year is the first four characters, we will isolate those characters
dfy = dfy.str[:4]

In [8]:
dfy.head()

0    2015
1    2015
2    2015
3    2015
4    2015
Name: g_id, dtype: object

In [9]:
#make new columnfor year
df['year'] = dfy.values

In [10]:
df.rename(columns= {'Pitchers Name': 'Pitcher_name', 'Batters Name': 'Batter_name'}, inplace= True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2867154 entries, 0 to 2867153
Data columns (total 72 columns):
Unnamed: 0          int64
px                  float64
pz                  float64
start_speed         float64
end_speed           float64
spin_rate           float64
spin_dir            float64
break_angle         float64
break_length        float64
break_y             float64
ax                  float64
ay                  float64
az                  float64
sz_bot              float64
sz_top              float64
type_confidence     float64
vx0                 float64
vy0                 float64
vz0                 float64
x                   float64
x0                  float64
y                   float64
y0                  float64
z0                  float64
pfx_x               float64
pfx_z               float64
nasty               float64
zone                float64
code                object
type                object
pitch_type          object
event_num           int6

##### will get rid of unneccesary data for a new Dataframe, divide data by seasons since there is almost 3 million data points

In [12]:
main= pd.DataFrame(df[['pitch_type', 'Pitcher_name', 'Batter_name', 'b_score', 's_count', 'outs', 'pitch_num', \
                            'on_1b', 'on_2b', 'on_3b', 'batter_id', 'inning', 'p_score', 'p_throws',\
                             'pitcher_id', 'stand', 'top', 'b_count', 'weather', 'year']])

In [15]:
main.head()

Unnamed: 0,pitch_type,Pitcher_name,Batter_name,b_score,s_count,outs,pitch_num,on_1b,on_2b,on_3b,batter_id,inning,p_score,p_throws,pitcher_id,stand,top,b_count,weather,year
0,FF,Jon Lester,Matt Carpenter,0.0,0.0,0.0,1.0,0.0,0.0,0.0,572761,1,0,L,452657,L,True,0.0,"44 degrees, clear",2015
1,FF,Jon Lester,Matt Carpenter,0.0,1.0,0.0,2.0,0.0,0.0,0.0,572761,1,0,L,452657,L,True,0.0,"44 degrees, clear",2015
2,FF,Jon Lester,Matt Carpenter,0.0,2.0,0.0,3.0,0.0,0.0,0.0,572761,1,0,L,452657,L,True,0.0,"44 degrees, clear",2015
3,FF,Jon Lester,Matt Carpenter,0.0,2.0,0.0,4.0,0.0,0.0,0.0,572761,1,0,L,452657,L,True,0.0,"44 degrees, clear",2015
4,CU,Jon Lester,Matt Carpenter,0.0,2.0,0.0,5.0,0.0,0.0,0.0,572761,1,0,L,452657,L,True,1.0,"44 degrees, clear",2015


In [31]:
main.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2852965 entries, 0 to 2867153
Data columns (total 20 columns):
pitch_type      object
Pitcher_name    object
Batter_name     object
b_score         float64
s_count         float64
outs            float64
pitch_num       float64
on_1b           float64
on_2b           float64
on_3b           float64
batter_id       int64
inning          int64
p_score         int64
p_throws        object
pitcher_id      int64
stand           object
top             bool
b_count         float64
weather         object
year            object
dtypes: bool(1), float64(8), int64(4), object(7)
memory usage: 438.0+ MB


In [25]:
#checking for null values
main.isnull().sum()

pitch_type      14189
Pitcher_name        0
Batter_name         0
b_score             0
s_count             0
outs                0
pitch_num           0
on_1b               0
on_2b               0
on_3b               0
batter_id           0
inning              0
p_score             0
p_throws            0
pitcher_id          0
stand               0
top                 0
b_count             0
weather             0
year                0
dtype: int64

#### Since <1% of the total observations contain null values (under pitch_type)we can drop the null values from the dataset 

In [26]:
#dropping null values
main = main.dropna()

In [28]:
main.isnull().sum()

pitch_type      0
Pitcher_name    0
Batter_name     0
b_score         0
s_count         0
outs            0
pitch_num       0
on_1b           0
on_2b           0
on_3b           0
batter_id       0
inning          0
p_score         0
p_throws        0
pitcher_id      0
stand           0
top             0
b_count         0
weather         0
year            0
dtype: int64

In [33]:
main.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2852965 entries, 0 to 2867153
Data columns (total 20 columns):
pitch_type      object
Pitcher_name    object
Batter_name     object
b_score         float64
s_count         float64
outs            float64
pitch_num       float64
on_1b           float64
on_2b           float64
on_3b           float64
batter_id       int64
inning          int64
p_score         int64
p_throws        object
pitcher_id      int64
stand           object
top             bool
b_count         float64
weather         object
year            object
dtypes: bool(1), float64(8), int64(4), object(7)
memory usage: 438.0+ MB


In [21]:
#isolating the data that only contains the 2015 season
df15 = main[main['year'] == '2015']
df15.to_csv('df15.csv')

In [22]:
#isolating the data that only contains the 2016 season
df16 = main[main['year'] == '2016']
df16.to_csv('df16.csv')

In [23]:
#isolating the data that only contains the 2017 season
df17 = main[main['year'] == '2017']
df17.to_csv('df17.csv')

In [24]:
#isolating the data that only contains the 2018 season
df18 = main[main['year'] == '2018']
df18.to_csv('df18.csv')

In [34]:
df18.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 724444 entries, 2142710 to 2867153
Data columns (total 20 columns):
pitch_type      718322 non-null object
Pitcher_name    724444 non-null object
Batter_name     724444 non-null object
b_score         724444 non-null float64
s_count         724444 non-null float64
outs            724444 non-null float64
pitch_num       724444 non-null float64
on_1b           724444 non-null float64
on_2b           724444 non-null float64
on_3b           724444 non-null float64
batter_id       724444 non-null int64
inning          724444 non-null int64
p_score         724444 non-null int64
p_throws        724444 non-null object
pitcher_id      724444 non-null int64
stand           724444 non-null object
top             724444 non-null bool
b_count         724444 non-null float64
weather         724444 non-null object
year            724444 non-null object
dtypes: bool(1), float64(8), int64(4), object(7)
memory usage: 111.2+ MB


In [None]:
df15.describe()