In [105]:
# Import required libraries
from path import Path
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder, StandardScaler
from pybaseball import batting_stats
from pybaseball import batting_stats_range
from pybaseball import pitching_stats_range
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

### Analysing *batting_stats()* with different paramters

In [106]:
all_data_2019 = batting_stats(2019)
players_with_more_than_50 = batting_stats(2019, qual=50)
all_data_since_2015 = batting_stats(2015, 2019)
aggregated_data = batting_stats(2010, 2016, ind=0)

print(all_data_2019.head())
print(players_with_more_than_50.head())
print(all_data_since_2015.head())
print(aggregated_data.head())

    Season              Name       Team   Age      G     AB     PA      H  \
20  2019.0        Mike Trout     Angels  27.0  134.0  470.0  600.0  137.0   
28  2019.0      Alex Bregman     Astros  25.0  156.0  554.0  690.0  164.0   
19  2019.0  Christian Yelich    Brewers  27.0  130.0  489.0  580.0  161.0   
32  2019.0    Cody Bellinger    Dodgers  23.0  156.0  558.0  660.0  170.0   
83  2019.0     Marcus Semien  Athletics  28.0  162.0  657.0  747.0  187.0   

       1B    2B  ...  wSL/C (pi)  wXX/C (pi)  O-Swing% (pi)  Z-Swing% (pi)  \
20   63.0  27.0  ...        2.44       -3.09          0.191          0.559   
28   84.0  37.0  ...        1.28         NaN          0.169          0.552   
19   85.0  29.0  ...        2.47         NaN          0.290          0.670   
32   86.0  34.0  ...        1.00         NaN          0.262          0.654   
83  104.0  43.0  ...       -1.01        3.31          0.218          0.628   

    Swing% (pi)  O-Contact% (pi)  Z-Contact% (pi)  Contact% (pi)  Zo

# Conclusion for batting_stats()
Very versitile function that can take one or two years as parameters (single season or multiple seasons). It would return a dataframe with 287 columns with all the batting statistics for every single player for the whole season/seasons. A lot of statistics!

### Analysing *schedule_and_record()*

In [107]:
from pybaseball import schedule_and_record

Astros_record = schedule_and_record(2019, "HOU")
Astros_record.head(20)

Unnamed: 0,Date,Tm,Home_Away,Opp,W/L,R,RA,Inn,W-L,Rank,GB,Win,Loss,Save,Time,D/N,Attendance,Streak,Orig. Scheduled
1,"Thursday, Mar 28",HOU,@,TBR,W,5.0,1.0,9.0,1-0,2.0,1.0,Verlander,Snell,,2:38,D,25025.0,1,
2,"Friday, Mar 29",HOU,@,TBR,L,2.0,4.0,9.0,1-1,2.0,1.0,Morton,Cole,Alvarado,2:38,N,13059.0,-1,
3,"Saturday, Mar 30",HOU,@,TBR,L,1.0,3.0,9.0,1-2,3.0,2.0,Glasnow,McHugh,Alvarado,2:40,N,16010.0,-2,
4,"Sunday, Mar 31",HOU,@,TBR,L,1.0,3.0,9.0,1-3,4.0,3.0,Chirinos,Miley,Castillo,2:12,D,18473.0,-3,
5,"Monday, Apr 1",HOU,@,TEX,W,2.0,1.0,9.0,2-3,4.0,3.0,Peacock,Sampson,Osuna,2:39,N,18056.0,1,
6,"Tuesday, Apr 2",HOU,@,TEX,L,4.0,6.0,9.0,2-4,4.0,4.0,Kelley,Valdez,Leclerc,3:23,N,17907.0,-1,
7,"Wednesday, Apr 3",HOU,@,TEX,L,0.0,4.0,9.0,2-5,4.0,4.5,Minor,Cole,,2:39,N,22265.0,-2,
8,"Friday, Apr 5",HOU,Home,OAK,W,3.0,2.0,9.0,3-5,4.0,3.5,McHugh,Montas,Osuna,3:03,N,43165.0,1,
9,"Saturday, Apr 6",HOU,Home,OAK,W,6.0,0.0,9.0,4-5,4.0,3.5,Miley,Brooks,,2:47,N,34487.0,2,
10,"Sunday, Apr 7",HOU,Home,OAK,W-wo,9.0,8.0,9.0,5-5,2.0,3.5,Osuna,Treinen,,3:39,D,34902.0,3,


In [108]:
Astros_record.columns

Index(['Date', 'Tm', 'Home_Away', 'Opp', 'W/L', 'R', 'RA', 'Inn', 'W-L',
       'Rank', 'GB', 'Win', 'Loss', 'Save', 'Time', 'D/N', 'Attendance',
       'Streak', 'Orig. Scheduled'],
      dtype='object')

### Conclusion for schedule_and_record()
Very useful function that can get the players who played in each game, and also all the dates in which each a team played in a season, among others. It only has 19 columns, but this will be used as a *link* table between the others.

## Analysing *batting_stats_range()*

In [109]:
data_range = batting_stats_range("2019-09-01","2019-09-30")
data_range

Unnamed: 0,Name,Age,#days,Lev,Tm,G,PA,AB,R,H,...,HBP,SH,SF,GDP,SB,CS,BA,OBP,SLG,OPS
1,Jose Abreu,32,42,MLB-AL,Chicago,25,114,98,17,28,...,4,0,3,4,0,0,0.286,0.360,0.520,0.880
2,Ronald Acuna Jr.,21,47,MLB-NL,Atlanta,20,86,73,20,17,...,0,0,0,1,6,1,0.233,0.349,0.507,0.856
3,Cristhian Adames,27,42,MLB-NL,San Francisco,10,24,22,1,7,...,0,0,0,0,0,0,0.318,0.375,0.364,0.739
4,Willy Adames,23,42,MLB-AL,Tampa Bay,24,83,74,11,20,...,0,0,1,1,0,1,0.270,0.337,0.459,0.797
5,Matt Adams,30,42,MLB-NL,Washington,10,30,29,1,4,...,0,0,0,1,0,0,0.138,0.167,0.138,0.305
6,Ehire Adrianza,29,59,MLB-AL,Minnesota,5,17,15,1,3,...,0,1,1,0,0,0,0.200,0.188,0.467,0.654
7,Dario Agrazal,24,46,MLB-NL,Pittsburgh,4,7,7,0,2,...,0,0,0,0,0,0,0.286,0.286,0.286,0.571
8,Jesus Aguilar,29,42,MLB-AL,Tampa Bay,14,30,24,2,5,...,0,0,2,1,0,0,0.208,0.300,0.458,0.758
9,Nick Ahmed,29,42,MLB-NL,Arizona,24,96,86,8,18,...,2,0,1,3,1,1,0.209,0.281,0.372,0.653
10,R.J. Alaniz,28,43,MLB-NL,Cincinnati,1,1,1,0,1,...,0,0,0,0,0,0,1.000,1.000,1.000,2.000


In [110]:
print(data_range.columns)
print(all_data_2019.columns)

Index(['Name', 'Age', '#days', 'Lev', 'Tm', 'G', 'PA', 'AB', 'R', 'H', '2B',
       '3B', 'HR', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SH', 'SF', 'GDP', 'SB',
       'CS', 'BA', 'OBP', 'SLG', 'OPS'],
      dtype='object')
Index(['Season', 'Name', 'Team', 'Age', 'G', 'AB', 'PA', 'H', '1B', '2B',
       ...
       'wSL/C (pi)', 'wXX/C (pi)', 'O-Swing% (pi)', 'Z-Swing% (pi)',
       'Swing% (pi)', 'O-Contact% (pi)', 'Z-Contact% (pi)', 'Contact% (pi)',
       'Zone% (pi)', 'Pace (pi)'],
      dtype='object', length=287)


### Conclusion for *batting_stats_range()*

this functions returns a much smaller dataframe with only 27 columns. This will be useful to get the most recent information for each player right before each game.

# Let's look at all the columns for batting_stats() and batting_stats_range()

In [111]:
print(all_data_2019.columns[:95])

Index(['Season', 'Name', 'Team', 'Age', 'G', 'AB', 'PA', 'H', '1B', '2B', '3B',
       'HR', 'R', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SF', 'SH', 'GDP', 'SB',
       'CS', 'AVG', 'GB', 'FB', 'LD', 'IFFB', 'Pitches', 'Balls', 'Strikes',
       'IFH', 'BU', 'BUH', 'BB%', 'K%', 'BB/K', 'OBP', 'SLG', 'OPS', 'ISO',
       'BABIP', 'GB/FB', 'LD%', 'GB%', 'FB%', 'IFFB%', 'HR/FB', 'IFH%', 'BUH%',
       'wOBA', 'wRAA', 'wRC', 'Bat', 'Fld', 'Rep', 'Pos', 'RAR', 'WAR', 'Dol',
       'Spd', 'wRC+', 'WPA', '-WPA', '+WPA', 'RE24', 'REW', 'pLI', 'phLI',
       'PH', 'WPA/LI', 'Clutch', 'FB% (Pitch)', 'FBv', 'SL%', 'SLv', 'CT%',
       'CTv', 'CB%', 'CBv', 'CH%', 'CHv', 'SF%', 'SFv', 'KN%', 'KNv', 'XX%',
       'PO%', 'wFB', 'wSL', 'wCT', 'wCB', 'wCH', 'wSF', 'wKN'],
      dtype='object')


In [112]:
print(all_data_2019.columns[95:190])

Index(['wFB/C', 'wSL/C', 'wCT/C', 'wCB/C', 'wCH/C', 'wSF/C', 'wKN/C',
       'O-Swing%', 'Z-Swing%', 'Swing%', 'O-Contact%', 'Z-Contact%',
       'Contact%', 'Zone%', 'F-Strike%', 'SwStr%', 'BsR', 'FA% (pfx)',
       'FT% (pfx)', 'FC% (pfx)', 'FS% (pfx)', 'FO% (pfx)', 'SI% (pfx)',
       'SL% (pfx)', 'CU% (pfx)', 'KC% (pfx)', 'EP% (pfx)', 'CH% (pfx)',
       'SC% (pfx)', 'KN% (pfx)', 'UN% (pfx)', 'vFA (pfx)', 'vFT (pfx)',
       'vFC (pfx)', 'vFS (pfx)', 'vFO (pfx)', 'vSI (pfx)', 'vSL (pfx)',
       'vCU (pfx)', 'vKC (pfx)', 'vEP (pfx)', 'vCH (pfx)', 'vSC (pfx)',
       'vKN (pfx)', 'FA-X (pfx)', 'FT-X (pfx)', 'FC-X (pfx)', 'FS-X (pfx)',
       'FO-X (pfx)', 'SI-X (pfx)', 'SL-X (pfx)', 'CU-X (pfx)', 'KC-X (pfx)',
       'EP-X (pfx)', 'CH-X (pfx)', 'SC-X (pfx)', 'KN-X (pfx)', 'FA-Z (pfx)',
       'FT-Z (pfx)', 'FC-Z (pfx)', 'FS-Z (pfx)', 'FO-Z (pfx)', 'SI-Z (pfx)',
       'SL-Z (pfx)', 'CU-Z (pfx)', 'KC-Z (pfx)', 'EP-Z (pfx)', 'CH-Z (pfx)',
       'SC-Z (pfx)', 'KN-Z (pfx)', 'wFA (pfx)'

In [113]:
print(all_data_2019.columns[190:])

Index(['wKN/C (pfx)', 'O-Swing% (pfx)', 'Z-Swing% (pfx)', 'Swing% (pfx)',
       'O-Contact% (pfx)', 'Z-Contact% (pfx)', 'Contact% (pfx)', 'Zone% (pfx)',
       'Pace', 'Def', 'wSB', 'UBR', 'Age Rng', 'Off', 'Lg', 'wGDP', 'Pull%',
       'Cent%', 'Oppo%', 'Soft%', 'Med%', 'Hard%', 'TTO%', 'CH% (pi)',
       'CS% (pi)', 'CU% (pi)', 'FA% (pi)', 'FC% (pi)', 'FS% (pi)', 'KN% (pi)',
       'SB% (pi)', 'SI% (pi)', 'SL% (pi)', 'XX% (pi)', 'vCH (pi)', 'vCS (pi)',
       'vCU (pi)', 'vFA (pi)', 'vFC (pi)', 'vFS (pi)', 'vKN (pi)', 'vSB (pi)',
       'vSI (pi)', 'vSL (pi)', 'vXX (pi)', 'CH-X (pi)', 'CS-X (pi)',
       'CU-X (pi)', 'FA-X (pi)', 'FC-X (pi)', 'FS-X (pi)', 'KN-X (pi)',
       'SB-X (pi)', 'SI-X (pi)', 'SL-X (pi)', 'XX-X (pi)', 'CH-Z (pi)',
       'CS-Z (pi)', 'CU-Z (pi)', 'FA-Z (pi)', 'FC-Z (pi)', 'FS-Z (pi)',
       'KN-Z (pi)', 'SB-Z (pi)', 'SI-Z (pi)', 'SL-Z (pi)', 'XX-Z (pi)',
       'wCH (pi)', 'wCS (pi)', 'wCU (pi)', 'wFA (pi)', 'wFC (pi)', 'wFS (pi)',
       'wKN (pi)', 'wSB (

In [114]:
len(all_data_2019.columns)

287

In [115]:
len(data_range.columns)

27

### Let's see what columns in the smaller dataframe are also in the larger df

In [116]:
shared_columns = list(data_range.columns & all_data_2019.columns)

In [117]:
shared_columns

['Name',
 'Age',
 'G',
 'PA',
 'AB',
 'R',
 'H',
 '2B',
 '3B',
 'HR',
 'RBI',
 'BB',
 'IBB',
 'SO',
 'HBP',
 'SH',
 'SF',
 'GDP',
 'SB',
 'CS',
 'OBP',
 'SLG',
 'OPS']

In [118]:
len(shared_columns)

23

### basically all of them. Only 4 are not included. Let's see which one are not in the larger df.

In [120]:
range_columns_not_in_all_data = list(data_range.columns.difference(all_data_2019.columns))
range_columns_not_in_all_data

['#days', 'BA', 'Lev', 'Tm']

# So #days is a useless column, BA is the same as AVG, Lev is useless too, and Tm is the same as Team. As a conclusion, all the columns are included.

In [121]:
Astros_2019stats  =  all_data_2019[all_data_2019["Team"]=="Astros"]

In [122]:
Astros_2019stats.count()

Season             20
Name               20
Team               20
Age                20
G                  20
AB                 20
PA                 20
H                  20
1B                 20
2B                 20
3B                 20
HR                 20
R                  20
RBI                20
BB                 20
IBB                20
SO                 20
HBP                20
SF                 20
SH                 20
GDP                20
SB                 20
CS                 20
AVG                20
GB                 20
FB                 20
LD                 20
IFFB               20
Pitches            20
Balls              20
                   ..
wCH (pi)           17
wCS (pi)            0
wCU (pi)           18
wFA (pi)           20
wFC (pi)           17
wFS (pi)           14
wKN (pi)            1
wSB (pi)            0
wSI (pi)           18
wSL (pi)           18
wXX (pi)            2
wCH/C (pi)         17
wCS/C (pi)          0
wCU/C (pi)         18
wFA/C (pi)

In [123]:
#Thursday, Mar 28
#"2019-03-28"
game_march_28_2019 = batting_stats_range("2019-03-28",)


In [124]:
game_march_28_2019["Tm"].unique

<bound method Series.unique of 1            Chicago
2            Atlanta
3          Tampa Bay
4         Washington
5          Milwaukee
6            Arizona
7          Baltimore
8            Atlanta
9              Miami
10         Cleveland
11           Chicago
12          New York
13           Chicago
14           Houston
15             Miami
16           Chicago
17             Texas
18          New York
19         Milwaukee
20          Colorado
21         St. Louis
22           Chicago
23       Los Angeles
24        Cincinnati
25         Cleveland
27           Seattle
28        Pittsburgh
29       Los Angeles
30     San Francisco
31            Boston
           ...      
290         Colorado
291        San Diego
292      Los Angeles
293          Atlanta
294          Toronto
295        Milwaukee
296         New York
297      Los Angeles
298         New York
299      Los Angeles
300       Washington
301            Miami
302          Arizona
303           Boston
304      Los Angeles
305

In [125]:
game_march_28_2019[game_march_28_2019["Tm"]=="Houston"]

Unnamed: 0,Name,Age,#days,Lev,Date,Tm,Unnamed: 7,Opp,G,PA,...,HBP,SH,SF,GDP,SB,CS,BA,OBP,SLG,OPS
14,Jose Altuve,29,227,MLB-AL,"Mar 28, 2019",Houston,@,Tampa Bay,1,4,...,0,0,0,0,0,0,0.5,0.5,1.5,2.0
40,Michael Brantley,32,227,MLB-AL,"Mar 28, 2019",Houston,@,Tampa Bay,1,4,...,0,0,0,0,0,1,0.5,0.5,1.25,1.75
42,Alex Bregman,25,227,MLB-AL,"Mar 28, 2019",Houston,@,Tampa Bay,1,4,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
67,Robinson Chirinos,35,227,MLB-AL,"Mar 28, 2019",Houston,@,Tampa Bay,1,4,...,0,0,0,0,0,0,0.333,0.5,0.667,1.167
87,Aledmys Diaz,28,227,MLB-AL,"Mar 28, 2019",Houston,@,Tampa Bay,1,4,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
125,Yuli Gurriel,35,227,MLB-AL,"Mar 28, 2019",Houston,@,Tampa Bay,1,4,...,0,0,0,0,0,1,0.5,0.5,0.75,1.25
173,Jake Marisnick,28,227,MLB-AL,"Mar 28, 2019",Houston,@,Tampa Bay,1,4,...,0,0,0,0,0,0,0.333,0.5,0.333,0.833
279,George Springer,29,227,MLB-AL,"Mar 28, 2019",Houston,@,Tampa Bay,1,5,...,0,0,0,0,0,0,0.2,0.2,0.8,1.0
311,Tyler White,28,227,MLB-AL,"Mar 28, 2019",Houston,@,Tampa Bay,1,4,...,0,0,0,0,0,0,0.25,0.25,0.25,0.5


## ...We can see who played in every game ^

In [126]:
game_march_28_2019.columns

Index(['Name', 'Age', '#days', 'Lev', 'Date', 'Tm', ' ', 'Opp', 'G', 'PA',
       'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SH',
       'SF', 'GDP', 'SB', 'CS', 'BA', 'OBP', 'SLG', 'OPS'],
      dtype='object')

In [127]:
game_march_28_2019.iloc[:,6]

1      @
2      @
3       
4       
5       
6      @
7      @
8      @
9       
10     @
11     @
12     @
13     @
14     @
15      
16     @
17      
18      
19      
20     @
21     @
22     @
23      
24      
25     @
27      
28     @
29      
30     @
31     @
      ..
290    @
291     
292     
293    @
294     
295     
296     
297    @
298     
299     
300     
301     
302    @
303    @
304     
305    @
306     
307     
308    @
309     
310     
311    @
312     
313     
314    @
315     
316    @
317     
318    @
319     
Name:  , Length: 308, dtype: object

### *"@"* means they're playing as *visitors*.

# Startegy:
### General Strategy:
Retrieve data for each player in each game played. Feed the model with data from past year for each player and past month. Build a dataframe where each row represents the features to train the model and to make predictions out of the model. 

Each row would have the full 283 features from past season plus the 23 from past month for each player that played the particular game plus 2 team features. 

### Specific Tasks

Usually there are 9 players, so there would be 9 times 283 features only to account for past season statistics of 1 team. So this number would be later mulitply also by 2 since each game involves 2 teams. We will use **schedule_and_record()** function to get the players who played in a game, so we can later use **batting_stats()** function to get those 283 features from last season for each player that played a particular game.

There will be also recent statistics from last month for each player as well, which is 22 features. Again this number would be multiply by 9 and then by 2. We will use **schedule_and_record()** function to get the players who played in that game, so we can later retrieve the past month statistics for each one of those players from **batting_stats_range()**.

There will be also statistics from the team to be included in each record such as streak, and GB, also gotten from **schedule_and_record()**.

### Conclusion
The training DataFrame will contain a record for each game of a single team and its opponent. This leads us to  **DataFrame with dimensions of 5494 columns by *Games-Played-By-The-Team* rows**. This is only batting statistics so far.

## Analysing *pitching_stats()*

In [128]:
from pybaseball import pitching_stats

pitching_stats_2019 = pitching_stats("2019")
pitching_stats_2019.head(15)

Unnamed: 0,Season,Name,Team,Age,W,L,ERA,WAR,G,GS,...,wSL/C (pi),wXX/C (pi),O-Swing% (pi),Z-Swing% (pi),Swing% (pi),O-Contact% (pi),Z-Contact% (pi),Contact% (pi),Zone% (pi),Pace (pi)
94,2019.0,Gerrit Cole,Astros,28.0,20.0,5.0,2.5,7.4,33.0,33.0,...,1.88,,0.34,0.646,0.498,0.481,0.751,0.662,0.516,22.9
85,2019.0,Jacob deGrom,Mets,31.0,11.0,8.0,2.43,7.0,32.0,32.0,...,2.24,,0.365,0.696,0.529,0.536,0.791,0.702,0.495,24.2
214,2019.0,Lance Lynn,Rangers,32.0,16.0,11.0,3.67,6.8,33.0,33.0,...,,,0.283,0.674,0.486,0.607,0.8,0.746,0.518,25.1
124,2019.0,Max Scherzer,Nationals,34.0,11.0,7.0,2.92,6.5,27.0,27.0,...,3.99,,0.36,0.675,0.522,0.495,0.779,0.684,0.516,25.0
96,2019.0,Justin Verlander,Astros,36.0,21.0,6.0,2.58,6.4,34.0,34.0,...,3.38,,0.371,0.66,0.519,0.528,0.774,0.688,0.51,25.9
142,2019.0,Charlie Morton,Rays,35.0,16.0,6.0,3.05,6.1,33.0,33.0,...,1.6,,0.322,0.614,0.473,0.523,0.825,0.726,0.517,22.9
164,2019.0,Stephen Strasburg,Nationals,30.0,18.0,6.0,3.32,5.7,33.0,33.0,...,-28.78,,0.375,0.592,0.472,0.558,0.84,0.716,0.447,26.3
160,2019.0,Shane Bieber,Indians,24.0,15.0,8.0,3.28,5.6,34.0,33.0,...,1.29,,0.352,0.625,0.477,0.48,0.854,0.705,0.459,22.7
127,2019.0,Zack Greinke,- - -,35.0,18.0,5.0,2.93,5.4,33.0,33.0,...,-0.49,-2.87,0.346,0.603,0.465,0.647,0.857,0.772,0.461,25.5
179,2019.0,Lucas Giolito,White Sox,24.0,14.0,9.0,3.41,5.1,29.0,29.0,...,0.24,,0.291,0.684,0.493,0.545,0.763,0.7,0.514,25.3


In [129]:
pitching_stats_2019.columns

Index(['Season', 'Name', 'Team', 'Age', 'W', 'L', 'ERA', 'WAR', 'G', 'GS',
       ...
       'wSL/C (pi)', 'wXX/C (pi)', 'O-Swing% (pi)', 'Z-Swing% (pi)',
       'Swing% (pi)', 'O-Contact% (pi)', 'Z-Contact% (pi)', 'Contact% (pi)',
       'Zone% (pi)', 'Pace (pi)'],
      dtype='object', length=299)

In [130]:
pitching_stats_2019.columns[:100]

Index(['Season', 'Name', 'Team', 'Age', 'W', 'L', 'ERA', 'WAR', 'G', 'GS',
       'CG', 'ShO', 'SV', 'BS', 'IP', 'TBF', 'H', 'R', 'ER', 'HR', 'BB', 'IBB',
       'HBP', 'WP', 'BK', 'SO', 'GB', 'FB', 'LD', 'IFFB', 'Balls', 'Strikes',
       'Pitches', 'RS', 'IFH', 'BU', 'BUH', 'K/9', 'BB/9', 'K/BB', 'H/9',
       'HR/9', 'AVG', 'WHIP', 'BABIP', 'LOB%', 'FIP', 'GB/FB', 'LD%', 'GB%',
       'IFFB%', 'HR/FB', 'IFH%', 'BUH%', 'Starting', 'Start-IP', 'Relieving',
       'Relief-IP', 'RAR', 'Dollars', 'tERA', 'xFIP', 'WPA', '-WPA', '+WPA',
       'RE24', 'REW', 'pLI', 'inLI', 'gmLI', 'exLI', 'Pulls', 'WPA/LI',
       'Clutch', 'FB%', 'FBv', 'SL%', 'SLv', 'CT%', 'CTv', 'CB%', 'CBv', 'CH%',
       'CHv', 'SF%', 'SFv', 'KN%', 'KNv', 'XX%', 'PO%', 'wFB', 'wSL', 'wCT',
       'wCB', 'wCH', 'wSF', 'wKN', 'wFB/C', 'wSL/C', 'wCT/C'],
      dtype='object')

In [131]:
pitching_stats_2019.columns[100:200]

Index(['wCB/C', 'wCH/C', 'wSF/C', 'wKN/C', 'O-Swing%', 'Z-Swing%', 'Swing%',
       'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%', 'F-Strike%', 'SwStr%',
       'HLD', 'SD', 'MD', 'ERA-', 'FIP-', 'xFIP-', 'K%', 'BB%', 'SIERA',
       'RS/9', 'E-F', 'FA% (pfx)', 'FT% (pfx)', 'FC% (pfx)', 'FS% (pfx)',
       'FO% (pfx)', 'SI% (pfx)', 'SL% (pfx)', 'CU% (pfx)', 'KC% (pfx)',
       'EP% (pfx)', 'CH% (pfx)', 'SC% (pfx)', 'KN% (pfx)', 'UN% (pfx)',
       'vFA (pfx)', 'vFT (pfx)', 'vFC (pfx)', 'vFS (pfx)', 'vFO (pfx)',
       'vSI (pfx)', 'vSL (pfx)', 'vCU (pfx)', 'vKC (pfx)', 'vEP (pfx)',
       'vCH (pfx)', 'vSC (pfx)', 'vKN (pfx)', 'FA-X (pfx)', 'FT-X (pfx)',
       'FC-X (pfx)', 'FS-X (pfx)', 'FO-X (pfx)', 'SI-X (pfx)', 'SL-X (pfx)',
       'CU-X (pfx)', 'KC-X (pfx)', 'EP-X (pfx)', 'CH-X (pfx)', 'SC-X (pfx)',
       'KN-X (pfx)', 'FA-Z (pfx)', 'FT-Z (pfx)', 'FC-Z (pfx)', 'FS-Z (pfx)',
       'FO-Z (pfx)', 'SI-Z (pfx)', 'SL-Z (pfx)', 'CU-Z (pfx)', 'KC-Z (pfx)',
       'EP-Z (pfx)', 'CH-Z 

In [132]:
pitching_stats_2019.columns[200:]

Index(['wCH/C (pfx)', 'wSC/C (pfx)', 'wKN/C (pfx)', 'O-Swing% (pfx)',
       'Z-Swing% (pfx)', 'Swing% (pfx)', 'O-Contact% (pfx)',
       'Z-Contact% (pfx)', 'Contact% (pfx)', 'Zone% (pfx)', 'Pace', 'RA9-WAR',
       'BIP-Wins', 'LOB-Wins', 'FDP-Wins', 'Age Rng', 'K-BB%', 'Pull%',
       'Cent%', 'Oppo%', 'Soft%', 'Med%', 'Hard%', 'kwERA', 'TTO%', 'CH% (pi)',
       'CS% (pi)', 'CU% (pi)', 'FA% (pi)', 'FC% (pi)', 'FS% (pi)', 'KN% (pi)',
       'SB% (pi)', 'SI% (pi)', 'SL% (pi)', 'XX% (pi)', 'vCH (pi)', 'vCS (pi)',
       'vCU (pi)', 'vFA (pi)', 'vFC (pi)', 'vFS (pi)', 'vKN (pi)', 'vSB (pi)',
       'vSI (pi)', 'vSL (pi)', 'vXX (pi)', 'CH-X (pi)', 'CS-X (pi)',
       'CU-X (pi)', 'FA-X (pi)', 'FC-X (pi)', 'FS-X (pi)', 'KN-X (pi)',
       'SB-X (pi)', 'SI-X (pi)', 'SL-X (pi)', 'XX-X (pi)', 'CH-Z (pi)',
       'CS-Z (pi)', 'CU-Z (pi)', 'FA-Z (pi)', 'FC-Z (pi)', 'FS-Z (pi)',
       'KN-Z (pi)', 'SB-Z (pi)', 'SI-Z (pi)', 'SL-Z (pi)', 'XX-Z (pi)',
       'wCH (pi)', 'wCS (pi)', 'wCU (pi)', '

### Aparanetly, we should use only the *(pi)* variables. They are calculated using a new algorythm that eliminates what Brooks Baseball considers errors from Pitch FX (pfx).

link: https://www.reddit.com/r/Sabermetrics/comments/6qepoa/what_is_the_data_source_for_nonattributed_plate/

In [133]:
pfx_col = [x for x in pitching_stats_2019.columns if "(pfx)" in x ]
pfx_col

['FA% (pfx)',
 'FT% (pfx)',
 'FC% (pfx)',
 'FS% (pfx)',
 'FO% (pfx)',
 'SI% (pfx)',
 'SL% (pfx)',
 'CU% (pfx)',
 'KC% (pfx)',
 'EP% (pfx)',
 'CH% (pfx)',
 'SC% (pfx)',
 'KN% (pfx)',
 'UN% (pfx)',
 'vFA (pfx)',
 'vFT (pfx)',
 'vFC (pfx)',
 'vFS (pfx)',
 'vFO (pfx)',
 'vSI (pfx)',
 'vSL (pfx)',
 'vCU (pfx)',
 'vKC (pfx)',
 'vEP (pfx)',
 'vCH (pfx)',
 'vSC (pfx)',
 'vKN (pfx)',
 'FA-X (pfx)',
 'FT-X (pfx)',
 'FC-X (pfx)',
 'FS-X (pfx)',
 'FO-X (pfx)',
 'SI-X (pfx)',
 'SL-X (pfx)',
 'CU-X (pfx)',
 'KC-X (pfx)',
 'EP-X (pfx)',
 'CH-X (pfx)',
 'SC-X (pfx)',
 'KN-X (pfx)',
 'FA-Z (pfx)',
 'FT-Z (pfx)',
 'FC-Z (pfx)',
 'FS-Z (pfx)',
 'FO-Z (pfx)',
 'SI-Z (pfx)',
 'SL-Z (pfx)',
 'CU-Z (pfx)',
 'KC-Z (pfx)',
 'EP-Z (pfx)',
 'CH-Z (pfx)',
 'SC-Z (pfx)',
 'KN-Z (pfx)',
 'wFA (pfx)',
 'wFT (pfx)',
 'wFC (pfx)',
 'wFS (pfx)',
 'wFO (pfx)',
 'wSI (pfx)',
 'wSL (pfx)',
 'wCU (pfx)',
 'wKC (pfx)',
 'wEP (pfx)',
 'wCH (pfx)',
 'wSC (pfx)',
 'wKN (pfx)',
 'wFA/C (pfx)',
 'wFT/C (pfx)',
 'wFC/C (pfx)',
 '

In [134]:
pitching_stats_2019.drop( columns = pfx_col, inplace=True )
pitching_stats_2019.columns

Index(['Season', 'Name', 'Team', 'Age', 'W', 'L', 'ERA', 'WAR', 'G', 'GS',
       ...
       'wSL/C (pi)', 'wXX/C (pi)', 'O-Swing% (pi)', 'Z-Swing% (pi)',
       'Swing% (pi)', 'O-Contact% (pi)', 'Z-Contact% (pi)', 'Contact% (pi)',
       'Zone% (pi)', 'Pace (pi)'],
      dtype='object', length=213)

In [135]:
from pybaseball import pitching_stats_range

pitching_range_1day = pitching_stats_range("2019-03-28",)

In [136]:
pitching_range_1day.head()

Unnamed: 0,Name,Age,#days,Lev,Date,Tm,Unnamed: 7,Opp,G,GS,...,Str,StL,StS,GB/FB,LD,PU,WHIP,BAbip,SO9,SO/W
1,Victor Alcantara,26,227,MLB-AL,"Mar 28, 2019",Detroit,@,Toronto,1,0,...,0.64,0.0,0.09,0.33,0.33,0.33,1.0,0.333,9.0,
2,Nick Anderson,28,227,MLB-NL,"Mar 28, 2019",Miami,,Colorado,1,0,...,1.0,0.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,
3,Luke Bard,28,227,MLB-AL,"Mar 28, 2019",Los Angeles,@,Oakland,1,0,...,0.8,0.1,0.2,0.0,0.33,0.33,1.0,0.333,9.0,
4,Kyle Barraclough,29,227,MLB-NL,"Mar 28, 2019",Washington,,New York,1,0,...,0.64,0.14,0.07,0.33,0.67,0.0,1.0,0.333,0.0,
5,Cam Bedrosian,27,227,MLB-AL,"Mar 28, 2019",Los Angeles,@,Oakland,1,0,...,0.86,0.14,0.14,0.33,0.0,0.0,0.0,0.0,0.0,


In [137]:
pitchers_Astros_on_20190328 = pitching_range_1day[pitching_range_1day.Tm == "Houston"]
pitchers_Astros_on_20190328

Unnamed: 0,Name,Age,#days,Lev,Date,Tm,Unnamed: 7,Opp,G,GS,...,Str,StL,StS,GB/FB,LD,PU,WHIP,BAbip,SO9,SO/W
87,Roberto Osuna,24,227,MLB-AL,"Mar 28, 2019",Houston,@,Tampa Bay,1,0,...,0.56,0.22,0.0,0.0,0.0,0.0,0.0,0.0,9.0,
92,Ryan Pressly,30,227,MLB-AL,"Mar 28, 2019",Houston,@,Tampa Bay,1,0,...,0.85,0.38,0.08,0.5,0.5,0.0,1.0,0.5,18.0,
117,Justin Verlander,36,227,MLB-AL,"Mar 28, 2019",Houston,@,Tampa Bay,1,1,...,0.73,0.15,0.21,0.6,0.27,0.0,0.571,0.143,11.6,9.0


## So! we can know who pitched on a certain game ^


In [138]:
pitchers_Astros_on_20190328.columns

Index(['Name', 'Age', '#days', 'Lev', 'Date', 'Tm', ' ', 'Opp', 'G', 'GS', 'W',
       'L', 'SV', 'IP', 'H', 'R', 'ER', 'BB', 'SO', 'HR', 'HBP', 'ERA', 'GSc',
       'AB', '2B', '3B', 'IBB', 'GDP', 'SF', 'SB', 'CS', 'PO', 'BF', 'Pit',
       'Str', 'StL', 'StS', 'GB/FB', 'LD', 'PU', 'WHIP', 'BAbip', 'SO9',
       'SO/W'],
      dtype='object')

#### An idea is to keep columns that represent % of other columns. For example, keeping IFFB% but droping IFFB, since a percentage tells more of a story that an isolated scalar value. In this way, we can reduce redundancy in our features and, therefore, the total size of our features array.

In [139]:
bat_col_not_to_drop = [x for x in all_data_2019.columns if "%" in x and x[:-1] in all_data_2019.columns]
bat_col_not_to_drop

['BB%', 'LD%', 'GB%', 'FB%', 'IFFB%', 'IFH%', 'BUH%', 'SF%']

In [140]:
bat_col_to_drop = [x[:-1] for x in bat_col_not_to_drop]
bat_col_to_drop

['BB', 'LD', 'GB', 'FB', 'IFFB', 'IFH', 'BUH', 'SF']

In [141]:
pitch_col_not_to_drop = [x for x in pitching_stats_2019.columns if "%" in x and x[:-1] in pitching_stats_2019.columns]
pitch_col_not_to_drop

['LD%', 'GB%', 'IFFB%', 'IFH%', 'BUH%', 'FB%', 'BB%']

In [142]:
pitch_col_to_drop = [x[:-1] for x in pitch_col_not_to_drop]
pitch_col_to_drop

['LD', 'GB', 'IFFB', 'IFH', 'BUH', 'FB', 'BB']

In [143]:
all_data_2019.drop(columns=bat_col_to_drop, inplace=True)
pitching_stats_2019.drop(columns=pitch_col_to_drop, inplace=True)
print(f"Total columns in bat_statistics: {len(all_data_2019.columns)}\nTotal columns in pitch_statistics: {len(pitching_stats_2019.columns)} ")

Total columns in bat_statistics: 279
Total columns in pitch_statistics: 206 


### ...We still have loooots of columns.


##### We will have to get rid of some columns manually that we consider unnecesary such as "Age" or "Team", etc. However, some of these columns might be useful during concatenation, so we will handle this in within the function that creates the dataframe.

## Let's explore how many pitcher there are per game usually

In [144]:
# let's grab our datframe created through record_and_schedule() previously for The Astros season 2019:
Astros_record.head()

Unnamed: 0,Date,Tm,Home_Away,Opp,W/L,R,RA,Inn,W-L,Rank,GB,Win,Loss,Save,Time,D/N,Attendance,Streak,Orig. Scheduled
1,"Thursday, Mar 28",HOU,@,TBR,W,5.0,1.0,9.0,1-0,2.0,1.0,Verlander,Snell,,2:38,D,25025.0,1,
2,"Friday, Mar 29",HOU,@,TBR,L,2.0,4.0,9.0,1-1,2.0,1.0,Morton,Cole,Alvarado,2:38,N,13059.0,-1,
3,"Saturday, Mar 30",HOU,@,TBR,L,1.0,3.0,9.0,1-2,3.0,2.0,Glasnow,McHugh,Alvarado,2:40,N,16010.0,-2,
4,"Sunday, Mar 31",HOU,@,TBR,L,1.0,3.0,9.0,1-3,4.0,3.0,Chirinos,Miley,Castillo,2:12,D,18473.0,-3,
5,"Monday, Apr 1",HOU,@,TEX,W,2.0,1.0,9.0,2-3,4.0,3.0,Peacock,Sampson,Osuna,2:39,N,18056.0,1,


### We will have to convert the date format cuz these guys are assholes

In [216]:
date = Astros_record.Date[1]
#date = date.split(",")
date = re.findall(r"\w* [0-9][0-9]|[0-9]", date)
date = date[0]
print(date)
year = "2019"
date = year + " " + date
date

Mar 28


'2019 Mar 28'

In [147]:
import datetime as dt

In [156]:
date_formatted = dt.datetime.strptime(date,"%Y %b %d")
date_formatted

datetime.datetime(2019, 3, 28, 0, 0)

In [160]:
date_formatted.month

3

### it worked out, so now we know how

In [523]:
type(date_formatted)

datetime.datetime

# Building the Training DataFrame

In [695]:
from pybaseball import pitching_stats

def format_dates_to_dt(un_date="Monday, Dec 31", year=1999):
    date = re.findall(r"\W\w\w\w\s\d+", un_date)
    date = date[0]
    date = str(year) + date
    date_formatted = dt.datetime.strptime(date,"%Y %b %d")
    return date_formatted

In [475]:
def get_dates_played(df=None,year=None):
    
    dates_played = [format_dates_to_dt(date, year) for date in df.Date]
    return dates_played

In [721]:
def get_team_schedule(year=None, team = "HOU"):
    
    teams_df  = schedule_and_record(year, team)
    teams_df  = teams_df.iloc[ : , [0,1,2,3,10,17] ]
    teams_df["Date"] = teams_df.apply(lambda x: format_dates_to_dt(x["Date"],year), axis=1)
    teams_df.replace("@",1, inplace=True)
    teams_df.replace("Home",0, inplace=True)
    return teams_df


def get_players_per_game(year = 2019, team = "HOU"):
    
    schedule_df = get_team_schedule(year, team)
    
    bat_stat_path = Path(f"Data/Batting/Clean_Data/clean_batting_data_{year}.csv")
    all_bat_stats = pd.read_csv(bat_stat_path)
    
    players_df = pd.DataFrame()
    
    i = 0
    for date in schedule_df["Date"]:
        
        temp_dict = {"Date":date}
        #temp_dict.update({})
        
        #all_players_on_date = batting_stats_range(date.strftime("%Y-%m-%d"),)
        all_players_on_date = all_bat_stats[all_bat_stats["Date"]==date.strftime("%Y-%m-%d")] #[:9] to make sure we always have 9
        
        all_from_team = all_players_on_date[all_players_on_date.Tm == "Astros"][:9]
        
        count = 1   
        for player in all_from_team.Name: 
            temp_dict.update({f"player_{count:02}_{team}" : player})
            
            count+=1
            
        temp_df = pd.DataFrame(temp_dict, index =[i])
        
        players_df = pd.concat([players_df,temp_df], axis=0, sort = True )
        
        i+=1
        
    schedule_df.set_index("Date")
    players_df.set_index("Date")
    teams_df = pd.concat([schedule_df,players_df], axis=1, join="outer")
    
    return teams_df

def get_startingplayer_stats_by_game(players_df):
    
    batting_season_data = batting_stats(year)   
    pitching_season_data = pitching_stats(year)
    print(pitching_season_data.head()) 
    
    stats_players_start_lineup = pd.DataFrame()
    
    for row in range(0,len(players_df)):
               
        all_players_stats = pd.DataFrame()
        #all_players_stats["Date"] = players_df.loc[row,"Date"]
        #all_players_stats.reset_index(inplace=True)
        #print(all_players_stats)
               
        for player in players_df.iloc[row][6:]:
            
            if("00:00:00" in str(player)): continue #it gets the Date as first row by default. Skip it.
                
            try: player_bat_stats = batting_season_data[batting_season_data.Name == player].copy()
            except: 
                print("no name in batting list")
                continue
            """    
            try: 
                player_pitch_stats = pitching_season_data[pitching_season_data.Name == player].copy()
                print(f"SINGLE_PLAYER_STAT --------------\n{player_pitch_stats}")
            except: 
                print("no name in pitching list")
                continue
                """
                
            player_bat_stats.reset_index(inplace=True)
            #player_pitch_stats.reset_index(inplace=True)
            #print(f"ALL_PLAYER_pitcher_STAT ++++++++++++++\n{player_pitch_stats}")
            all_players_stats = pd.concat([ all_players_stats, player_bat_stats, 
                                           #player_pitch_stats
                                          ] , 
                                          axis=1)
            #print(f"SINGLE_PLAYER_STAT ##############\n{all_players_stats}")
            
        try: stats_players_start_lineup = stats_players_start_lineup.append(all_players_stats, ignore_index=True)
        except: 
            print("could not append ")
            continue
        
    
    return stats_players_start_lineup
    

In [722]:
def create_trining_df(year = (dt.datetime.today().year-1), team = "HOU" ):
                       
    
    players_df = get_players_per_game(year, team)
    #print(players_df.tail(10))    
    stats_players_start_lineup = get_startingplayer_stats_by_game(players_df)
    
    return pd.concat([players_df.iloc[:,[0,1,2,3,4,5]],stats_players_start_lineup], axis=1, join='inner')
       
    
    
    
    #dates_played = get_dates_played(last_season_games_played, year)   

In [723]:
#format_dates_to_dt(un_date = "Sunday, Apr 23", year= 2018)

In [724]:
stats_players_start_lineup = create_trining_df()
stats_players_start_lineup

     Season              Name       Team   Age     W    L   ERA  WAR     G  \
41   2018.0      Jacob deGrom       Mets  30.0  10.0  9.0  1.70  9.0  32.0   
92   2018.0      Max Scherzer  Nationals  33.0  18.0  7.0  2.53  7.5  33.0   
91   2018.0  Justin Verlander     Astros  35.0  16.0  9.0  2.52  6.6  34.0   
57   2018.0        Chris Sale    Red Sox  29.0  12.0  4.0  2.11  6.2  27.0   
133  2018.0       Gerrit Cole     Astros  27.0  15.0  5.0  2.88  6.0  32.0   

       GS  ...  wSL/C (pi)  wXX/C (pi)  O-Swing% (pi)  Z-Swing% (pi)  \
41   32.0  ...        2.25         NaN          0.367          0.661   
92   33.0  ...        1.85         NaN          0.355          0.666   
91   34.0  ...        0.48         NaN          0.336          0.656   
57   27.0  ...        2.20         NaN          0.363          0.602   
133  32.0  ...        0.53         NaN          0.315          0.650   

     Swing% (pi)  O-Contact% (pi)  Z-Contact% (pi)  Contact% (pi)  Zone% (pi)  \
41         0.518 

Unnamed: 0,Date,Tm,Home_Away,Opp,GB,Streak,index,Season,Name,Team,...,wSL/C (pi),wXX/C (pi),O-Swing% (pi),Z-Swing% (pi),Swing% (pi),O-Contact% (pi),Z-Contact% (pi),Contact% (pi),Zone% (pi),Pace (pi)
0,NaT,,,,,,84,2018.0,Jose Altuve,Astros,...,0.08,6.35,0.234,0.665,0.444,0.599,0.847,0.780,0.487,25.1
1,2018-03-29,HOU,1.0,TEX,Tied,1.0,84,2018.0,Jose Altuve,Astros,...,0.08,6.35,0.234,0.665,0.444,0.599,0.847,0.780,0.487,25.1
2,2018-03-30,HOU,1.0,TEX,0.5,-1.0,84,2018.0,Jose Altuve,Astros,...,-4.34,,0.272,0.563,0.410,0.681,0.866,0.802,0.476,23.9
3,2018-03-31,HOU,1.0,TEX,Tied,1.0,84,2018.0,Jose Altuve,Astros,...,0.08,6.35,0.234,0.665,0.444,0.599,0.847,0.780,0.487,25.1
4,2018-04-01,HOU,1.0,TEX,Tied,2.0,84,2018.0,Jose Altuve,Astros,...,0.72,,0.294,0.622,0.445,0.721,0.903,0.838,0.461,23.5
5,2018-04-02,HOU,0.0,BAL,up 1.0,3.0,84,2018.0,Jose Altuve,Astros,...,-4.34,,0.272,0.563,0.410,0.681,0.866,0.802,0.476,23.9
6,2018-04-03,HOU,0.0,BAL,up 1.0,4.0,84,2018.0,Jose Altuve,Astros,...,-0.65,,0.205,0.640,0.431,0.340,0.772,0.673,0.519,22.9
7,2018-04-04,HOU,0.0,BAL,up 1.0,5.0,84,2018.0,Jose Altuve,Astros,...,0.08,6.35,0.234,0.665,0.444,0.599,0.847,0.780,0.487,25.1
8,2018-04-06,HOU,0.0,SDP,Tied,-1.0,84,2018.0,Jose Altuve,Astros,...,0.72,,0.294,0.622,0.445,0.721,0.903,0.838,0.461,23.5
9,2018-04-07,HOU,0.0,SDP,up 1.0,1.0,84,2018.0,Jose Altuve,Astros,...,0.08,6.35,0.234,0.665,0.444,0.599,0.847,0.780,0.487,25.1


In [693]:
stats_players_start_lineup.loc[2]["Name"]

Name        Jose Altuve
Name       Alex Bregman
Name      Carlos Correa
Name         J.D. Davis
Name       Derek Fisher
Name        Evan Gattis
Name    Marwin Gonzalez
Name     Jake Marisnick
Name       Brian McCann
Name: 2, dtype: object

In [714]:
pitching_stats_2019[pitching_stats_2019.Name == "Jacob deGrom"]

Unnamed: 0,Season,Name,Team,Age,W,L,ERA,WAR,G,GS,...,wSL/C (pi),wXX/C (pi),O-Swing% (pi),Z-Swing% (pi),Swing% (pi),O-Contact% (pi),Z-Contact% (pi),Contact% (pi),Zone% (pi),Pace (pi)
85,2019.0,Jacob deGrom,Mets,31.0,11.0,8.0,2.43,7.0,32.0,32.0,...,2.24,,0.365,0.696,0.529,0.536,0.791,0.702,0.495,24.2


In [715]:
stats_players_start_lineup["W"]

Unnamed: 0,W,W.1,W.2,W.3,W.4,W.5,W.6,W.7,W.8
0,,,,,,,,,
1,,,,0.0,,,,,
2,,,,0.0,,,,,
3,,,,0.0,,,,,
4,,,,0.0,,,,,
5,,,,0.0,,,,,
6,,,0.0,,,,,,
7,,,,,,,,,
8,,,,,,,,,
9,,,,,,,,,
