In [None]:
# Import required libraries
from path import Path
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder, StandardScaler
from pybaseball import batting_stats
from pybaseball import batting_stats_range
from pybaseball import pitching_stats_range
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

### Analysing *batting_stats()* with different paramters

In [1108]:
all_data_2019 = batting_stats(2019)

In [None]:
all_data_2019 = batting_stats(2019)
players_with_more_than_50 = batting_stats(2019, qual=50)
all_data_since_2015 = batting_stats(2015, 2019)
aggregated_data = batting_stats(2010, 2016, ind=0)

print(all_data_2019.head())
print(players_with_more_than_50.head())
print(all_data_since_2015.head())
print(aggregated_data.head())

# Conclusion for batting_stats()
Very versitile function that can take one or two years as parameters (single season or multiple seasons). It would return a dataframe with 287 columns with all the batting statistics for every single player for the whole season/seasons. A lot of statistics!

### Analysing *schedule_and_record()*

In [None]:
from pybaseball import schedule_and_record

Astros_record = schedule_and_record(2019, "HOU")
Astros_record.head(20)

In [None]:
Astros_record.columns

### Conclusion for schedule_and_record()
Very useful function that can get the players who played in each game, and also all the dates in which each a team played in a season, among others. It only has 19 columns, but this will be used as a *link* table between the others.

## Analysing *batting_stats_range()*

In [None]:
data_range = batting_stats_range("2019-09-01","2019-09-30")
data_range

In [None]:
print(data_range.columns)
print(all_data_2019.columns)

### Conclusion for *batting_stats_range()*

this functions returns a much smaller dataframe with only 27 columns. This will be useful to get the most recent information for each player right before each game.

# Let's look at all the columns for batting_stats() and batting_stats_range()

In [None]:
print(all_data_2019.columns[:95])

In [None]:
print(all_data_2019.columns[95:190])

In [None]:
print(all_data_2019.columns[190:])

In [None]:
len(all_data_2019.columns)

In [None]:
len(data_range.columns)

### Let's see what columns in the smaller dataframe are also in the larger df

In [None]:
shared_columns = list(data_range.columns & all_data_2019.columns)

In [None]:
shared_columns

In [None]:
len(shared_columns)

### basically all of them. Only 4 are not included. Let's see which one are not in the larger df.

In [None]:
range_columns_not_in_all_data = list(data_range.columns.difference(all_data_2019.columns))
range_columns_not_in_all_data

# So #days is a useless column, BA is the same as AVG, Lev is useless too, and Tm is the same as Team. As a conclusion, all the columns are included.

In [None]:
Astros_2019stats  =  all_data_2019[all_data_2019["Team"]=="Astros"]

In [None]:
Astros_2019stats.count()

In [None]:
#Thursday, Mar 28
#"2019-03-28"
game_march_28_2019 = batting_stats_range("2019-03-28",)


In [None]:
game_march_28_2019["Tm"].unique

In [None]:
game_march_28_2019[game_march_28_2019["Tm"]=="Houston"]

## ...We can see who played in every game ^

In [None]:
game_march_28_2019.columns

In [None]:
game_march_28_2019.iloc[:,6]

### *"@"* means they're playing as *visitors*.

# Startegy:
### General Strategy:
Retrieve data for each player in each game played. Feed the model with data from past year for each player and past month. Build a dataframe where each row represents the features to train the model and to make predictions out of the model. 

Each row would have the full 283 features from past season plus the 23 from past month for each player that played the particular game plus 2 team features. 

### Specific Tasks

Usually there are 9 players, so there would be 9 times 283 features only to account for past season statistics of 1 team. So this number would be later mulitply also by 2 since each game involves 2 teams. We will use **schedule_and_record()** function to get the players who played in a game, so we can later use **batting_stats()** function to get those 283 features from last season for each player that played a particular game.

There will be also recent statistics from last month for each player as well, which is 22 features. Again this number would be multiply by 9 and then by 2. We will use **schedule_and_record()** function to get the players who played in that game, so we can later retrieve the past month statistics for each one of those players from **batting_stats_range()**.

There will be also statistics from the team to be included in each record such as streak, and GB, also gotten from **schedule_and_record()**.

### Conclusion
The training DataFrame will contain a record for each game of a single team and its opponent. This leads us to  **DataFrame with dimensions of 5494 columns by *Games-Played-By-The-Team* rows**. This is only batting statistics so far.

## Analysing *pitching_stats()*

In [None]:
from pybaseball import pitching_stats

pitching_stats_2019 = pitching_stats("2019")
pitching_stats_2019.head(15)

In [None]:
pitching_stats_2019.columns

In [None]:
pitching_stats_2019.columns[:100]

In [None]:
pitching_stats_2019.columns[100:200]

In [None]:
pitching_stats_2019.columns[200:]

### Aparanetly, we should use only the *(pi)* variables. They are calculated using a new algorythm that eliminates what Brooks Baseball considers errors from Pitch FX (pfx).

link: https://www.reddit.com/r/Sabermetrics/comments/6qepoa/what_is_the_data_source_for_nonattributed_plate/

In [None]:
pfx_col = [x for x in pitching_stats_2019.columns if "(pfx)" in x ]
pfx_col

In [None]:
pitching_stats_2019.drop( columns = pfx_col, inplace=True )
pitching_stats_2019.columns

In [None]:
from pybaseball import pitching_stats_range

pitching_range_1day = pitching_stats_range("2019-03-28",)

In [None]:
pitching_range_1day.head()

In [None]:
pitchers_Astros_on_20190328 = pitching_range_1day[pitching_range_1day.Tm == "Houston"]
pitchers_Astros_on_20190328

## So! we can know who pitched on a certain game ^


In [None]:
pitchers_Astros_on_20190328.columns

#### An idea is to keep columns that represent % of other columns. For example, keeping IFFB% but droping IFFB, since a percentage tells more of a story that an isolated scalar value. In this way, we can reduce redundancy in our features and, therefore, the total size of our features array.

In [None]:
bat_col_not_to_drop = [x for x in all_data_2019.columns if "%" in x and x[:-1] in all_data_2019.columns]
bat_col_not_to_drop

In [None]:
bat_col_to_drop = [x[:-1] for x in bat_col_not_to_drop]
bat_col_to_drop

In [None]:
pitch_col_not_to_drop = [x for x in pitching_stats_2019.columns if "%" in x and x[:-1] in pitching_stats_2019.columns]
pitch_col_not_to_drop

In [None]:
pitch_col_to_drop = [x[:-1] for x in pitch_col_not_to_drop]
pitch_col_to_drop

In [None]:
all_data_2019.drop(columns=bat_col_to_drop, inplace=True)
pitching_stats_2019.drop(columns=pitch_col_to_drop, inplace=True)
print(f"Total columns in bat_statistics: {len(all_data_2019.columns)}\nTotal columns in pitch_statistics: {len(pitching_stats_2019.columns)} ")

### ...We still have loooots of columns.


##### We will have to get rid of some columns manually that we consider unnecesary such as "Age" or "Team", etc. However, some of these columns might be useful during concatenation, so we will handle this in within the function that creates the dataframe.

## Let's explore how many pitcher there are per game usually

In [None]:
# let's grab our datframe created through record_and_schedule() previously for The Astros season 2019:
Astros_record.head()

### We will have to convert the date format cuz these guys are assholes

In [None]:
date = Astros_record.Date[1]
#date = date.split(",")
date = re.findall(r"\w* [0-9][0-9]|[0-9]", date)
date = date[0]
print(date)
year = "2019"
date = year + " " + date
date

In [None]:
import datetime as dt

In [None]:
date_formatted = dt.datetime.strptime(date,"%Y %b %d")
date_formatted

In [None]:
date_formatted.month

### it worked out, so now we know how

In [None]:
type(date_formatted)

# %%%%%%%    Building the Training DataFrame    %%%%%%%%

In [1094]:
# Import required libraries
from path import Path
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder, StandardScaler
from pybaseball import batting_stats
from pybaseball import batting_stats_range
from pybaseball import pitching_stats_range
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [1150]:
teams = set(all_data_2017["Team"])
teams

{'- - -',
 'Angels',
 'Astros',
 'Athletics',
 'Blue Jays',
 'Braves',
 'Brewers',
 'Cardinals',
 'Cubs',
 'Diamondbacks',
 'Dodgers',
 'Giants',
 'Indians',
 'Mariners',
 'Marlins',
 'Mets',
 'Nationals',
 'Orioles',
 'Padres',
 'Phillies',
 'Pirates',
 'Rangers',
 'Rays',
 'Red Sox',
 'Reds',
 'Rockies',
 'Royals',
 'Tigers',
 'Twins',
 'White Sox',
 'Yankees'}

In [1095]:
team_dict = {'Angels':'LAA',
            'Athletics': 'OAK',
            'Astros': 'HOU',
            'Braves': 'ATL',
            'Brewers': 'MIL',
            'Cards': 'STL',
            'Cubs': 'CUB',
            'Diamondbacks': 'ARI',
            'Dodgers': 'LOS',
            'Giants': 'SFG',
            'Indians': 'CLE',
            'Jays': 'TOR',
            'Mariners': 'SEA',
            'Marlins': 'MIA',
            'Mets': 'NYM',
            'Nats': 'WAS',
            'Orioles': 'BAL',
            'Padres': 'SDP',
            'Phillies': 'PHI',
            'Pirates': 'PIT',
            'Rangers': 'TEX',
            'Rays': 'TBR',
            'Red Sox': 'BOS',
            'Reds': 'CIN',
            'Rockies': 'COL',
            'Royals': 'KCR',
            'Tigers': 'DET',
            'Twins': 'MIN',
            'White_Sox': 'CHW',
            'Yankees': 'NYY'
             
           }
def get_key_from_dict(dictionary, val): 
    for key, value in dictionary.items(): 
         if val == value: 
            return key 

In [1096]:
from pybaseball import pitching_stats

def format_dates_to_dt(un_date="Monday, Dec 31", year=1999):
    date = re.findall(r"\W\w\w\w\s\d+", un_date)
    date = date[0]
    date = str(year) + date
    date_formatted = dt.datetime.strptime(date,"%Y %b %d")
    return date_formatted

In [1097]:
def modify_dates_from_lineups(date="1. Thu,3/29 at TEX W (4-1)#", year = 2018):
    date = re.findall(r"\d+/\d+", date)
    try: date = date[0]
    except: return
    date = str(year) +" " + date
    return date

In [1098]:
def modify_date_col_from_lineups(df, year):
    df["0"] = df.apply(lambda x: modify_dates_from_lineups(x["0"],year), axis=1)
    df["0"] = pd.to_datetime(df["0"])
    return df

In [1099]:
def clean_lineups(df):
    for row in range(0,len(df)):
        for column in range(2,11):
            original_name = df.iloc[row,column]
            name = original_name.split("-")
            name = name[0]
            df.replace(original_name,name, inplace=True)
            #WE CAN IMPROVE THIS LATER so we don't need to iterate all the dataframe since most of it
            #has the same names over and over. We just need to grab all the different names and replace them
    return df

In [1100]:

def format_lineups_df(df, year):
    df  =  modify_date_col_from_lineups(df, year)
    df = clean_lineups(df)
    df.columns = ["index","Date","1","2","3","4","5","6","7","8","9"]
    df.set_index("index", inplace=True, drop=True)
    return df
    
    

In [1101]:
def get_dates_played(df=None,year=None):
    
    dates_played = [format_dates_to_dt(date, year) for date in df.Date]
    return dates_played

In [1102]:
def get_team_schedule(year=None, team = "HOU"):
    
    teams_df  = schedule_and_record(year, team)
    teams_df  = teams_df.iloc[ : , [0,1,2,3,10,17] ]
    teams_df["Date"] = teams_df.apply(lambda x: format_dates_to_dt(x["Date"],year), axis=1)
    teams_df.replace("@",1, inplace=True)
    teams_df.replace("Home",0, inplace=True)
    return teams_df

In [1107]:
def get_players_per_game(year = None, team = None):
    
    schedule_df = get_team_schedule(year, team)
    opponents = set(schedule_df["Opp"])
    
    bat_stat_path = Path(f"Data/Batting/Clean_Data/clean_batting_data_{year}.csv")
    all_bat_stats = pd.read_csv(bat_stat_path)
    
    lineups_path = Path(f"Lineups/{team}_lineups_{year}.csv")
    all_lineups_season = pd.read_csv(lineups_path)
    all_lineups_season = format_lineups_df(all_lineups_season, year)
    
    opponents_lineups = {}
    for opponent in opponents:
        opp_lineups_path = Path(f"Lineups/{opponent}_lineups_{year}.csv")
        opp_all_lineups_season = pd.read_csv(opp_lineups_path)
        print(f"opp: {opponent}")
        opp_all_lineups_season = format_lineups_df(opp_all_lineups_season, year)
        opponents_lineups.update({opponent:opp_all_lineups_season})
    
    players_df = pd.DataFrame()
    
    i = 0
    for date in schedule_df["Date"]:
        
        
        adversary = schedule_df[schedule_df["Date"]==date.strftime("%Y-%m-%d")]["Opp"].values[0]

        temp_dict = {"Date":date}

        ##Line ups for the team
        try: all_players_on_date = all_lineups_season[all_lineups_season["Date"]==date.strftime("%Y-%m-%d")]
            
        except:
            print(f"No game on this date {date} for team")
            continue
            
        count = 1  
        all_players_team = all_players_on_date.iloc[0]       
        for player in all_players_team[1:]:            
            temp_dict.update({f"player_{count:02}" : player})           
            count+=1
            
        ##Line ups for the adversary
        try:
            opp_lineups_df = opponents_lineups[adversary]
            all_opponents_on_date = opp_lineups_df[opp_lineups_df["Date"]==date.strftime("%Y-%m-%d")]
            all_opponents_on_date.columns = ["Date","10","11","12","12","14","15","16","17","18"]
        except:
            print(f"No game on this date {date} for opponent")
            continue

        count = 10  
        all_players_opp = all_opponents_on_date.iloc[0]       
        for enemy in all_players_opp[1:]:            
            temp_dict.update({f"player_{count:02}" : enemy})           
            count+=1
         
              
        temp_df = pd.DataFrame(temp_dict, index =[i])
        players_df = pd.concat([players_df,temp_df], axis=0, sort = True )
        i+=1
        
    schedule_df.set_index("Date", inplace=True)
    players_df.set_index("Date", inplace=True)
    teams_df = pd.concat([schedule_df,players_df], axis=1, join="inner")
    teams_df.reset_index(inplace=True, drop=False)
    
    return teams_df

In [1106]:
players_per_game_for_HOU_2018 = get_players_per_game(year = 2018, team = "HOU")
players_per_game_for_HOU_2018.head(20)

opp: KCR
opp: DET
opp: TBR
opp: BOS
opp: OAK
opp: CHW
opp: LAD
opp: TOR
opp: MIN
opp: TEX
opp: CLE
opp: LAA
opp: SDP
opp: ARI
opp: SFG
opp: NYY
opp: COL
opp: BAL
opp: SEA
temporary dataframe:
         Date player_01 player_02 player_03 player_04 player_05 player_06  \
0 2018-03-29  Springer   Bregman    Altuve    Correa  Gonzalez   Reddick   

  player_07  player_08 player_09  player_10 player_11 player_12 player_13  \
0    Gattis  Marisnick    McCann  DeShields     Gallo    Andrus    Beltre   

  player_14 player_15 player_16 player_17 player_18  
0    Mazara      Choo  Chirinos      Odor       Rua  
players dataframe:
         Date player_01 player_02 player_03 player_04 player_05 player_06  \
0 2018-03-29  Springer   Bregman    Altuve    Correa  Gonzalez   Reddick   

  player_07  player_08 player_09  player_10 player_11 player_12 player_13  \
0    Gattis  Marisnick    McCann  DeShields     Gallo    Andrus    Beltre   

  player_14 player_15 player_16 player_17 player_18  
0    Maza

Unnamed: 0,Date,Tm,Home_Away,Opp,GB,Streak,player_01,player_02,player_03,player_04,...,player_09,player_10,player_11,player_12,player_13,player_14,player_15,player_16,player_17,player_18
0,2018-03-29,HOU,1,TEX,Tied,1,Springer,Bregman,Altuve,Correa,...,McCann,DeShields,Gallo,Andrus,Beltre,Mazara,Choo,Chirinos,Odor,Rua
1,2018-03-30,HOU,1,TEX,0.5,-1,Springer,Reddick,Altuve,Correa,...,Stassi,DeShields,Gallo,Andrus,Beltre,Mazara,Choo,Chirinos,Odor,Rua
2,2018-03-31,HOU,1,TEX,Tied,1,Springer,Bregman,Altuve,Correa,...,Marisnick,Andrus,Gallo,Beltre,Mazara,Choo,Odor,Centeno,Robinson,Tocci
3,2018-04-01,HOU,1,TEX,Tied,2,Springer,Bregman,Altuve,Correa,...,Fisher,Andrus,Gallo,Beltre,Mazara,Chirinos,Odor,Rua,Profar,Robinson
4,2018-04-02,HOU,0,BAL,up 1.0,3,Springer,Bregman,Altuve,Correa,...,Fisher,Davis,Machado,Schoop,Jones,Mancini,Rasmus,Beckham,Alvarez,Joseph
5,2018-04-03,HOU,0,BAL,up 1.0,4,Springer,Bregman,Altuve,Correa,...,Fisher,Davis,Machado,Schoop,Jones,Mancini,Rasmus,Beckham,Sisco,Santander
6,2018-04-04,HOU,0,BAL,up 1.0,5,Springer,Bregman,Altuve,Reddick,...,Fisher,Beckham,Machado,Schoop,Jones,Mancini,Valencia,Santander,Joseph,Gentry
7,2018-04-06,HOU,0,SDP,Tied,-1,Springer,Bregman,Altuve,Correa,...,Fisher,Margot,Pirela,Hosmer,Villanueva,Renfroe,Asuaje,Galvis,Spangenberg,Hedges
8,2018-04-07,HOU,0,SDP,up 1.0,1,Springer,Bregman,Altuve,Correa,...,Marisnick,Pirela,Galvis,Hosmer,Villanueva,Asuaje,Renfroe,Spangenberg,Ellis,Margot
9,2018-04-08,HOU,0,SDP,up 1.0,2,Springer,Bregman,Altuve,Correa,...,Fisher,Asuaje,Hosmer,Pirela,Villanueva,Headley,Renfroe,Galvis,Hedges,Margot


In [1120]:
x = players_per_game_for_HOU_2018.iloc[0][3]
x

'TEX'

In [1111]:
x[3]

'TEX'

In [1136]:
def get_stats_startingplayer_by_game(players_df, team, year):
    
    batting_season_data = batting_stats(year-1)   
    #pitching_season_data = pitching_stats(year)
    #print(pitching_season_data.head()) 
    
    stats_players_start_lineup = pd.DataFrame()
    
    for row in range(0,len(players_df)):
        print(row)       
        all_players_stats = pd.DataFrame()
        
        #for players of team      
        for player in players_df.iloc[row][6:15]:
            #print(f"players: {len(players_df.iloc[row][6:])}\n {players_df.iloc[row][6:]}")
            if("00:00:00" in str(player)): continue #it gets the Date as first row by default. Skip it.
                
            try: 
                all_team_bat_stats = batting_season_data[batting_season_data.Team == get_key_from_dict(team_dict,team)]
                #print(f"ALL TEAM STATS :\n{all_team_bat_stats.head()}")
                player_full_name = [x for x in all_team_bat_stats.Name if player in x]
                #print(player_full_name)
                player_bat_stats = all_team_bat_stats[all_team_bat_stats.Name == player_full_name[0]]
                #print(f"Stats for player {player_full_name} only: \n{player_bat_stats}")

            except: 
                print(f"{player} not in batting list")
                #create code to fill batting statistics for a new player full of NAs or 0.0
                continue
            
            player_bat_stats.reset_index(inplace=True)
            #print(f"SINGLE_PLAYER_STAT ++++++++++++++\n{player_bat_stats}")
            all_players_stats = pd.concat([ all_players_stats, player_bat_stats ] ,   axis=1)
            #print(f"ALL_PLAYER_STAT ##############\n{all_players_stats}")
            
         #for players of opponent     
        for enemy in players_df.iloc[row][15:24]:
            #print(f"players: {len(players_df.iloc[row][6:])}\n {players_df.iloc[row][6:]}")
            if("00:00:00" in str(enemy)): continue #it gets the Date as first row by default. Skip it.
                
            try: 
                print(f"row[3]: {players_df.iloc[row][3]}")
                all_opp_bat_stats = batting_season_data[batting_season_data.Team == 
                                                         get_key_from_dict(team_dict,players_df.iloc[row][3])]
                #print(f"ALL OPPONENT STATS :\n{all_opp_bat_stats.head()}")
                enemy_full_name = [x for x in all_opp_bat_stats.Name if enemy in x]
                #print(player_full_name)
                enemy_bat_stats = all_opp_bat_stats[all_opp_bat_stats.Name == enemy_full_name[0]]
                #print(f"Stats for player {player_full_name} only: \n{player_bat_stats}")

            except: 
                print(f"enemy {enemy} not in batting list")
                #create code to fill batting statistics for a new player full of NAs or 0.0
                continue
            
            enemy_bat_stats.reset_index(inplace=True)
            #print(f"SINGLE_PLAYER_STAT ++++++++++++++\n{player_bat_stats}")
            all_players_stats = pd.concat([ all_players_stats, enemy_bat_stats] ,   axis=1)
            #print(f"ALL_PLAYER_STAT ##############\n{all_players_stats}")
            
        try: 
            stats_players_start_lineup = stats_players_start_lineup.append(all_players_stats, ignore_index=True)
            #stats_players_start_lineup = pd.concat([stats_players_start_lineup,all_players_stats], join="outer",axis=0, sort = True )
        except: 
            print("could not append ")
            continue
        
    
    return stats_players_start_lineup

In [1137]:

test = get_stats_startingplayer_by_game(players_per_game_for_HOU_2018, "HOU", 2018)
test

0
row[3]: TEX
row[3]: TEX
row[3]: TEX
row[3]: TEX
row[3]: TEX
row[3]: TEX
row[3]: TEX
row[3]: TEX
row[3]: TEX
1
row[3]: TEX
row[3]: TEX
row[3]: TEX
row[3]: TEX
row[3]: TEX
row[3]: TEX
row[3]: TEX
row[3]: TEX
row[3]: TEX
2
row[3]: TEX
row[3]: TEX
row[3]: TEX
row[3]: TEX
row[3]: TEX
row[3]: TEX
row[3]: TEX
enemy Centeno not in batting list
row[3]: TEX
row[3]: TEX
enemy Tocci not in batting list
could not append 
3
row[3]: TEX
row[3]: TEX
row[3]: TEX
row[3]: TEX
row[3]: TEX
row[3]: TEX
row[3]: TEX
row[3]: TEX
row[3]: TEX
4
row[3]: BAL
row[3]: BAL
row[3]: BAL
row[3]: BAL
row[3]: BAL
row[3]: BAL
enemy Rasmus not in batting list
row[3]: BAL
enemy Beckham not in batting list
row[3]: BAL
row[3]: BAL
could not append 
5
row[3]: BAL
row[3]: BAL
row[3]: BAL
row[3]: BAL
row[3]: BAL
row[3]: BAL
enemy Rasmus not in batting list
row[3]: BAL
enemy Beckham not in batting list
row[3]: BAL
row[3]: BAL
could not append 
6
row[3]: BAL
enemy Beckham not in batting list
row[3]: BAL
row[3]: BAL
row[3]: BAL
ro

Unnamed: 0,index,Season,Name,Team,Age,G,AB,PA,H,1B,...,wSL/C (pi),wXX/C (pi),O-Swing% (pi),Z-Swing% (pi),Swing% (pi),O-Contact% (pi),Z-Contact% (pi),Contact% (pi),Zone% (pi),Pace (pi)
0,79,2017.0,George Springer,Astros,27.0,140.0,548.0,629.0,155.0,92.0,...,-0.41,-5.21,0.346,0.608,0.461,0.509,0.752,0.65,0.44,22.3
1,79,2017.0,George Springer,Astros,27.0,140.0,548.0,629.0,155.0,92.0,...,-0.41,-5.21,0.346,0.608,0.461,0.509,0.752,0.65,0.44,22.3
2,79,2017.0,George Springer,Astros,27.0,140.0,548.0,629.0,155.0,92.0,...,0.33,,0.274,0.628,0.434,0.56,0.781,0.704,0.451,26.1


In [None]:
all_data_2017 = batting_stats(2017)

In [1149]:
team = get_key_from_dict(team_dict, "San Diego")
print(team)
teams = set(all_data_2017["Team"])
teams

None


{'- - -',
 'Angels',
 'Astros',
 'Athletics',
 'Blue Jays',
 'Braves',
 'Brewers',
 'Cardinals',
 'Cubs',
 'Diamondbacks',
 'Dodgers',
 'Giants',
 'Indians',
 'Mariners',
 'Marlins',
 'Mets',
 'Nationals',
 'Orioles',
 'Padres',
 'Phillies',
 'Pirates',
 'Rangers',
 'Rays',
 'Red Sox',
 'Reds',
 'Rockies',
 'Royals',
 'Tigers',
 'Twins',
 'White Sox',
 'Yankees'}

In [1129]:
def create_trining_df(year = (dt.datetime.today().year-1), team = "HOU" ):
                       
    
    players_df = get_players_per_game(year, team)
    #print(players_df.tail(10))    
    stats_players_start_lineup = get_stats_startingplayer_by_game(players_df, team, year)
    
    return pd.concat([players_df.iloc[:,[0,1,2,3,4,5]],stats_players_start_lineup], axis=1, join='inner')
       
    
    
    
    #dates_played = get_dates_played(last_season_games_played, year)   

In [1130]:
stats_players_start_lineup = create_trining_df()
stats_players_start_lineup

opp: KCR
opp: DET
opp: TBR
opp: BOS
opp: OAK


KeyboardInterrupt: 

In [None]:
stats_players_start_lineup.head()

In [None]:
names = stats_players_start_lineup.loc[:]["Name"].isna()

In [None]:
names.isnull().values.any()