In [1]:
# Import required libraries
from path import Path
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder, StandardScaler
from pybaseball import batting_stats
from pybaseball import batting_stats_range
from pybaseball import pitching_stats_range
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

KeyboardInterrupt: 

### Analysing *batting_stats()* with different paramters

In [None]:
all_data_2019 = batting_stats(2019)

In [None]:
all_data_2019 = batting_stats(2019)
players_with_more_than_50 = batting_stats(2019, qual=50)
all_data_since_2015 = batting_stats(2015, 2019)
aggregated_data = batting_stats(2010, 2016, ind=0)

print(all_data_2019.head())
print(players_with_more_than_50.head())
print(all_data_since_2015.head())
print(aggregated_data.head())

# Conclusion for batting_stats()
Very versitile function that can take one or two years as parameters (single season or multiple seasons). It would return a dataframe with 287 columns with all the batting statistics for every single player for the whole season/seasons. A lot of statistics!

### Analysing *schedule_and_record()*

In [None]:
from pybaseball import schedule_and_record

Astros_record = schedule_and_record(2019, "HOU")
Astros_record.head(20)

In [None]:
Astros_record.columns

### Conclusion for schedule_and_record()
Very useful function that can get the players who played in each game, and also all the dates in which each a team played in a season, among others. It only has 19 columns, but this will be used as a *link* table between the others.

## Analysing *batting_stats_range()*

In [None]:
data_range = batting_stats_range("2019-09-01","2019-09-30")
data_range

In [None]:
print(data_range.columns)
print(all_data_2019.columns)

### Conclusion for *batting_stats_range()*

this functions returns a much smaller dataframe with only 27 columns. This will be useful to get the most recent information for each player right before each game.

# Let's look at all the columns for batting_stats() and batting_stats_range()

In [None]:
print(all_data_2019.columns[:95])

In [None]:
print(all_data_2019.columns[95:190])

In [None]:
print(all_data_2019.columns[190:])

In [None]:
len(all_data_2019.columns)

In [None]:
len(data_range.columns)

### Let's see what columns in the smaller dataframe are also in the larger df

In [None]:
shared_columns = list(data_range.columns & all_data_2019.columns)

In [None]:
shared_columns

In [None]:
len(shared_columns)

### basically all of them. Only 4 are not included. Let's see which one are not in the larger df.

In [None]:
range_columns_not_in_all_data = list(data_range.columns.difference(all_data_2019.columns))
range_columns_not_in_all_data

# So #days is a useless column, BA is the same as AVG, Lev is useless too, and Tm is the same as Team. As a conclusion, all the columns are included.

In [None]:
Astros_2019stats  =  all_data_2019[all_data_2019["Team"]=="Astros"]

In [None]:
Astros_2019stats.count()

In [None]:
#Thursday, Mar 28
#"2019-03-28"
game_march_28_2019 = batting_stats_range("2019-03-28",)


In [None]:
game_march_28_2019["Tm"].unique

In [None]:
game_march_28_2019[game_march_28_2019["Tm"]=="Houston"]

## ...We can see who played in every game ^

In [None]:
game_march_28_2019.columns

In [None]:
game_march_28_2019.iloc[:,6]

### *"@"* means they're playing as *visitors*.

# Startegy:
### General Strategy:
Retrieve data for each player in each game played. Feed the model with data from past year for each player and past month. Build a dataframe where each row represents the features to train the model and to make predictions out of the model. 

Each row would have the full 283 features from past season plus the 23 from past month for each player that played the particular game plus 2 team features. 

### Specific Tasks

Usually there are 9 players, so there would be 9 times 283 features only to account for past season statistics of 1 team. So this number would be later mulitply also by 2 since each game involves 2 teams. We will use **schedule_and_record()** function to get the players who played in a game, so we can later use **batting_stats()** function to get those 283 features from last season for each player that played a particular game.

There will be also recent statistics from last month for each player as well, which is 22 features. Again this number would be multiply by 9 and then by 2. We will use **schedule_and_record()** function to get the players who played in that game, so we can later retrieve the past month statistics for each one of those players from **batting_stats_range()**.

There will be also statistics from the team to be included in each record such as streak, and GB, also gotten from **schedule_and_record()**.

### Conclusion
The training DataFrame will contain a record for each game of a single team and its opponent. This leads us to  **DataFrame with dimensions of 5494 columns by *Games-Played-By-The-Team* rows**. This is only batting statistics so far.

## Analysing *pitching_stats()*

In [None]:
from pybaseball import pitching_stats

pitching_stats_2019 = pitching_stats("2019")
pitching_stats_2019.head(15)

In [None]:
pitching_stats_2019.columns

In [None]:
pitching_stats_2019.columns[:100]

In [None]:
pitching_stats_2019.columns[100:200]

In [None]:
pitching_stats_2019.columns[200:]

### Aparanetly, we should use only the *(pi)* variables. They are calculated using a new algorythm that eliminates what Brooks Baseball considers errors from Pitch FX (pfx).

link: https://www.reddit.com/r/Sabermetrics/comments/6qepoa/what_is_the_data_source_for_nonattributed_plate/

In [None]:
pfx_col = [x for x in pitching_stats_2019.columns if "(pfx)" in x ]
pfx_col

In [None]:
pitching_stats_2019.drop( columns = pfx_col, inplace=True )
pitching_stats_2019.columns

In [None]:
from pybaseball import pitching_stats_range

pitching_range_1day = pitching_stats_range("2019-03-28",)

In [None]:
pitching_range_1day.head()

In [None]:
pitchers_Astros_on_20190328 = pitching_range_1day[pitching_range_1day.Tm == "Houston"]
pitchers_Astros_on_20190328

## So! we can know who pitched on a certain game ^


In [None]:
pitchers_Astros_on_20190328.columns

#### An idea is to keep columns that represent % of other columns. For example, keeping IFFB% but droping IFFB, since a percentage tells more of a story that an isolated scalar value. In this way, we can reduce redundancy in our features and, therefore, the total size of our features array.

In [None]:
bat_col_not_to_drop = [x for x in all_data_2019.columns if "%" in x and x[:-1] in all_data_2019.columns]
bat_col_not_to_drop

In [None]:
bat_col_to_drop = [x[:-1] for x in bat_col_not_to_drop]
bat_col_to_drop

In [None]:
pitch_col_not_to_drop = [x for x in pitching_stats_2019.columns if "%" in x and x[:-1] in pitching_stats_2019.columns]
pitch_col_not_to_drop

In [None]:
pitch_col_to_drop = [x[:-1] for x in pitch_col_not_to_drop]
pitch_col_to_drop

In [None]:
all_data_2019.drop(columns=bat_col_to_drop, inplace=True)
pitching_stats_2019.drop(columns=pitch_col_to_drop, inplace=True)
print(f"Total columns in bat_statistics: {len(all_data_2019.columns)}\nTotal columns in pitch_statistics: {len(pitching_stats_2019.columns)} ")

### ...We still have loooots of columns.


##### We will have to get rid of some columns manually that we consider unnecesary such as "Age" or "Team", etc. However, some of these columns might be useful during concatenation, so we will handle this in within the function that creates the dataframe.

## Let's explore how many pitcher there are per game usually

In [None]:
# let's grab our datframe created through record_and_schedule() previously for The Astros season 2019:
Astros_record.head()

### We will have to convert the date format cuz these guys are assholes

In [None]:
date = Astros_record.Date[1]
#date = date.split(",")
date = re.findall(r"\w* [0-9][0-9]|[0-9]", date)
date = date[0]
print(date)
year = "2019"
date = year + " " + date
date

In [None]:
import datetime as dt

In [None]:
date_formatted = dt.datetime.strptime(date,"%Y %b %d")
date_formatted

In [None]:
date_formatted.month

### it worked out, so now we know how

In [None]:
type(date_formatted)

# %%%%%%%    Building the Training DataFrame    %%%%%%%%

In [23]:
# Import required libraries
from path import Path
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder, StandardScaler
from pybaseball import batting_stats
from pybaseball import batting_stats_range
from pybaseball import pitching_stats_range
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from pybaseball import schedule_and_record
import re
from pybaseball import pitching_stats
import datetime as dt

In [4]:
team_dict = {'Angels':'LAA',
            'Athletics': 'OAK',
            'Astros': 'HOU',
            'Braves': 'ATL',
            'Brewers': 'MIL',
            'Cards': 'STL',
            'Cubs': 'CUB',
            'Diamondbacks': 'ARI',
            'Dodgers': 'LAD',
            'Giants': 'SFG',
            'Indians': 'CLE',
            'Blue Jays': 'TOR',
            'Mariners': 'SEA',
            'Marlins': 'MIA',
            'Mets': 'NYM',
            'Nats': 'WAS',
            'Orioles': 'BAL',
            'Padres': 'SDP',
            'Phillies': 'PHI',
            'Pirates': 'PIT',
            'Rangers': 'TEX',
            'Rays': 'TBR',
            'Red Sox': 'BOS',
            'Reds': 'CIN',
            'Rockies': 'COL',
            'Royals': 'KCR',
            'Tigers': 'DET',
            'Twins': 'MIN',
            'White_Sox': 'CHW',
            'Yankees': 'NYY'
             
           }
def get_key_from_dict(dictionary, val): 
    for key, value in dictionary.items(): 
         if val == value: 
            return key 
 

In [5]:
"""
Not necesary anymore.
"""
def switch_key_val_dict(dictionary): 
    new_dict = {}
    
    for key, value in dictionary.items():
        new_dict.update({value:key})
    
    return new_dict  
        

In [25]:
acr_team_dict = {}
acr_team_dict.update({'CHC':'CUB'})
acr_team_dict.update({'LAD':'LOS'})
acr_team_dict.update({'SFG':'SFO'})
acr_team_dict.update({'SDP':'SDG'})
acr_team_dict.update({'TBR':'TAM'} )
acr_team_dict.update({'KCR':'KAN'})
acr_team_dict.update({'CHW':'CWS'})
print(acr_team_dict.keys())

dict_keys(['CHC', 'LAD', 'SFG', 'SDP', 'TBR', 'KCR', 'CHW'])


In [7]:


def format_dates_to_dt(un_date="Monday, Dec 31", year=1999):
    date = re.findall(r"\W\w\w\w\s\d+", un_date)
    date = date[0]
    date = str(year) + date
    date_formatted = dt.datetime.strptime(date,"%Y %b %d")
    return date_formatted

In [8]:
def modify_dates_from_lineups(date="1. Thu,3/29 at TEX W (4-1)#", year = 2018):
    date = re.findall(r"\d+/\d+", date)
    try: date = date[0]
    except: return
    date = str(year) +" " + date
    return date

In [9]:
def modify_date_col_from_lineups(df, year):
    df["0"] = df.apply(lambda x: modify_dates_from_lineups(x["0"],year), axis=1)
    df["0"] = pd.to_datetime(df["0"])
    return df

In [10]:
def clean_lineups(df):
    for row in range(0,len(df)):
        for column in range(2,11):
            original_name = df.iloc[row,column]
            name = original_name.split("-")
            name = name[0]
            df.replace(original_name,name, inplace=True)
            #WE CAN IMPROVE THIS LATER so we don't need to iterate all the dataframe since most of it
            #has the same names over and over. We just need to grab all the different names and replace them
    return df

In [11]:

def format_lineups_df(df, year):
    df  =  modify_date_col_from_lineups(df, year)
    df = clean_lineups(df)
    df.columns = ["index","Date","1","2","3","4","5","6","7","8","9"]
    df.set_index("index", inplace=True, drop=True)
    return df
    
    

In [12]:
def get_dates_played(df=None,year=None):
    
    dates_played = [format_dates_to_dt(date, year) for date in df.Date]
    return dates_played

In [13]:
def get_team_schedule(year=None, team = "HOU"):
    
    teams_df  = schedule_and_record(year, team)
    teams_df  = teams_df.iloc[ : , [0,1,2,3,4,10,17] ]
    teams_df["Date"] = teams_df.apply(lambda x: format_dates_to_dt(x["Date"],year), axis=1)
    teams_df.replace("@",1, inplace=True)
    teams_df.replace("Home",0, inplace=True)
    return teams_df

In [14]:
def get_players_per_game(year = None, team = None):
    
    schedule_df = get_team_schedule(year, team)
    opponents = set(schedule_df["Opp"])
    
    bat_stat_path = Path(f"Data/Batting/Clean_Data/clean_batting_data_{year}.csv")
    all_bat_stats = pd.read_csv(bat_stat_path)
    
    pitchers_path = Path(f"Starting_Pitchers/Starting_Pitchers_{year}.csv")
    starting_pitchers = pd.read_csv(pitchers_path)
    #print(f"head of PITCHER STATS: \n{starting_pitchers.head()}")
    
    lineups_path = Path(f"Lineups/{team}_lineups_{year}.csv")
    all_lineups_season = pd.read_csv(lineups_path)
    all_lineups_season = format_lineups_df(all_lineups_season, year)
    
    opponents_lineups = {}
    for opponent in opponents:
        opp_lineups_path = Path(f"Lineups/{opponent}_lineups_{year}.csv")
        opp_all_lineups_season = pd.read_csv(opp_lineups_path)
        #print(f"opp: {opponent}")
        opp_all_lineups_season = format_lineups_df(opp_all_lineups_season, year)
        opponents_lineups.update({opponent:opp_all_lineups_season})
    
    players_df = pd.DataFrame()
    
    i = 0
    for date in schedule_df["Date"]:
        
        
        adversary = schedule_df[schedule_df["Date"]==date.strftime("%Y-%m-%d")]["Opp"].values[0]
        
        temp_dict = {"Date":date}

        ##Line ups for the team
        try: all_players_on_date = all_lineups_season[all_lineups_season["Date"]==date.strftime("%Y-%m-%d")]
            
        except:
            print(f"No game on this date {date} for team")
            continue
            
        count = 1  
        all_players_team = all_players_on_date.iloc[0]       
        for player in all_players_team[1:]:            
            temp_dict.update({f"player_{count:02}" : player})           
            count+=1
        
        ##Line ups for the adversary
        try:
            opp_lineups_df = opponents_lineups[adversary]
            all_opponents_on_date = opp_lineups_df[opp_lineups_df["Date"]==date.strftime("%Y-%m-%d")]
            all_opponents_on_date.columns = ["Date","10","11","12","12","14","15","16","17","18"]
        except:
            print(f"No game on this date {date} for opponent")
            continue

        count = 10  
        all_players_opp = all_opponents_on_date.iloc[0]       
        for enemy in all_players_opp[1:]:            
            temp_dict.update({f"player_{count:02}" : enemy})           
            count+=1
         
        pitcher_on_date = starting_pitchers[starting_pitchers["Date"]==date.strftime("%Y-%m-%d")]
        #print(f"PITCHERS ON DATE: \n{pitcher_on_date}")     
        
        try:
            team_pitcher = pitcher_on_date[pitcher_on_date["Team"]==team]["PITCHER"].values[0]
            #print(f"TEAM PITCHER: \n{team_pitcher}")   
            temp_dict.update({"pitcher_team": team_pitcher })
        except:
            #print(f"No pitcher found for {adversary} on {date}")
            opponent_pitcher = "Unknown"
        
        if adversary in acr_team_dict.keys(): 
            print("Adversary in list of conflicting acronyms")
            print(f"\n acronym received {adversary}")
            adversary = acr_team_dict[adversary]
            print(f"\n acronym after process {adversary}")
        print(f"adversary: {adversary}\n")
        
        try:
            opponent_pitcher = pitcher_on_date[pitcher_on_date["Team"]==adversary]["PITCHER"].values[0] 
            print(f"OPPONENT PITCHER: \n{opponent_pitcher}") 
            temp_dict.update({"pitcher_opp": opponent_pitcher})
        except:
            print(f"No pitcher found for {adversary} on {date}")
            opponent_pitcher = "Unknown"
        
        temp_df = pd.DataFrame(temp_dict, index =[i])
        players_df = pd.concat([players_df,temp_df], axis=0, sort = True )
        i+=1
        
    schedule_df.set_index("Date", inplace=True)
    players_df.set_index("Date", inplace=True)
    teams_df = pd.concat([schedule_df,players_df], axis=1, join="inner")
    teams_df.reset_index(inplace=True, drop=False)
    
    return teams_df

In [56]:
players_per_game_for_HOU_2019 = get_players_per_game(year = 2019, team = "HOU")

Adversary in list of conflicting acronyms

 acronym received TBR

 acronym after process TAM
adversary: TAM

OPPONENT PITCHER: 
Snell
Adversary in list of conflicting acronyms

 acronym received TBR

 acronym after process TAM
adversary: TAM

OPPONENT PITCHER: 
Morton
Adversary in list of conflicting acronyms

 acronym received TBR

 acronym after process TAM
adversary: TAM

OPPONENT PITCHER: 
Glasnow
Adversary in list of conflicting acronyms

 acronym received TBR

 acronym after process TAM
adversary: TAM

OPPONENT PITCHER: 
Chirinos
adversary: TEX

OPPONENT PITCHER: 
Smyly
adversary: TEX

OPPONENT PITCHER: 
Miller
adversary: TEX

OPPONENT PITCHER: 
Minor
adversary: OAK

OPPONENT PITCHER: 
Montas
adversary: OAK

OPPONENT PITCHER: 
Brooks
adversary: OAK

OPPONENT PITCHER: 
Fiers
adversary: NYY

OPPONENT PITCHER: 
Tanaka
adversary: NYY

OPPONENT PITCHER: 
Loaisiga
adversary: NYY

OPPONENT PITCHER: 
Paxton
adversary: SEA

OPPONENT PITCHER: 
Leblanc
adversary: SEA

OPPONENT PITCHER: 
Her

In [57]:
players_per_game_for_HOU_2019.tail(20)

Unnamed: 0,Date,Tm,Home_Away,Opp,W/L,GB,Streak,pitcher_opp,pitcher_team,player_01,...,player_09,player_10,player_11,player_12,player_13,player_14,player_15,player_16,player_17,player_18
142,2019-09-07,HOU,0,SEA,W,up 9.5,3,Kikuchi,Verlander,Altuve,...,Reddick,Moore,Smith,Seager,Narvaez,Vogelbach,Murphy,Long,Fraley,Gordon
143,2019-09-08,HOU,0,SEA,W,up 9.5,4,Hernandez,Cole,Springer,...,Marisnick,Gordon,Long,Nola,Seager,Narvaez,Vogelbach,Lopes,Bishop,Broxton
144,2019-09-09,HOU,0,OAK,W,up10.5,5,Fiers,Greinke,Springer,...,Reddick,Semien,Grossman,Chapman,Olson,Canha,Brown,Davis,Profar,Phegley
145,2019-09-10,HOU,0,OAK,L,up 9.5,-1,Roark,Miley,Springer,...,Reddick,Semien,Chapman,Olson,Canha,Pinder,Davis,Profar,Murphy,Grossman
146,2019-09-11,HOU,0,OAK,L,up 8.5,-2,Anderson,Urquidy,Springer,...,Straw,Semien,Chapman,Olson,Canha,Brown,Davis,Laureano,Profar,Murphy
147,2019-09-12,HOU,0,OAK,L,up 7.5,-3,Bailey,Verlander,Springer,...,Reddick,Semien,Chapman,Olson,Canha,Brown,Davis,Profar,Grossman,Phegley
148,2019-09-13,HOU,1,KCR,W,up 7.5,1,Duffy,Cole,Springer,...,Marisnick,Merrifield,Mondesi,Soler,Dozier,Gordon,McBroom,O'Hearn,Starling,Viloria
149,2019-09-14,HOU,1,KCR,W,up 7.5,2,Montgomery,Greinke,Springer,...,Reddick,Merrifield,Mondesi,Soler,Dozier,Gordon,McBroom,O'Hearn,Viloria,Lopez
150,2019-09-15,HOU,1,KCR,W,up 7.5,3,Junis,Miley,Tucker,...,Maldonado,Merrifield,Mondesi,Soler,Dozier,McBroom,Cuthbert,Starling,Dini,Lopez
151,2019-09-17,HOU,0,TEX,W,up 8.0,4,Lynn,Verlander,Springer,...,Reddick,Choo,Andrus,Calhoun,Mazara,Santana,Solak,Odor,DeShields,Trevino


In [58]:
players_per_game_for_HOU_2019.columns

Index(['Date', 'Tm', 'Home_Away', 'Opp', 'W/L', 'GB', 'Streak', 'pitcher_opp',
       'pitcher_team', 'player_01', 'player_02', 'player_03', 'player_04',
       'player_05', 'player_06', 'player_07', 'player_08', 'player_09',
       'player_10', 'player_11', 'player_12', 'player_13', 'player_14',
       'player_15', 'player_16', 'player_17', 'player_18'],
      dtype='object')

In [59]:
players_per_game_for_HOU_2019[players_per_game_for_HOU_2019["pitcher_opp"].isna()]

Unnamed: 0,Date,Tm,Home_Away,Opp,W/L,GB,Streak,pitcher_opp,pitcher_team,player_01,...,player_09,player_10,player_11,player_12,player_13,player_14,player_15,player_16,player_17,player_18


In [60]:
def get_col_explanation(col1,col2):
    print(f"column 1: {col1[:10]}\ncolumn2: {col2[:10]}")
    try: col1 = [x.split("_")[1] for x in col1 if x is not "index"]
    except: pass
    try: col2 = [x.split("_")[1] for x in col2 if x is not "index"]
    except: pass   
    print(f"column 1: {col1[:10]}\ncolumn2: {col2[:10]}")
    if(len(col1)>len(col2)):missing_col = list(set(col1).difference(set(col2)))
    else:missing_col = list(set(col2).difference(set(col1)))
    return missing_col

In [61]:
get_col_explanation(["1_a","222_b","2_c"],["222_b","1_c"])

column 1: ['1_a', '222_b', '2_c']
column2: ['222_b', '1_c']
column 1: ['a', 'b', 'c']
column2: ['b', 'c']


['a']

In [62]:
type(players_per_game_for_HOU_2018.columns)

pandas.core.indexes.base.Index

In [63]:
def stats_single_game_x_team(players=None, team=None, stats_df=None, counter=1, pitching_stats = False):
    
    all_players_team_stats = pd.DataFrame()
    
    for player in players:

            if("00:00:00" in str(player)): continue #If it's a date, Skip it.
                
            try: 
                all_team_stats = stats_df[stats_df.Team==get_key_from_dict( team_dict ,team )]
                player_full_name = [x for x in all_team_stats.Name if player in x]
                player_stats = all_team_stats[all_team_stats.Name==player_full_name[0]]
              
            except: 
                print(f"{player} from {get_key_from_dict( team_dict ,team )}, {team} not in batting list")
                
                try:
                    #nanlist = np.empty((1,len(player_stats.columns)))
                    nanlist = np.empty((1,len(stats_df.columns)))
                except Exception as e:
                    print(f"Not able to get columns from last player\n{e}")
                    continue
                nanlist.fill(np.nan)
                """
                fake_columns = []
                for col in player_stats.columns:
                    try: fake_columns.append(col.split("_")[1])
                    except: fake_columns.append(col)
                """
                fake_columns = stats_df.columns
                player_stats = pd.DataFrame(data = nanlist, columns = fake_columns)
                #player_stats.drop(columns=["index"], inplace=True)
                
            new_col = []
            for col in player_stats.columns:
                new_col.append(f"{counter:02}_{col}")
            player_stats.columns = new_col
            player_stats.reset_index(inplace=True)
            all_players_team_stats = pd.concat([ all_players_team_stats, player_stats ] ,   axis=1)
            
            counter+=1
            
    return all_players_team_stats

In [64]:
def get_stats_startingplayer_by_game(players_df, team, year):
    
    batting_season_data = batting_stats(year-1)   
    pitching_season_data = pitching_stats(year)
    #print(pitching_season_data.head()) 
    
    stats_players_start_lineup = pd.DataFrame()
    
    for row in range(0,len(players_df)):
        
        all_players_stats = pd.DataFrame()
        
        all_players_team_stats = stats_single_game_x_team(players = players_df.iloc[row][9:18], 
                                                                  team = team, 
                                                                  stats_df = batting_season_data)
        
        all_players_opp_stats = stats_single_game_x_team(players = players_df.iloc[row][18:], 
                                                                  team = players_df.iloc[row][3], 
                                                                  stats_df = batting_season_data,
                                                                  counter = 10)           
        try:
            all_players_stats = pd.concat([ all_players_team_stats, all_players_opp_stats] ,   axis=1)    
        except Exception as e: 
            print(f"could not concatenate team with opponent stats\n{e}")
            continue
            
        try: 
            stats_players_start_lineup = stats_players_start_lineup.append(all_players_stats, ignore_index=True)
        except Exception as e: 
            print(f"could not append {e}")
            print(f"explanation: \n{get_col_explanation(stats_players_start_lineup.columns,all_players_stats.columns)}")
            
            continue
        
    
    return stats_players_start_lineup

In [65]:

test = get_stats_startingplayer_by_game(players_per_game_for_HOU_2019, "HOU", 2019)
test

Brantley from Astros, HOU not in batting list
Chirinos from Astros, HOU not in batting list
Diaz from Astros, HOU not in batting list
Meadows from Rays, TBR not in batting list
Pham from Rays, TBR not in batting list
Choi from Rays, TBR not in batting list
Diaz from Rays, TBR not in batting list
Zunino from Rays, TBR not in batting list
Brantley from Astros, HOU not in batting list
Diaz from Astros, HOU not in batting list
Meadows from Rays, TBR not in batting list
Pham from Rays, TBR not in batting list
Choi from Rays, TBR not in batting list
Diaz from Rays, TBR not in batting list
Garcia from Rays, TBR not in batting list
Zunino from Rays, TBR not in batting list
Brantley from Astros, HOU not in batting list
Chirinos from Astros, HOU not in batting list
Meadows from Rays, TBR not in batting list
Pham from Rays, TBR not in batting list
Choi from Rays, TBR not in batting list
Diaz from Rays, TBR not in batting list
Garcia from Rays, TBR not in batting list
Diaz from Astros, HOU not in 

Unnamed: 0,index,01_Season,01_Name,01_Team,01_Age,01_G,01_AB,01_PA,01_H,01_1B,...,18_wSL/C (pi),18_wXX/C (pi),18_O-Swing% (pi),18_Z-Swing% (pi),18_Swing% (pi),18_O-Contact% (pi),18_Z-Contact% (pi),18_Contact% (pi),18_Zone% (pi),18_Pace (pi)
0,168,2018.0,George Springer,Astros,28.0,140.0,544.0,620.0,144.0,96.0,...,,,,,,,,,,
1,168,2018.0,George Springer,Astros,28.0,140.0,544.0,620.0,144.0,96.0,...,,,,,,,,,,
2,168,2018.0,George Springer,Astros,28.0,140.0,544.0,620.0,144.0,96.0,...,-4.51,,0.281,0.717,0.503,0.439,0.853,0.740,0.510,23.5
3,168,2018.0,George Springer,Astros,28.0,140.0,544.0,620.0,144.0,96.0,...,-1.37,,0.277,0.667,0.467,0.561,0.819,0.740,0.486,23.2
4,168,2018.0,George Springer,Astros,28.0,140.0,544.0,620.0,144.0,96.0,...,-2.35,,0.225,0.559,0.390,0.620,0.894,0.814,0.494,22.6
5,168,2018.0,George Springer,Astros,28.0,140.0,544.0,620.0,144.0,96.0,...,-2.35,,0.225,0.559,0.390,0.620,0.894,0.814,0.494,22.6
6,168,2018.0,George Springer,Astros,28.0,140.0,544.0,620.0,144.0,96.0,...,,,,,,,,,,
7,168,2018.0,George Springer,Astros,28.0,140.0,544.0,620.0,144.0,96.0,...,,,,,,,,,,
8,168,2018.0,George Springer,Astros,28.0,140.0,544.0,620.0,144.0,96.0,...,,,,,,,,,,
9,168,2018.0,George Springer,Astros,28.0,140.0,544.0,620.0,144.0,96.0,...,0.11,,0.343,0.592,0.461,0.643,0.853,0.771,0.474,25.7


In [66]:
#test.columns[-1]
col = "18_Name"
test[test[col].isna()]

Unnamed: 0,index,01_Season,01_Name,01_Team,01_Age,01_G,01_AB,01_PA,01_H,01_1B,...,18_wSL/C (pi),18_wXX/C (pi),18_O-Swing% (pi),18_Z-Swing% (pi),18_Swing% (pi),18_O-Contact% (pi),18_Z-Contact% (pi),18_Contact% (pi),18_Zone% (pi),18_Pace (pi)
0,168,2018.0,George Springer,Astros,28.0,140.0,544.0,620.0,144.0,96.0,...,,,,,,,,,,
1,168,2018.0,George Springer,Astros,28.0,140.0,544.0,620.0,144.0,96.0,...,,,,,,,,,,
6,168,2018.0,George Springer,Astros,28.0,140.0,544.0,620.0,144.0,96.0,...,,,,,,,,,,
7,168,2018.0,George Springer,Astros,28.0,140.0,544.0,620.0,144.0,96.0,...,,,,,,,,,,
8,168,2018.0,George Springer,Astros,28.0,140.0,544.0,620.0,144.0,96.0,...,,,,,,,,,,
10,168,2018.0,George Springer,Astros,28.0,140.0,544.0,620.0,144.0,96.0,...,,,,,,,,,,
15,168,2018.0,George Springer,Astros,28.0,140.0,544.0,620.0,144.0,96.0,...,,,,,,,,,,
16,168,2018.0,George Springer,Astros,28.0,140.0,544.0,620.0,144.0,96.0,...,,,,,,,,,,
26,168,2018.0,George Springer,Astros,28.0,140.0,544.0,620.0,144.0,96.0,...,,,,,,,,,,
27,168,2018.0,George Springer,Astros,28.0,140.0,544.0,620.0,144.0,96.0,...,,,,,,,,,,


In [69]:
batting_season_2018 = batting_stats(2018)

In [93]:
p_name = "Chirinos"
p_full_name = [x for x in batting_season_2018.Name if p_name in x]
p_full_name
players_st = [batting_season_2018[batting_season_2018["Name"]==x] for x in p_full_name]

In [94]:
Possible_player = pd.DataFrame()
for p in players_st:
    Possible_player = Possible_player.append(p)
Possible_player

Unnamed: 0,Season,Name,Team,Age,G,AB,PA,H,1B,2B,...,wSL/C (pi),wXX/C (pi),O-Swing% (pi),Z-Swing% (pi),Swing% (pi),O-Contact% (pi),Z-Contact% (pi),Contact% (pi),Zone% (pi),Pace (pi)
193,2018.0,Robinson Chirinos,Rangers,34.0,113.0,360.0,426.0,80.0,46.0,15.0,...,0.69,,0.276,0.628,0.441,0.538,0.757,0.684,0.47,26.2
957,2018.0,Yonny Chirinos,Rays,24.0,18.0,1.0,1.0,0.0,0.0,0.0,...,-1.83,,0.0,0.333,0.25,,0.0,0.0,0.75,16.3


In [None]:
def create_trining_df(year = (dt.datetime.today().year-1), team = "HOU" ):
                       
    
    players_df = get_players_per_game(year, team)
    #print(players_df.tail(10))    
    stats_players_start_lineup = get_stats_startingplayer_by_game(players_df, team, year)
    
    return pd.concat([players_df.iloc[:,[0,1,2,3,4,5,6]],stats_players_start_lineup], axis=1, join='inner')
       
    
    
    
    #dates_played = get_dates_played(last_season_games_played, year)   

In [68]:
stats_players_start_lineup = create_trining_df()
stats_players_start_lineup

KeyboardInterrupt: 

In [None]:
stats_players_start_lineup.head()

In [None]:
names = stats_players_start_lineup.loc[:]["Name"].isna()

In [None]:
names.isnull().values.any()