In [None]:
# Import required libraries
from path import Path
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder, StandardScaler
from pybaseball import batting_stats
from pybaseball import batting_stats_range
from pybaseball import pitching_stats_range
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

### Analysing *batting_stats()* with different paramters

In [None]:
all_data_2019 = batting_stats(2019)

In [None]:
print(str(int(all_data_2019["Season"][0])))

In [None]:
all_data_2019[["Name","Team"]]

In [None]:
all_data_2019 = batting_stats(2019)
players_with_more_than_50 = batting_stats(2019, qual=50)
all_data_since_2015 = batting_stats(2015, 2019)
aggregated_data = batting_stats(2010, 2016, ind=0)

print(all_data_2019.head())
print(players_with_more_than_50.head())
print(all_data_since_2015.head())
print(aggregated_data.head())

# Conclusion for batting_stats()
Very versitile function that can take one or two years as parameters (single season or multiple seasons). It would return a dataframe with 287 columns with all the batting statistics for every single player for the whole season/seasons. A lot of statistics!

### Analysing *schedule_and_record()*

In [None]:
from pybaseball import schedule_and_record

Astros_record = schedule_and_record(2019, "HOU")
Astros_record.head(20)

In [None]:
Astros_record.columns

### Conclusion for schedule_and_record()
Very useful function that can get the players who played in each game, and also all the dates in which each a team played in a season, among others. It only has 19 columns, but this will be used as a *link* table between the others.

## Analysing *batting_stats_range()*

In [None]:
data_range = batting_stats_range("2019-09-01","2019-09-30")
data_range

In [None]:
print(data_range.columns)
print(all_data_2019.columns)

### Conclusion for *batting_stats_range()*

this functions returns a much smaller dataframe with only 27 columns. This will be useful to get the most recent information for each player right before each game.

# Let's look at all the columns for batting_stats() and batting_stats_range()

In [None]:
print(all_data_2019.columns[:95])

In [None]:
print(all_data_2019.columns[95:190])

In [None]:
print(all_data_2019.columns[190:])

In [None]:
len(all_data_2019.columns)

In [None]:
len(data_range.columns)

### Let's see what columns in the smaller dataframe are also in the larger df

In [None]:
shared_columns = list(data_range.columns & all_data_2019.columns)

In [None]:
shared_columns

In [None]:
len(shared_columns)

### basically all of them. Only 4 are not included. Let's see which one are not in the larger df.

In [None]:
range_columns_not_in_all_data = list(data_range.columns.difference(all_data_2019.columns))
range_columns_not_in_all_data

# So #days is a useless column, BA is the same as AVG, Lev is useless too, and Tm is the same as Team. As a conclusion, all the columns are included.

In [None]:
Astros_2019stats  =  all_data_2019[all_data_2019["Team"]=="Astros"]

In [None]:
Astros_2019stats.count()

In [None]:
#Thursday, Mar 28
#"2019-03-28"
game_march_28_2019 = batting_stats_range("2019-03-28",)


In [None]:
game_march_28_2019["Tm"].unique

In [None]:
game_march_28_2019[game_march_28_2019["Tm"]=="Houston"]

## ...We can see who played in every game ^

In [None]:
game_march_28_2019.columns

In [None]:
game_march_28_2019.iloc[:,6]

### *"@"* means they're playing as *visitors*.

# Startegy:
### General Strategy:
Retrieve data for each player in each game played. Feed the model with data from past year for each player and past month. Build a dataframe where each row represents the features to train the model and to make predictions out of the model. 

Each row would have the full 283 features from past season plus the 23 from past month for each player that played the particular game plus 2 team features. 

### Specific Tasks

Usually there are 9 players, so there would be 9 times 283 features only to account for past season statistics of 1 team. So this number would be later mulitply also by 2 since each game involves 2 teams. We will use **schedule_and_record()** function to get the players who played in a game, so we can later use **batting_stats()** function to get those 283 features from last season for each player that played a particular game.

There will be also recent statistics from last month for each player as well, which is 22 features. Again this number would be multiply by 9 and then by 2. We will use **schedule_and_record()** function to get the players who played in that game, so we can later retrieve the past month statistics for each one of those players from **batting_stats_range()**.

There will be also statistics from the team to be included in each record such as streak, and GB, also gotten from **schedule_and_record()**.

### Conclusion
The training DataFrame will contain a record for each game of a single team and its opponent. This leads us to  **DataFrame with dimensions of 5494 columns by *Games-Played-By-The-Team* rows**. This is only batting statistics so far.

## Analysing *pitching_stats()*

In [None]:
from pybaseball import pitching_stats

pitching_stats_2019 = pitching_stats("2019")
pitching_stats_2019.head(15)

In [None]:
pitching_stats_2019.columns

In [None]:
pitching_stats_2019.columns[:100]

In [None]:
pitching_stats_2019.columns[100:200]

In [None]:
pitching_stats_2019.columns[200:]

### Aparanetly, we should use only the *(pi)* variables. They are calculated using a new algorythm that eliminates what Brooks Baseball considers errors from Pitch FX (pfx).

link: https://www.reddit.com/r/Sabermetrics/comments/6qepoa/what_is_the_data_source_for_nonattributed_plate/

In [None]:
pfx_col = [x for x in pitching_stats_2019.columns if "(pfx)" in x ]
pfx_col

In [None]:
pitching_stats_2019.drop( columns = pfx_col, inplace=True )
pitching_stats_2019.columns

In [None]:
from pybaseball import pitching_stats_range

pitching_range_1day = pitching_stats_range("2019-03-28",)

In [None]:
pitching_range_1day.head()

In [None]:
pitchers_Astros_on_20190328 = pitching_range_1day[pitching_range_1day.Tm == "Houston"]
pitchers_Astros_on_20190328

## So! we can know who pitched on a certain game ^


In [None]:
pitchers_Astros_on_20190328.columns

#### An idea is to keep columns that represent % of other columns. For example, keeping IFFB% but droping IFFB, since a percentage tells more of a story that an isolated scalar value. In this way, we can reduce redundancy in our features and, therefore, the total size of our features array.

In [None]:
bat_col_not_to_drop = [x for x in all_data_2019.columns if "%" in x and x[:-1] in all_data_2019.columns]
bat_col_not_to_drop

In [None]:
bat_col_to_drop = [x[:-1] for x in bat_col_not_to_drop]
bat_col_to_drop

In [None]:
pitch_col_not_to_drop = [x for x in pitching_stats_2019.columns if "%" in x and x[:-1] in pitching_stats_2019.columns]
pitch_col_not_to_drop

In [None]:
pitch_col_to_drop = [x[:-1] for x in pitch_col_not_to_drop]
pitch_col_to_drop

In [None]:
all_data_2019.drop(columns=bat_col_to_drop, inplace=True)
pitching_stats_2019.drop(columns=pitch_col_to_drop, inplace=True)
print(f"Total columns in bat_statistics: {len(all_data_2019.columns)}\nTotal columns in pitch_statistics: {len(pitching_stats_2019.columns)} ")

### ...We still have loooots of columns.


##### We will have to get rid of some columns manually that we consider unnecesary such as "Age" or "Team", etc. However, some of these columns might be useful during concatenation, so we will handle this in within the function that creates the dataframe.

## Let's explore how many pitcher there are per game usually

In [None]:
# let's grab our datframe created through record_and_schedule() previously for The Astros season 2019:
Astros_record.head()

### We will have to convert the date format cuz these guys are assholes

In [None]:
date = Astros_record.Date[1]
#date = date.split(",")
date = re.findall(r"\w* [0-9][0-9]|[0-9]", date)
date = date[0]
print(date)
year = "2019"
date = year + " " + date
date

In [None]:
import datetime as dt

In [None]:
date_formatted = dt.datetime.strptime(date,"%Y %b %d")
date_formatted

In [None]:
date_formatted.month

### it worked out, so now we know how

In [None]:
type(date_formatted)

# %%%%%%%    Building the Training DataFrame    %%%%%%%%

In [1]:
# Import required libraries
from path import Path
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder, StandardScaler
from pybaseball import batting_stats
from pybaseball import batting_stats_range
from pybaseball import pitching_stats_range
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from pybaseball import schedule_and_record
import re
from pybaseball import pitching_stats
import datetime as dt

In [66]:
team_dict = {'Angels':'LAA',
            'Athletics': 'OAK',
            'Astros': 'HOU',
            'Braves': 'ATL',
            'Brewers': 'MIL',
            'Cards': 'STL',
            'Cubs': 'CHC',
            'Diamondbacks': 'ARI',
            'Dodgers': 'LAD',
            'Giants': 'SFG',
            'Indians': 'CLE',
            'Blue Jays': 'TOR',
            'Mariners': 'SEA',
            'Marlins': 'MIA',
            'Mets': 'NYM',
            'Nats': 'WSN',
            'Orioles': 'BAL',
            'Padres': 'SDP',
            'Phillies': 'PHI',
            'Pirates': 'PIT',
            'Rangers': 'TEX',
            'Rays': 'TBR',
            'Red Sox': 'BOS',
            'Reds': 'CIN',
            'Rockies': 'COL',
            'Royals': 'KCR',
            'Tigers': 'DET',
            'Twins': 'MIN',
            'White_Sox': 'CHW',
            'Yankees': 'NYY'
             
           }
def get_key_from_dict(dictionary, val): 
    for key, value in dictionary.items(): 
         if val == value: 
            return key 
 

In [3]:
"""
Not necesary anymore.
"""
def switch_key_val_dict(dictionary): 
    new_dict = {}
    
    for key, value in dictionary.items():
        new_dict.update({value:key})
    
    return new_dict  
        

In [68]:
acr_team_dict = {}
acr_team_dict.update({'CHC':'CUB'})
#acr_team_dict.update({'LAD':'LAD'})
acr_team_dict.update({'SFG':'SFO'})
acr_team_dict.update({'SDP':'SDG'})
acr_team_dict.update({'TBR':'TAM'} )
acr_team_dict.update({'KCR':'KAN'})
acr_team_dict.update({'CHW':'CWS'})
print(acr_team_dict.keys())

dict_keys(['CHC', 'SFG', 'SDP', 'TBR', 'KCR', 'CHW'])


In [5]:


def format_dates_to_dt(un_date="Monday, Dec 31", year=1999):
    date = re.findall(r"\W\w\w\w\s\d+", un_date)
    date = date[0]
    date = str(year) + date
    date_formatted = dt.datetime.strptime(date,"%Y %b %d")
    return date_formatted

In [6]:
def modify_dates_from_lineups(date="1. Thu,3/29 at TEX W (4-1)#", year = 2018):
    date = re.findall(r"\d+/\d+", date)
    try: date = date[0]
    except: return
    date = str(year) +" " + date
    return date

In [7]:
def modify_date_col_from_lineups(df, year):
    df["0"] = df.apply(lambda x: modify_dates_from_lineups(x["0"],year), axis=1)
    df["0"] = pd.to_datetime(df["0"])
    return df

In [8]:
def clean_lineups(df):
    for row in range(0,len(df)):
        for column in range(2,11):
            original_name = df.iloc[row,column]
            name = original_name.split("-")
            name = name[0]
            df.replace(original_name,name, inplace=True)
            #WE CAN IMPROVE THIS LATER so we don't need to iterate all the dataframe since most of it
            #has the same names over and over. We just need to grab all the different names and replace them
    return df

In [9]:

def format_lineups_df(df, year):
    df  =  modify_date_col_from_lineups(df, year)
    df = clean_lineups(df)
    df.columns = ["index","Date","1","2","3","4","5","6","7","8","9"]
    df.set_index("index", inplace=True, drop=True)
    return df
    
    

In [10]:
def get_dates_played(df=None,year=None):
    
    dates_played = [format_dates_to_dt(date, year) for date in df.Date]
    return dates_played

In [11]:
def get_team_schedule(year=None, team = "HOU"):
    
    try: teams_df  = schedule_and_record(year, team)
    except: 
        message: "Not able to get_team_schedule()"
        print(message)
        return message
    teams_df  = teams_df.iloc[ : , [0,1,2,3,4,10,17] ]
    teams_df["Date"] = teams_df.apply(lambda x: format_dates_to_dt(x["Date"],year), axis=1)
    teams_df.replace("@",1, inplace=True)
    teams_df.replace("Home",0, inplace=True)
    return teams_df

In [20]:
def get_players_per_game(year = None, team = None):
    
    
    
    schedule_df = get_team_schedule(year, team)
    if(type(schedule_df)=="str"): return schedule_df
    
   
  
    opponents = set(schedule_df["Opp"])
    
    bat_stat_path = Path(f"Data/Batting/Clean_Data/clean_batting_data_{year}.csv")
    all_bat_stats = pd.read_csv(bat_stat_path)
    
    pitchers_path = Path(f"Starting_Pitchers/Starting_Pitchers_{year}.csv")
    starting_pitchers = pd.read_csv(pitchers_path)
    #print(f"head of PITCHER STATS: \n{starting_pitchers.head()}")
    
    lineups_path = Path(f"Lineups/{team}_lineups_{year}.csv")
    all_lineups_season = pd.read_csv(lineups_path)
    all_lineups_season = format_lineups_df(all_lineups_season, year)
    
    opponents_lineups = {}
    for opponent in opponents:
        opp_lineups_path = Path(f"Lineups/{opponent}_lineups_{year}.csv")
        opp_all_lineups_season = pd.read_csv(opp_lineups_path)
        #print(f"opp: {opponent}")
        opp_all_lineups_season = format_lineups_df(opp_all_lineups_season, year)
        opponents_lineups.update({opponent:opp_all_lineups_season})
    
    players_df = pd.DataFrame()
    
    i = 0
    for date in schedule_df["Date"]:
        
        
        adversary = schedule_df[schedule_df["Date"]==date.strftime("%Y-%m-%d")]["Opp"].values[0]
        
        
        
        temp_dict = {"Date":date}

        ##Line ups for the team
        try: all_players_on_date = all_lineups_season[all_lineups_season["Date"]==date.strftime("%Y-%m-%d")]
            
        except:
            print(f"No game on this date {date} for team")
            continue
            
        count = 1  
        all_players_team = all_players_on_date.iloc[0]       
        for player in all_players_team[1:]:            
            temp_dict.update({f"player_{count:02}" : player})           
            count+=1
        
        ##Line ups for the adversary
        try:
            opp_lineups_df = opponents_lineups[adversary]
            all_opponents_on_date = opp_lineups_df[opp_lineups_df["Date"]==date.strftime("%Y-%m-%d")]
            all_opponents_on_date.columns = ["Date","10","11","12","12","14","15","16","17","18"]
        except:
            print(f"No game on this date {date} for opponent")
            continue

        count = 10  
        all_players_opp = all_opponents_on_date.iloc[0]       
        for enemy in all_players_opp[1:]:            
            temp_dict.update({f"player_{count:02}" : enemy})           
            count+=1
         
        pitcher_on_date = starting_pitchers[starting_pitchers["Date"]==date.strftime("%Y-%m-%d")]
        #print(f"PITCHERS ON DATE: \n{pitcher_on_date}")    
        if team in acr_team_dict.keys():
                team = acr_team_dict[team]
        
        try:
            team_pitcher = pitcher_on_date[pitcher_on_date["Team"]==team]["PITCHER"].values[0]
            #print(f"TEAM PITCHER: \n{team_pitcher}")   
            temp_dict.update({"pitcher_team": team_pitcher })
        except:
            opponent_pitcher = "Unknown"
            print(f"No pitcher found for {team} on {date}")
            
        
        if adversary in acr_team_dict.keys(): 
            
            adversary = acr_team_dict[adversary]
        
        try:
            opponent_pitcher = pitcher_on_date[pitcher_on_date["Team"]==adversary]["PITCHER"].values[0] 
            #print(f"OPPONENT PITCHER: \n{opponent_pitcher}") 
            temp_dict.update({"pitcher_opp": opponent_pitcher})
        except:
            print(f"No pitcher found for {adversary} on {date}")
            opponent_pitcher = "Unknown"
        
        temp_df = pd.DataFrame(temp_dict, index =[i])
        players_df = pd.concat([players_df,temp_df], axis=0, sort = True )
        i+=1
        
    schedule_df.set_index("Date", inplace=True)
    players_df.set_index("Date", inplace=True)
    teams_df = pd.concat([schedule_df,players_df], axis=1, join="inner")
    teams_df.reset_index(inplace=True, drop=False)
    
    return teams_df

In [21]:
#players_per_game_for_HOU_2019 = get_players_per_game(year = 2019, team = "HOU")

In [22]:
#players_per_game_for_HOU_2019.iloc[24]

In [23]:
#players_per_game_for_HOU_2019.columns

In [24]:
#players_per_game_for_HOU_2019[players_per_game_for_HOU_2019["pitcher_opp"].isna()]

In [25]:
#players_per_game_for_HOU_2019.rows

In [26]:
def get_col_explanation(col1,col2):
    print(f"column 1: {col1[:10]}\ncolumn2: {col2[:10]}")
    try: col1 = [x.split("_")[1] for x in col1 if x is not "index"]
    except: pass
    try: col2 = [x.split("_")[1] for x in col2 if x is not "index"]
    except: pass   
    print(f"column 1: {col1[:10]}\ncolumn2: {col2[:10]}")
    if(len(col1)>len(col2)):missing_col = list(set(col1).difference(set(col2)))
    else:missing_col = list(set(col2).difference(set(col1)))
    return missing_col

In [27]:
#get_col_explanation(["1_a","222_b","2_c"],["222_b","1_c"])

In [28]:
#type(players_per_game_for_HOU_2018.columns)

In [29]:
def stats_single_game_x_team(players=None, names_df = None, team=None, stats_df=None, counter=1, pitching_stats = False, opponent = False, warnings=False):
    
    all_players_team_stats = pd.DataFrame()
    
    if team in acr_team_dict.keys(): team = acr_team_dict[team]
        
    if str(players[0]) != "nan":
        for player in players:

                if("00:00:00" in str(player)): continue #If it's a date, Skip it.
                if "jr." in player: player = player[:-3]

                try: 
                    
                    current_team_players = names_df[names_df.Team==team]
                    player_full_name = [x for x in current_team_players.Name if player.lower() == x.split(" ")[1].lower() ]
                    if len(player_full_name)>1: print(f"More than one player with this laste name \n{player_full_name}")
                    if len(player_full_name)==0: 
                    # If we couldn't find a player in current season, let's try finding him in stats from last year
                        player_full_name = [x for x in stats_df.Name if (player.lower() in x.lower()) &
                                           (stats_df[stats_df.Team==get_key_from_dict( team_dict ,team )])]
                        if warnings:
                            if len(player_full_name)==0: print(f"We couldn't find player {player}")
                            else: print(f"We found {player} in last year's stats")
                    #print(player_full_name)
                    player_stats = stats_df[stats_df.Name==player_full_name[0]]
                    if len(player_stats)==0: 
                        if warnings: print(f"couldn't find statistics for {player_full_name[0]}")
                    # If we couldn't find statistics x player let's iterate the array pf possible players
                        i=1
                        while (len(player_stats)==0):
                            player_stats = stats_df[stats_df.Name==player_full_name[i]] 
                            if len(player_stats)==1: 
                                if warnings: print(f"We found stats for {player_full_name[i]} instead.")
                            i+=1
                                
                        else: print(f"We found {player} in last year's stats")
                    if len(player_stats["Name"])>1 : 
                        if warnings: print("More than one player with same name")
                        print(player_stats)
                        player_stats = player_stats[player_stats.Team == get_key_from_dict( team_dict ,team )]
                   
                                        

                except Exception as e: 
                    if warnings: print(f"{player} from {get_key_from_dict( team_dict ,team )}, {team} not in list Pitcher? :{pitching_stats}. \nError: {e}")

                    try:
                        #nanlist = np.empty((1,len(player_stats.columns)))
                        nanlist = np.empty((1,len(stats_df.columns)))
                    except Exception as e:
                        if warnings: print(f"Not able to get columns from last player\n{e}")
                        continue
                    nanlist.fill(np.nan)
                    fake_columns = stats_df.columns
                    player_stats = pd.DataFrame(data = nanlist, columns = fake_columns)
                    #player_stats.drop(columns=["index"], inplace=True)

                new_col = []
                for col in player_stats.columns:
                    if pitching_stats: 
                        if opponent: new_col.append(f"opponet_pitcher_{col}")
                        else: new_col.append(f"team_pitcher_{col}")
                    else: new_col.append(f"{counter:02}_{col}")
                player_stats.columns = new_col
                player_stats.reset_index(inplace=True, drop=True)
                all_players_team_stats = pd.concat([ all_players_team_stats, player_stats ] ,   axis=1)

                counter+=1
                
    else:
        print(f"Received not a string as a player ({type(players)})\n{players}")
        try:
            nanlist = np.empty((1,len(stats_df.columns)))
        except Exception as e:
            print(f"Not able to get columns from last player\n{e}")
            return 
            
        nanlist.fill(np.nan)
        fake_columns = stats_df.columns
        player_stats = pd.DataFrame(data = nanlist, columns = fake_columns)
        
        new_col = []
        for col in player_stats.columns:
            if pitching_stats: 
                if opponent: new_col.append(f"opponet_pitcher_{col}")
                else: new_col.append(f"team_pitcher_{col}")
            else: new_col.append(f"{counter:02}_{col}")
        player_stats.columns = new_col
        player_stats.reset_index(inplace=True)
        all_players_team_stats = pd.concat([ all_players_team_stats, player_stats ] ,   axis=1)

        counter+=1

            
    return all_players_team_stats

In [30]:
"""
c=np.NaN
print(type(c))
if str(c) != "nan":
    print("not float")
    """

'\nc=np.NaN\nprint(type(c))\nif str(c) != "nan":\n    print("not float")\n    '

In [31]:
def get_stats_startingplayer_by_game(players_df=None, team=None, batting_season_data=None,pitching_season_data=None, year=None):
    
    if batting_season_data.empty: batting_season_data = batting_stats(year-1)   
    if pitching_season_data.empty: pitching_season_data = pitching_stats(year-1)
    print("stats loaded")
    
    bat_names_path = Path(f"Data/Batting/Clean_Data/clean_batting_data_{year}.csv")
    all_bat_names = pd.read_csv(bat_names_path)
    
    pitch_names_path = Path(f"Data/Pitching/Clean_Data/clean_pitching_data_{year}.csv")
    all_pitch_names = pd.read_csv(pitch_names_path)
    #names_teams_current_season = batting_stats(year)
    #pitchers_teams_current_season = pitching_stats(year)    
    print("names loaded")
    
    names_teams_current_season = all_bat_names[["Date","Name","Tm"]]  
    names_teams_current_season.columns = ["Date","Name","Team"]
    #print(names_teams_current_season.head())
    pitchers_teams_current_season = all_pitch_names[["Date","Name","Tm"]]
    pitchers_teams_current_season.columns = ["Date","Name","Team"]
    #print(pitchers_teams_current_season.head())
    
    #print(pitching_season_data.head()) 
    
    stats_players_start_lineup = pd.DataFrame()
    print("starting concatenating df..")
    for row in range(0,len(players_df)):
        
        all_players_stats = pd.DataFrame()
       
        date_x=players_df.iloc[row][0]
        all_players_team_stats = stats_single_game_x_team(players = players_df.iloc[row][9:18],
                                                          names_df = names_teams_current_season[names_teams_current_season["Date"]==
                                                                                               date_x.strftime("%Y-%m-%d")],
                                                                  team = team, 
                                                                  stats_df = batting_season_data)
        
        #print("names_df for team pitchers:\n")
        #print(pitchers_teams_current_season[pitchers_teams_current_season["Date"]==date_x.strftime("%Y-%m-%d")])
        pitcher_team = stats_single_game_x_team(players = [players_df.iloc[row][8]],
                                                          names_df = pitchers_teams_current_season[pitchers_teams_current_season["Date"]==
                                                                                               date_x.strftime("%Y-%m-%d")],
                                                                  team = team, 
                                                                  stats_df = pitching_season_data,
                                                           pitching_stats = True)
        
        
        #print("names_df for opponent batters:\n")
        #print(names_teams_current_season[names_teams_current_season["Date"]==date_x.strftime("%Y-%m-%d")])  
        all_players_opp_stats = stats_single_game_x_team(players = players_df.iloc[row][18:], 
                                                       names_df = names_teams_current_season[names_teams_current_season["Date"]==
                                                                                               date_x.strftime("%Y-%m-%d")],
                                                                  team = players_df.iloc[row][3], 
                                                                  stats_df = batting_season_data,
                                                                  counter = 10)     
        
        #print("names_df for opponent pitchers:\n")
        #print(pitchers_teams_current_season[pitchers_teams_current_season["Date"]==date_x.strftime("%Y-%m-%d")])
        pitcher_opp = stats_single_game_x_team(players = [players_df.iloc[row][7]],
                                                          names_df = pitchers_teams_current_season[pitchers_teams_current_season["Date"]==
                                                                                               date_x.strftime("%Y-%m-%d")],
                                                                  team = players_df.iloc[row][3], 
                                                                  stats_df = pitching_season_data,
                                                           pitching_stats = True,
                                                           opponent = True)
        
        
        try:
            all_players_stats = pd.concat([ all_players_team_stats, 
                                           pitcher_team,
                                           all_players_opp_stats,
                                           pitcher_opp] ,  
                                          axis=1)   

        except Exception as e: 
            print(f"could not concatenate stats\n{e}")
            continue
            
        try: 
            stats_players_start_lineup = stats_players_start_lineup.append(all_players_stats, ignore_index=True)
        except Exception as e: 
            print(f"could not append {e}")
            print(f"explanation: \n{get_col_explanation(stats_players_start_lineup.columns,all_players_stats.columns)}")
            
            continue
        
    
    return stats_players_start_lineup

In [32]:
def get_col_contains(name=None, columns=None, split_criteria = "_"):
   # columns_processed = []
    try: 
        #columns_processed = [x.split(split_criteria)[1] for x in columns if x is not "index"]
        cols = [x for x in columns if name in x]
    except: pass
    return list(cols)
    

In [33]:
"""
test = get_stats_startingplayer_by_game(players_per_game_for_HOU_2019, "HOU", 2019)
test
"""

'\ntest = get_stats_startingplayer_by_game(players_per_game_for_HOU_2019, "HOU", 2019)\ntest\n'

In [34]:
#test.columns[-1]
"""
col = "Name"
cols = get_col_contains(name=col, columns=test.columns)
names_dataf = test[cols]
narows=[index for index, row in names_dataf.iterrows() if names_dataf.iloc[index].isna().values.any()]
print(f"NA Rows (total: {len(narows)}): \n{narows}\ntotal: {len(narows)}")
names_dataf.iloc[narows]
#names_dataf[names_dataf.notnull()]
"""

'\ncol = "Name"\ncols = get_col_contains(name=col, columns=test.columns)\nnames_dataf = test[cols]\nnarows=[index for index, row in names_dataf.iterrows() if names_dataf.iloc[index].isna().values.any()]\nprint(f"NA Rows (total: {len(narows)}): \n{narows}\ntotal: {len(narows)}")\nnames_dataf.iloc[narows]\n#names_dataf[names_dataf.notnull()]\n'

In [35]:
#test["opponet_pitcher_Name"].isnull().sum()

In [36]:
#test

In [37]:
batting_season_2018 = batting_stats(2018)

In [38]:
pitch_stats_18_19 = pitching_stats(2018,2019)

In [39]:

pi_name = "Martin"
p_full_name = [x for x in pitch_stats_18_19.Name if pi_name.lower() in x.lower()]
print(p_full_name)
pitchs_st = [pitch_stats_18_19[pitch_stats_18_19["Name"]==x] for x in p_full_name]
Possible_pitchers = pd.DataFrame()
for p in pitchs_st:
    Possible_pitchers = Possible_pitchers.append(p)
Possible_pitchers


['Carlos Martinez', 'Martin Perez', 'Carlos Martinez', 'Chris Martin', 'Brett Martin', 'Chris Martin', 'Russell Martin', 'Nick Martini', 'Martin Perez', 'Sean Gilmartin', 'Sean Gilmartin', 'Corbin Martin']


Unnamed: 0,Season,Name,Team,Age,W,L,ERA,WAR,G,GS,...,wSL/C (pi),wXX/C (pi),O-Swing% (pi),Z-Swing% (pi),Swing% (pi),O-Contact% (pi),Z-Contact% (pi),Contact% (pi),Zone% (pi),Pace (pi)
306,2018.0,Carlos Martinez,Cardinals,26.0,8.0,6.0,3.11,2.1,33.0,18.0,...,0.17,,0.296,0.637,0.467,0.603,0.853,0.774,0.503,22.6
322,2019.0,Carlos Martinez,Cardinals,27.0,4.0,2.0,3.17,1.2,48.0,0.0,...,3.64,,0.345,0.653,0.496,0.496,0.869,0.737,0.49,24.9
977,2019.0,Martin Perez,Twins,28.0,10.0,7.0,5.12,1.9,32.0,29.0,...,,,0.337,0.672,0.485,0.681,0.868,0.796,0.443,25.3
1224,2018.0,Martin Perez,Rangers,27.0,2.0,7.0,6.22,-0.1,22.0,15.0,...,-6.61,-4.37,0.31,0.639,0.457,0.733,0.913,0.846,0.448,23.8
306,2018.0,Carlos Martinez,Cardinals,26.0,8.0,6.0,3.11,2.1,33.0,18.0,...,0.17,,0.296,0.637,0.467,0.603,0.853,0.774,0.503,22.6
322,2019.0,Carlos Martinez,Cardinals,27.0,4.0,2.0,3.17,1.2,48.0,0.0,...,3.64,,0.345,0.653,0.496,0.496,0.869,0.737,0.49,24.9
390,2019.0,Chris Martin,- - -,33.0,1.0,3.0,3.4,1.0,58.0,0.0,...,-2.44,,0.367,0.684,0.542,0.629,0.833,0.771,0.553,28.4
801,2018.0,Chris Martin,Rangers,32.0,1.0,5.0,4.54,0.6,46.0,0.0,...,-0.49,,0.309,0.667,0.494,0.629,0.876,0.801,0.516,28.6
881,2019.0,Brett Martin,Rangers,24.0,2.0,3.0,4.76,0.9,51.0,2.0,...,1.39,,0.347,0.688,0.511,0.511,0.87,0.743,0.48,25.5
390,2019.0,Chris Martin,- - -,33.0,1.0,3.0,3.4,1.0,58.0,0.0,...,-2.44,,0.367,0.684,0.542,0.629,0.833,0.771,0.553,28.4


In [40]:
"""
p_name = "Chirinos"
p_full_name = [x for x in batting_season_2018.Name if p_name in x]
p_full_name
players_st = [batting_season_2018[batting_season_2018["Name"]==x] for x in p_full_name]
"""

'\np_name = "Chirinos"\np_full_name = [x for x in batting_season_2018.Name if p_name in x]\np_full_name\nplayers_st = [batting_season_2018[batting_season_2018["Name"]==x] for x in p_full_name]\n'

In [41]:
"""
Possible_player = pd.DataFrame()
for p in players_st:
    Possible_player = Possible_player.append(p)
Possible_player
"""

'\nPossible_player = pd.DataFrame()\nfor p in players_st:\n    Possible_player = Possible_player.append(p)\nPossible_player\n'

In [42]:
def create_trining_df(year = (dt.datetime.today().year-1), team = "HOU" , batting_season_data=None,pitching_season_data=None):
                       
    
    players_df = get_players_per_game(year, team)
    if type(players_df)=="str": return players_df
    #print(players_df.tail(10))    
    stats_players_start_lineup = get_stats_startingplayer_by_game(players_df = players_df, 
                                                                  batting_season_data = batting_season_data,
                                                                  pitching_season_data = pitching_season_data,
                                                                  team =team, year = year)
    
    return pd.concat([players_df.iloc[:,[4,0,1,2,3,5,6]],stats_players_start_lineup], axis=1, join='inner')
        

In [43]:
"""
stats_players_start_lineup = create_trining_df(year=2019)
stats_players_start_lineup
"""

'\nstats_players_start_lineup = create_trining_df(year=2019)\nstats_players_start_lineup\n'

In [78]:
def check_values_in_cols(val=np.nan,col_contains=None, df=None):
    
    cols = get_col_contains(name=col_contains, columns=df.columns)
    names_dataf = df[cols]
    if val==np.nan:
        narows=[index for index in range(0,len(names_dataf)) if
                names_dataf.iloc[index].isna().values.any()]
    else:
        narows=[index for index in range(0,len(names_dataf)) if
                (names_dataf.iloc[index]==val).any()]
    #print(f"NA Rows (total: {len(narows)}): \n{narows}\ntotal: {len(narows)}")
    return names_dataf.iloc[narows]

In [77]:
"""
col = "Name"
check_nas_in_cols(col_contains=col, df=stats_players_start_lineup)
"""

'\ncol = "Name"\ncheck_nas_in_cols(col_contains=col, df=stats_players_start_lineup)\n'

In [87]:
def clean_baseball_stats_df(data_frame=None):
  
    df = data_frame.copy()
    #set date as index
    try: df.set_index("Date", inplace=True, drop=True)
    except: 
        print("Could not set Date as index. Maybe it already is?")
        pass
    df.replace(np.nan, 0, inplace =True)
    #drop col with (pfx) as discovered in earlier.
    pfx_col = [x for x in df.columns if "(pfx)" in x ]
    
    #drop col when they have scalars but others have the same expressed in percentage.
    col_not_to_drop = [x for x in df.columns if "%" in x and x[:-1] in df.columns]
    not_perc_col = [x[:-1] for x in col_not_to_drop]
    
    #Drop columns that have Name, index, Season 
    name_cols = get_col_contains(name="Name", columns=df.columns)
    index_cols = get_col_contains(name="index", columns=df.columns)
    Season_cols = get_col_contains(name="Season", columns=df.columns)
    
    #Drop columns Team, Opp
    team_col = ["Tm"]
    opp_col = ["Opp"]

    
    cols_to_drop =[ *pfx_col, *not_perc_col,*name_cols,*index_cols,*Season_cols,*team_col,*opp_col,"GB"]
    
    df[["W/L","GB"]].replace("W",1, inplace=True)
    df[["W/L","GB"]].replace("L",0, inplace=True)
    df.replace("W-wo","W", inplace = True)
    df.replace("L-wo","L", inplace = True)

    
    df.drop(columns=cols_to_drop, inplace=True)
    
    
    return df

In [48]:
#clean_stats_players_start_lineup = clean_baseball_stats_df(stats_players_start_lineup)

In [49]:
#clean_stats_players_start_lineup.drop(columns = ["GB"], inplace = True)
#clean_stats_players_start_lineup

In [50]:
#set(clean_stats_players_start_lineup["W/L"].values)


In [51]:
#check_values_in_cols(val=0,col_contains=col, df=clean_stats_players_start_lineup)

In [85]:
def create_global_train_df(year=2019):
    batting_season_data = batting_stats(year-1)   
    pitching_season_data = pitching_stats(year-1)
    global_training_df = pd.DataFrame()
    for key, val in team_dict.items():
        print(val)
        training_df_team = create_trining_df(year=year, team = val,
                                             batting_season_data =  batting_season_data,
                                             pitching_season_data = pitching_season_data)
        if type(training_df_team)=="str": 
            print(training_df_team)
            continue
        
        #print(clean_trining_df_team.head())
        global_training_df = pd.concat([global_training_df,training_df_team], axis=0, sort = True )
    
    clean_global_df = clean_baseball_stats_df(global_training_df).copy()
    clean_global_df.sort_index(axis=1,inplace=True)
    return clean_global_df, clean_global_df

In [69]:
global_training_df = create_global_train_df()

LAA
stats loaded
names loaded
starting concatenating df..
OAK
stats loaded
names loaded
starting concatenating df..
More than one player with this laste name 
['Harold Castro', 'Willi Castro']
More than one player with this laste name 
['Harold Castro', 'Willi Castro']
More than one player with this laste name 
['Harold Castro', 'Willi Castro']
More than one player with this laste name 
['Harold Castro', 'Willi Castro']
HOU
No pitcher found for HOU on 2019-05-10 00:00:00
stats loaded
names loaded
starting concatenating df..
Received not a string as a player (<class 'list'>)
[nan]
ATL
No pitcher found for WSN on 2019-05-28 00:00:00
No pitcher found for WSN on 2019-05-29 00:00:00
No pitcher found for WSN on 2019-06-21 00:00:00
No pitcher found for WSN on 2019-06-22 00:00:00
No pitcher found for WSN on 2019-06-23 00:00:00
No pitcher found for WSN on 2019-07-18 00:00:00
No pitcher found for WSN on 2019-07-19 00:00:00
No pitcher found for WSN on 2019-07-20 00:00:00
No pitcher found for WSN 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  method=method)


In [71]:
global_training_df["Home_Away"]

Date
2019-03-20    0
2019-03-20    1
2019-03-21    0
2019-03-21    1
2019-03-28    0
2019-03-28    1
2019-03-28    1
2019-03-28    0
2019-03-28    0
2019-03-28    1
2019-03-28    1
2019-03-28    0
2019-03-28    1
2019-03-28    1
2019-03-28    0
2019-03-28    1
2019-03-28    1
2019-03-28    1
2019-03-28    0
2019-03-28    0
2019-03-28    0
2019-03-28    1
2019-03-28    1
2019-03-28    1
2019-03-28    0
2019-03-28    0
2019-03-28    0
2019-03-28    0
2019-03-28    1
2019-03-28    0
             ..
2019-09-28    1
2019-09-28    1
2019-09-29    1
2019-09-29    1
2019-09-29    0
2019-09-29    0
2019-09-29    1
2019-09-29    1
2019-09-29    1
2019-09-29    0
2019-09-29    1
2019-09-29    1
2019-09-29    0
2019-09-29    1
2019-09-29    1
2019-09-29    0
2019-09-29    0
2019-09-29    0
2019-09-29    1
2019-09-29    1
2019-09-29    0
2019-09-29    0
2019-09-29    0
2019-09-29    0
2019-09-29    0
2019-09-29    1
2019-09-29    0
2019-09-29    1
2019-09-29    1
2019-09-29    1
Name: Home_Away, Le

In [81]:
check_values_in_cols(val=0.0,col_contains="Name", df=global_training_df)

In [84]:
name_col =get_col_contains(name="Name", columns=global_training_df.columns)
name_col

[]