In [80]:
import pandas as pd 
from datetime import timedelta

### Read in odds, batting stats, and pitching stats.  Change your relative path to match where the files are located on your local repo.

In [81]:
odds_df = pd.read_csv('./Betting_Odds/mlb_odds_2019.csv', index_col = 'Date', infer_datetime_format = True, parse_dates = True)
batting_data_df = pd.read_csv('./Data/Hitting_Data/clean_batting_data_2019.csv', index_col = 'Date', infer_datetime_format = True, parse_dates = True)
pitching_data_df = pd.read_csv('./Data/Pitching_Data/clean_pitching_data_2019.csv', index_col = 'Date', infer_datetime_format = True, parse_dates = True)

In [82]:
odds_df.head()

Unnamed: 0_level_0,VH,Team,Pitcher,Open,Close,Final
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-03-20,N,SEA,MGONZALES-L,107,120,9
2019-03-20,N,OAK,MFIERS-R,-127,-130,7
2019-03-21,N,SEA,YKIKUCHI-L,109,-125,5
2019-03-21,N,OAK,MESTRADA-R,-129,115,4
2019-03-28,V,NYM,JDEGROM-R,110,115,2


In [83]:
batting_data_df.head()


Unnamed: 0_level_0,Name,Tm,VH,Opp,G,PA,AB,R,H,2B,...,RBI,BB,IBB,SO,HBP,SH,SF,GDP,SB,CS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-03-28,Jose Abreu,CWS,1,Kansas City,1,4,4,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2019-03-28,Ronald Acuna Jr.,ATL,1,Philadelphia,1,4,2,1,1,0,...,1,2,0,0,0,0,0,0,1,0
2019-03-28,Willy Adames,TAM,0,Houston,1,4,4,0,0,0,...,0,0,0,3,0,0,0,0,0,0
2019-03-28,Matt Adams,WAS,0,New York,1,1,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2019-03-28,Jesus Aguilar,MIL,0,St. Louis,1,4,4,0,0,0,...,0,0,0,2,0,0,0,0,0,0


In [84]:
pitching_data_df.head()

Unnamed: 0_level_0,Name,Tm,VH,Opp,G,GS,IP,H,R,ER,...,Str,StL,StS,GB/FB,LD,PU,WHIP,BAbip,SO9,SO/W
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-03-28,Victor Alcantara,DET,1,Toronto,1,0,1.0,1,0,0,...,0.64,0.0,0.09,0.33,0.33,0.33,1.0,0.333,9.0,
2019-03-28,Nick Anderson,MIA,0,Colorado,1,0,0.1,0,0,0,...,1.0,0.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,
2019-03-28,Luke Bard,LAA,1,Oakland,1,0,1.0,1,0,0,...,0.8,0.1,0.2,0.0,0.33,0.33,1.0,0.333,9.0,
2019-03-28,Kyle Barraclough,WAS,0,New York,1,0,1.0,1,0,0,...,0.64,0.14,0.07,0.33,0.67,0.0,1.0,0.333,0.0,
2019-03-28,Cam Bedrosian,LAA,1,Oakland,1,0,1.0,0,0,0,...,0.86,0.14,0.14,0.33,0.0,0.0,0.0,0.0,0.0,


## Helper function that you don't have to call.  It will be used inside of the df_for_feature_selection() function

In [85]:
def make_schedule_with_odds(odds_df):
    """ Takes a dataframe of gambling odds that have each team in a game on a seperate row.  Will return a dataframe
        with both teams in a game on the same row with their Open Moneyline Odds, Close Moneyline Odds,
        who won and who lost, and Starting Pitchers. """
    new_df = pd.DataFrame()
    t = odds_df.iterrows()
    date = []
    home = []
    visitor = []
    home_pitcher = []
    visitor_pitcher = []
    home_open_odds = []
    visitor_open_odds = []
    home_close_odds = []
    visitor_close_odds = []
    home_win_loss = []
    visitor_win_loss = []
    for (i, row1), (j, row2) in zip(t, t):
        date.append(i)
        home.append(row2['Team'])
        visitor.append(row1['Team'])
        home_pitcher.append(row2['Pitcher'])
        visitor_pitcher.append(row1['Pitcher'])
        home_open_odds.append(row2['Open'])
        visitor_open_odds.append(row1['Open'])
        home_close_odds.append(row2['Close'])
        visitor_close_odds.append(row1['Close'])
        if row2['Final'] > row1['Final']:
            home_win_loss.append(1)
            visitor_win_loss.append(0)
        else:
            home_win_loss.append(0)
            visitor_win_loss.append(1)
    schedule_odds_df = pd.DataFrame(list(zip(home, visitor, home_pitcher, visitor_pitcher, home_open_odds, visitor_open_odds, home_close_odds, 
              visitor_close_odds, home_win_loss, visitor_win_loss)), columns=['home','visitor', 'home_pitcher', 'visitor_pitcher', 
              'home_open_odds', 'visitor_open_odds', 'home_close_odds', 'visitor_close_odds', 'home_win_loss', 'visitor_win_loss'], 
              index = date)
    return schedule_odds_df
        
         
        
        
            
            
    
    
    
    

## Helper function that you don't have to call.  It will be used inside of the df_for_feature_selection() function

In [86]:
def stats_for_game_day(schedule_odds_df, batting_df, pitching_df, look_back):
    """ This function takes an odds_df that has been scrubbed through_schedule_with_odds, a batting_df that has been cleaned from PyBaseball,
        a pitching_df that has been cleaned from PyBaseball, and accepts an integer for a lookback period.  The lookback period is the time 
        previous to the game being played that you want to calculate teams stats for.  This function will return a tuple with the cumulative
        hitting stats for each team during the lookback period as the first value of the tuple.  The second value of the tuple will be the 
        cumulative pitching stats for each team for the lookback period."""
    
    hitting_day_list = []
    pitching_day_list = []
    for index, row in schedule_odds_df.iterrows():
        hitting_day = batting_df.loc[index - timedelta(look_back): index - timedelta(1)].groupby('Tm').sum()
        pitching_day = pitching_df.loc[index - timedelta(look_back): index - timedelta(1)].groupby('Tm').sum()
        hitting_day_list.append(hitting_day)
        pitching_day_list.append(pitching_day)
    return hitting_day_list, pitching_day_list
    
    
            

### Main function.  Call this to build dataframe

In [90]:
def df_for_feature_selection(odds_df, batting_df, pitching_df, look_back):
    """ This is the main function for this library.  Every other function is a helper function.This function takes an odds_df that has been scrubbed 
        through_schedule_with_odds, a batting_df that has been cleaned from PyBaseball, a pitching_df that has been cleaned from PyBaseball, and accepts an 
        integer for a lookback period that all stats are calculated for.  This function will return a dataframe with all the odds info, cumulative home team hitting stats, 
        cumulative home team pitching stats, cumulative visitor team hitting stats, cumulative visitor team pitching stats, and who won the game.  Each row of the 
        dataframe represents one game with the cumulative stats, odds, and winner between the 2 teams."""
    
    # Use helper function to get schedule with odds
    odds_df_with_lookback = odds_df[odds_df.index[0] + timedelta(look_back): batting_data_df.index[-1]]
    schedule_odds_df = make_schedule_with_odds(odds_df_with_lookback)
    
    # Use helper function to get all stats for lookback period by team
    hitting_day, pitching_day = stats_for_game_day(schedule_odds_df, batting_df, pitching_df, look_back)

    # Create one dataframe that houses all the odds, stats, winners and losers for each game played 
    total_df = pd.DataFrame()
    for i in range(len(schedule_odds_df)):
        hitting_day_df = pd.DataFrame(hitting_day[i])
        pitching_day_df = pd.DataFrame(pitching_day[i])
        hitting_games_home = pd.DataFrame(hitting_day_df.loc[schedule_odds_df['home'][i]]).T.reset_index().drop(columns = ['index', 'VH'])
        hitting_games_home['Date'] = schedule_odds_df.index[i]
        hitting_games_home = hitting_games_home.set_index('Date')
        hitting_games_home = hitting_games_home.add_prefix('Home_Hitting')
        hitting_games_visitor = pd.DataFrame(hitting_day_df.loc[schedule_odds_df['visitor'][i]]).T.reset_index().drop(columns = ['index', 'VH'])
        hitting_games_visitor['Date'] = schedule_odds_df.index[i]
        hitting_games_visitor = hitting_games_visitor.set_index('Date')
        hitting_games_visitor = hitting_games_visitor.add_prefix('Visitor_Hitting')
        pitching_games_home = pd.DataFrame(pitching_day_df.loc[schedule_odds_df['home'][i]]).T.reset_index().drop(columns = ['index', 'VH'])
        pitching_games_home['Date'] = schedule_odds_df.index[i]
        pitching_games_home = pitching_games_home.set_index('Date')
        pitching_games_home = pitching_games_home.add_prefix('Home_Pitching')
        pitching_games_visitor = pd.DataFrame(pitching_day_df.loc[schedule_odds_df['visitor'][i]]).T.reset_index().drop(columns = ['index', 'VH'])
        pitching_games_visitor['Date'] = schedule_odds_df.index[i]
        pitching_games_visitor = pitching_games_visitor.set_index('Date')
        pitching_games_visitor = pitching_games_visitor.add_prefix('Visitor_Pitching')
        total_line = pd.concat([hitting_games_home,hitting_games_visitor, pitching_games_home, pitching_games_visitor], axis = 1)
        total_df = total_df.append(total_line)
    stats_odds_df = pd.concat([schedule_odds_df,total_df], axis = 1 )
    return stats_odds_df
    
        
        
        

### Input your odds_df, batting_df, pitching_df, and look_back period

In [91]:
combined_df = df_for_feature_selection(odds_df, batting_data_df, pitching_data_df, look_back = 10)


In [92]:
combined_df.head()

Unnamed: 0,home,visitor,home_pitcher,visitor_pitcher,home_open_odds,visitor_open_odds,home_close_odds,visitor_close_odds,home_win_loss,visitor_win_loss,...,Visitor_PitchingStr,Visitor_PitchingStL,Visitor_PitchingStS,Visitor_PitchingGB/FB,Visitor_PitchingLD,Visitor_PitchingPU,Visitor_PitchingWHIP,Visitor_PitchingBAbip,Visitor_PitchingSO9,Visitor_PitchingSO/W
2019-03-30,WAS,NYM,SSTRASBURG-R,NSYNDERGAARD-R,-130,110,-112,102,0,1,...,2.59,0.62,0.8,0.66,1.75,0.08,1.0,0.417,51.0,10.0
2019-03-30,PHI,ATL,NPIVETTA-R,BWILSON-R,-145,125,-145,135,1,0,...,3.03,0.53,0.78,2.29,0.0,0.0,9.7,0.523,53.1,4.0
2019-03-30,MIA,COL,PLOPEZ-R,TANDERSON-L,125,-145,118,-128,1,0,...,4.78,1.36,0.77,3.94,0.53,0.05,3.262,0.156,50.7,7.33
2019-03-30,MIL,STL,BWOODRUFF-R,DHUDSON-R,-125,105,-132,122,1,0,...,5.74,1.49,1.04,4.59,2.11,0.13,7.646,1.238,60.5,9.0
2019-03-30,SDG,SFO,NMARGEVICIUS-L,DRODRIGUEZ-R,-125,105,-130,120,0,1,...,4.94,1.21,0.84,3.35,1.86,0.07,7.357,2.052,51.7,12.5


### The dataframe has alot of unneeded columns that are ratios that have been summed over the look back period.  They have no value in our feature selection. Select any of the counting stats 
### and construct any ratios like slugging percentage or on base you want to try.  The target labels are the home_win_loss or visitor_win_loss, whichever one you prefer to use. 1 is for a win, 0 for a loss.

In [93]:
combined_df.columns.values

array(['home', 'visitor', 'home_pitcher', 'visitor_pitcher',
       'home_open_odds', 'visitor_open_odds', 'home_close_odds',
       'visitor_close_odds', 'home_win_loss', 'visitor_win_loss',
       'Home_HittingG', 'Home_HittingPA', 'Home_HittingAB',
       'Home_HittingR', 'Home_HittingH', 'Home_Hitting2B',
       'Home_Hitting3B', 'Home_HittingHR', 'Home_HittingRBI',
       'Home_HittingBB', 'Home_HittingIBB', 'Home_HittingSO',
       'Home_HittingHBP', 'Home_HittingSH', 'Home_HittingSF',
       'Home_HittingGDP', 'Home_HittingSB', 'Home_HittingCS',
       'Visitor_HittingG', 'Visitor_HittingPA', 'Visitor_HittingAB',
       'Visitor_HittingR', 'Visitor_HittingH', 'Visitor_Hitting2B',
       'Visitor_Hitting3B', 'Visitor_HittingHR', 'Visitor_HittingRBI',
       'Visitor_HittingBB', 'Visitor_HittingIBB', 'Visitor_HittingSO',
       'Visitor_HittingHBP', 'Visitor_HittingSH', 'Visitor_HittingSF',
       'Visitor_HittingGDP', 'Visitor_HittingSB', 'Visitor_HittingCS',
       'Home_Pitchin

In [94]:
new_df = combined_df[['home', 'visitor', 'home_open_odds', 'visitor_open_odds', 'home_close_odds',
       'visitor_close_odds', 'home_win_loss', 'visitor_win_loss', 'Home_HittingPA', 'Home_HittingAB',
       'Home_HittingR', 'Home_HittingH', 'Home_Hitting2B',
       'Home_Hitting3B', 'Home_HittingHR', 'Home_HittingBB', 'Home_HittingIBB', 'Home_HittingSO',
       'Home_HittingHBP', 'Home_HittingSH', 'Home_HittingSF', 'Visitor_HittingPA', 'Visitor_HittingAB',
       'Visitor_HittingR', 'Visitor_HittingH', 'Visitor_Hitting2B',
       'Visitor_Hitting3B', 'Visitor_HittingHR', 'Visitor_HittingBB', 'Visitor_HittingIBB', 'Visitor_HittingSO',
       'Visitor_HittingHBP', 'Visitor_HittingSH', 'Visitor_HittingSF', 'Home_PitchingIP',
       'Home_PitchingH', 'Home_PitchingR', 'Home_PitchingER',
       'Home_PitchingBB', 'Home_PitchingSO', 'Home_PitchingHR',
       'Home_PitchingHBP', 'Home_PitchingAB',
       'Home_Pitching2B', 'Home_Pitching3B', 'Home_PitchingIBB',
       'Home_PitchingGDP', 'Home_PitchingSF', 'Home_PitchingBF', 'Visitor_PitchingIP', 'Visitor_PitchingH',
       'Visitor_PitchingR', 'Visitor_PitchingER', 'Visitor_PitchingBB',
       'Visitor_PitchingSO', 'Visitor_PitchingHR', 'Visitor_PitchingHBP',
       'Visitor_PitchingERA', 'Visitor_PitchingAB', 'Visitor_Pitching2B',
       'Visitor_Pitching3B', 'Visitor_PitchingIBB', 'Visitor_PitchingSF', 'Visitor_PitchingBF'  ]]

In [95]:
new_df.columns.values

array(['home', 'visitor', 'home_open_odds', 'visitor_open_odds',
       'home_close_odds', 'visitor_close_odds', 'home_win_loss',
       'visitor_win_loss', 'Home_HittingPA', 'Home_HittingAB',
       'Home_HittingR', 'Home_HittingH', 'Home_Hitting2B',
       'Home_Hitting3B', 'Home_HittingHR', 'Home_HittingBB',
       'Home_HittingIBB', 'Home_HittingSO', 'Home_HittingHBP',
       'Home_HittingSH', 'Home_HittingSF', 'Visitor_HittingPA',
       'Visitor_HittingAB', 'Visitor_HittingR', 'Visitor_HittingH',
       'Visitor_Hitting2B', 'Visitor_Hitting3B', 'Visitor_HittingHR',
       'Visitor_HittingBB', 'Visitor_HittingIBB', 'Visitor_HittingSO',
       'Visitor_HittingHBP', 'Visitor_HittingSH', 'Visitor_HittingSF',
       'Home_PitchingIP', 'Home_PitchingH', 'Home_PitchingR',
       'Home_PitchingER', 'Home_PitchingBB', 'Home_PitchingSO',
       'Home_PitchingHR', 'Home_PitchingHBP', 'Home_PitchingAB',
       'Home_Pitching2B', 'Home_Pitching3B', 'Home_PitchingIBB',
       'Home_PitchingGDP

In [96]:
def baseball_stats_calculator_hitting(df) : 
    home_and_visitor = ['Home_Hitting', 'Visitor_Hitting']
    
    for i in range(2):
        df[home_and_visitor[i] + 'K%'] = df[home_and_visitor[i] + 'SO'] / df[home_and_visitor[i] + 'PA']
        df[home_and_visitor[i] + 'BB%'] = df[home_and_visitor[i] + 'BB'] / df[home_and_visitor[i] + 'PA']
        df[home_and_visitor[i] + 'OBP_num'] = df[home_and_visitor[i] + 'H'] + df[home_and_visitor[i] + 'BB']+ df[home_and_visitor[i] + 'HBP']                                   
        df[home_and_visitor[i] + 'OBP_den'] = df[home_and_visitor[i] + 'AB'] + df[home_and_visitor[i] + 'BB'] + df[home_and_visitor[i] + 'HBP'] + df[home_and_visitor[i] + 'SF']
        df[home_and_visitor[i] + 'OBP'] = df[home_and_visitor[i] + 'OBP_num'] / df[home_and_visitor[i] + 'OBP_den']
        df[home_and_visitor[i] + '1B'] = df[home_and_visitor[i] + 'H'] - (df[home_and_visitor[i] + '2B'] + df[home_and_visitor[i] + '3B'] + df[home_and_visitor[i] + 'HR'])
        df[home_and_visitor[i] + 'SLG%_num'] = df[home_and_visitor[i] + '1B'] + 2 * df[home_and_visitor[i] + '2B'] + 3 * df[home_and_visitor[i] + '3B'] + 4 * df[home_and_visitor[i] + 'HR']
        df[home_and_visitor[i] + 'SLG%_den'] = df[home_and_visitor[i] + 'AB']
        df[home_and_visitor[i] + 'SLG%'] = df[home_and_visitor[i] + 'SLG%_num'] / df[home_and_visitor[i] + 'SLG%_den']
    return df

In [97]:
def baseball_stats_calculator_pitching(df) : 
    home_and_visitor = ['Home_Pitching', 'Visitor_Pitching']

    for i in range(2):
        df[home_and_visitor[i] + 'K%'] = df[home_and_visitor[i] + 'SO'] / df[home_and_visitor[i] + 'BF']
        df[home_and_visitor[i] + 'BB%'] = df[home_and_visitor[i] + 'BB'] / df[home_and_visitor[i] + 'BF']
        df[home_and_visitor[i] + 'OBP_num'] = df[home_and_visitor[i] + 'H'] + df[home_and_visitor[i] + 'BB']+ df[home_and_visitor[i] + 'HBP']                                   
        df[home_and_visitor[i] + 'OBP_den'] = df[home_and_visitor[i] + 'AB'] + df[home_and_visitor[i] + 'BB'] + df[home_and_visitor[i] + 'HBP'] + df[home_and_visitor[i] + 'SF']
        df[home_and_visitor[i] + 'OBP_allowed'] = df[home_and_visitor[i] + 'OBP_num'] / df[home_and_visitor[i] + 'OBP_den']
        df[home_and_visitor[i] + '1B'] = df[home_and_visitor[i] + 'H'] - (df[home_and_visitor[i] + '2B'] + df[home_and_visitor[i] + '3B'] + df[home_and_visitor[i] + 'HR'])
        df[home_and_visitor[i] + 'SLG%_num'] = df[home_and_visitor[i] + '1B'] + 2 * df[home_and_visitor[i] + '2B'] + 3 * df[home_and_visitor[i] + '3B'] + 4 * df[home_and_visitor[i] + 'HR']
        df[home_and_visitor[i] + 'SLG%_den'] = df[home_and_visitor[i] + 'AB']
        df[home_and_visitor[i] + 'SLG%_allowed'] = df[home_and_visitor[i] + 'SLG%_num'] / df[home_and_visitor[i] + 'SLG%_den']
    return df

In [98]:
p_df = baseball_stats_calculator_pitching(new_df)
q_df = baseball_stats_calculator_hitting(p_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/panda

In [99]:
q_df

Unnamed: 0,home,visitor,home_open_odds,visitor_open_odds,home_close_odds,visitor_close_odds,home_win_loss,visitor_win_loss,Home_HittingPA,Home_HittingAB,...,Home_HittingSLG%,Visitor_HittingK%,Visitor_HittingBB%,Visitor_HittingOBP_num,Visitor_HittingOBP_den,Visitor_HittingOBP,Visitor_Hitting1B,Visitor_HittingSLG%_num,Visitor_HittingSLG%_den,Visitor_HittingSLG%
2019-03-30,WAS,NYM,-130,110,-112,102,0,1,33,31,...,0.193548,0.352941,0.088235,8,34,0.235294,4,8,31,0.258065
2019-03-30,PHI,ATL,-145,125,-145,135,1,0,37,31,...,0.516129,0.250000,0.166667,13,36,0.361111,5,11,30,0.366667
2019-03-30,MIA,COL,125,-145,118,-128,1,0,65,59,...,0.305085,0.148148,0.086420,27,81,0.333333,10,32,74,0.432432
2019-03-30,MIL,STL,-125,105,-132,122,1,0,67,61,...,0.508197,0.253012,0.108434,29,82,0.353659,10,40,72,0.555556
2019-03-30,SDG,SFO,-125,105,-130,120,0,1,62,55,...,0.418182,0.250000,0.058824,16,68,0.235294,7,17,63,0.269841
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-09-29,LAA,HOU,317,-400,340,-410,0,1,331,305,...,0.331148,0.174051,0.107595,105,316,0.332278,32,135,275,0.490909
2019-09-29,SEA,OAK,151,-175,130,-140,1,0,333,311,...,0.260450,0.225397,0.117460,111,314,0.353503,45,119,273,0.435897
2019-09-29,CWS,DET,-135,115,-150,140,1,0,318,296,...,0.523649,0.261006,0.031447,84,318,0.264151,51,110,306,0.359477
2019-09-29,KAN,MIN,163,-190,125,-135,1,0,308,270,...,0.433333,0.176301,0.092486,120,346,0.346821,49,159,311,0.511254


In [100]:
q_df.columns.values

array(['home', 'visitor', 'home_open_odds', 'visitor_open_odds',
       'home_close_odds', 'visitor_close_odds', 'home_win_loss',
       'visitor_win_loss', 'Home_HittingPA', 'Home_HittingAB',
       'Home_HittingR', 'Home_HittingH', 'Home_Hitting2B',
       'Home_Hitting3B', 'Home_HittingHR', 'Home_HittingBB',
       'Home_HittingIBB', 'Home_HittingSO', 'Home_HittingHBP',
       'Home_HittingSH', 'Home_HittingSF', 'Visitor_HittingPA',
       'Visitor_HittingAB', 'Visitor_HittingR', 'Visitor_HittingH',
       'Visitor_Hitting2B', 'Visitor_Hitting3B', 'Visitor_HittingHR',
       'Visitor_HittingBB', 'Visitor_HittingIBB', 'Visitor_HittingSO',
       'Visitor_HittingHBP', 'Visitor_HittingSH', 'Visitor_HittingSF',
       'Home_PitchingIP', 'Home_PitchingH', 'Home_PitchingR',
       'Home_PitchingER', 'Home_PitchingBB', 'Home_PitchingSO',
       'Home_PitchingHR', 'Home_PitchingHBP', 'Home_PitchingAB',
       'Home_Pitching2B', 'Home_Pitching3B', 'Home_PitchingIBB',
       'Home_PitchingGDP

In [101]:
training_df = p_df[['home', 'visitor', 'home_open_odds', 'visitor_open_odds','home_close_odds', 'visitor_close_odds', 'home_win_loss',
       'visitor_win_loss', 'Home_PitchingK%', 'Home_PitchingBB%', 'Home_PitchingOBP_allowed','Home_PitchingSLG%_allowed', 
       'Visitor_PitchingK%', 'Visitor_PitchingBB%', 'Visitor_PitchingOBP_allowed','Visitor_PitchingSLG%_allowed', 'Home_HittingK%',
       'Home_HittingBB%', 'Home_HittingOBP','Home_HittingSLG%', 'Visitor_HittingK%','Visitor_HittingBB%', 'Visitor_HittingOBP',
       'Visitor_HittingSLG%']]

In [102]:
training_df.head()

Unnamed: 0,home,visitor,home_open_odds,visitor_open_odds,home_close_odds,visitor_close_odds,home_win_loss,visitor_win_loss,Home_PitchingK%,Home_PitchingBB%,...,Visitor_PitchingOBP_allowed,Visitor_PitchingSLG%_allowed,Home_HittingK%,Home_HittingBB%,Home_HittingOBP,Home_HittingSLG%,Visitor_HittingK%,Visitor_HittingBB%,Visitor_HittingOBP,Visitor_HittingSLG%
2019-03-30,WAS,NYM,-130,110,-112,102,0,1,0.352941,0.088235,...,0.212121,0.193548,0.424242,0.030303,0.212121,0.193548,0.352941,0.088235,0.235294,0.258065
2019-03-30,PHI,ATL,-145,125,-145,135,1,0,0.25,0.166667,...,0.351351,0.516129,0.243243,0.162162,0.351351,0.516129,0.25,0.166667,0.361111,0.366667
2019-03-30,MIA,COL,125,-145,118,-128,1,0,0.148148,0.08642,...,0.184615,0.305085,0.246154,0.061538,0.184615,0.305085,0.148148,0.08642,0.333333,0.432432
2019-03-30,MIL,STL,-125,105,-132,122,1,0,0.253012,0.108434,...,0.298507,0.508197,0.19403,0.059701,0.298507,0.508197,0.253012,0.108434,0.353659,0.555556
2019-03-30,SDG,SFO,-125,105,-130,120,0,1,0.25,0.058824,...,0.322581,0.418182,0.290323,0.080645,0.322581,0.418182,0.25,0.058824,0.235294,0.269841


In [103]:
training_df.columns.values

array(['home', 'visitor', 'home_open_odds', 'visitor_open_odds',
       'home_close_odds', 'visitor_close_odds', 'home_win_loss',
       'visitor_win_loss', 'Home_PitchingK%', 'Home_PitchingBB%',
       'Home_PitchingOBP_allowed', 'Home_PitchingSLG%_allowed',
       'Visitor_PitchingK%', 'Visitor_PitchingBB%',
       'Visitor_PitchingOBP_allowed', 'Visitor_PitchingSLG%_allowed',
       'Home_HittingK%', 'Home_HittingBB%', 'Home_HittingOBP',
       'Home_HittingSLG%', 'Visitor_HittingK%', 'Visitor_HittingBB%',
       'Visitor_HittingOBP', 'Visitor_HittingSLG%'], dtype=object)

In [104]:
training_df.index = training_df.index.rename('Date')

In [105]:
training_df.to_csv('./Training_Files/2019_10_day.csv')

In [23]:
def home_visitor_pitcher_hitter_diff(df):
    df['Home_Pitcher_K%_diff'] = df['Home_PitchingK%'] - df['Visitor_HittingK%']
    df['Home_PitchingBB%_diff'] = df['Home_PitchingBB%'] - df['Visitor_HittingBB%']
    df['Home_PitchingOBP_allowed_diff'] = df['Home_PitchingOBP_allowed'] - df['Visitor_HittingOBP']
    df['Home_PitchingSLG%_allowed_diff'] = df['Home_PitchingSLG%_allowed'] - df['Visitor_HittingSLG%']
    df['Visitor_PitchingK%_diff'] = df['Visitor_PitchingK%'] - df['Home_HittingK%']
    df['Visitor_PitchingBB%_diff'] = df['Visitor_PitchingBB%'] - df['Home_HittingBB%']
    df['Visitor_PitchingOBP_allowed_diff'] = df['Visitor_PitchingOBP_allowed'] - df['Home_HittingOBP']
    df['Visitor_PitchingSLG%_allowed_diff'] = df['Visitor_PitchingSLG%_allowed'] - df['Visitor_HittingSLG%']
    return df

In [24]:
diff_df = home_visitor_pitcher_hitter_diff(training_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

In [25]:
diff_df.columns.values

array(['home', 'visitor', 'home_open_odds', 'visitor_open_odds',
       'home_close_odds', 'visitor_close_odds', 'home_win_loss',
       'visitor_win_loss', 'Home_PitchingK%', 'Home_PitchingBB%',
       'Home_PitchingOBP_allowed', 'Home_PitchingSLG%_allowed',
       'Visitor_PitchingK%', 'Visitor_PitchingBB%',
       'Visitor_PitchingOBP_allowed', 'Visitor_PitchingSLG%_allowed',
       'Home_HittingK%', 'Home_HittingBB%', 'Home_HittingOBP',
       'Home_HittingSLG%', 'Visitor_HittingK%', 'Visitor_HittingBB%',
       'Visitor_HittingOBP', 'Visitor_HittingSLG%',
       'Home_Pitcher_K%_diff', 'Home_PitchingBB%_diff',
       'Home_PitchingOBP_allowed_diff', 'Home_PitchingSLG%_allowed_diff',
       'Visitor_PitchingK%_diff', 'Visitor_PitchingBB%_diff',
       'Visitor_PitchingOBP_allowed_diff',
       'Visitor_PitchingSLG%_allowed_diff'], dtype=object)

In [26]:
diff_df.index = diff_df.index.rename('Date')

In [27]:
diff_df.to_csv('./Training_files/2016_12day_diff.csv')