In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [23]:
import os
df_before_cleanup_path = os.getcwd()+"/data/df_before_cleanup.csv"
df_w_features_path = os.getcwd()+"/data/df_w_features.csv"
if(not os.path.exists(df_before_cleanup_path)):
    open(df_before_cleanup_path, 'w').close()
if(not os.path.exists(df_w_features_path)):
    open(df_w_features_path, 'w').close()

In [24]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
import features_in_code_folder as features
import dataframe_utilities_in_code_folder as util

# Load Data

In [26]:
games = features.get_games()
batting = features.get_batting()
pitching = features.get_pitching()
pitchers = features.get_pitchers()

# Features

In [27]:
df = features.get_game_df()

## Add Small Feats

In [28]:
df = features.add_trueskill_ratings(df)
df = features.add_rest_durations(df)
df.shape

(3960, 17)

In [29]:
#datetime
date = pd.to_datetime(df['date'])
df['season'] = date.dt.year
df['month']=date.dt.month
df['week_num'] = date.dt.week
df['dow']=date.dt.weekday.astype('int')

In [30]:
df['dh_game_no'] = pd.to_numeric(df['game_id'].str[-1:],errors='coerce')
df['date'] = (pd.to_datetime(df['date']) - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s') #epoch time

## Add Stats

### Rolling Stats

In [31]:
# create rolling stat
b_stats = ['batting_avg','leverage_index_avg', 'onbase_perc', 'onbase_plus_slugging']
df = features.add_10RA_rolling(batting, df, b_stats, True, 'batting')

In [32]:
pitching['SO_batters_faced'] = pitching['SO'] / pitching['batters_faced']
pitching['H_batters_faced'] = pitching['H'] / pitching['batters_faced']
pitching['BB_batters_faced'] = pitching['BB'] / pitching['batters_faced']

# create rolling stat
b_stats = ['earned_run_avg','SO_batters_faced','H_batters_faced','BB_batters_faced']
df = features.add_10RA_rolling(pitching, df, b_stats, True, 'team_pitching')

In [33]:
pitchers['SO_batters_faced'] = pitchers['SO'] / pitchers['batters_faced']
pitchers['H_batters_faced'] = pitchers['H'] / pitchers['batters_faced']
pitchers['BB_batters_faced'] = pitchers['BB'] / pitchers['batters_faced']

# create rolling stat
b_stats = ['earned_run_avg','SO_batters_faced','H_batters_faced','BB_batters_faced']
df = features.add_10RA_rolling(pitchers, df, b_stats, False, 'pitcher')

### Game Stats

In [34]:
df = features.game_stats(games,df)
df.shape

(3960, 81)

### Season Stats

In [35]:
batting_stats = ['A', 'AB', 'BB', 'H', 'PA', 'PO', 'R', 'RBI', 'SO', 'batting_avg',
             'leverage_index_avg', 'onbase_perc', 'onbase_plus_slugging', 'pitches', 
             're24_bat', 'slugging_perc', 'strikes_total', 'wpa_bat', 'wpa_bat_neg', 
             'wpa_bat_pos']
df = features.add_season_rolling(batting, df, batting_stats, True,'batting')
df.shape

(3960, 221)

In [36]:
pitching_stats = ['BB', 'ER', 'H', 'HR', 'IP', 'R', 'SO', 'batters_faced',
               'earned_run_avg', 'game_score', 'inherited_runners',
               'inherited_score', 'inplay_fb_total', 'inplay_gb_total', 'inplay_ld',
               'inplay_unk', 'leverage_index_avg', 'pitches', 're24_def',
               'strikes_contact', 'strikes_looking', 'strikes_swinging',
               'strikes_total', 'wpa_def','SO_batters_faced','H_batters_faced',
                'BB_batters_faced']
df = features.add_season_rolling(pitching, df, pitching_stats, True,'team_pitching')
df.shape

(3960, 410)

In [37]:
df = features.add_season_rolling(pitchers, df, pitching_stats, False,'pitcher')
df.shape

(3960, 599)

In [38]:
df.to_csv(df_before_cleanup_path, index=False)

# Cleanup

In [39]:
#fin na's
df = util.fix_na(df, False)

In [40]:
df.shape

(3960, 599)

In [41]:
df.to_csv(df_w_features_path, index=False)