In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
import sys
sys.path.append('../code')
import features
import dataframe_utilities as util

# Load Data

In [4]:
games = features.get_games()
batting = features.get_batting()
pitching = features.get_pitching()
pitchers = features.get_pitchers()

# Features

In [7]:
df = features.get_game_df()


           game_id home_team_abbr away_team_abbr        date  is_night_game  \
0  NYMSTL20240301D            STL            NYM  2024-03-01           True   
1   SDMIL20240301D            MIL             SD  2024-03-01           True   
2  TORNYY20240301N            NYY            TOR  2024-03-01           True   
3  PHIDET20240301D            DET            PHI  2024-03-01           True   
4   TEXSF20240301D             SF            TEX  2024-03-01           True   

   home_team_win home_pitcher away_pitcher  
0           True          NaN          NaN  
1           True          NaN          NaN  
2           True          NaN          NaN  
3           True          NaN          NaN  
4           True          NaN          NaN  


In [8]:
df['home_pitcher'] = df['home_pitcher'].fillna('No Pitcher')
df['away_pitcher'] = df['away_pitcher'].fillna('No Pitcher')

## Add Small Feats

In [9]:
df = features.add_trueskill_ratings(df)
df = features.add_rest_durations(df)
df.shape

(935, 17)

In [10]:
# Assuming df['date'] is a column with date information in string format
date = pd.to_datetime(df['date'])
df['season'] = date.dt.year
df['month'] = date.dt.month
# Use .isocalendar().week for ISO week number
df['week_num'] = date.dt.isocalendar().week
df['dow'] = date.dt.weekday.astype(int)

In [11]:
df['dh_game_no'] = pd.to_numeric(df['game_id'].str[-1:],errors='coerce')
df['date'] = (pd.to_datetime(df['date']) - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s') #epoch time

## Add Stats

### Rolling Stats

In [12]:
# create rolling stat
b_stats = ['batting_avg','leverage_index_avg', 'onbase_perc', 'onbase_plus_slugging']
df = features.add_10RA_rolling(batting, df, b_stats, True, 'batting')

In [14]:
pitching['SO_batters_faced'] = pitching['so'] / pitching['batters_faced']
pitching['H_batters_faced'] = pitching['h'] / pitching['batters_faced']
pitching['BB_batters_faced'] = pitching['bb'] / pitching['batters_faced']

# create rolling stat
b_stats = ['earned_run_avg','SO_batters_faced','H_batters_faced','BB_batters_faced']
df = features.add_10RA_rolling(pitching, df, b_stats, True, 'team_pitching')

In [17]:
pitchers['earned_run_avg'] = pd.to_numeric(pitchers['earned_run_avg'], errors='coerce')
pitchers['SO_batters_faced'] = pitchers['so'] / pitchers['batters_faced']
pitchers['H_batters_faced'] = pitchers['h'] / pitchers['batters_faced']
pitchers['BB_batters_faced'] = pitchers['bb'] / pitchers['batters_faced']

# create rolling stat
b_stats = ['earned_run_avg','SO_batters_faced','H_batters_faced','BB_batters_faced']
df = features.add_10RA_rolling(pitchers, df, b_stats, False, 'pitcher')

### Game Stats

In [18]:
print(games.columns)
print(df.columns)

Index(['away_team_abbr', 'home_team_abbr', 'date', 'start_time', 'venue',
       'away_team_errors', 'home_team_errors', 'away_team_hits',
       'home_team_hits', 'away_team_runs', 'home_team_runs', 'game_id',
       'is_night_game', 'is_grass', 'spread'],
      dtype='object')
Index(['name', 'ip', 'h', 'r', 'er', 'bb', 'so', 'hr', 'earned_run_avg',
       'batters_faced', 'pitches', 'strikes_total', 'strikes_contact',
       'strikes_swinging', 'strikes_looking', 'inplay_gb_total',
       'inplay_fb_total', 'inplay_ld', 'inplay_unk', 'game_score',
       'inherited_runners', 'inherited_score', 'wpa_def', 'leverage_index_avg',
       're24_def', 'game_id', 'is_home_team', 'is_starting_pitcher', 'team',
       'SO_batters_faced', 'H_batters_faced', 'BB_batters_faced',
       'earned_run_avg_10RA', 'SO_batters_faced_10RA', 'H_batters_faced_10RA',
       'BB_batters_faced_10RA'],
      dtype='object')


In [23]:
# Assuming 'game_id' is the common key and 'date' is the column you need from 'games'
df = df.merge(games[['game_id', 'home_team_abbr']], on='game_id', how='left')

In [25]:
import pandas as pd

# Example DataFrame creation
# df = pd.DataFrame({
#     'game_id': ['1', '2', '3'],
#     'date': ['2024-03-01', '2023-10-01', '2023-05-01']
# })

# Check if 'date' column is datetime type, if not convert it
if df['date'].dtype != '<M8[ns]':  # '<M8[ns]' is numpy notation for datetime64
    df['date'] = pd.to_datetime(df['date'])

# Extract the year from the date
df['season'] = df['date'].dt.year

In [26]:
df = features.game_stats(games,df)
df.shape

(11725, 63)

### Season Stats

In [35]:
batting_stats = ['ab', 'bb', 'h', 'pa', 'po', 'r', 'rbi', 'so', 'batting_avg',
             'leverage_index_avg', 'onbase_perc', 'onbase_plus_slugging', 'pitches', 
             're24_bat', 'slugging_perc', 'strikes_total', 'wpa_bat', 'wpa_bat_neg', 
             'wpa_bat_pos']
df = features.add_season_rolling(batting, df, batting_stats, True,'batting')
df.shape

Index(['name', 'ip', 'h', 'r', 'er', 'bb', 'so', 'hr', 'earned_run_avg',
       'batters_faced',
       ...
       'strikes_total_skew_home_batting', 'wpa_bat_mean_home_batting',
       'wpa_bat_std_home_batting', 'wpa_bat_skew_home_batting',
       'wpa_bat_neg_mean_home_batting', 'wpa_bat_neg_std_home_batting',
       'wpa_bat_neg_skew_home_batting', 'wpa_bat_pos_mean_home_batting',
       'wpa_bat_pos_std_home_batting', 'wpa_bat_pos_skew_home_batting'],
      dtype='object', length=120)
Index(['name', 'ip', 'h', 'r', 'er', 'bb', 'so', 'hr', 'earned_run_avg',
       'batters_faced',
       ...
       'strikes_total_skew_away_batting', 'wpa_bat_mean_away_batting',
       'wpa_bat_std_away_batting', 'wpa_bat_skew_away_batting',
       'wpa_bat_neg_mean_away_batting', 'wpa_bat_neg_std_away_batting',
       'wpa_bat_neg_skew_away_batting', 'wpa_bat_pos_mean_away_batting',
       'wpa_bat_pos_std_away_batting', 'wpa_bat_pos_skew_away_batting'],
      dtype='object', length=177)


(11725, 196)

In [37]:
pitching_stats = ['bb', 'er', 'h', 'hr', 'ip', 'r', 'so', 'batters_faced',
               'earned_run_avg', 'game_score', 'inherited_runners',
               'inherited_score', 'inplay_fb_total', 'inplay_gb_total', 'inplay_ld',
               'inplay_unk', 'leverage_index_avg', 'pitches', 're24_def',
               'strikes_contact', 'strikes_looking', 'strikes_swinging',
               'strikes_total', 'wpa_def','SO_batters_faced','H_batters_faced',
                'BB_batters_faced']
df = features.add_season_rolling(pitching, df, pitching_stats, True,'team_pitching')
df.shape

Index(['name', 'ip', 'h', 'r', 'er', 'bb', 'so', 'hr', 'earned_run_avg',
       'batters_faced',
       ...
       'wpa_def_skew_home_team_pitching',
       'SO_batters_faced_mean_home_team_pitching',
       'SO_batters_faced_std_home_team_pitching',
       'SO_batters_faced_skew_home_team_pitching',
       'H_batters_faced_mean_home_team_pitching',
       'H_batters_faced_std_home_team_pitching',
       'H_batters_faced_skew_home_team_pitching',
       'BB_batters_faced_mean_home_team_pitching',
       'BB_batters_faced_std_home_team_pitching',
       'BB_batters_faced_skew_home_team_pitching'],
      dtype='object', length=277)
Index(['name', 'ip', 'h', 'r', 'er', 'bb', 'so', 'hr', 'earned_run_avg',
       'batters_faced',
       ...
       'wpa_def_skew_away_team_pitching',
       'SO_batters_faced_mean_away_team_pitching',
       'SO_batters_faced_std_away_team_pitching',
       'SO_batters_faced_skew_away_team_pitching',
       'H_batters_faced_mean_away_team_pitching',
       'H_

(11725, 385)

In [39]:
if 'away_pitcher' not in df.columns:
    df['away_pitcher'] = 'Unknown'
else:
    df['away_pitcher'].fillna('Unknown', inplace=True)

In [40]:
df = features.add_season_rolling(pitchers, df, pitching_stats, False,'pitcher')
df.shape

Index(['name', 'ip', 'h', 'r', 'er', 'bb', 'so', 'hr', 'earned_run_avg',
       'batters_faced',
       ...
       'wpa_def_skew_home_pitcher', 'SO_batters_faced_mean_home_pitcher',
       'SO_batters_faced_std_home_pitcher',
       'SO_batters_faced_skew_home_pitcher',
       'H_batters_faced_mean_home_pitcher', 'H_batters_faced_std_home_pitcher',
       'H_batters_faced_skew_home_pitcher',
       'BB_batters_faced_mean_home_pitcher',
       'BB_batters_faced_std_home_pitcher',
       'BB_batters_faced_skew_home_pitcher'],
      dtype='object', length=467)
Index(['name', 'ip', 'h', 'r', 'er', 'bb', 'so', 'hr', 'earned_run_avg',
       'batters_faced',
       ...
       'wpa_def_skew_away_pitcher', 'SO_batters_faced_mean_away_pitcher',
       'SO_batters_faced_std_away_pitcher',
       'SO_batters_faced_skew_away_pitcher',
       'H_batters_faced_mean_away_pitcher', 'H_batters_faced_std_away_pitcher',
       'H_batters_faced_skew_away_pitcher',
       'BB_batters_faced_mean_away_pitche

(11725, 575)

In [41]:
df.to_csv('../data/df_before_cleanup.csv', index=False)

# Cleanup

In [42]:
#fin na's
df = util.fix_na(df, False)

In [43]:
df.shape

(11725, 575)

In [44]:
df.to_csv('../data/df_w_features.csv', index=False)

In [45]:
import pandas as pd
import numpy as np

def create_pitchers_with_starting():
    # Load the original CSV file
    pitchers = pd.read_csv('../data/pitchers.csv')
    
    # Assign random True/False to 'is_starting_pitcher'
    # np.random.choice takes a list of values and a size, generating a random selection for each row in DataFrame
    pitchers['is_starting_pitcher'] = np.random.choice([True, False], size=len(pitchers))

    # Save the modified DataFrame to a new CSV file
    pitchers.to_csv('../data/pitchers_with_starting.csv', index=False)

# Run the function to create the new CSV file
create_pitchers_with_starting()

In [46]:
import pandas as pd
import numpy as np

def create_batting_with_team():
    batting = pd.read_csv('../data/batting.csv')
    
    batting['team'] = np.where(
        batting['home_away'] == 'away', 
        batting['game_id'].str[:3],   # Extracts the first three characters if away
        batting['game_id'].str[3:6]   # Extracts the next three characters if home
    )

    # Save the modified DataFrame to a new CSV file
    batting.to_csv('../data/batting_with_team.csv', index=False)

# Run the function to create the new CSV file
create_batting_with_team()