# Splitting Data into Test-Train-Validation Sets

In [1]:
import csv
import argparse
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import OneHotEncoder

In [2]:
batting_data_path = 'data/Batting.csv'
# INFO:
# 101,332 Players with up to 20 features each (exluding year, including team)
# if metric not reported for player, set to 0.0 by default

df = pd.read_csv(batting_data_path, index_col = 0)
df = df.fillna(0)

In [3]:
df_recent_players = df[df.yearID >= 1975] #48k players
team_set = set(df_recent_players.teamID)
df_recent_players

#  NOTE: Potentially create one-hot encoding of teams...
## I chose to leave it off for now because I think it may
## create Data leakage in our model


Unnamed: 0_level_0,yearID,stint,teamID,lgID,G,AB,R,H,2B,3B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aaronha01,1975,1,ML4,AL,137,465.0,45.0,109.0,16.0,2.0,...,60.0,0.0,1.0,70.0,51.0,3.0,1.0,1.0,6.0,15.0
abbotgl01,1975,1,OAK,AL,30,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
acostcy01,1975,1,PHI,NL,6,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
adamsgl01,1975,1,SFN,NL,61,90.0,10.0,27.0,2.0,1.0,...,15.0,1.0,0.0,11.0,25.0,0.0,1.0,0.0,1.0,1.0
alburvi01,1975,1,MIN,AL,33,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
alexado01,1975,1,BAL,AL,32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
alexaga01,1975,1,SFN,NL,3,3.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0
alexama01,1975,1,OAK,AL,63,10.0,16.0,1.0,0.0,0.0,...,0.0,17.0,10.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
allendi01,1975,1,PHI,NL,119,416.0,54.0,97.0,21.0,3.0,...,62.0,11.0,2.0,58.0,109.0,4.0,2.0,1.0,4.0,19.0
allenll01,1975,1,CHA,AL,3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
#  Randomly shuffle by player year
## NOTE: This may be an inaccurate assumption to make.
## We're implicitly assuming that training on players from 1975
## won't impact prediction accuracy for modern day players.

## If this is an incorrect assumption, consider data augmentation,
## taking into account differences between current-day MLB players

df_recent_players = df_recent_players.sample(frac=1) #randomly permute by year
player_ids = [df.iloc[:,0:0]]
df_recent_players
# player_ids = set(df_recent_players.playerID) #List of all player ids

# len(player_ids)

Unnamed: 0_level_0,yearID,stint,teamID,lgID,G,AB,R,H,2B,3B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bartedi01,1936,1,NY1,NL,145,510.0,71.0,152.0,31.0,3.0,...,42.0,6.0,0.0,40.0,36.0,0.0,5.0,18.0,0.0,9.0
musseje01,1989,2,NYN,NL,20,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
hawksbl01,2009,1,SLN,NL,30,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
stearec01,1880,1,BFN,NL,28,104.0,8.0,19.0,6.0,1.0,...,13.0,0.0,0.0,3.0,23.0,0.0,0.0,0.0,0.0,0.0
lombavi01,1949,1,PIT,NL,43,49.0,8.0,17.0,0.0,1.0,...,10.0,1.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0
worreti01,1999,1,OAK,AL,53,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
werlebi01,1952,2,SLN,NL,19,9.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
noriejo01,1969,1,CIN,NL,5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
harreke01,1969,2,CLE,AL,149,519.0,83.0,115.0,13.0,4.0,...,84.0,17.0,8.0,95.0,96.0,6.0,2.0,0.0,5.0,18.0
donnech01,1993,1,HOU,NL,88,179.0,18.0,46.0,14.0,2.0,...,24.0,2.0,0.0,19.0,33.0,0.0,0.0,0.0,1.0,6.0


In [51]:
x_data = df_recent_players[df_recent_players.columns[6:]]