# Splitting Data into Test-Train-Validation Sets

In [16]:
import csv
import argparse
import pandas as pd
import numpy as np
import os

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

In [17]:
batting_data_path = 'bsb_ref.csv'
# INFO:
# 101,332 Players with up to 20 features each (exluding year, including team)
# if metric not reported for player, set to 0.0 by default

df = pd.read_csv(batting_data_path, index_col = 0)
df = df.fillna(0)

In [18]:
df_recent_players = df[df.year_ID >= 1975] #48k players
team_set = set(df_recent_players.team_ID)
df_recent_players

#  NOTE: Potentially create one-hot encoding of teams...
## I chose to leave it off for now because I think it may
## create Data leakage in our model


Unnamed: 0_level_0,age,mlb_ID,player_ID,year_ID,team_ID,stint_ID,lg_ID,PA,G,Inn,...,oppRpG_rep,pyth_exponent,pyth_exponent_rep,waa_win_perc,waa_win_perc_off,waa_win_perc_def,waa_win_perc_rep,OPS_plus,TOB_lg,TB_lg
name_common,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
David Aardsma,22.0,430911.0,aardsda01,2004,SFG,1,NL,0.0,11,10.7,...,4.67092,1.890,1.890,0.5000,0.5000,0.5000,0.5000,0.0000,0.000,0.000
David Aardsma,24.0,430911.0,aardsda01,2006,CHC,1,NL,3.0,43,53.0,...,4.86457,1.912,1.913,0.4990,0.4990,0.5000,0.4998,-100.0000,0.694,0.896
David Aardsma,25.0,430911.0,aardsda01,2007,CHW,1,AL,0.0,2,32.3,...,4.85895,1.912,1.912,0.5000,0.5000,0.5000,0.5000,0.0000,0.000,0.000
David Aardsma,26.0,430911.0,aardsda01,2008,BOS,1,AL,1.0,5,48.7,...,4.69650,1.893,1.894,0.4970,0.4970,0.5000,0.4992,-100.0000,0.345,0.434
David Aardsma,27.0,430911.0,aardsda01,2009,SEA,1,AL,0.0,3,71.3,...,4.79788,1.905,1.905,0.5000,0.5000,0.5000,0.5000,0.0000,0.000,0.000
David Aardsma,28.0,430911.0,aardsda01,2010,SEA,1,AL,0.0,4,49.7,...,4.44684,1.864,1.864,0.5000,0.5000,0.5000,0.5000,0.0000,0.000,0.000
David Aardsma,30.0,430911.0,aardsda01,2012,NYY,1,AL,0.0,0,1.0,...,0.00000,0.000,0.000,0.0000,0.0000,0.0000,0.0000,0.0000,0.000,0.000
David Aardsma,31.0,430911.0,aardsda01,2013,NYM,1,NL,0.0,41,39.7,...,4.02801,1.812,1.812,0.5000,0.5000,0.5000,0.5000,0.0000,0.000,0.000
David Aardsma,33.0,430911.0,aardsda01,2015,ATL,1,NL,1.0,30,30.7,...,4.22114,1.837,1.837,0.4996,0.4996,0.5000,0.4999,-100.0000,0.320,0.404
Hank Aaron,41.0,110001.0,aaronha01,1975,MIL,1,AL,543.0,137,1164.0,...,4.15661,1.839,1.838,0.4861,0.4868,0.4899,0.4846,94.9126,177.668,176.282


In [19]:
#  Randomly shuffle by player year
## NOTE: This may be an inaccurate assumption to make.
## We're implicitly assuming that training on players from 1975
## won't impact prediction accuracy for modern day players.

## If this is an incorrect assumption, consider data augmentation,
## taking into account differences between current-day MLB players

df_recent_players = df_recent_players.sample(frac=1) #randomly permute by year
player_ids = [df.iloc[:,0:0]]
df_recent_players
# player_ids = set(df_recent_players.playerID) #List of all player ids

# len(player_ids)

Unnamed: 0_level_0,age,mlb_ID,player_ID,year_ID,team_ID,stint_ID,lg_ID,PA,G,Inn,...,oppRpG_rep,pyth_exponent,pyth_exponent_rep,waa_win_perc,waa_win_perc_off,waa_win_perc_def,waa_win_perc_rep,OPS_plus,TOB_lg,TB_lg
name_common,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
David Murphy,29.0,461815.0,murphda07,2011,TEX,1,AL,440.0,120,976.7,...,4.29912,1.858,1.855,0.4935,0.4909,0.4982,0.4854,92.4950,146.889,171.498
Jesse Hahn,25.0,534910.0,hahnje01,2015,OAK,1,AL,3.0,1,96.7,...,4.19651,1.824,1.840,0.4591,0.4591,0.5011,0.4884,-100.0000,0.944,1.223
Steve Kline,29.0,117157.0,klinest02,2002,STL,1,NL,1.0,64,58.3,...,4.48052,1.868,1.868,0.4998,0.4998,0.5000,0.4999,-100.0000,0.337,0.419
Domingo Ramos,25.0,120908.0,ramosdo01,1983,SEA,1,AL,136.0,53,316.0,...,4.41000,1.869,1.865,0.5040,0.4983,0.5096,0.4908,86.8281,44.914,51.765
Brian McRae,25.0,118871.0,mcraebr01,1993,KCR,1,AL,685.0,153,1345.3,...,4.57187,1.899,1.888,0.5058,0.5018,0.5064,0.4837,92.8677,231.629,262.274
Bert Blyleven,25.0,111126.0,blylebe01,1976,MIN,1,AL,0.0,0,95.3,...,0.00000,0.000,0.000,0.0000,0.0000,0.0000,0.0000,0.0000,0.000,0.000
Matt Grott,27.0,115199.0,grottma01,1995,CIN,1,NL,0.0,2,1.7,...,4.63299,1.886,1.886,0.5000,0.5000,0.5000,0.5000,0.0000,0.000,0.000
Johnnie LeMaster,30.0,117665.0,lemasjo01,1984,SFG,1,NL,493.0,132,1055.3,...,3.93455,1.810,1.809,0.4779,0.4873,0.4971,0.4856,56.3834,157.610,170.388
Rick Aguilera,28.0,110090.0,aguilri01,1990,MIN,1,AL,0.0,1,65.3,...,4.32902,1.850,1.850,0.5000,0.5000,0.5000,0.5000,0.0000,0.000,0.000
Rich Hinton,29.0,115975.0,hintori01,1976,CIN,1,NL,1.0,12,17.7,...,3.97231,1.805,1.805,0.4987,0.4987,0.5000,0.4997,-100.0000,0.332,0.378


In [25]:
x_data = df_recent_players[df_recent_players.columns[7:9]]
y_data = df_recent_players['OPS_plus']
print(x_data)

                        PA    G
name_common                    
David Murphy         440.0  120
Jesse Hahn             3.0    1
Steve Kline            1.0   64
Domingo Ramos        136.0   53
Brian McRae          685.0  153
Bert Blyleven          0.0    0
Matt Grott             0.0    2
Johnnie LeMaster     493.0  132
Rick Aguilera          0.0    1
Rich Hinton            1.0   12
Ryan McGuire         222.0   84
Mark Gubicza           0.0    0
Ken Landreaux        536.0  129
Jeff Pierce            0.0    0
Jose Gonzalez         12.0   23
Max Stassi           250.0   88
Tom Burgmeier          0.0    3
Kevin Orie           230.0   64
Ben Oglivie          660.0  156
Pete Dalena            7.0    5
Shane Andrews        559.0  150
Dusty Baker          604.0  153
Steve Fireovid         0.0    6
Chris George           3.0    1
Ryan Perry             0.0    1
Benny Ayala           94.0   42
Joe Wieland            2.0    1
Wilson Betemit       198.0   87
Scott Proctor          2.0   28
Mike Sch

In [26]:
reg = LinearRegression().fit(x_data, y_data)
print(reg.score(x_data, y_data))

0.2725445040216048
