In [4]:
import pandas as pd
import numpy as np
import glob

In [5]:
csv_files = glob.glob("data/lahman_1871-2024_csv/*.csv")
csv_files

['data/lahman_1871-2024_csv\\AllstarFull.csv',
 'data/lahman_1871-2024_csv\\Appearances.csv',
 'data/lahman_1871-2024_csv\\AwardsManagers.csv',
 'data/lahman_1871-2024_csv\\AwardsPlayers.csv',
 'data/lahman_1871-2024_csv\\AwardsShareManagers.csv',
 'data/lahman_1871-2024_csv\\AwardsSharePlayers.csv',
 'data/lahman_1871-2024_csv\\Batting.csv',
 'data/lahman_1871-2024_csv\\BattingPost.csv',
 'data/lahman_1871-2024_csv\\CollegePlaying.csv',
 'data/lahman_1871-2024_csv\\Fielding.csv',
 'data/lahman_1871-2024_csv\\FieldingOF.csv',
 'data/lahman_1871-2024_csv\\FieldingOFsplit.csv',
 'data/lahman_1871-2024_csv\\FieldingPost.csv',
 'data/lahman_1871-2024_csv\\HallOfFame.csv',
 'data/lahman_1871-2024_csv\\HomeGames.csv',
 'data/lahman_1871-2024_csv\\Managers.csv',
 'data/lahman_1871-2024_csv\\ManagersHalf.csv',
 'data/lahman_1871-2024_csv\\Parks.csv',
 'data/lahman_1871-2024_csv\\People.csv',
 'data/lahman_1871-2024_csv\\Pitching.csv',
 'data/lahman_1871-2024_csv\\PitchingPost.csv',
 'data/lahm

In [6]:
salaries = pd.read_csv("data/lahman_1871-2024_csv/Salaries.csv")
salaries.head()

latest_year = salaries['yearID'].max()
print("Latest year with salary data:", latest_year)

Latest year with salary data: 2016


In [7]:
years = list(range(latest_year - 10, latest_year + 1))
years

[2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016]

In [8]:
salaries_11yr = salaries[salaries['yearID'].isin(years)].copy()

In [9]:
pitching = pd.read_csv("data/lahman_1871-2024_csv/Pitching.csv")
fielding = pd.read_csv("data/lahman_1871-2024_csv/Fielding.csv")

pitching_agg = pitching.groupby(['playerID', 'yearID'], as_index=False).sum(numeric_only=True)
fielding_agg = fielding.groupby(['playerID', 'yearID'], as_index=False).sum(numeric_only=True)

In [10]:
pitcher_salaries = salaries_11yr.merge(pitching_agg, on=['playerID', 'yearID'], how='inner')
combined = pitcher_salaries.merge(fielding_agg, on=['playerID', 'yearID'], how='left', suffixes=('_pitch', '_field'))

In [11]:
combined.rename(columns={'salary': 'Salary'}, inplace=True)

In [12]:
combined['TrainVal'] = np.where(combined['yearID'] == latest_year, 'Validation', 'Training')

In [13]:
cols_to_drop = ['teamID', 'lgID', 'stint']
combined = combined.drop(columns=[c for c in cols_to_drop if c in combined.columns], errors='ignore')

In [19]:
combined.to_csv("data/Pitcher_Salary_Data.csv", index=False)

In [20]:
final_data = pd.read_csv("data/Pitcher_Salary_Data.csv")
print("Preview of Saved Pitcher_Salary_Data.csv ")
display(final_data.head(43))

Preview of Saved Pitcher_Salary_Data.csv 


Unnamed: 0,yearID,playerID,Salary,stint_pitch,W,L,G_pitch,GS_pitch,CG,SHO,...,PO,A,E,DP,PB,WP_field,SB,CS,ZR,TrainVal
0,2007,aardsda01,387500,1,2,1,25,0,0,0,...,2,4,1.0,0,0.0,0.0,0.0,0.0,0.0,Training
1,2008,aardsda01,403250,1,4,2,47,0,0,0,...,3,6,0.0,0,0.0,0.0,0.0,0.0,0.0,Training
2,2009,aardsda01,419000,1,3,6,73,0,0,0,...,2,5,0.0,1,0.0,0.0,0.0,0.0,0.0,Training
3,2010,aardsda01,2750000,1,0,6,53,0,0,0,...,2,3,1.0,0,0.0,0.0,0.0,0.0,0.0,Training
4,2012,aardsda01,500000,1,0,0,1,0,0,0,...,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,Training
5,2011,abadfe01,418000,1,1,4,29,0,0,0,...,1,2,2.0,0,0.0,0.0,0.0,0.0,0.0,Training
6,2012,abadfe01,485000,1,0,6,37,6,0,0,...,3,6,0.0,0,0.0,0.0,0.0,0.0,0.0,Training
7,2014,abadfe01,525900,1,2,4,69,0,0,0,...,0,8,0.0,0,0.0,0.0,0.0,0.0,0.0,Training
8,2015,abadfe01,1087500,1,2,2,62,0,0,0,...,2,4,0.0,0,0.0,0.0,0.0,0.0,0.0,Training
9,2016,abadfe01,1250000,3,1,6,57,0,0,0,...,0,4,0.0,1,0.0,0.0,0.0,0.0,0.0,Validation
