# Assessing Our Salary Data

In [6]:
import pandas as pd
import os

In [7]:
# Import all Y data from 2007-08 to 2023-24
Y_file_path = '../../Data/Lake/1B_Output_Salary/dataframe'

# Y_file_path contains csv files for each year from 2007-08 to 2023-24
salary_data = pd.DataFrame()
for year in range(2008, 2025):
    this_year = pd.read_csv(Y_file_path + '/season=' + str(year) + '.csv')
    salary_data = pd.concat([salary_data, this_year])

salary_data.shape

(22085, 24)

In [8]:
salary_data.columns.tolist()

['PLAYER',
 'TEAM',
 'AGE',
 'DATE OF BIRTH',
 'COUNTRY',
 'POS',
 'HANDED',
 'DRAFTED',
 'SIGNING TEAM',
 'TYPE',
 'SIGNING AGE',
 'CLAUSE',
 'LENGTH',
 'EXPIRY',
 'EXP. YEAR',
 'CAP HIT',
 'AAV',
 'SALARY',
 'BASE SALARY',
 'S.BONUS',
 'P.BONUS',
 'season',
 'Y_Salary_Cap',
 'Y_Salary_Cap_Percentage']

In [9]:
# View players that show up twice in one year in our salary data
indices = salary_data[salary_data.duplicated(subset=['PLAYER', 'season'], keep=False)].sort_values(by=['PLAYER', 'POS', 'season']).index

In [10]:
# drop all records based on indices
salary_data.drop(index=indices, inplace=True)

In [11]:
salary_data.describe()

Unnamed: 0,SIGNING AGE,LENGTH,EXP. YEAR,Y_Salary_Cap,Y_Salary_Cap_Percentage
count,20719.0,20719.0,20719.0,20719.0,20719.0
mean,23.839423,2.908731,2018.207925,72317790.0,0.025804
std,4.440089,1.72488,4.718979,9612829.0,0.026796
min,0.0,1.0,2008.0,50300000.0,0.00785
25%,20.0,2.0,2015.0,64300000.0,0.009576
50%,23.0,3.0,2019.0,73000000.0,0.011597
75%,27.0,3.0,2022.0,81500000.0,0.033444
max,48.0,15.0,2031.0,83500000.0,0.279082


In [14]:
# Save salary data to the warehouse
outdir = '../../Data/Warehouse/SalaryData/'
outfile = 'salary_data.csv' 
if not os.path.exists(outdir):
    os.mkdir(outdir)
salary_data.to_csv(outdir + '/' + outfile, index=False)

Unnamed: 0,SIGNING AGE,LENGTH,EXP. YEAR,Y_Salary_Cap,Y_Salary_Cap_Percentage
count,20718.0,20718.0,20718.0,20718.0,20718.0
mean,23.839463,2.908823,2018.208225,72318170.0,0.025792
std,4.440192,1.72487,4.718896,9612899.0,0.026739
min,0.0,1.0,2008.0,50300000.0,0.00785
25%,20.0,2.0,2015.0,64300000.0,0.009576
50%,23.0,3.0,2019.0,73000000.0,0.011596
75%,27.0,3.0,2022.0,81500000.0,0.033437
max,48.0,15.0,2031.0,83500000.0,0.168227
