In [1]:
# Import module/s
import pandas as pd
import numpy as np

# NBA Data Cleaning

In [2]:
# Read in NBA csv for player salary data
# Set encoding to ISO-8859-1
nba_csv = '../data/raw/NBA_PlayerByYear.csv'
nba_data = pd.read_csv(nba_csv, sep=';', encoding='ISO-8859-1')

# Create DataFrame
nba_df = pd.DataFrame(nba_data)
nba_df

Unnamed: 0,Player,2017,2016,2015,2014,2013,2012,2011,2010,2009,...,2000,1999,1998,1997,1996,1995,1994,1993,1992,1991
0,A.C. Green,,,,,,,,,,...,1700000.0,5125088,5095088.0,4851000.0,6473000,6472600.0,1885000.0,1750000.0,1750000.0,1750000.0
1,A.J. Bramlett,,,,,,,,,,...,118974.0,,,,,,,,,
2,A.J. English,,,,,,,,,,...,,,,,,,150000.0,406000.0,325000.0,275000.0
3,A.J. Guyton,,,,,,,,,,...,,,,,,,,,,
4,A.J. Price,,,281484.0,947907.0,885120.0,854389.0,762195.0,457588.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2199,Zeljko Rebraca,,,,,,,,,,...,,,,,,,,,,
2200,Zendon Hamilton,,,,,,,,,,...,,,,,,,,,,
2201,Zoran Dragic,,1756500.0,1756500.0,,,,,,,...,,,,,,,,,,
2202,Zoran Planinic,,,,,,,,,,...,,,,,,,,,,


In [3]:
# Get info on NBA data
nba_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2204 entries, 0 to 2203
Data columns (total 28 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  2204 non-null   object 
 1   2017    499 non-null    float64
 2   2016    500 non-null    float64
 3   2015    513 non-null    float64
 4   2014    501 non-null    float64
 5   2013    481 non-null    float64
 6   2012    464 non-null    float64
 7   2011    460 non-null    float64
 8   2010    457 non-null    float64
 9   2009    461 non-null    float64
 10  2008    470 non-null    float64
 11  2007    496 non-null    float64
 12  2006    480 non-null    float64
 13  2005    471 non-null    float64
 14  2004    455 non-null    float64
 15  2003    452 non-null    float64
 16  2002    451 non-null    float64
 17  2001    456 non-null    float64
 18  2000    517 non-null    float64
 19  1999    447 non-null    object 
 20  1998    445 non-null    float64
 21  1997    416 non-null    float64
 22  

In [4]:
# Find max values for each year
nba_df.max()

  nba_df.max()


Player    Zydrunas Ilgauskas
2017              30963450.0
2016              25000000.0
2015              23500000.0
2014              30453805.0
2013              27849149.0
2012              25244493.0
2011              24806250.0
2010              23239562.0
2009              24751934.0
2008              23750000.0
2007              21000000.0
2006              20000000.0
2005              27696430.0
2004              28000000.0
2003              25200000.0
2002              22400000.0
2001              19610000.0
2000              17142858.0
1998              33140000.0
1997              30140000.0
1995              14660000.0
1994               5740000.0
1993               5720000.0
1992               7070000.0
1991               4250000.0
dtype: object

In [5]:
# Check for 'Unknown' values in the entire DataFrame
unknown_values_mask = nba_df.applymap(lambda x: x == 'Unknown')

# Get the rows and columns where 'Unknown' values are present
unknown_values_locations = (unknown_values_mask).any(axis=1)

# Display rows where 'Unknown' values are present
rows_with_unknown_values = nba_df[unknown_values_locations]
# print(rows_with_unknown_values)

# Display quantity of 'Unknown' values 
total_unknown_values = rows_with_unknown_values.count().sum()
print("Total 'Unknown' values:", total_unknown_values)


Total 'Unknown' values: 283


### NOTE:
above cell must be executed again after executing below cell to show proper removal of 'Unknown' values

In [6]:
# Replace 'Unknown's with NaN to match other non-numeric (NaN) values
nba_df = nba_df.replace(to_replace='Unknown', value=np.nan)

# Re-Check 'Unknown' values
print("Total 'Unknown' values:", total_unknown_values)

Total 'Unknown' values: 283


In [7]:
# Convert the object type columns to floats
nba_df['1999'] = nba_df['1999'].astype(float)
nba_df['1996'] = nba_df['1996'].astype(float)

# Re-check data
nba_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2204 entries, 0 to 2203
Data columns (total 28 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  2204 non-null   object 
 1   2017    499 non-null    float64
 2   2016    500 non-null    float64
 3   2015    513 non-null    float64
 4   2014    501 non-null    float64
 5   2013    481 non-null    float64
 6   2012    464 non-null    float64
 7   2011    460 non-null    float64
 8   2010    457 non-null    float64
 9   2009    461 non-null    float64
 10  2008    470 non-null    float64
 11  2007    496 non-null    float64
 12  2006    480 non-null    float64
 13  2005    471 non-null    float64
 14  2004    455 non-null    float64
 15  2003    452 non-null    float64
 16  2002    451 non-null    float64
 17  2001    456 non-null    float64
 18  2000    517 non-null    float64
 19  1999    427 non-null    float64
 20  1998    445 non-null    float64
 21  1997    416 non-null    float64
 22  

### Rounding Salary Values
Final desired format for all salary values will be whole numbers with no decimals

In [8]:
# Identify numeric columns (excluding 'Player' column)
numeric_columns = nba_df.select_dtypes(include=['number']).columns.difference(['Player'])

# Round and convert only the numeric columns (excluding 'Player' column)
nba_df[numeric_columns] = nba_df[numeric_columns].apply(lambda x: round(x).astype('Int64'))

# Print the modified DataFrame
print(nba_df)

                  Player  2017     2016     2015    2014    2013    2012  \
0             A.C. Green  <NA>     <NA>     <NA>    <NA>    <NA>    <NA>   
1          A.J. Bramlett  <NA>     <NA>     <NA>    <NA>    <NA>    <NA>   
2           A.J. English  <NA>     <NA>     <NA>    <NA>    <NA>    <NA>   
3            A.J. Guyton  <NA>     <NA>     <NA>    <NA>    <NA>    <NA>   
4             A.J. Price  <NA>     <NA>   281484  947907  885120  854389   
...                  ...   ...      ...      ...     ...     ...     ...   
2199      Zeljko Rebraca  <NA>     <NA>     <NA>    <NA>    <NA>    <NA>   
2200     Zendon Hamilton  <NA>     <NA>     <NA>    <NA>    <NA>    <NA>   
2201        Zoran Dragic  <NA>  1756500  1756500    <NA>    <NA>    <NA>   
2202      Zoran Planinic  <NA>     <NA>     <NA>    <NA>    <NA>    <NA>   
2203  Zydrunas Ilgauskas  <NA>     <NA>     <NA>    <NA>    <NA>    <NA>   

         2011      2010      2009  ...     2000     1999     1998     1997  \
0        

In [9]:
# Look deeper into data
nba_df.head(15)

Unnamed: 0,Player,2017,2016,2015,2014,2013,2012,2011,2010,2009,...,2000,1999,1998,1997,1996,1995,1994,1993,1992,1991
0,A.C. Green,,,,,,,,,,...,1700000.0,5125088.0,5095088.0,4851000.0,6473000.0,6472600.0,1885000.0,1750000.0,1750000.0,1750000.0
1,A.J. Bramlett,,,,,,,,,,...,118974.0,,,,,,,,,
2,A.J. English,,,,,,,,,,...,,,,,,,150000.0,406000.0,325000.0,275000.0
3,A.J. Guyton,,,,,,,,,,...,,,,,,,,,,
4,A.J. Price,,,281484.0,947907.0,885120.0,854389.0,762195.0,457588.0,,...,,,,,,,,,,
5,A.J. Wynder,,,,,,,,,,...,,,,,,,,140000.0,130000.0,30000.0
6,Aaron Brooks,2700000.0,2250000.0,915243.0,947907.0,3250000.0,,2016692.0,1118520.0,1045560.0,...,,,,,,,,,,
7,Aaron Gordon,4351320.0,4171680.0,3992040.0,,,,,,,...,,,,,,,,,,
8,Aaron Gray,,1356146.0,1227985.0,2690875.0,2575000.0,2500000.0,1028840.0,1000497.0,711517.0,...,,,,,,,,,,
9,Aaron Harrison,874636.0,525093.0,,,,,,,,...,,,,,,,,,,


### NOTE:
`NaN` values are automatically converted to `<NA>`values when applying rounding and conversion operations to columns containing missing values in Pandas. <br>
`<NA>` is used specifically for missing values in nullable integer data types, providing better support for integer-specific operations.

In [10]:
# Add 'League' column, for purposes of the final merged DataFrame showing the league of each player

# Define desired column position
insert_pos = 0

# Create and insert 'League'column
nba_df.insert(insert_pos, 'League', 'NBA')

# Display DF to verify column addition
nba_df.head()

Unnamed: 0,League,Player,2017,2016,2015,2014,2013,2012,2011,2010,...,2000,1999,1998,1997,1996,1995,1994,1993,1992,1991
0,NBA,A.C. Green,,,,,,,,,...,1700000.0,5125088.0,5095088.0,4851000.0,6473000.0,6472600.0,1885000.0,1750000.0,1750000.0,1750000.0
1,NBA,A.J. Bramlett,,,,,,,,,...,118974.0,,,,,,,,,
2,NBA,A.J. English,,,,,,,,,...,,,,,,,150000.0,406000.0,325000.0,275000.0
3,NBA,A.J. Guyton,,,,,,,,,...,,,,,,,,,,
4,NBA,A.J. Price,,,281484.0,947907.0,885120.0,854389.0,762195.0,457588.0,...,,,,,,,,,,


In [11]:
# Review final NBA DataFrame info 
nba_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2204 entries, 0 to 2203
Data columns (total 29 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   League  2204 non-null   object
 1   Player  2204 non-null   object
 2   2017    499 non-null    Int64 
 3   2016    500 non-null    Int64 
 4   2015    513 non-null    Int64 
 5   2014    501 non-null    Int64 
 6   2013    481 non-null    Int64 
 7   2012    464 non-null    Int64 
 8   2011    460 non-null    Int64 
 9   2010    457 non-null    Int64 
 10  2009    461 non-null    Int64 
 11  2008    470 non-null    Int64 
 12  2007    496 non-null    Int64 
 13  2006    480 non-null    Int64 
 14  2005    471 non-null    Int64 
 15  2004    455 non-null    Int64 
 16  2003    452 non-null    Int64 
 17  2002    451 non-null    Int64 
 18  2001    456 non-null    Int64 
 19  2000    517 non-null    Int64 
 20  1999    427 non-null    Int64 
 21  1998    445 non-null    Int64 
 22  1997    416 non-null    

In [12]:
# Check Max values are still correct after rounding
nba_df.max()

League                   NBA
Player    Zydrunas Ilgauskas
2017                30963450
2016                25000000
2015                23500000
2014                30453805
2013                27849149
2012                25244493
2011                24806250
2010                23239562
2009                24751934
2008                23750000
2007                21000000
2006                20000000
2005                27696430
2004                28000000
2003                25200000
2002                22400000
2001                19610000
2000                17142858
1999                18500000
1998                33140000
1997                30140000
1996                18724000
1995                14660000
1994                 5740000
1993                 5720000
1992                 7070000
1991                 4250000
dtype: object

# NHL Data Cleaning 

In [13]:
# Read in csv for the NHL player salary data
# Utilize encoding='ISO-8859-1'
nhl_csv = '../data/raw/NHL_PlayerByYear.csv'
nhl_data = pd.read_csv(nhl_csv, sep=';', encoding='ISO-8859-1')

# Create DataFrame
nhl_df = pd.DataFrame(nhl_data)
nhl_df

Unnamed: 0,Player,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000
0,A.J. Greer,741666.0,741666.0,,,,,,,,,,,,,,,,
1,Aaron Dell,625000.0,625000.0,,,,,,,,,,,,,,,,
2,Aaron Ekblad,7500000.0,925000.0,925000.0,1775000.0,,,,,,,,,,,,,,
3,Aaron Johnson,,,,,,650000.0,550000.0,,,,,,,,,,,
4,Aaron Ness,612500.0,612500.0,,,,875000.0,875000.0,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1871,Zane McIntyre,650000.0,925000.0,,,,,,,,,,,,,,,,
1872,Zbynek Michalek,,3200000.0,3200000.0,440860.0,4000000.0,4000000.0,4000000.0,4000000.0,1250000.0,1250000.0,1250000.0,1250000.0,,,,,,
1873,Zdeno Chara,4000000.0,6916667.0,6916667.0,6916667.0,6916667.0,6916667.0,6916665.0,7500000.0,7500000.0,7500000.0,7500000.0,7500000.0,,,1750000.0,1750000.0,1750000.0,632500.0
1874,Zemgus Girgensons,1600000.0,1150000.0,894167.0,1106667.0,1369167.0,,,,,,,,,,,,,


In [14]:
# Find max value for each year
nhl_df.max()

Player    Zenon Konopka
2017         10500000.0
2016         10500000.0
2015         10500000.0
2014          9538462.0
2013          9538462.0
2012          9538462.0
2011          9538462.0
2010          9650000.0
2009          9538462.0
2008          9538462.0
2007          9240000.0
2006          9240000.0
2005          9240000.0
2004          2550000.0
2003          9666667.0
2002          9666667.0
2001          9666667.0
2000          8750000.0
dtype: object

In [15]:
# Review NHL salary data 
nhl_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1876 entries, 0 to 1875
Data columns (total 19 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  1876 non-null   object 
 1   2017    1046 non-null   float64
 2   2016    1040 non-null   float64
 3   2015    964 non-null    float64
 4   2014    788 non-null    float64
 5   2013    790 non-null    float64
 6   2012    825 non-null    float64
 7   2011    957 non-null    float64
 8   2010    853 non-null    float64
 9   2009    734 non-null    float64
 10  2008    637 non-null    float64
 11  2007    469 non-null    float64
 12  2006    303 non-null    float64
 13  2005    186 non-null    float64
 14  2004    4 non-null      float64
 15  2003    107 non-null    float64
 16  2002    87 non-null     float64
 17  2001    71 non-null     float64
 18  2000    52 non-null     float64
dtypes: float64(18), object(1)
memory usage: 278.6+ KB


In [16]:
# Cell to round salary values to whole numbers (Same code as NBA cleaning, to match format)

# Identify numeric columns (excluding 'Player' column)
nhl_num_columns = nhl_df.select_dtypes(include=['number']).columns.difference(['Player'])

# Round and convert only the numeric columns (excluding 'Player' column)
nhl_df[nhl_num_columns] = nhl_df[nhl_num_columns].apply(lambda x: round(x).astype('Int64'))

# Display Rounded DataFrame
nhl_df.head() 

Unnamed: 0,Player,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000
0,A.J. Greer,741666.0,741666.0,,,,,,,,,,,,,,,,
1,Aaron Dell,625000.0,625000.0,,,,,,,,,,,,,,,,
2,Aaron Ekblad,7500000.0,925000.0,925000.0,1775000.0,,,,,,,,,,,,,,
3,Aaron Johnson,,,,,,650000.0,550000.0,,,,,,,,,,,
4,Aaron Ness,612500.0,612500.0,,,,875000.0,875000.0,,,,,,,,,,,


In [17]:
# Add 'League' column, for purposes of the final merged DataFrame showing the league of each player

# Create and insert 'League'column, insert_pos pre-defined
nhl_df.insert(insert_pos, 'League', 'NHL')

# Display DF to verify column addition
nhl_df.head()

Unnamed: 0,League,Player,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000
0,NHL,A.J. Greer,741666.0,741666.0,,,,,,,,,,,,,,,,
1,NHL,Aaron Dell,625000.0,625000.0,,,,,,,,,,,,,,,,
2,NHL,Aaron Ekblad,7500000.0,925000.0,925000.0,1775000.0,,,,,,,,,,,,,,
3,NHL,Aaron Johnson,,,,,,650000.0,550000.0,,,,,,,,,,,
4,NHL,Aaron Ness,612500.0,612500.0,,,,875000.0,875000.0,,,,,,,,,,,


In [18]:
# Final review of data
nhl_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1876 entries, 0 to 1875
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   League  1876 non-null   object
 1   Player  1876 non-null   object
 2   2017    1046 non-null   Int64 
 3   2016    1040 non-null   Int64 
 4   2015    964 non-null    Int64 
 5   2014    788 non-null    Int64 
 6   2013    790 non-null    Int64 
 7   2012    825 non-null    Int64 
 8   2011    957 non-null    Int64 
 9   2010    853 non-null    Int64 
 10  2009    734 non-null    Int64 
 11  2008    637 non-null    Int64 
 12  2007    469 non-null    Int64 
 13  2006    303 non-null    Int64 
 14  2005    186 non-null    Int64 
 15  2004    4 non-null      Int64 
 16  2003    107 non-null    Int64 
 17  2002    87 non-null     Int64 
 18  2001    71 non-null     Int64 
 19  2000    52 non-null     Int64 
dtypes: Int64(18), object(2)
memory usage: 326.2+ KB


In [19]:
# Check max salary values are correct after rounding
nhl_df.max()

League              NHL
Player    Zenon Konopka
2017           10500000
2016           10500000
2015           10500000
2014            9538462
2013            9538462
2012            9538462
2011            9538462
2010            9650000
2009            9538462
2008            9538462
2007            9240000
2006            9240000
2005            9240000
2004            2550000
2003            9666667
2002            9666667
2001            9666667
2000            8750000
dtype: object

# NFL Data Cleaning

In [20]:
# Read in csv for NFL player salary data
# Utilize ISO-8859-1 encoding 
nfl_csv = '../data/raw/NFL_PlayerByYear.csv'
nfl_data = pd.read_csv(nfl_csv, sep=';', encoding='ISO-8859-1')

# Create DataFrame 
nfl_df = pd.DataFrame(nfl_data)
nfl_df

Unnamed: 0,Player,2017,2016,2015,2014,2013,2012,2011,2010,2009,...,2000,1999,1998,1997,1996,1995,1994,1993,1992,1991
0,A.J. Jenkins,,,,1.021.594,705.797,1.263.188,,,,...,,,,,,,,,,
1,A.J. Bouye,5.468.750,1.671.000,586.668,496.666,406.666,,,,,...,,,,,,,,,,
2,A.J. Cann,921.399,785.399,635.399,,,,,,,...,,,,,,,,,,
3,A.J. Derby,180.880,264.700,346.013,,,,,,,...,,,,,,,,,,
4,A.J. Edds,,,,116.471,,465.000,375.000,439.250,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5685,Zay Jones,1.232.696,,,,,,,,,...,,,,,,,,,,
5686,Zebrie Sanders,,,,,,322.625,,,,...,,,,,,,,,,
5687,Zeke Motta,,,,329.474,416.474,,,,,...,,,,,,,,,,
5688,Zoltan Mesko,,,,,,586.812,496.812,366.812,,...,,,,,,,,,,


In [21]:
# Examine NFL data
nfl_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5690 entries, 0 to 5689
Data columns (total 28 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  5690 non-null   object 
 1   2017    2115 non-null   object 
 2   2016    2094 non-null   object 
 3   2015    2077 non-null   object 
 4   2014    2046 non-null   object 
 5   2013    2015 non-null   object 
 6   2012    2156 non-null   object 
 7   2011    1923 non-null   object 
 8   2010    1718 non-null   object 
 9   2009    1542 non-null   object 
 10  2008    1379 non-null   object 
 11  2007    1219 non-null   object 
 12  2006    1027 non-null   object 
 13  2005    817 non-null    object 
 14  2004    623 non-null    object 
 15  2003    457 non-null    object 
 16  2002    374 non-null    object 
 17  2001    281 non-null    object 
 18  2000    252 non-null    object 
 19  1999    1 non-null      object 
 20  1998    169 non-null    object 
 21  1997    0 non-null      float64
 22  

To start the cleaning I will:
1. removing the periods/decimal places from the salary values to match the desired whole number format
2. convert the data types of the salary columns to numeric

In [22]:
# 1. Function to remove periods from object columns (Salary columns)
def remove_periods(s):
    if isinstance(s, str):
        return s.replace('.', '')
    return s

# Apply the custom function to all columns except 'Player' and 'League'
nfl_df.loc[:, nfl_df.columns.difference(['Player'])] = nfl_df.loc[:, nfl_df.columns.difference(['Player'])].applymap(remove_periods)

# Verify DF was modified correctly
nfl_df.head()

Unnamed: 0,Player,2017,2016,2015,2014,2013,2012,2011,2010,2009,...,2000,1999,1998,1997,1996,1995,1994,1993,1992,1991
0,A.J. Jenkins,,,,1021594.0,705797.0,1263188.0,,,,...,,,,,,,,,,
1,A.J. Bouye,5468750.0,1671000.0,586668.0,496666.0,406666.0,,,,,...,,,,,,,,,,
2,A.J. Cann,921399.0,785399.0,635399.0,,,,,,,...,,,,,,,,,,
3,A.J. Derby,180880.0,264700.0,346013.0,,,,,,,...,,,,,,,,,,
4,A.J. Edds,,,,116471.0,,465000.0,375000.0,439250.0,,...,,,,,,,,,,


In [23]:
# 2. Convert numeric columns to 'Int64' type, handling empty values
numeric_columns = nfl_df.columns.difference(['Player'])
nfl_df[numeric_columns] = nfl_df[numeric_columns].apply(
    lambda col: pd.to_numeric(col, errors='coerce', downcast='integer'))

# Verify Data Type Conversion
print(nfl_df.info())
print(nfl_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5690 entries, 0 to 5689
Data columns (total 28 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  5690 non-null   object 
 1   2017    2115 non-null   float64
 2   2016    2094 non-null   float64
 3   2015    2077 non-null   float64
 4   2014    2046 non-null   float64
 5   2013    2015 non-null   float64
 6   2012    2156 non-null   float64
 7   2011    1923 non-null   float64
 8   2010    1718 non-null   float64
 9   2009    1542 non-null   float64
 10  2008    1379 non-null   float64
 11  2007    1219 non-null   float64
 12  2006    1027 non-null   float64
 13  2005    817 non-null    float64
 14  2004    623 non-null    float64
 15  2003    457 non-null    float64
 16  2002    374 non-null    float64
 17  2001    281 non-null    float64
 18  2000    252 non-null    float64
 19  1999    1 non-null      float64
 20  1998    169 non-null    float64
 21  1997    0 non-null      float64
 22  

## NOTE  
Above data type conversion successful, although left numeric values with ".0" endings which will need to be removed to match desired format

In [24]:
# Will round and convert to integers as above cleaning NHL needed 
nfl_df[numeric_columns] = nfl_df[numeric_columns].apply(lambda x: round(x).astype('Int64'))

nfl_df.head()

Unnamed: 0,Player,2017,2016,2015,2014,2013,2012,2011,2010,2009,...,2000,1999,1998,1997,1996,1995,1994,1993,1992,1991
0,A.J. Jenkins,,,,1021594.0,705797.0,1263188.0,,,,...,,,,,,,,,,
1,A.J. Bouye,5468750.0,1671000.0,586668.0,496666.0,406666.0,,,,,...,,,,,,,,,,
2,A.J. Cann,921399.0,785399.0,635399.0,,,,,,,...,,,,,,,,,,
3,A.J. Derby,180880.0,264700.0,346013.0,,,,,,,...,,,,,,,,,,
4,A.J. Edds,,,,116471.0,,465000.0,375000.0,439250.0,,...,,,,,,,,,,


In [25]:
# Add 'League' column, for purposes of the final merged DataFrame showing the league of each player

# Create and insert 'League'column, insert_pos pre-defined
nfl_df.insert(insert_pos, 'League', 'NFL')

# Display DF to verify column addition
nfl_df.head()

Unnamed: 0,League,Player,2017,2016,2015,2014,2013,2012,2011,2010,...,2000,1999,1998,1997,1996,1995,1994,1993,1992,1991
0,NFL,A.J. Jenkins,,,,1021594.0,705797.0,1263188.0,,,...,,,,,,,,,,
1,NFL,A.J. Bouye,5468750.0,1671000.0,586668.0,496666.0,406666.0,,,,...,,,,,,,,,,
2,NFL,A.J. Cann,921399.0,785399.0,635399.0,,,,,,...,,,,,,,,,,
3,NFL,A.J. Derby,180880.0,264700.0,346013.0,,,,,,...,,,,,,,,,,
4,NFL,A.J. Edds,,,,116471.0,,465000.0,375000.0,439250.0,...,,,,,,,,,,


In [26]:
# Final review of data 
nfl_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5690 entries, 0 to 5689
Data columns (total 29 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   League  5690 non-null   object
 1   Player  5690 non-null   object
 2   2017    2115 non-null   Int64 
 3   2016    2094 non-null   Int64 
 4   2015    2077 non-null   Int64 
 5   2014    2046 non-null   Int64 
 6   2013    2015 non-null   Int64 
 7   2012    2156 non-null   Int64 
 8   2011    1923 non-null   Int64 
 9   2010    1718 non-null   Int64 
 10  2009    1542 non-null   Int64 
 11  2008    1379 non-null   Int64 
 12  2007    1219 non-null   Int64 
 13  2006    1027 non-null   Int64 
 14  2005    817 non-null    Int64 
 15  2004    623 non-null    Int64 
 16  2003    457 non-null    Int64 
 17  2002    374 non-null    Int64 
 18  2001    281 non-null    Int64 
 19  2000    252 non-null    Int64 
 20  1999    1 non-null      Int64 
 21  1998    169 non-null    Int64 
 22  1997    0 non-null      

In [27]:
# Check salary values are correct after rounding & reformatting
nfl_df.max()

League         NFL
Player        name
2017      24550000
2016      24200000
2015      23800000
2014      22412500
2013      20850000
2012      20500000
2011      17818000
2010      20320000
2009      23216666
2008      20716666
2007      16854168
2006      13828590
2005      14600000
2004      12400000
2003      15375333
2002      12926181
2001      10063210
2000       8637857
1999       1882142
1998       7624999
1997          <NA>
1996       5926666
1995      37140000
1994       4500000
1993       5600000
1992          <NA>
1991       2725000
dtype: object

In [28]:
# Cell to locate incorrectly formatted maximum salary value
max_value_index = nfl_df['1995'].idxmax()

# Retrieve the entire row with the maximum value
row_with_max_value = nfl_df.loc[max_value_index]

# Print the row with the maximum value
print("Row with the maximum value in column '1995':")
print(row_with_max_value)

Row with the maximum value in column '1995':
League           NFL
Player    Jeff Blake
2017            <NA>
2016            <NA>
2015            <NA>
2014            <NA>
2013            <NA>
2012            <NA>
2011            <NA>
2010            <NA>
2009            <NA>
2008            <NA>
2007            <NA>
2006            <NA>
2005          455000
2004         1000000
2003         2500000
2002          450000
2001            <NA>
2000         1690000
1999            <NA>
1998         2450000
1997            <NA>
1996         2050000
1995        37140000
1994          162000
1993            <NA>
1992            <NA>
1991            <NA>
Name: 2612, dtype: object


In [29]:
# Need to correct incorrect 1995 Salary value to $371400 not $37Million, as seen above
row_index = 2612
column_name = '1995'

# New corrected value
corrected_value = 371400 

# Change the value in the DataFrame
nfl_df.at[row_index, column_name] = corrected_value

# Verify the change
print("Updated value in row", row_index, "and column", column_name, ":")
print(nfl_df.at[row_index, column_name])

Updated value in row 2612 and column 1995 :
371400


In [30]:
# Double check value was changed
nfl_df.max()

League         NFL
Player        name
2017      24550000
2016      24200000
2015      23800000
2014      22412500
2013      20850000
2012      20500000
2011      17818000
2010      20320000
2009      23216666
2008      20716666
2007      16854168
2006      13828590
2005      14600000
2004      12400000
2003      15375333
2002      12926181
2001      10063210
2000       8637857
1999       1882142
1998       7624999
1997          <NA>
1996       5926666
1995       4500000
1994       4500000
1993       5600000
1992          <NA>
1991       2725000
dtype: object

# MLB Data Cleaning

In [31]:
# Read in csv for the MLB player salary data
# UTF-8 default encoding was not reading, encoding='ISO-8859-1' likely needed for all csv's from the data source
mlb_csv = '../data/raw/MLB_PlayerByYear.csv'
mlb_data = pd.read_csv(mlb_csv, sep=';', encoding='ISO-8859-1')

# Create DataFrame
mlb_df = pd.DataFrame(mlb_data)
mlb_df

Unnamed: 0,Player,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000
0,Abel De Los Santos,,507.500,507.500,,,,,,,,,,,,,,,
1,A.J. Achter,,507.500,507.500,500.000,,,,,,,,,,,,,,
2,A.J. Alexy,,600.000,,,,,,,,,,,,,,,,
3,A.J. Bogucki,,150.000,,,,,,,,,,,,,,,,
4,A.J. Burnett,,,8.500.000,11.250.000,16.500.000,16.500.000,16.500.000,16.500.000,16.500.000,13.200.000,13.200.000,2.200.000,3.750.000,2.500.000,2.500.000,367.500,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3946,Zack Weiss,,,,,180.000,,,,,,,,,,,,,
3947,Zack Wheeler,800.000,546.250,546.250,512.375,490.000,,,,3.300.000,,,,,,,,,
3948,Zeke Spruill,,,,500.000,490.000,,,,,,,,,,,,,
3949,Zelous Wheeler,,,,500.000,,,,,,,,,,,,,,


In [32]:
# Analyze MLB data
mlb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3951 entries, 0 to 3950
Data columns (total 19 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Player  3951 non-null   object
 1   2017    1811 non-null   object
 2   2016    1746 non-null   object
 3   2015    1462 non-null   object
 4   2014    1283 non-null   object
 5   2013    1402 non-null   object
 6   2012    1433 non-null   object
 7   2011    1353 non-null   object
 8   2010    977 non-null    object
 9   2009    929 non-null    object
 10  2008    728 non-null    object
 11  2007    622 non-null    object
 12  2006    509 non-null    object
 13  2005    396 non-null    object
 14  2004    282 non-null    object
 15  2003    222 non-null    object
 16  2002    166 non-null    object
 17  2001    110 non-null    object
 18  2000    84 non-null     object
dtypes: object(19)
memory usage: 586.6+ KB


In [33]:
# Define numeric columns to apply function to 
num_columns = mlb_df.columns.difference(['Player'])

# Define a function to remove the first two periods
def remove_periods(value):
    if isinstance(value, str):  # Check if value is a string
        return value.replace('.', '', 2)
    elif pd.notnull(value):  # Check if value is not null
        return str(value).replace('.', '', 2)  # Convert numeric value to string and remove periods
    else:
        return value  # Return value as is if it's null

# Apply the function to all specified columns
mlb_df[num_columns] = mlb_df[num_columns].applymap(remove_periods)


In [34]:
mlb_df.head()

Unnamed: 0,Player,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000
0,Abel De Los Santos,,507500.0,507500.0,,,,,,,,,,,,,,,
1,A.J. Achter,,507500.0,507500.0,500000.0,,,,,,,,,,,,,,
2,A.J. Alexy,,600000.0,,,,,,,,,,,,,,,,
3,A.J. Bogucki,,150000.0,,,,,,,,,,,,,,,,
4,A.J. Burnett,,,8500000.0,11250000.0,16500000.0,16500000.0,16500000.0,16500000.0,16500000.0,13200000.0,13200000.0,2200000.0,3750000.0,2500000.0,2500000.0,367500.0,,


In [35]:
# Fucntion to convert Year columns to numeric 
def convert_to_numeric(col):
    if col.name != "Player":
        return pd.to_numeric(col, errors='coerce')
    else:
        return col

# Apply the function to all salary columns
mlb_df = mlb_df.apply(convert_to_numeric)

# Check data types
mlb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3951 entries, 0 to 3950
Data columns (total 19 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  3951 non-null   object 
 1   2017    1811 non-null   float64
 2   2016    1746 non-null   float64
 3   2015    1462 non-null   float64
 4   2014    1283 non-null   float64
 5   2013    1402 non-null   float64
 6   2012    1433 non-null   float64
 7   2011    1353 non-null   float64
 8   2010    977 non-null    float64
 9   2009    929 non-null    float64
 10  2008    728 non-null    float64
 11  2007    622 non-null    float64
 12  2006    509 non-null    float64
 13  2005    396 non-null    float64
 14  2004    282 non-null    float64
 15  2003    222 non-null    float64
 16  2002    166 non-null    float64
 17  2001    110 non-null    float64
 18  2000    84 non-null     float64
dtypes: float64(18), object(1)
memory usage: 586.6+ KB


In [36]:
# Check max values on newly converted numeric columns 
mlb_df.max()

Player    Zoilo Almonte
2017         35571428.0
2016         34571428.0
2015         32571428.0
2014         26000000.0
2013         29000000.0
2012         30000000.0
2011         32000000.0
2010         33000000.0
2009         33000000.0
2008         28000000.0
2007         24700000.0
2006         22275000.0
2005         24700000.0
2004         25200000.0
2003         21275000.0
2002         19400000.0
2001         20250000.0
2000         13650000.0
dtype: object

## Observations of Data
1. data seems to be similar to NFL (object columns need to be converted to numeric)
2. periods used to seperate 1000's (need to be removed)

In [37]:
# Cell to round salary values to whole numbers (Same code as above cleaning, to match format)

# Round and convert only the numeric columns (excluding 'Player' column)
mlb_df[num_columns] = mlb_df[num_columns].apply(lambda x: round(x).astype('Int64'))

# # Display Rounded DataFrame
mlb_df.head() 

Unnamed: 0,Player,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000
0,Abel De Los Santos,,507500.0,507500.0,,,,,,,,,,,,,,,
1,A.J. Achter,,507500.0,507500.0,500000.0,,,,,,,,,,,,,,
2,A.J. Alexy,,600000.0,,,,,,,,,,,,,,,,
3,A.J. Bogucki,,150000.0,,,,,,,,,,,,,,,,
4,A.J. Burnett,,,8500000.0,11250000.0,16500000.0,16500000.0,16500000.0,16500000.0,16500000.0,13200000.0,13200000.0,2200000.0,3750000.0,2500000.0,2500000.0,367500.0,,


In [38]:
# review of data
mlb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3951 entries, 0 to 3950
Data columns (total 19 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Player  3951 non-null   object
 1   2017    1811 non-null   Int64 
 2   2016    1746 non-null   Int64 
 3   2015    1462 non-null   Int64 
 4   2014    1283 non-null   Int64 
 5   2013    1402 non-null   Int64 
 6   2012    1433 non-null   Int64 
 7   2011    1353 non-null   Int64 
 8   2010    977 non-null    Int64 
 9   2009    929 non-null    Int64 
 10  2008    728 non-null    Int64 
 11  2007    622 non-null    Int64 
 12  2006    509 non-null    Int64 
 13  2005    396 non-null    Int64 
 14  2004    282 non-null    Int64 
 15  2003    222 non-null    Int64 
 16  2002    166 non-null    Int64 
 17  2001    110 non-null    Int64 
 18  2000    84 non-null     Int64 
dtypes: Int64(18), object(1)
memory usage: 656.1+ KB


In [39]:
# Re-Check Max values to verify numbers were rounding successfully
mlb_df.max()

Player    Zoilo Almonte
2017           35571428
2016           34571428
2015           32571428
2014           26000000
2013           29000000
2012           30000000
2011           32000000
2010           33000000
2009           33000000
2008           28000000
2007           24700000
2006           22275000
2005           24700000
2004           25200000
2003           21275000
2002           19400000
2001           20250000
2000           13650000
dtype: object

In [40]:
# Add 'League' column, for purposes of the final merged DataFrame showing the league of each player

# Create and insert 'League'column
mlb_df.insert(insert_pos, 'League', 'MLB')

# Display DF to verify column addition
mlb_df.head()

Unnamed: 0,League,Player,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000
0,MLB,Abel De Los Santos,,507500.0,507500.0,,,,,,,,,,,,,,,
1,MLB,A.J. Achter,,507500.0,507500.0,500000.0,,,,,,,,,,,,,,
2,MLB,A.J. Alexy,,600000.0,,,,,,,,,,,,,,,,
3,MLB,A.J. Bogucki,,150000.0,,,,,,,,,,,,,,,,
4,MLB,A.J. Burnett,,,8500000.0,11250000.0,16500000.0,16500000.0,16500000.0,16500000.0,16500000.0,13200000.0,13200000.0,2200000.0,3750000.0,2500000.0,2500000.0,367500.0,,


In [41]:
# Final data information check
mlb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3951 entries, 0 to 3950
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   League  3951 non-null   object
 1   Player  3951 non-null   object
 2   2017    1811 non-null   Int64 
 3   2016    1746 non-null   Int64 
 4   2015    1462 non-null   Int64 
 5   2014    1283 non-null   Int64 
 6   2013    1402 non-null   Int64 
 7   2012    1433 non-null   Int64 
 8   2011    1353 non-null   Int64 
 9   2010    977 non-null    Int64 
 10  2009    929 non-null    Int64 
 11  2008    728 non-null    Int64 
 12  2007    622 non-null    Int64 
 13  2006    509 non-null    Int64 
 14  2005    396 non-null    Int64 
 15  2004    282 non-null    Int64 
 16  2003    222 non-null    Int64 
 17  2002    166 non-null    Int64 
 18  2001    110 non-null    Int64 
 19  2000    84 non-null     Int64 
dtypes: Int64(18), object(2)
memory usage: 686.9+ KB


# Data Merging 
1. first draft of data cleaning complete
2. All 4 leagues will now be merged into one DataFrame for further project work

In [42]:
# Begin with merging NBA and NFL as there columns match identically
merged_nba_nfl = pd.merge(nba_df, nfl_df, on=['Player', 'League'], how='outer')

# Show DF and shape
merged_nba_nfl

Unnamed: 0,League,Player,2017_x,2016_x,2015_x,2014_x,2013_x,2012_x,2011_x,2010_x,...,2000_y,1999_y,1998_y,1997_y,1996_y,1995_y,1994_y,1993_y,1992_y,1991_y
0,NBA,A.C. Green,,,,,,,,,...,,,,,,,,,,
1,NBA,A.J. Bramlett,,,,,,,,,...,,,,,,,,,,
2,NBA,A.J. English,,,,,,,,,...,,,,,,,,,,
3,NBA,A.J. Guyton,,,,,,,,,...,,,,,,,,,,
4,NBA,A.J. Price,,,281484,947907,885120,854389,762195,457588,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7889,NFL,Zay Jones,,,,,,,,,...,,,,,,,,,,
7890,NFL,Zebrie Sanders,,,,,,,,,...,,,,,,,,,,
7891,NFL,Zeke Motta,,,,,,,,,...,,,,,,,,,,
7892,NFL,Zoltan Mesko,,,,,,,,,...,,,,,,,,,,


## Observations 
1. Merging is duplicating all year columns in suffix differentiating format, which I do not want 
2. pd.concat() may be the easier option 

In [43]:
# FIRST concatenation will be NBA and NFL as they possess matching columns 
concat_nba_nfl = pd.concat([nba_df, nfl_df], axis=0) # 0 concatenates along rows, as column # and names match

# Show DF and shape
concat_nba_nfl
# NBA 2204 rows + NFL 5690 rows = expected total 7894


Unnamed: 0,League,Player,2017,2016,2015,2014,2013,2012,2011,2010,...,2000,1999,1998,1997,1996,1995,1994,1993,1992,1991
0,NBA,A.C. Green,,,,,,,,,...,1700000,5125088,5095088,4851000,6473000,6472600,1885000,1750000,1750000,1750000
1,NBA,A.J. Bramlett,,,,,,,,,...,118974,,,,,,,,,
2,NBA,A.J. English,,,,,,,,,...,,,,,,,150000,406000,325000,275000
3,NBA,A.J. Guyton,,,,,,,,,...,,,,,,,,,,
4,NBA,A.J. Price,,,281484,947907,885120,854389,762195,457588,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5685,NFL,Zay Jones,1232696,,,,,,,,...,,,,,,,,,,
5686,NFL,Zebrie Sanders,,,,,,322625,,,...,,,,,,,,,,
5687,NFL,Zeke Motta,,,,329474,416474,,,,...,,,,,,,,,,
5688,NFL,Zoltan Mesko,,,,,,586812,496812,366812,...,,,,,,,,,,


In [44]:
# SECOND, concatenating NHL and MLB along rows as there columns match identically
concat_nhl_mlb = pd.concat([nhl_df, mlb_df], axis=0)

# Show DF and shape
concat_nhl_mlb
# NHL 1876 rows + MLB 3951 rows = expected total 5827

Unnamed: 0,League,Player,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000
0,NHL,A.J. Greer,741666,741666,,,,,,,,,,,,,,,,
1,NHL,Aaron Dell,625000,625000,,,,,,,,,,,,,,,,
2,NHL,Aaron Ekblad,7500000,925000,925000,1775000,,,,,,,,,,,,,,
3,NHL,Aaron Johnson,,,,,,650000,550000,,,,,,,,,,,
4,NHL,Aaron Ness,612500,612500,,,,875000,875000,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3946,MLB,Zack Weiss,,,,,180000,,,,,,,,,,,,,
3947,MLB,Zack Wheeler,800000,546250,546250,512375,490000,,,,3300000,,,,,,,,,
3948,MLB,Zeke Spruill,,,,500000,490000,,,,,,,,,,,,,
3949,MLB,Zelous Wheeler,,,,500000,,,,,,,,,,,,,,


In [45]:
# FINAL, concatenation of both concatenated DataFrames

# Creating a DF will all league salary data
final_all_leagues = pd.concat([concat_nba_nfl, concat_nhl_mlb], axis=0)

# Show DF and shape
final_all_leagues
# (NBA & NFl 7894 rows) + (NHL & MLB 5827 rows) = expected total 13721

Unnamed: 0,League,Player,2017,2016,2015,2014,2013,2012,2011,2010,...,2000,1999,1998,1997,1996,1995,1994,1993,1992,1991
0,NBA,A.C. Green,,,,,,,,,...,1700000,5125088,5095088,4851000,6473000,6472600,1885000,1750000,1750000,1750000
1,NBA,A.J. Bramlett,,,,,,,,,...,118974,,,,,,,,,
2,NBA,A.J. English,,,,,,,,,...,,,,,,,150000,406000,325000,275000
3,NBA,A.J. Guyton,,,,,,,,,...,,,,,,,,,,
4,NBA,A.J. Price,,,281484,947907,885120,854389,762195,457588,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3946,MLB,Zack Weiss,,,,,180000,,,,...,,,,,,,,,,
3947,MLB,Zack Wheeler,800000,546250,546250,512375,490000,,,,...,,,,,,,,,,
3948,MLB,Zeke Spruill,,,,500000,490000,,,,...,,,,,,,,,,
3949,MLB,Zelous Wheeler,,,,500000,,,,,...,,,,,,,,,,


In [46]:
# Converting "wide" format data structure to "long" with pandas melt
id_vars = ['League', 'Player']  # Columns to keep as is
value_vars = [col for col in final_all_leagues.columns if col not in id_vars]  # Columns to melt

# Reshape the Dataframe
all_leagues_melted = pd.melt(final_all_leagues, id_vars=id_vars, value_vars=value_vars, var_name='Year', value_name='Salary')

#Display reshaped DataFrame
all_leagues_melted

Unnamed: 0,League,Player,Year,Salary
0,NBA,A.C. Green,2017,
1,NBA,A.J. Bramlett,2017,
2,NBA,A.J. English,2017,
3,NBA,A.J. Guyton,2017,
4,NBA,A.J. Price,2017,
...,...,...,...,...
370462,MLB,Zack Weiss,1991,
370463,MLB,Zack Wheeler,1991,
370464,MLB,Zeke Spruill,1991,
370465,MLB,Zelous Wheeler,1991,


In [47]:
# Review values in Salary column
all_leagues_melted.value_counts() 

League  Player               Year  Salary 
MLB      Abel De Los Santos  2015  507500     1
NFL     Kelvin Benjamin      2015  1741875    1
        Kellen Winslow       2011  8290000    1
                             2013  424412     1
        Kelly Holcomb        1995  56100      1
                                             ..
NBA     Jared Cunningham     2013  1156320    1
                             2014  1208400    1
                             2015  915243     1
                             2016  56052      1
NHL     Zenon Konopka        2013  479103     1
Length: 62433, dtype: int64

In [48]:
# Locate incorrectly cleaned high salary values
high_values = all_leagues_melted.value_counts() > 100000000


print(high_values)

League  Player               Year  Salary 
MLB      Abel De Los Santos  2015  507500     False
NFL     Kelvin Benjamin      2015  1741875    False
        Kellen Winslow       2011  8290000    False
                             2013  424412     False
        Kelly Holcomb        1995  56100      False
                                              ...  
NBA     Jared Cunningham     2013  1156320    False
                             2014  1208400    False
                             2015  915243     False
                             2016  56052      False
NHL     Zenon Konopka        2013  479103     False
Length: 62433, dtype: bool


In [49]:
# Information from melted long  and wide DataFrames
print(all_leagues_melted.info())
print(final_all_leagues.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370467 entries, 0 to 370466
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   League  370467 non-null  object
 1   Player  370467 non-null  object
 2   Year    370467 non-null  object
 3   Salary  62433 non-null   object
dtypes: object(4)
memory usage: 11.3+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 13721 entries, 0 to 3950
Data columns (total 29 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   League  13721 non-null  object
 1   Player  13721 non-null  object
 2   2017    5471 non-null   Int64 
 3   2016    5380 non-null   Int64 
 4   2015    5016 non-null   Int64 
 5   2014    4618 non-null   Int64 
 6   2013    4688 non-null   Int64 
 7   2012    4878 non-null   Int64 
 8   2011    4693 non-null   Int64 
 9   2010    4005 non-null   Int64 
 10  2009    3666 non-null   Int64 
 11  2008    3214 non-null   Int64 
 12  2007 

In [50]:
# Convert 'Year' and 'Salary' back to Integers 
melted_num_columns = ['Year','Salary']

all_leagues_melted[melted_num_columns] = all_leagues_melted[melted_num_columns].astype('Int64')

# Re-Check data types for conversion
all_leagues_melted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370467 entries, 0 to 370466
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   League  370467 non-null  object
 1   Player  370467 non-null  object
 2   Year    370467 non-null  Int64 
 3   Salary  62433 non-null   Int64 
dtypes: Int64(2), object(2)
memory usage: 12.0+ MB


In [51]:
# Identify row with incorrect min salary 333 --> should be 333000

min_value_index = all_leagues_melted['Salary'].idxmin()

row_with_min = all_leagues_melted.iloc[min_value_index]

print(row_with_min)

League                 NFL
Player    Terrence Johnson
Year                  2010
Salary                 333
Name: 103437, dtype: object


In [52]:
# Correct above value

# Set location and correct value
min_row = 103437
column_position = 'Salary'
correction = 333000

# Alter dataframe accordingly
all_leagues_melted.at[min_row, column_position] = correction

# Verify the change
print("Updated value in row", min_row, "and column", column_position, ":")
print(all_leagues_melted.at[min_row, column_position])

Updated value in row 103437 and column Salary :
333000


In [53]:
# Re-Check for Min value
print(all_leagues_melted['Salary'].idxmin())

196846


In [54]:
# Check Salary column for incorrect 3 digit salaries
col_to_check = 'Salary'

low_digit_rows = all_leagues_melted[all_leagues_melted[col_to_check].astype(str).str.len() == 3]

print('Rows with only 3 digits in Salary column:')
print(low_digit_rows)

Rows with only 3 digits in Salary column:
       League       Player  Year  Salary
196846    NFL  Jason McKie  2003     334


In [55]:
# Fix Row, spotrac.com lists his 2003 salary at $105882
row_to_fix = 196846
col_to_fix = 'Salary'
correct_value = 105882

all_leagues_melted.at[row_to_fix, col_to_fix] = correct_value

# Search for player to verfiy change
player_name = 'Jason McKie'  
league_name = 'NFL'  
year_value = 2003  

# Search for rows where Player, League, and Year columns match the specified values
filtered_rows = all_leagues_melted[(all_leagues_melted['Player'] == player_name) & (all_leagues_melted['League'] == league_name) & (all_leagues_melted['Year'] == year_value)]

# Verify change
print('Expected salary shown should be: 105882, actual value is:')
print(filtered_rows)

Expected salary shown should be: 105882, actual value is:
       League       Player  Year  Salary
196846    NFL  Jason McKie  2003  105882


**Export cleaned & re-shaped Data to processed folder**

In [56]:
# Save both version (wide & long) as CSVs to processed folder

# Wide version
final_all_leagues.to_csv('../data/processed/all_leagues_wide.csv', index=False)

# Long version
all_leagues_melted.to_csv('../data/processed/all_leagues_long.csv', index=False)

# Team Data Cleaning

## MLB Team Data

In [57]:
# Read in MLB team data
team_mlb_csv = '../data/raw/MLB_TeamByYear.csv'
team_mlb = pd.read_csv(team_mlb_csv, sep=';')

# Create DataFrame
team_mlb_df = pd.DataFrame(team_mlb)
team_mlb_df.head()

Unnamed: 0,team,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000
0,arizona diamondbacks,132080400,93870000,69764831,89443700,87788000,78442066,66360316,59453666,71044333,91377793,50423879,27539000,7505000,17105750,20991666,26991666,20688000,19300000
1,atlanta braves,113898391,97375791,82066982,109268374,104318533,91401666,109294516,111608910,93752832,65790166,57713749,49984166,30924833,23903333,31923333,38477033,34216333,27338237
2,baltimore orioles,194737583,168157381,108338838,123376458,117209833,86289833,100595888,84997445,58260083,40668166,42468666,39034000,21893000,10615000,8500000,532500,600000,0
3,boston red sox,209084178,232959928,182184365,129485728,190154000,183150118,178305675,156725384,104605900,113490750,125504449,80719984,80358125,84075000,68489500,57200000,34400000,17020000
4,chicago cubs,187196280,184953447,120995925,64937523,83501423,91607066,132046096,132027537,116929833,97179834,62236583,54767833,50468000,32377000,8645000,5247000,2500000,200000


In [58]:
# Review data info
team_mlb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 19 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   team    32 non-null     object
 1   2017    32 non-null     int64 
 2   2016    32 non-null     int64 
 3   2015    32 non-null     int64 
 4   2014    32 non-null     int64 
 5   2013    32 non-null     int64 
 6   2012    32 non-null     int64 
 7   2011    32 non-null     int64 
 8   2010    32 non-null     int64 
 9   2009    32 non-null     int64 
 10  2008    32 non-null     int64 
 11  2007    32 non-null     int64 
 12  2006    32 non-null     int64 
 13  2005    32 non-null     int64 
 14  2004    32 non-null     int64 
 15  2003    32 non-null     int64 
 16  2002    32 non-null     int64 
 17  2001    32 non-null     int64 
 18  2000    32 non-null     int64 
dtypes: int64(18), object(1)
memory usage: 4.9+ KB


In [76]:
# Insert 'League' column for later filtering needs by league once merged
# Create and insert 'League'column
team_mlb_df.insert(insert_pos, 'League', 'MLB')  

# Verify success
team_mlb_df.head()

Unnamed: 0,League,team,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000
0,MLB,arizona diamondbacks,132080400,93870000,69764831,89443700,87788000,78442066,66360316,59453666,71044333,91377793,50423879,27539000,7505000,17105750,20991666,26991666,20688000,19300000
1,MLB,atlanta braves,113898391,97375791,82066982,109268374,104318533,91401666,109294516,111608910,93752832,65790166,57713749,49984166,30924833,23903333,31923333,38477033,34216333,27338237
2,MLB,baltimore orioles,194737583,168157381,108338838,123376458,117209833,86289833,100595888,84997445,58260083,40668166,42468666,39034000,21893000,10615000,8500000,532500,600000,0
3,MLB,boston red sox,209084178,232959928,182184365,129485728,190154000,183150118,178305675,156725384,104605900,113490750,125504449,80719984,80358125,84075000,68489500,57200000,34400000,17020000
4,MLB,chicago cubs,187196280,184953447,120995925,64937523,83501423,91607066,132046096,132027537,116929833,97179834,62236583,54767833,50468000,32377000,8645000,5247000,2500000,200000


## NBA Team Data

In [69]:
# Read in NBA Team data
team_nba_csv = '../data/raw/NBA_TeamByYear.csv'
team_nba = pd.read_csv(team_nba_csv, sep=';')

# Create DataFrame
team_nba_df = pd.DataFrame(team_nba)
team_nba_df.head()

Unnamed: 0,Team,2017,2016,2015,2014,2013,2012,2011,2010,2009,...,2000,1999,1998,1997,1996,1995,1994,1993,1992,1991
0,Salary Cap,94.143.000,70.000.000,63.065.000,58.679.000,58.044.000,58.044.000,58.044.000,57.700.000,58.680.000,...,34.000.000,30.000.000,26.900.000,24.363.000,23.000.000,15.964.000,15.175.000,14.000.000,12.500.000,11.871.000
1,Atlanta Hawks,95.471.579,72.902.950,58.470.278,58.936.376,66.768.365,73.669.912,71.469.843,65.883.642,68.168.841,...,44.169.202,40.954.940,30.133.827,25.806.100,22.227.000,21.869.340,22.510.000,18.036.000,12.930.000,11.761.000
2,Boston Celtics,87.272.327,77.139.134,62.442.955,70.676.163,73.021.989,79.820.530,82.045.867,83.552.174,79.188.973,...,46.152.875,32.258.856,26.800.500,25.853.500,20.219.000,24.462.000,17.881.000,25.217.000,25.343.000,11.256.000
3,Brooklyn Nets,72.926.799,88.346.807,92.172.817,107.358.841,85.282.885,,,,,...,,,,,,,,,,
4,Charlotte Bobcats,,,,72.426.678,57.491.899,57.902.024,66.542.696,68.681.758,62.866.926,...,,,,,,,,,,


In [70]:
# Review data info
team_nba_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 28 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Team    37 non-null     object
 1   2017    31 non-null     object
 2   2016    31 non-null     object
 3   2015    31 non-null     object
 4   2014    31 non-null     object
 5   2013    31 non-null     object
 6   2012    31 non-null     object
 7   2011    31 non-null     object
 8   2010    31 non-null     object
 9   2009    31 non-null     object
 10  2008    31 non-null     object
 11  2007    31 non-null     object
 12  2006    31 non-null     object
 13  2005    31 non-null     object
 14  2004    30 non-null     object
 15  2003    30 non-null     object
 16  2002    30 non-null     object
 17  2001    30 non-null     object
 18  2000    30 non-null     object
 19  1999    30 non-null     object
 20  1998    30 non-null     object
 21  1997    30 non-null     object
 22  1996    30 non-null     ob

### Info Review - NBA
* can see the year columns are 'string' types, due to the period separators in the total team spend values
    * periods will need to be removed
    * data types will need to be converted to Int64 to match other league data

In [71]:
# Apply the remove_periods function to all columns except 'Team' Column
team_nba_df.loc[:, team_nba_df.columns.difference(['Team'])] = team_nba_df.loc[:, team_nba_df.columns.difference(['Team'])].applymap(remove_periods)

# Verify DF was modified correctly
team_nba_df.head()


Unnamed: 0,Team,2017,2016,2015,2014,2013,2012,2011,2010,2009,...,2000,1999,1998,1997,1996,1995,1994,1993,1992,1991
0,Salary Cap,94143000.0,70000000.0,63065000.0,58679000,58044000,58044000.0,58044000.0,57700000.0,58680000.0,...,34000000.0,30000000.0,26900000.0,24363000.0,23000000.0,15964000.0,15175000.0,14000000.0,12500000.0,11871000.0
1,Atlanta Hawks,95471579.0,72902950.0,58470278.0,58936376,66768365,73669912.0,71469843.0,65883642.0,68168841.0,...,44169202.0,40954940.0,30133827.0,25806100.0,22227000.0,21869340.0,22510000.0,18036000.0,12930000.0,11761000.0
2,Boston Celtics,87272327.0,77139134.0,62442955.0,70676163,73021989,79820530.0,82045867.0,83552174.0,79188973.0,...,46152875.0,32258856.0,26800500.0,25853500.0,20219000.0,24462000.0,17881000.0,25217000.0,25343000.0,11256000.0
3,Brooklyn Nets,72926799.0,88346807.0,92172817.0,107358841,85282885,,,,,...,,,,,,,,,,
4,Charlotte Bobcats,,,,72426678,57491899,57902024.0,66542696.0,68681758.0,62866926.0,...,,,,,,,,,,


In [72]:
# Convert numeric columns (all year columns) to Numeric type - Int64
team_nba_df.loc[:, team_nba_df.columns.difference(['Team'])] = team_nba_df.loc[:, team_nba_df.columns.difference(['Team'])].astype('Int64')

# Check success of data type conversion 
team_nba_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 28 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Team    37 non-null     object
 1   2017    31 non-null     object
 2   2016    31 non-null     object
 3   2015    31 non-null     object
 4   2014    31 non-null     object
 5   2013    31 non-null     object
 6   2012    31 non-null     object
 7   2011    31 non-null     object
 8   2010    31 non-null     object
 9   2009    31 non-null     object
 10  2008    31 non-null     object
 11  2007    31 non-null     object
 12  2006    31 non-null     object
 13  2005    31 non-null     object
 14  2004    30 non-null     object
 15  2003    30 non-null     object
 16  2002    30 non-null     object
 17  2001    30 non-null     object
 18  2000    30 non-null     object
 19  1999    30 non-null     object
 20  1998    30 non-null     object
 21  1997    30 non-null     object
 22  1996    30 non-null     ob

Above code did not work, .loc[] and .astype() method unsuccessful 

In [74]:
# Set difference by excluding Team column & convert to numeric
nba_num_columns = team_nba_df.columns.difference(['Team'])
team_nba_df[nba_num_columns] = team_nba_df[nba_num_columns].apply(pd.to_numeric, errors='coerce')

# Specify numeric type as Int64
team_nba_df[nba_num_columns] = team_nba_df[nba_num_columns].astype('Int64')
# Check success of conversion
team_nba_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 28 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Team    37 non-null     object
 1   2017    31 non-null     Int64 
 2   2016    31 non-null     Int64 
 3   2015    31 non-null     Int64 
 4   2014    31 non-null     Int64 
 5   2013    31 non-null     Int64 
 6   2012    31 non-null     Int64 
 7   2011    31 non-null     Int64 
 8   2010    31 non-null     Int64 
 9   2009    31 non-null     Int64 
 10  2008    31 non-null     Int64 
 11  2007    31 non-null     Int64 
 12  2006    31 non-null     Int64 
 13  2005    31 non-null     Int64 
 14  2004    30 non-null     Int64 
 15  2003    30 non-null     Int64 
 16  2002    30 non-null     Int64 
 17  2001    30 non-null     Int64 
 18  2000    30 non-null     Int64 
 19  1999    30 non-null     Int64 
 20  1998    30 non-null     Int64 
 21  1997    30 non-null     Int64 
 22  1996    30 non-null     In

In [75]:
# Insert 'League' column for later filtering needs by league once merged
# Create and insert 'League'column
team_nba_df.insert(insert_pos, 'League', 'NBA')  

# Verify success
team_nba_df.head()

Unnamed: 0,League,Team,2017,2016,2015,2014,2013,2012,2011,2010,...,2000,1999,1998,1997,1996,1995,1994,1993,1992,1991
0,NBA,Salary Cap,94143000.0,70000000.0,63065000.0,58679000,58044000,58044000.0,58044000.0,57700000.0,...,34000000.0,30000000.0,26900000.0,24363000.0,23000000.0,15964000.0,15175000.0,14000000.0,12500000.0,11871000.0
1,NBA,Atlanta Hawks,95471579.0,72902950.0,58470278.0,58936376,66768365,73669912.0,71469843.0,65883642.0,...,44169202.0,40954940.0,30133827.0,25806100.0,22227000.0,21869340.0,22510000.0,18036000.0,12930000.0,11761000.0
2,NBA,Boston Celtics,87272327.0,77139134.0,62442955.0,70676163,73021989,79820530.0,82045867.0,83552174.0,...,46152875.0,32258856.0,26800500.0,25853500.0,20219000.0,24462000.0,17881000.0,25217000.0,25343000.0,11256000.0
3,NBA,Brooklyn Nets,72926799.0,88346807.0,92172817.0,107358841,85282885,,,,...,,,,,,,,,,
4,NBA,Charlotte Bobcats,,,,72426678,57491899,57902024.0,66542696.0,68681758.0,...,,,,,,,,,,


## NFL Team Data

In [62]:
# Read in NFL Team Data
team_nfl_csv = '../data/raw/NFL_TeamByYear.csv'
team_nfl = pd.read_csv(team_nfl_csv, sep=';')

#Create DataFrame
team_nfl_df = pd.DataFrame(team_nfl)
team_nfl_df.head()

Unnamed: 0,Team,2017,2016,2015,2014,2013,2012,2011,2010,2009,...,2000,1999,1998,1997,1996,1995,1994,1993,1992,1991
0,arizona cardinals,161695128,142050412,131898567,117186279,101554783,118319856,112570561,87447659,102466299,...,17193499,0,15611366,0,4143666,4328834,6224665,6185002,0,2583332
1,atlanta falcons,157854648,127773922,113766524,121320590,104962454,124269245,123051711,115671068,90402045,...,8605416,0,3208333,0,5086000,2910500,5027089,7208750,0,3942083
2,baltimore ravens,136870254,135617906,117207709,109515752,110323504,121621563,115894675,110785006,91828447,...,12995750,0,3716140,0,6564901,0,0,0,0,0
3,buffalo bills,126167732,133546829,124698408,119736639,96676241,110891698,94786799,94608232,90019378,...,28925070,0,20909394,0,16812930,14154667,11686999,18795416,0,7032166
4,carolina panthers,163318040,126045316,129789582,105757589,91297223,121280383,120892742,77740912,98595879,...,8713900,0,9090000,0,7840000,5632000,0,115312370,0,0


In [63]:
# Review data info
team_nfl_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 28 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Team    35 non-null     object
 1   2017    35 non-null     int64 
 2   2016    35 non-null     int64 
 3   2015    35 non-null     int64 
 4   2014    35 non-null     int64 
 5   2013    35 non-null     int64 
 6   2012    35 non-null     int64 
 7   2011    35 non-null     int64 
 8   2010    35 non-null     int64 
 9   2009    35 non-null     int64 
 10  2008    35 non-null     int64 
 11  2007    35 non-null     int64 
 12  2006    35 non-null     int64 
 13  2005    35 non-null     int64 
 14  2004    35 non-null     int64 
 15  2003    35 non-null     int64 
 16  2002    35 non-null     int64 
 17  2001    35 non-null     int64 
 18  2000    35 non-null     int64 
 19  1999    35 non-null     int64 
 20  1998    35 non-null     int64 
 21  1997    35 non-null     int64 
 22  1996    35 non-null     int6

In [77]:
# Insert 'League' column for later filtering needs by league once merged
# Create and insert 'League'column
team_nfl_df.insert(insert_pos, 'League', 'NFL')  

# Verify success
team_nfl_df.head()

Unnamed: 0,League,Team,2017,2016,2015,2014,2013,2012,2011,2010,...,2000,1999,1998,1997,1996,1995,1994,1993,1992,1991
0,NFL,arizona cardinals,161695128,142050412,131898567,117186279,101554783,118319856,112570561,87447659,...,17193499,0,15611366,0,4143666,4328834,6224665,6185002,0,2583332
1,NFL,atlanta falcons,157854648,127773922,113766524,121320590,104962454,124269245,123051711,115671068,...,8605416,0,3208333,0,5086000,2910500,5027089,7208750,0,3942083
2,NFL,baltimore ravens,136870254,135617906,117207709,109515752,110323504,121621563,115894675,110785006,...,12995750,0,3716140,0,6564901,0,0,0,0,0
3,NFL,buffalo bills,126167732,133546829,124698408,119736639,96676241,110891698,94786799,94608232,...,28925070,0,20909394,0,16812930,14154667,11686999,18795416,0,7032166
4,NFL,carolina panthers,163318040,126045316,129789582,105757589,91297223,121280383,120892742,77740912,...,8713900,0,9090000,0,7840000,5632000,0,115312370,0,0


## Team NHL Data

In [64]:
# Read in NHL Team Data
team_nhl_csv = '../data/raw/NHL_TeamByYear.csv'
team_nhl = pd.read_csv(team_nhl_csv, sep=';')

#Create DataFrame
team_nhl_df = pd.DataFrame(team_nhl)
team_nhl_df.head()

Unnamed: 0,team,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000
0,anaheim-ducks,82268570,86617361,73981638,58041414,59463769,67064166,54986918,62038195,33069028,43448544,31838898,17178065,4216399,0,2525600,1625000,9234000,17940000
1,arizona-coyotes,63562921,76763342,75116149,42954690,58798944,0,61236666,57211999,41889667,875000,24090000,0,5270400,0,2800000,0,0,0
2,atlanta-thrashers,0,0,0,0,0,0,4300000,42896666,39522633,22925133,15026800,16639300,14789300,2000000,1905000,2130000,1130000,0
3,boston-bruins,82532498,80246666,80612667,66010938,67308559,68889643,65575474,69572142,59474166,50202498,32229898,25112032,12411400,0,9273000,3183333,3183333,1958333
4,buffalo-sabres,75188273,81405814,65649286,42406839,41989515,58394524,65660357,47324523,50790356,39497056,36178923,31066600,10888700,0,5315250,4602500,1050000,650000


### Data Review
* will need to convert 0 values to Nulls to match other league data
    * Otherwise these values will skew the largest during the EDA and Visualizing processes

In [65]:
# Review data info
team_nhl_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 19 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   team    33 non-null     object
 1   2017    33 non-null     int64 
 2   2016    33 non-null     int64 
 3   2015    33 non-null     int64 
 4   2014    33 non-null     int64 
 5   2013    33 non-null     int64 
 6   2012    33 non-null     int64 
 7   2011    33 non-null     int64 
 8   2010    33 non-null     int64 
 9   2009    33 non-null     int64 
 10  2008    33 non-null     int64 
 11  2007    33 non-null     int64 
 12  2006    33 non-null     int64 
 13  2005    33 non-null     int64 
 14  2004    33 non-null     int64 
 15  2003    33 non-null     int64 
 16  2002    33 non-null     int64 
 17  2001    33 non-null     int64 
 18  2000    33 non-null     int64 
dtypes: int64(18), object(1)
memory usage: 5.0+ KB


In [78]:
# Insert 'League' column for later filtering needs by league once merged
# Create and insert 'League'column
team_nhl_df.insert(insert_pos, 'League', 'NHL')  

# Verify success
team_nhl_df.head()

Unnamed: 0,League,team,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000
0,NHL,anaheim-ducks,82268570,86617361,73981638,58041414,59463769,67064166,54986918,62038195,33069028,43448544,31838898,17178065,4216399,0,2525600,1625000,9234000,17940000
1,NHL,arizona-coyotes,63562921,76763342,75116149,42954690,58798944,0,61236666,57211999,41889667,875000,24090000,0,5270400,0,2800000,0,0,0
2,NHL,atlanta-thrashers,0,0,0,0,0,0,4300000,42896666,39522633,22925133,15026800,16639300,14789300,2000000,1905000,2130000,1130000,0
3,NHL,boston-bruins,82532498,80246666,80612667,66010938,67308559,68889643,65575474,69572142,59474166,50202498,32229898,25112032,12411400,0,9273000,3183333,3183333,1958333
4,NHL,buffalo-sabres,75188273,81405814,65649286,42406839,41989515,58394524,65660357,47324523,50790356,39497056,36178923,31066600,10888700,0,5315250,4602500,1050000,650000
