In [1]:
# Import module/s
import pandas as pd
import numpy as np

In [9]:
# Read in NBA csv for player salary data
# Set encoding to ISO-8859-1
nba_csv = '../data/raw/NBA_PlayerByYear.csv'
nba_data = pd.read_csv(nba_csv, sep=';', encoding='ISO-8859-1')

# Create DataFrame
nba_df = pd.DataFrame(nba_data)
nba_df.head()

Unnamed: 0,Player,2017,2016,2015,2014,2013,2012,2011,2010,2009,...,2000,1999,1998,1997,1996,1995,1994,1993,1992,1991
0,A.C. Green,,,,,,,,,,...,1700000.0,5125088.0,5095088.0,4851000.0,6473000.0,6472600.0,1885000.0,1750000.0,1750000.0,1750000.0
1,A.J. Bramlett,,,,,,,,,,...,118974.0,,,,,,,,,
2,A.J. English,,,,,,,,,,...,,,,,,,150000.0,406000.0,325000.0,275000.0
3,A.J. Guyton,,,,,,,,,,...,,,,,,,,,,
4,A.J. Price,,,281484.0,947907.0,885120.0,854389.0,762195.0,457588.0,,...,,,,,,,,,,


In [10]:
# Get info on NBA data
nba_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2204 entries, 0 to 2203
Data columns (total 28 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  2204 non-null   object 
 1   2017    499 non-null    float64
 2   2016    500 non-null    float64
 3   2015    513 non-null    float64
 4   2014    501 non-null    float64
 5   2013    481 non-null    float64
 6   2012    464 non-null    float64
 7   2011    460 non-null    float64
 8   2010    457 non-null    float64
 9   2009    461 non-null    float64
 10  2008    470 non-null    float64
 11  2007    496 non-null    float64
 12  2006    480 non-null    float64
 13  2005    471 non-null    float64
 14  2004    455 non-null    float64
 15  2003    452 non-null    float64
 16  2002    451 non-null    float64
 17  2001    456 non-null    float64
 18  2000    517 non-null    float64
 19  1999    447 non-null    object 
 20  1998    445 non-null    float64
 21  1997    416 non-null    float64
 22  

In [17]:
# Check for 'Unknown' values in the entire DataFrame
unknown_values_mask = nba_df.applymap(lambda x: x == 'Unknown')

# Get the rows and columns where 'Unknown' values are present
unknown_values_locations = (unknown_values_mask).any(axis=1)

# Display rows where 'Unknown' values are present
rows_with_unknown_values = nba_df[unknown_values_locations]
# print(rows_with_unknown_values)

# Display quantity of 'Unknown' values 
total_unknown_values = rows_with_unknown_values.count().sum()
print("Total 'Unknown' values:", total_unknown_values)


Total 'Unknown' values: 0


In [18]:
# Replace 'Unknown's with NaN to match other non-numeric (NaN) values
nba_df = nba_df.replace(to_replace='Unknown', value=np.nan)

# Re-Check 'Unknown' values
print("Total 'Unknown' values:", total_unknown_values)

Total 'Unknown' values: 0


In [20]:
# Convert the object type columns to floats
nba_df['1999'] = nba_df['1999'].astype(float)
nba_df['1996'] = nba_df['1996'].astype(float)

# Re-check data
nba_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2204 entries, 0 to 2203
Data columns (total 28 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  2204 non-null   object 
 1   2017    499 non-null    float64
 2   2016    500 non-null    float64
 3   2015    513 non-null    float64
 4   2014    501 non-null    float64
 5   2013    481 non-null    float64
 6   2012    464 non-null    float64
 7   2011    460 non-null    float64
 8   2010    457 non-null    float64
 9   2009    461 non-null    float64
 10  2008    470 non-null    float64
 11  2007    496 non-null    float64
 12  2006    480 non-null    float64
 13  2005    471 non-null    float64
 14  2004    455 non-null    float64
 15  2003    452 non-null    float64
 16  2002    451 non-null    float64
 17  2001    456 non-null    float64
 18  2000    517 non-null    float64
 19  1999    427 non-null    float64
 20  1998    445 non-null    float64
 21  1997    416 non-null    float64
 22  

In [21]:
# Check if any values have decimal parts
#  Goal is to reformat all salary values into whole numbers with comma seperators for 1000's. Eg: 1,500,000 for 1.5M
has_decimal_values = (nba_df.applymap(lambda x: np.modf(x)[0] != 0)).any().any()

if has_decimal_values:
    print("DataFrame has values with decimal parts.")
    
    #nba_df.round(0, inplace=True)
else:
    print("DataFrame does not have values with decimal parts.")


TypeError: ufunc 'modf' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [None]:
# Modifies the salary values to exclude the decimal point, aiming to make all values in all 4 leagues whole numbers for cleanliness


In [None]:
# Read in csv for the NHL player salary data
# Utilize encoding='ISO-8859-1'
nhl_csv = '../data/raw/NHL_PlayerByYear.csv'
nhl_data = pd.read_csv(nhl_csv, sep=';', encoding='ISO-8859-1')

# Create DataFrame
nhl_df = pd.DataFrame(nhl_data)
nhl_df.head()

In [None]:
# Read in csv for NFL player salary data
# Utilize ISO-8859-1 encoding 
nfl_csv = '../data/raw/NFL_PlayerByYear.csv'
nfl_data = pd.read_csv(nfl_csv, sep=';', encoding='ISO-8859-1')

# Create DataFrame 
nfl_df = pd.DataFrame(nfl_data)
nfl_df.head()

In [None]:
# Read in csv for the MLB player salary data
# UTF-8 default encoding was not reading, encoding='ISO-8859-1' likely needed for all csv's from the data source
mlb_csv = '../data/raw/MLB_PlayerByYear.csv'
mlb_data = pd.read_csv(mlb_csv, sep=';', encoding='ISO-8859-1')

# Create DataFrame
mlb_df = pd.DataFrame(mlb_data)
mlb_df.head()