In [104]:
import pandas as pd

teams = pd.read_csv('data/original/teams.csv')
players = pd.read_csv('data/original/players.csv')
coaches = pd.read_csv('data/original/coaches.csv')
players_teams = pd.read_csv('data/original/players_teams.csv')
series_post = pd.read_csv('data/original/series_post.csv')
awards_players = pd.read_csv('data/original/awards_players.csv')
teams_post = pd.read_csv('data/original/teams_post.csv')

In [105]:
# Function to get columns with only one value
def get_empty_columns(data):
    for column in data.columns:
        if data[column].nunique() == 1:
            print("Column with " + str(data[column].nunique()) + " unique values: " + column)

## Cleaning teams.csv

In [106]:
get_empty_columns(teams)

Column with 1 unique values: lgID
Column with 1 unique values: seeded
Column with 1 unique values: tmORB
Column with 1 unique values: tmDRB
Column with 1 unique values: tmTRB
Column with 1 unique values: opptmORB
Column with 1 unique values: opptmDRB
Column with 1 unique values: opptmTRB


In [107]:
# Drop the columns that are not needed

teams = teams.drop(columns=['lgID', 'franchID', 'divID', 'seeded', 'arena', 'name', 'tmORB', 'tmDRB', 'tmTRB', 'opptmORB', 'opptmDRB', 'opptmTRB'])

## Cleaning players.csv

In [108]:
get_empty_columns(players)

Column with 1 unique values: firstseason
Column with 1 unique values: lastseason


In [109]:
# Drop the columns that are not needed

players = players.drop(columns=['firstseason', 'lastseason', 'college', 'collegeOther'])


In [110]:
def convert_height_to_cm(height):
    height = int(height)
    return int(height * 2.54)

def convert_weight_to_kg(weight):
    weight = int(weight)
    return int(weight * 0.453592)

players['height'] = players['height'].apply(convert_height_to_cm)
players['weight'] = players['weight'].apply(convert_weight_to_kg)

In [111]:
# check if any ids from players are not in players_teams and drop them from players if their weight is 0

players_ids = players['bioID'].unique()
players_teams_ids = players_teams['playerID'].unique()

for player_id in players_ids:
    if player_id not in players_teams_ids:
        # Drop only if weight and height are 0 and birthDate is 0000-00-00
        if players.loc[players['bioID'] == player_id, 'weight'].values[0] == 0 and players.loc[players['bioID'] == player_id, 'height'].values[0] == 0 and players.loc[players['bioID'] == player_id, 'birthDate'].values[0] == '0000-00-00':
            players = players[players.bioID != player_id]
    

In [126]:
# check if any players have 0 height or weight

print(players[players['pos'].isnull()])

# Outliers

players.describe()

Empty DataFrame
Columns: [bioID, pos, height, weight, birthDate, deathDate]
Index: []


Unnamed: 0,height,weight
count,813.0,813.0
mean,182.228782,71.95941
std,10.611909,19.351037
min,22.0,0.0
25%,175.0,66.0
50%,182.0,74.0
75%,190.0,82.0
max,203.0,115.0


Unnamed: 0,height,weight
count,813.0,813.0
mean,182.228782,71.95941
std,10.611909,19.351037
min,22.0,0.0
25%,175.0,66.0
50%,182.0,74.0
75%,190.0,82.0
max,203.0,115.0


## Cleaning coaches.csv

In [113]:
get_empty_columns(coaches)

Column with 1 unique values: lgID


In [114]:
# Drop the columns that are not needed

coaches = coaches.drop(columns=['lgID'])

## Cleaning players_teams.csv

In [115]:
# Drop the columns that are not needed

players_teams = players_teams.drop(columns=['lgID'])

## Cleaning series_post.csv

In [116]:
# Drop the columns that are not needed

series_post = series_post.drop(columns=['lgIDWinner', 'lgIDLoser'])

## Cleaning awards_players.csv

In [117]:
# Drop the columns that are not needed

awards_players = awards_players.drop(columns=['lgID'])

## Cleaning teams_post.csv

In [118]:
# Drop the columns that are not needed

teams_post = teams_post.drop(columns=['lgID'])


In [119]:
teams.to_csv('data/clean/cleaned_teams.csv', index=False)
players.to_csv('data/clean/cleaned_players.csv', index=False)
coaches.to_csv('data/clean/cleaned_coaches.csv', index=False)
players_teams.to_csv('data/clean/cleaned_players_teams.csv', index=False)
series_post.to_csv('data/clean/cleaned_series_post.csv', index=False)
awards_players.to_csv('data/clean/cleaned_awards_players.csv', index=False)
teams_post.to_csv('data/clean/cleaned_teams_post.csv', index=False)