In [1]:
import pandas as pd
import matplotlib.pyplot as plt

teams = pd.read_csv('data/original/teams.csv')
players = pd.read_csv('data/original/players.csv')
coaches = pd.read_csv('data/original/coaches.csv')
players_teams = pd.read_csv('data/original/players_teams.csv')
series_post = pd.read_csv('data/original/series_post.csv')
awards_players = pd.read_csv('data/original/awards_players.csv')
teams_post = pd.read_csv('data/original/teams_post.csv')

In [2]:
# Function to get columns with only one value
def get_empty_columns(data):
    for column in data.columns:
        if data[column].nunique() == 1:
            print("Column with " + str(data[column].nunique()) + " unique values: " + column)

### Empty columns in Teams

In [3]:
get_empty_columns(teams)

Column with 1 unique values: lgID
Column with 1 unique values: seeded
Column with 1 unique values: tmORB
Column with 1 unique values: tmDRB
Column with 1 unique values: tmTRB
Column with 1 unique values: opptmORB
Column with 1 unique values: opptmDRB
Column with 1 unique values: opptmTRB


### Drop columns that are not needed or have only one value - Teams

In [4]:
# Drop the columns that are not needed or have only one value
teams = teams.drop(columns=['lgID', 'divID', 'seeded', 'arena', 'name', 'tmORB', 'tmDRB', 'tmTRB', 'opptmORB', 'opptmDRB', 'opptmTRB', 'attend'])

### Empty columns in Players

In [5]:
get_empty_columns(players)

Column with 1 unique values: firstseason
Column with 1 unique values: lastseason


### Drop columns that are not needed or have only one value - Players

In [6]:
# Drop the columns that are not needed or have only one value

players = players.drop(columns=['firstseason', 'lastseason', 'college', 'collegeOther', 'deathDate'])


### Remove irrelevant awards 

In [7]:
# Remove sportsmanship awards from awards_players

awards_players = awards_players[awards_players['award'] != 'Kim Perrot Sportsmanship']
awards_players = awards_players[awards_players['award'] != 'Kim Perrot Sportsmanship Award']

### Empty columns in Coaches

In [8]:
get_empty_columns(coaches)

Column with 1 unique values: lgID


### Drop columns that are not needed or have only one value - Coaches

In [9]:
# Drop the columns that are not needed or have only one value

coaches = coaches.drop(columns=['lgID','post_wins',"post_losses"])

### Empty columns in Player Teams

In [10]:
get_empty_columns(players_teams)

Column with 1 unique values: lgID


### Drop Players without BIO Information

In [11]:
# check if any ids from players are not in players_teams and drop them from players if their weight is 0

players_ids = players['bioID'].unique()
players_teams_ids = players_teams['playerID'].unique()

print(len(players_ids))
for player_id in players_ids:
    if player_id not in players_teams_ids:
        # Drop only if weight and height are 0 and birthDate is 0000-00-00
        if players.loc[players['bioID'] == player_id, 'weight'].values[0] == 0 and players.loc[players['bioID'] == player_id, 'height'].values[0] == 0 and players.loc[players['bioID'] == player_id, 'birthDate'].values[0] == '0000-00-00':
            players = players[players.bioID != player_id]

893


### Drop columns that are not needed or have only one value - Player Teams

In [12]:
# Drop the columns that are not needed or have only one value

players_teams = players_teams.drop(columns=['lgID'])

In [13]:
teams.to_csv('data/clean/cleaned_teams.csv', index=False)
players.to_csv('data/clean/cleaned_players.csv', index=False)
coaches.to_csv('data/clean/cleaned_coaches.csv', index=False)
players_teams.to_csv('data/clean/cleaned_players_teams.csv', index=False)
awards_players.to_csv('data/clean/cleaned_awards_players.csv', index=False)