## Imports

In [19]:
import pandas as pd

In [20]:
df_awards_players = pd.read_csv('data/awards_players_processed.csv')
df_coaches = pd.read_csv('data/coaches_processed.csv')
df_players_teams = pd.read_csv('data/players_teams_processed.csv')
df_players = pd.read_csv('data/players_processed.csv')
df_series_post = pd.read_csv('data/series_post_processed.csv')
df_teams = pd.read_csv('data/teams_processed.csv')
df_teams_post = pd.read_csv('data/teams_post_processed.csv')

## Data Cleanup and analysis

Agregate the data from series_post by the number of wins of each player and the number of losses, then join it to the Teams dataset 

In [21]:

# Join the team statistics with the second DataFrame
df_teams = pd.merge(df_teams, df_series_post, on=["year", "tmID"], how="left").fillna(0)

df_teams.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 56 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     142 non-null    int64  
 1   tmID                     142 non-null    object 
 2   rank                     142 non-null    float64
 3   playoff                  142 non-null    int64  
 4   o_fgm                    142 non-null    float64
 5   o_fga                    142 non-null    float64
 6   o_ftm                    142 non-null    float64
 7   o_fta                    142 non-null    float64
 8   o_3pm                    142 non-null    float64
 9   o_3pa                    142 non-null    float64
 10  o_oreb                   142 non-null    float64
 11  o_dreb                   142 non-null    float64
 12  o_reb                    142 non-null    float64
 13  o_asts                   142 non-null    float64
 14  o_pf                     1

In [22]:
players_teams_info_df = pd.merge(df_players_teams, df_players, left_on='playerID', right_on='bioID', how='left')
players_teams_info_df.info()
players_teams_info_df = pd.merge(players_teams_info_df, df_awards_players, on=['playerID', 'year'], how='left')
players_teams_info_df.to_csv('players_teams_info.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1876 entries, 0 to 1875
Data columns (total 49 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   playerID            1876 non-null   object 
 1   year                1876 non-null   int64  
 2   tmID                1876 non-null   object 
 3   GP                  1876 non-null   float64
 4   GS                  1876 non-null   float64
 5   minutes             1876 non-null   float64
 6   points              1876 non-null   float64
 7   oRebounds           1876 non-null   float64
 8   dRebounds           1876 non-null   float64
 9   rebounds            1876 non-null   float64
 10  assists             1876 non-null   float64
 11  steals              1876 non-null   float64
 12  blocks              1876 non-null   float64
 13  turnovers           1876 non-null   float64
 14  PF                  1876 non-null   float64
 15  fgAttempted         1876 non-null   float64
 16  fgMade

In [23]:
agg_players_df = players_teams_info_df.groupby(["year", "tmID"]).agg({
    "GP": "sum",
    "GS": "sum",
    "minutes": "mean",
    "points": "sum",
    "oRebounds": "sum",
    "dRebounds": "sum",
    "rebounds": "sum",
    "assists": "sum",
    "steals": "sum",
    "blocks": "sum",
    "turnovers": "sum",
    "PF": "sum",
    "fgAttempted": "sum",
    "fgMade": "sum",
    "ftAttempted": "sum",
    "ftMade": "sum",
    "threeAttempted": "sum",
    "threeMade": "sum",
    "dq": "sum",
    "PostGP": "sum",
    "PostGS": "sum",
    "PostMinutes": "sum",
    "PostPoints": "sum",
    "PostoRebounds": "sum",
    "PostdRebounds": "sum",
    "PostRebounds": "sum",
    "PostAssists": "sum",
    "PostSteals": "sum",
    "PostBlocks": "sum",
    "PostTurnovers": "sum",
    "PostPF": "sum",
    "PostfgAttempted": "sum",
    "PostfgMade": "sum",
    "PostftAttempted": "sum",
    "PostftMade": "sum",
    "PostthreeAttempted": "sum",
    "PostthreeMade": "sum",
    "PostDQ": "sum",
    "height": "mean",
    "weight": "mean",
    "award_count": "sum",

    # Add more columns you want to aggregate here
}).reset_index()

agg_players_df['GP'] = agg_players_df['GP'] + agg_players_df['PostGP']
agg_players_df['GS'] = agg_players_df['GS'] + agg_players_df['PostGS']
agg_players_df['minutes'] = agg_players_df['minutes'] + agg_players_df['PostMinutes']
agg_players_df['points'] = agg_players_df['points'] + agg_players_df['PostPoints']
agg_players_df['oRebounds'] = agg_players_df['oRebounds'] + agg_players_df['PostoRebounds']
agg_players_df['dRebounds'] = agg_players_df['dRebounds'] + agg_players_df['PostdRebounds']
agg_players_df['rebounds'] = agg_players_df['rebounds'] + agg_players_df['PostRebounds']
agg_players_df['assists'] = agg_players_df['assists'] + agg_players_df['PostAssists']
agg_players_df['steals'] = agg_players_df['steals'] + agg_players_df['PostSteals']
agg_players_df['blocks'] = agg_players_df['blocks'] + agg_players_df['PostBlocks']
agg_players_df['turnovers'] = agg_players_df['turnovers'] + agg_players_df['PostTurnovers']
agg_players_df['PF'] = agg_players_df['PF'] + agg_players_df['PostPF']
agg_players_df['fgAttempted'] = agg_players_df['fgAttempted'] + agg_players_df['PostfgAttempted']
agg_players_df['fgMade'] = agg_players_df['fgMade'] + agg_players_df['PostfgMade']
agg_players_df['ftAttempted'] = agg_players_df['ftAttempted'] + agg_players_df['PostftAttempted']
agg_players_df['ftMade'] = agg_players_df['ftMade'] + agg_players_df['PostftMade']
agg_players_df['threeAttempted'] = agg_players_df['threeAttempted'] + agg_players_df['PostthreeAttempted']
agg_players_df['threeMade'] = agg_players_df['threeMade'] + agg_players_df['PostthreeMade']
agg_players_df['dq'] = agg_players_df['dq'] + agg_players_df['PostDQ']

agg_players_df.drop(columns=['PostGP', 'PostGS', 'PostMinutes', 'PostPoints', 'PostoRebounds', 'PostdRebounds', 'PostRebounds', 'PostAssists', 'PostSteals', 'PostBlocks', 'PostTurnovers', 'PostPF', 'PostfgAttempted', 'PostfgMade', 'PostftAttempted', 'PostftMade', 'PostthreeAttempted', 'PostthreeMade', 'PostDQ'], inplace=True)


df_teams = pd.merge(df_teams, agg_players_df, on=['year', 'tmID'], how='left')
#agg_players_df.head()

In [24]:
df_teams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 78 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     142 non-null    int64  
 1   tmID                     142 non-null    object 
 2   rank                     142 non-null    float64
 3   playoff                  142 non-null    int64  
 4   o_fgm                    142 non-null    float64
 5   o_fga                    142 non-null    float64
 6   o_ftm                    142 non-null    float64
 7   o_fta                    142 non-null    float64
 8   o_3pm                    142 non-null    float64
 9   o_3pa                    142 non-null    float64
 10  o_oreb                   142 non-null    float64
 11  o_dreb                   142 non-null    float64
 12  o_reb                    142 non-null    float64
 13  o_asts                   142 non-null    float64
 14  o_pf                     1

In [25]:
df_teams = pd.merge(df_teams, df_coaches, on=['year', 'tmID'], how='left')
df_teams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 82 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     142 non-null    int64  
 1   tmID                     142 non-null    object 
 2   rank                     142 non-null    float64
 3   playoff                  142 non-null    int64  
 4   o_fgm                    142 non-null    float64
 5   o_fga                    142 non-null    float64
 6   o_ftm                    142 non-null    float64
 7   o_fta                    142 non-null    float64
 8   o_3pm                    142 non-null    float64
 9   o_3pa                    142 non-null    float64
 10  o_oreb                   142 non-null    float64
 11  o_dreb                   142 non-null    float64
 12  o_reb                    142 non-null    float64
 13  o_asts                   142 non-null    float64
 14  o_pf                     1

Merges the teams with teams post and added the value 0 to the teams that did not have any wins or losses

In [26]:
df_teams = pd.merge(df_teams, df_teams_post, on=['year', 'tmID'], how='left').fillna(0)
df_teams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 84 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     142 non-null    int64  
 1   tmID                     142 non-null    object 
 2   rank                     142 non-null    float64
 3   playoff                  142 non-null    int64  
 4   o_fgm                    142 non-null    float64
 5   o_fga                    142 non-null    float64
 6   o_ftm                    142 non-null    float64
 7   o_fta                    142 non-null    float64
 8   o_3pm                    142 non-null    float64
 9   o_3pa                    142 non-null    float64
 10  o_oreb                   142 non-null    float64
 11  o_dreb                   142 non-null    float64
 12  o_reb                    142 non-null    float64
 13  o_asts                   142 non-null    float64
 14  o_pf                     1

In [27]:
df_teams.to_csv('data_merged.csv', index=False)