## Imports

In [1]:
import pandas as pd

In [2]:
df_awards_players = pd.read_csv('data/awards_players_processed.csv')
df_coaches = pd.read_csv('data/coaches_processed.csv')
df_players_teams = pd.read_csv('data/players_teams_processed.csv')
df_players = pd.read_csv('data/players.csv')
df_series_post = pd.read_csv('data/series_post_processed.csv')
df_teams = pd.read_csv('data/teams_processed.csv')
df_teams_post = pd.read_csv('data/teams_post_processed.csv')

## Data Cleanup and analysis

Agregate the data from series_post by the number of wins of each player and the number of losses, then join it to the Teams dataset 

In [3]:

# Join the team statistics with the second DataFrame
# df_teams = pd.merge(df_teams, df_series_post, on=["year", "tmID"], how="left").fillna(0)

# df_teams.info()


In [4]:
players_teams_info_df = pd.merge(df_players_teams, df_players, left_on='playerID', right_on='bioID', how='left')
players_teams_info_df.info()
players_teams_info_df = pd.merge(players_teams_info_df, df_awards_players, on=['playerID', 'year'], how='left')
players_teams_info_df.to_csv('players_teams_info.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1290 entries, 0 to 1289
Data columns (total 52 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   playerID            1290 non-null   object 
 1   year                1290 non-null   int64  
 2   stint               1290 non-null   int64  
 3   tmID                1290 non-null   object 
 4   GP                  1290 non-null   float64
 5   GS                  1290 non-null   float64
 6   minutes             1290 non-null   float64
 7   points              1290 non-null   float64
 8   oRebounds           1290 non-null   float64
 9   dRebounds           1290 non-null   float64
 10  rebounds            1290 non-null   float64
 11  assists             1290 non-null   float64
 12  steals              1290 non-null   float64
 13  blocks              1290 non-null   float64
 14  turnovers           1290 non-null   float64
 15  PF                  1290 non-null   float64
 16  fgAtte

In [5]:
agg_players_df = players_teams_info_df.groupby(["year", "tmID"]).agg({
    "GP": "sum",
    "GS": "sum",
    "minutes": "mean",
    "points": "sum",
    "oRebounds": "sum",
    "dRebounds": "sum",
    "rebounds": "sum",
    "assists": "sum",
    "steals": "sum",
    "blocks": "sum",
    "turnovers": "sum",
    "PF": "sum",
    "fgAttempted": "sum",
    "fgMade": "sum",
    "ftAttempted": "sum",
    "ftMade": "sum",
    "threeAttempted": "sum",
    "threeMade": "sum",
    "PostGP": "sum",
    "PostGS": "sum",
    "PostMinutes": "sum",
    "PostPoints": "sum",
    "PostoRebounds": "sum",
    "PostdRebounds": "sum",
    "PostRebounds": "sum",
    "PostAssists": "sum",
    "PostSteals": "sum",
    "PostBlocks": "sum",
    "PostTurnovers": "sum",
    "PostPF": "sum",
    "PostfgAttempted": "sum",
    "PostfgMade": "sum",
    "PostftAttempted": "sum",
    "PostftMade": "sum",
    "PostthreeAttempted": "sum",
    "PostthreeMade": "sum",
    "PostDQ": "sum",
    "height": "mean",
    "weight": "mean",
    "award_count": "sum",

    # Add more columns you want to aggregate here
}).reset_index()

df_teams = pd.merge(df_teams, agg_players_df, on=['year', 'tmID'], how='left')
#agg_players_df.head()

In [6]:
df_teams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 94 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     122 non-null    int64  
 1   tmID                     122 non-null    object 
 2   rank                     122 non-null    float64
 3   playoff                  122 non-null    int64  
 4   o_fgm                    122 non-null    float64
 5   o_fga                    122 non-null    float64
 6   o_ftm                    122 non-null    float64
 7   o_fta                    122 non-null    float64
 8   o_3pm                    122 non-null    float64
 9   o_3pa                    122 non-null    float64
 10  o_oreb                   122 non-null    float64
 11  o_dreb                   122 non-null    float64
 12  o_reb                    122 non-null    float64
 13  o_asts                   122 non-null    float64
 14  o_pf                     1

In [7]:
df_teams = pd.merge(df_teams, df_coaches, on=['year', 'tmID'], how='left')
df_teams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 98 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     122 non-null    int64  
 1   tmID                     122 non-null    object 
 2   rank                     122 non-null    float64
 3   playoff                  122 non-null    int64  
 4   o_fgm                    122 non-null    float64
 5   o_fga                    122 non-null    float64
 6   o_ftm                    122 non-null    float64
 7   o_fta                    122 non-null    float64
 8   o_3pm                    122 non-null    float64
 9   o_3pa                    122 non-null    float64
 10  o_oreb                   122 non-null    float64
 11  o_dreb                   122 non-null    float64
 12  o_reb                    122 non-null    float64
 13  o_asts                   122 non-null    float64
 14  o_pf                     1

Merges the teams with teams post and added the value 0 to the teams that did not have any wins or losses

In [8]:
# df_teams = pd.merge(df_teams, df_teams_post, on=['year', 'tmID'], how='left').fillna(0)
# df_teams.info()

In [9]:
df_teams.to_csv('data_merged.csv', index=False)