# Transformation

In [60]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler

In [61]:
# update mismatch
def update_team_data(df_teams_path, df_players, player_stat_column, team_stat_column, output_path):
    df_teams = pd.read_csv(df_teams_path)

    # sum for each 
    sum_stats = df_players.groupby(['tmID', 'year'])[player_stat_column].sum().reset_index()
    sum_stats.rename(columns={player_stat_column: f'sum_{player_stat_column}Player'}, inplace=True)
    df_compare = pd.merge(df_teams, sum_stats, on=['tmID', 'year'], how='left')

    # mismatch and update values
    df_compare[f'diff_{player_stat_column}'] = df_compare[team_stat_column] - df_compare[
        f'sum_{player_stat_column}Player']
    mismatches = df_compare[df_compare[f'diff_{player_stat_column}'] != 0]

    # print different values
    if not mismatches.empty:
        print(f"Mismatches found for {team_stat_column}:")
        print(mismatches[
                  ['tmID', 'year', team_stat_column, f'sum_{player_stat_column}Player', f'diff_{player_stat_column}']])
    df_compare.loc[df_compare[f'diff_{player_stat_column}'] != 0, team_stat_column] = df_compare[
        f'sum_{player_stat_column}Player']
    # remove temporary colums + save
    df_teams_updated = df_compare.drop(columns=[f'sum_{player_stat_column}Player', f'diff_{player_stat_column}'])
    df_teams_updated.to_csv(output_path, index=False)


df_players = pd.read_csv('../newData/players_teams_cleaned.csv')

stat_mappings = [
    ('fgMade', 'o_fgm'), ('ftMade', 'o_ftm'), ('threeMade', 'o_3pm'),
    ('fgAttempted', 'o_fga'), ('ftAttempted', 'o_fta'), ('threeAttempted', 'o_3pa'),
    ('oRebounds', 'o_oreb'), ('dRebounds', 'o_dreb'), ('rebounds', 'o_reb'),
    ('assists', 'o_asts'), ('steals', 'o_stl'), ('turnovers', 'o_to'), ('blocks', 'o_blk')
]

# update EAST e WEST
for side in ['EA', 'WE']:
    for player_stat, team_stat in stat_mappings:
        update_team_data(f'../newData/teams_{side}_cleaned.csv', df_players, player_stat, team_stat,
                         f'../newData/teams_{side}_cleaned.csv')

columns_to_remove = [
    'fgMade', 'ftMade', 'threeMade', 'fgAttempted', 'ftAttempted', 'threeAttempted',
    'oRebounds', 'dRebounds', 'rebounds', 'assists', 'steals', 'turnovers', 'blocks'
]

for path in ['../newData/teams_EA_cleaned.csv', '../newData/teams_WE_cleaned.csv']:
    df_teams_final = pd.read_csv(path)
    df_teams_final = df_teams_final.drop(columns=columns_to_remove, errors='ignore')
    df_teams_final.to_csv(path, index=False)



Mismatches found for o_fgm:
    tmID  year  o_fgm  sum_fgMadePlayer  diff_fgMade
0    ATL     9    895               818           77
1    ATL     9    895               818           77
2    ATL     9    895               818           77
3    ATL     9    895               818           77
4    ATL     9    895               818           77
..   ...   ...    ...               ...          ...
448  DET     8    972               903           69
449  DET     8    972               903           69
450  DET     8    972               903           69
451  DET     8    972               903           69
452  DET     8    972               903           69

[77 rows x 5 columns]
Mismatches found for o_ftm:
    tmID  year  o_ftm  sum_ftMadePlayer  diff_ftMade
0    ATL     9    542               476           66
1    ATL     9    542               476           66
2    ATL     9    542               476           66
3    ATL     9    542               476           66
4    ATL     9    54

In [62]:
# games win/lost/played in a season but the all team

teams_EA = pd.read_csv('../newData/teams_EA_cleaned.csv')
teams_WE = pd.read_csv('../newData/teams_WE_cleaned.csv')

columns_to_remove = ['GP_y']
teams_EA_new = teams_EA.drop(columns=columns_to_remove)
teams_WE_new = teams_WE.drop(columns=columns_to_remove)

teams_EA_new.to_csv('../newData/teams_EA_cleaned.csv', index=False)
teams_WE_new.to_csv('../newData/teams_WE_cleaned.csv', index=False)


In [66]:
# EA
# description problem = change of name
teams_EA = pd.read_csv('../newData/teams_EA_cleaned.csv')
teams_EA.loc[teams_EA['tmID'] == 'ORL', 'tmID'] = 'CON'
teams_EA.loc[teams_EA['tmID'] == 'CON', 'name'] = 'Connecticut Sun'
teams_EA.to_csv('../newData/teams_EA_cleaned.csv', index=False)

teams_WE = pd.read_csv('../newData/teams_WE_cleaned.csv')
teams_WE.loc[teams_WE['tmID'] == 'UTA', 'tmID'] = 'SAS'
teams_WE.loc[teams_WE['tmID'] == 'SAS', 'name'] = 'San Antonio Silver Stars'
teams_WE.to_csv('../newData/teams_WE_cleaned.csv', index=False)

