# Deep Inspection
In this phase, we analyze the two tables in more detail to eliminate some inconsistent data


In [1]:
import pandas as pd
import utils

There is an inconsistency between the team's total points for a year and the sum of the individual players' points for that team in that year. We decided to consider the sum of the player stats for each team for every year

In [2]:
def update_team_data(df_teams_path, df_players, player_stat_column, team_stat_column, output_path):
    df_teams = pd.read_csv(df_teams_path)

    # sum for each 
    sum_stats = df_players.groupby(['tmID', 'year'])[player_stat_column].sum().reset_index()
    sum_stats.rename(columns={player_stat_column: f'sum_{player_stat_column}Player'}, inplace=True)
    df_compare = pd.merge(df_teams, sum_stats, on=['tmID', 'year'], how='left')

    # mismatch and update values
    df_compare[f'diff_{player_stat_column}'] = df_compare[team_stat_column] - df_compare[
        f'sum_{player_stat_column}Player']
    mismatches = df_compare[df_compare[f'diff_{player_stat_column}'] != 0]

    # print different values
    if not mismatches.empty:
        print(f"Mismatches found for {team_stat_column}:")
        print(mismatches[
                  ['tmID', 'year', team_stat_column, f'sum_{player_stat_column}Player', f'diff_{player_stat_column}']])
    df_compare.loc[df_compare[f'diff_{player_stat_column}'] != 0, team_stat_column] = df_compare[
        f'sum_{player_stat_column}Player']
    # remove temporary colums + save
    df_teams_updated = df_compare.drop(columns=[f'sum_{player_stat_column}Player', f'diff_{player_stat_column}'])
    df_teams_updated.to_csv(output_path, index=False)


df_players = pd.read_csv('../newData/players_teams_cleaned.csv')

stat_mappings = [
    ('fgMade', 'o_fgm'), ('ftMade', 'o_ftm'), ('threeMade', 'o_3pm'),
    ('fgAttempted', 'o_fga'), ('ftAttempted', 'o_fta'), ('threeAttempted', 'o_3pa'),
    ('oRebounds', 'o_oreb'), ('dRebounds', 'o_dreb'), ('rebounds', 'o_reb'),
    ('assists', 'o_asts'), ('steals', 'o_stl'), ('turnovers', 'o_to'), ('blocks', 'o_blk')
]

# update EAST e WEST
for side in ['EA', 'WE']:
    for player_stat, team_stat in stat_mappings:
        update_team_data(f'../newData/teams_{side}_cleaned.csv', df_players, player_stat, team_stat,
                         f'../newData/teams_{side}_cleaned.csv')

columns_to_remove = [
    'fgMade', 'ftMade', 'threeMade', 'fgAttempted', 'ftAttempted', 'threeAttempted',
    'oRebounds', 'dRebounds', 'rebounds', 'assists', 'steals', 'turnovers', 'blocks'
]

for path in ['../newData/teams_EA_cleaned.csv', '../newData/teams_WE_cleaned.csv']:
    df_teams_final = pd.read_csv(path)
    df_teams_final = df_teams_final.drop(columns=columns_to_remove, errors='ignore')
    df_teams_final.to_csv(path, index=False)



Mismatches found for o_fgm:
    tmID  year  o_fgm  sum_fgMadePlayer  diff_fgMade
0    ATL     9    895               818           77
1    ATL     9    895               818           77
2    ATL     9    895               818           77
3    ATL     9    895               818           77
4    ATL     9    895               818           77
..   ...   ...    ...               ...          ...
448  DET     8    972               903           69
449  DET     8    972               903           69
450  DET     8    972               903           69
451  DET     8    972               903           69
452  DET     8    972               903           69

[77 rows x 5 columns]
Mismatches found for o_ftm:
    tmID  year  o_ftm  sum_ftMadePlayer  diff_ftMade
0    ATL     9    542               476           66
1    ATL     9    542               476           66
2    ATL     9    542               476           66
3    ATL     9    542               476           66
4    ATL     9    54

Two teams (one from the East and one from the West) changed their names starting from year 4. We decided to update the tmID and the team name to the most recent ones for years 1, 2, and 3.

In [3]:
teams_EA = pd.read_csv('../newData/teams_EA_cleaned.csv')
teams_EA.loc[teams_EA['tmID'] == 'ORL', 'tmID'] = 'CON'
teams_EA.loc[teams_EA['tmID'] == 'CON', 'name'] = 'Connecticut Sun'
teams_EA.to_csv('../newData/teams_EA_cleaned.csv', index=False)

teams_WE = pd.read_csv('../newData/teams_WE_cleaned.csv')
teams_WE.loc[teams_WE['tmID'] == 'UTA', 'tmID'] = 'SAS'
teams_WE.loc[teams_WE['tmID'] == 'SAS', 'name'] = 'San Antonio Silver Stars'
teams_WE.to_csv('../newData/teams_WE_cleaned.csv', index=False)


### Merge of the two tables

In [4]:
df_ea = pd.read_csv('../newData/teams_EA_cleaned.csv')
df_we = pd.read_csv('../newData/teams_WE_cleaned.csv')

combined_df = pd.concat([df_ea, df_we], ignore_index=True)
combined_df.to_csv('../newData/combined_teams.csv', index=False)

# Transformation

In [5]:
df = pd.read_csv('../newData/combined_teams.csv')
target = "playoff"

In [6]:
#Calclate the win rate of each team
df['winrate'] =  df['won_x'] / df['GP_x'] * 100

### Eliminating Useless Attributes

In [7]:
df.drop(
    [
        "franchID",
        "won_x",
        "lost_x",
        "homeW",
        "homeL",
        "awayW",
        "awayL",
        "confW",
        "confL",
        "min",
        "attend",
        "arena",
        "GP_y",
        "GP_x",
        "stint_x",
        "points",
        "PF",
        "GS",
        "minutes",
        "dq",
        "PostGP",
        "PostGS",
        "GS",
        "PostMinutes",
        "PostPoints",
        "PostRebounds",
        "PostoRebounds",
        "PostdRebounds",
        "PostAssists",
        "PostSteals",
        "PostBlocks",
        "PostTurnovers",
        "PostPF",
        "PostfgAttempted",
        "PostfgMade",
        "PostftAttempted",
        "PostftMade",
        "PostthreeAttempted",
        "PostthreeMade",
        "PostDQ",
        "post_wins",
        "post_losses",
        "stint_y",
        "won_y",
        "lost_y",
    ],
    axis=1, inplace=True,
)

## Preparing Data

 Counting the games each team played in the postseason

In [9]:
mapping = {"L": 1, "W": 1, "N": 0}
for col in ["semis", "finals", "firstRound"]:
    df[col] = df[col].map(mapping)
df["roundsPlayed"] = df[["semis", "finals", "firstRound"]].sum(axis=1)
df.drop(["semis", "finals", "firstRound"], axis=1, inplace=True)

Calculating the mean of the height, weight and the age of the teams and the sum of the awards collected by players and coaches for each team for every year

In [10]:
new_df = pd.DataFrame()
for year in df["year"].unique():
    for team in df["tmID"].unique():
        small_df = df[(df["tmID"] == team) & (df["year"] == year)]
        if small_df.empty:
            continue

        d = pd.DataFrame([small_df.iloc[0]])
        d.fillna(0, inplace=True)
        d["award_player"] = small_df["award_x"].count()
        d["award_coach"] = d["award_y"].apply(lambda i: 1 if i != 0 else 0)
        d["height"] = small_df["height"].mean()
        d["weight"] = small_df["weight"].mean()
        d["playersAge"] = (2000 + df["year"]) - utils.get_overall_age(
            small_df["birthDate"]
        )
        d.drop("playerID", axis=1, inplace=True)
        d.drop("birthDate", axis=1, inplace=True)
        d.drop("award_x", axis=1, inplace=True)
        d.drop("award_y", axis=1, inplace=True)

        new_df = pd.concat([new_df, d])

df = new_df.sort_values(by=["year", "tmID"])


Each stats have a weight that can be modified, and we calculate a new value called ATR in which we collect the weighted average of some of each team's statistics 

In [11]:
weights = {"pts": 1, "reb": 0.8, "asts": 0.5, "fgm": 0.7}
df["ATR"] = (
    weights["pts"] * (df["o_pts"] - df["d_pts"])
    + weights["reb"] * (df["o_reb"] - df["d_reb"])
    + weights["asts"] * (df["o_asts"] - df["d_asts"])
    + weights["fgm"] * (df["o_fgm"] - df["d_fgm"])
)
df.drop(["o_pts", "o_reb", "o_asts", "o_fgm", "d_pts", "d_reb", "d_asts", "d_fgm"], axis=1, inplace=True)

#TODO
#Vedi se le altre statistiche possono servire a qualcosa oppure se si possono rimuovere

df.to_csv("../newData/mid_file.csv", index=False)

Calculated ATR.


# Future Encoding