# Deep Inspection
In this phase, we analyze the two tables in more detail to eliminate some inconsistent data


In [14]:
import numpy as np
import pandas as pd
import transformation_utils as util
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.preprocessing import StandardScaler


There is an inconsistency between the team's total points for a year and the sum of the individual players' points for that team in that year. We decided to consider the sum of the player stats for each team for every year

In [15]:
df_players = pd.read_csv('../newData/players_teams_cleaned.csv')

stat_mappings = [
    ('fgMade', 'o_fgm'), ('ftMade', 'o_ftm'), ('threeMade', 'o_3pm'),
    ('fgAttempted', 'o_fga'), ('ftAttempted', 'o_fta'), ('threeAttempted', 'o_3pa'),
    ('oRebounds', 'o_oreb'), ('dRebounds', 'o_dreb'), ('rebounds', 'o_reb'),
    ('assists', 'o_asts'), ('steals', 'o_stl'), ('turnovers', 'o_to'), ('blocks', 'o_blk')
]

# update EAST e WEST
for side in ['EA', 'WE']:
    for player_stat, team_stat in stat_mappings:
        util.update_team_data(f'../newData/teams_{side}_cleaned.csv', df_players, player_stat, team_stat,
                         f'../newData/teams_{side}_cleaned.csv')

columns_to_remove = [
    'fgMade', 'ftMade', 'threeMade', 'fgAttempted', 'ftAttempted', 'threeAttempted',
    'oRebounds', 'dRebounds', 'rebounds', 'assists', 'steals', 'turnovers', 'blocks'
]

for path in ['../newData/teams_EA_cleaned.csv', '../newData/teams_WE_cleaned.csv']:
    df_teams_final = pd.read_csv(path)
    df_teams_final = df_teams_final.drop(columns=columns_to_remove, errors='ignore')
    df_teams_final.to_csv(path, index=False)

Mismatches found for o_fgm:
    tmID  year  o_fgm  sum_fgMadePlayer  diff_fgMade
833  CON     1    833               NaN          NaN
834  CON     1    833               NaN          NaN
835  CON     1    833               NaN          NaN
836  CON     1    833               NaN          NaN
837  CON     1    833               NaN          NaN
838  CON     1    833               NaN          NaN
839  CON     1    833               NaN          NaN
840  CON     1    833               NaN          NaN
841  CON     1    833               NaN          NaN
842  CON     1    833               NaN          NaN
843  CON     1    833               NaN          NaN
844  CON     1    833               NaN          NaN
845  CON     1    833               NaN          NaN
846  CON     2    768               NaN          NaN
847  CON     2    768               NaN          NaN
848  CON     2    768               NaN          NaN
849  CON     2    768               NaN          NaN
850  CON     2    

Two teams (one from the East and one from the West) changed their names starting from year 4. We decided to update the tmID and the team name to the most recent ones for years 1, 2, and 3.

In [16]:
teams_EA = pd.read_csv('../newData/teams_EA_cleaned.csv')
teams_EA.loc[teams_EA['tmID'] == 'ORL', 'tmID'] = 'CON'
teams_EA.loc[teams_EA['tmID'] == 'CON', 'name'] = 'Connecticut Sun'
teams_EA.to_csv('../newData/teams_EA_cleaned.csv', index=False)

teams_WE = pd.read_csv('../newData/teams_WE_cleaned.csv')
teams_WE.loc[teams_WE['tmID'] == 'UTA', 'tmID'] = 'SAS'
teams_WE.loc[teams_WE['tmID'] == 'SAS', 'name'] = 'San Antonio Silver Stars'
teams_WE.to_csv('../newData/teams_WE_cleaned.csv', index=False)


### Merge of the two tables

In [17]:
df_ea = pd.read_csv('../newData/teams_EA_cleaned.csv')
df_we = pd.read_csv('../newData/teams_WE_cleaned.csv')

combined_df = pd.concat([df_ea, df_we], ignore_index=True)
combined_df.to_csv('../newData/combined_teams.csv', index=False)

In [18]:
df = pd.read_csv('../newData/combined_teams.csv')
target = "playoff"

#Calculate the win rate of each team
df['winrate'] = df['won_x'] / df['GP_x'] * 100

df.to_csv('../newData/combined_teams.csv', index=False)

In [19]:
import pandas as pd

TPI_weights = {
    'o_pts': 1,       
    'o_fgm': 1,       
    'o_3pm': 1,       
    'o_ftm': 1,       
    'o_reb': 1,       
    'o_asts': 1,     
    'o_to': -1,       
    'o_pf': -1,       
    'd_reb': 1,       
    'd_stl': 1,       
    'd_blk': 1,      
    'd_pts': -1,      
    'd_pf': -1,       
    'd_to': 1       
}

# upload the dataset
df = pd.read_csv('../newData/combined_teams.csv')
# calculate 'TPI_Sum' for each team and year
df['TPI_Sum'] = 0
for index, row in df.iterrows():
    TPI_sum = sum(row[stat] * weight for stat, weight in TPI_weights.items() if stat in row)
    df.at[index, 'TPI_Sum'] = TPI_sum

# remove columns used to calculate TPI_Sum
columns_to_remove = list(TPI_weights.keys())
df.drop(columns=columns_to_remove, inplace=True)

# Transformation

### Eliminating Useless Attributes

In [20]:
df.drop(
    [
        "franchID",
        "won_x",
        "lost_x",
        "homeW",
        "homeL",
        "awayW",
        "awayL",
        "name",
        "confW",
        "confL",
        "min",
        "attend",
        "arena",
        "GP_y",
        "GP_x",
        "stint_x",
        "points",
        "PF",
        "GS",
        "minutes",
        "dq",
        "PostGP",
        "PostGS",
        "GS",
        "stint_y",
        "won_y",
        "lost_y",
    ],
    axis=1, inplace=True,
)

## Preparing Data

 Counting the games each team played in the postseason

In [21]:
mapping = {"L": 1, "W": 1, "N": 0}
for col in ["semis", "finals", "firstRound"]:
    df[col] = df[col].map(mapping)
df["roundsPlayed"] = df[["semis", "finals", "firstRound"]].sum(axis=1)

df.drop(["semis", "finals", "firstRound"], axis=1, inplace=True)

Calculating the mean of the height, weight and the age of the teams and the sum of the awards collected by players and coaches for each team for every year

In [22]:
new_df = pd.DataFrame()
for year in df["year"].unique():
    for team in df["tmID"].unique():
        small_df = df[(df["tmID"] == team) & (df["year"] == year)]
        if small_df.empty:
            continue

        d = pd.DataFrame([small_df.iloc[0]])
        d.fillna(0, inplace=True)
        d["award_player"] = small_df["award_x"].count()
        d["award_coach"] = d["award_y"].apply(lambda i: 1 if i != 0 else 0)
        d["height"] = small_df["height"].mean()
        d["weight"] = small_df["weight"].mean()
        d["playersAge"] = (2000 + df["year"]) - util.get_overall_age(
            small_df["birthDate"]
        )
        d.drop("playerID", axis=1, inplace=True)
        d.drop("birthDate", axis=1, inplace=True)
        d.drop("award_x", axis=1, inplace=True)
        d.drop("award_y", axis=1, inplace=True)
        d.drop("coachID", axis=1, inplace=True)

        new_df = pd.concat([new_df, d])

df = new_df.sort_values(by=["year", "tmID"])

# Future Encoding

This section of the code uses LabelEncoder to transform categorical variables into numerical values and separates the columns based on the type of variable

In [23]:
le = LabelEncoder()
df[target] = le.fit_transform(df[target])
df["confID"] = le.fit_transform(df["confID"])

#These columns contain key information that are used as predictive variables.
key_cols = ["confID", "year", "playoff", "tmID"]

numerical_cols = [col for col in df.columns if col not in key_cols]

## Scaling of Numerical Variables

In [24]:
def custom_scaling(df, numerical_cols):
    """
    Apply StandardScaler to columns with Gaussian distribution,
    and MinMaxScaler to other columns.

    Parameters:
    - df: DataFrame to scale
    - numerical_cols: List of numerical columns to scale

    Returns:
    - Scaled DataFrame
    """
    gaussian_cols = []
    other_cols = []

    # Identifying columns based on Gaussian distribution
    for col in numerical_cols:
        if abs(df[col].skew()) < 0.5:  # Assuming skewness < 0.5 indicates Gaussian
            gaussian_cols.append(col)
        else:
            other_cols.append(col)

    # Scaling
    if gaussian_cols:
        df[gaussian_cols] = StandardScaler().fit_transform(df[gaussian_cols])
    if other_cols:
        df[other_cols] = MinMaxScaler().fit_transform(df[other_cols])

    return df

In [25]:
# Call the custom scaling function
df = custom_scaling(df, numerical_cols)

# Save the transformed DataFrame
df.to_csv('../newData/transformed_data.csv', index=False)

### Adding PlayOffNextYear

In [26]:
df = pd.read_csv('../newData/transformed_data.csv')
df = df.sort_values(by=["tmID", "year"])
df['PlayOffNextYear'] = df['playoff'].shift(-1)
df.loc[df['tmID'] != df['tmID'].shift(-1), 'PlayOffNextYear'] = None
df.dropna(subset=['PlayOffNextYear'] , inplace=True)

# Imposta PlayOffNextYear a 0 per tutte le squadre dell'anno 10
transformed_data = pd.read_csv('../newData/transformed_data.csv')
y_filtered = transformed_data[transformed_data['year'] == 10]
y_filtered['PlayOffNextYear'] = np.nan

# Aggiungi le righe filtrate di y a x
df = pd.concat([df, y_filtered], ignore_index=True)
df = df.sort_values(by=["tmID", "year"])

df.to_csv('../newData/Shifted_playoff.csv', index=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_filtered['PlayOffNextYear'] = np.nan
