In [108]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os

output_dir = Path("engineered_data")
output_dir.mkdir(exist_ok=True)
figures_dir = Path("figures/engineered_features")
figures_dir.mkdir(exist_ok=True, parents=True)

In [124]:
processed_dfs = {}
all_features = []
def engineer_features(df):
    df = df.copy()

    required_cols = [
        'HOME_LAST_N_WIN_PCT', 'AWAY_LAST_N_WIN_PCT',
        'HOME_LAST_N_AVG_PTS', 'AWAY_LAST_N_AVG_PTS_ALLOWED',
        'WIN_PCT_HOME', 'WIN_PCT_AWAY', 'FG_PCT_home', 'FG_PCT_away',
        'AST_home', 'AST_away', 'REB_home', 'REB_away'
    ]
    if not all(col in df.columns for col in required_cols):
        return None

    

    # Binned win % for categorical splits
    df['home_win_bin'] = pd.cut(df['WIN_PCT_HOME'], bins=[0, 0.4, 0.6, 0.8, 1.0], labels=[0, 1, 2, 3]).astype(int)
    df['away_win_bin'] = pd.cut(df['WIN_PCT_AWAY'], bins=[0, 0.4, 0.6, 0.8, 1.0], labels=[0, 1, 2, 3]).astype(int)

    # Fundamental gaps
    df['win_pct_gap'] = df['HOME_LAST_N_WIN_PCT'] - df['AWAY_LAST_N_WIN_PCT']
    df['scoring_gap'] = df['HOME_LAST_N_AVG_PTS'] - df['AWAY_LAST_N_AVG_PTS_ALLOWED']
    df['defense_gap'] = df['AWAY_LAST_N_AVG_PTS_ALLOWED'] - df.get('HOME_LAST_N_AVG_PTS_ALLOWED', 100)
    df['momentum_score'] = df['win_pct_gap'] + df['scoring_gap'] - df['defense_gap']

    # Assist-Rebound Ratios
    df['AST_REB_ratio_home'] = df['AST_home'] / (df['REB_home'] + 1e-5)
    df['AST_REB_ratio_away'] = df['AST_away'] / (df['REB_away'] + 1e-5)
    df['AST_REB_ratio_diff'] = df['AST_REB_ratio_home'] - df['AST_REB_ratio_away']

    # Shooting x AST synergy
    df['home_ast_shooting_synergy'] = df['AST_home'] * df['FG_PCT_home']
    df['away_ast_shooting_synergy'] = df['AST_away'] * df['FG_PCT_away']
    df['shooting_variance_gap'] = (df['FG_PCT_home'] - df['FG_PCT_away']) ** 2
    df['combined_shooting'] = df['FG_PCT_home'] * df['FG_PCT_away']

    # Higher-order nonlinear combos
    df['FG_PCT_home_x_AST_home'] = df['FG_PCT_home'] * df['AST_home']
    df['REB_home_minus_away'] = df['REB_home'] - df['REB_away']
    df['home_shooting_synergy'] = df['FG_PCT_home'] * df['AST_REB_ratio_home']
    df['away_shooting_synergy'] = df['FG_PCT_away'] * df['AST_REB_ratio_away']
    df['scoring_defense_ratio'] = df['scoring_gap'] / (df['defense_gap'] + 1e-5)
    
    df['high_ast_home'] = (df['AST_home'] > 25).astype(int)
    df['shooting_edge'] = (df['FG_PCT_home'] > df['FG_PCT_away']).astype(int)
    df['rebound_edge'] = (df['REB_home'] > df['REB_away']).astype(int)
    df['is_momentum_positive'] = (df['momentum_score'] > 0).astype(int)
    # Advanced nonlinearity
    df['momentum_log_gap'] = df['momentum_score'] * np.log1p(np.abs(df['win_pct_gap']))
    df['reb_gap_squared'] = (df['REB_home'] - df['REB_away']) ** 2
    df['ast_gap_squared'] = (df['AST_home'] - df['AST_away']) ** 2
    df['home_strength_ratio'] = df['WIN_PCT_HOME'] / (df['WIN_PCT_AWAY'] + 1e-5)
    df['win_pct_gap_squared'] = df['win_pct_gap'] ** 2
    df['scoring_gap_squared'] = df['scoring_gap'] ** 2
    df['log_defense_gap'] = np.log1p(np.abs(df['defense_gap']))
    df['log_momentum_score'] = np.log1p(np.abs(df['momentum_score']))
    df['log_AST_home'] = np.log1p(df['AST_home'])
    df['squared_FG_PCT_home'] = df['FG_PCT_home'] ** 2
    df['exp_momentum'] = np.exp(df['momentum_score'])
    df['reciprocal_defense_gap'] = 1 / (df['defense_gap'] + 1e-5)
    df['asymmetry_shooting'] = np.abs(df['FG_PCT_home'] - df['FG_PCT_away'])

    df['momentum_win_interaction'] = df['momentum_score'] * df['win_pct_gap']
    df['ast_shooting_synergy_home'] = df['AST_home'] * df['FG_PCT_home']
    df['ast_shooting_synergy_away'] = df['AST_away'] * df['FG_PCT_away']
    df['reb_ast_synergy'] = df['REB_home'] * df['AST_REB_ratio_home']
    df['scoring_x_defense'] = df['scoring_gap'] * df['defense_gap']
    df['log_shooting_var_gap'] = np.log1p((df['FG_PCT_home'] - df['FG_PCT_away'])**2)
    df['win_pct_gap_cubed'] = df['win_pct_gap'] ** 3

    # Overall signal aggregation
    df['overall_team_form'] = (
        df['win_pct_gap'] +
        df['scoring_gap'] -
        df['defense_gap'] +
        df['home_shooting_synergy'] -
        df['away_shooting_synergy']
    )

    # Binning features
    df['binned_momentum'] = pd.qcut(df['momentum_score'], q=4, labels=False, duplicates='drop')
    df['binned_fg_home'] = pd.qcut(df['FG_PCT_home'], q=4, labels=False, duplicates='drop')
    df['binned_reb_gap'] = pd.qcut(df['REB_home_minus_away'], q=4, labels=False, duplicates='drop')
    df['binned_log_ast'] = pd.qcut(df['log_AST_home'], q=4, labels=False, duplicates='drop')

    return df

In [125]:
# Load processed CSVs
for file in csv_files:
    df = pd.read_csv(file)
    result = engineer_features(df)
    if result is not None:
        name = file.stem
        processed_dfs[name] = result
        result.to_csv(output_dir / f"engineered_{name}.csv", index=False)
        print(f"Processed: {name}, shape: {result.shape}")
    else:
        print(f"Skipped: {file.name} (no game structure)")

df_all = pd.concat(processed_dfs.values(), ignore_index=True)
if 'GAME_ID' in df_all.columns:
    df_all = df_all[df_all['GAME_ID'].notna()]

# Drop unrelated player/team metadata
irrelevant_cols = [
    'PLAYER_NAME', 'PLAYER_ID', 'MIN', 'FGA', 'TEAM_CITY', 'TEAM_NAME', 
    'ARENA', 'OWNER', 'COMMENT', 'HEADCOACH', 'DLEAGUEAFFILIATION',
    'LEAGUE_ID', 'SEASON_ID', 'TEAM_ABBREVIATION', 'TEAM', 'ROAD_RECORD',
    'HOME_RECORD', 'GENERALMANAGER', 'ABBREVIATION', 'MAX_YEAR', 'MIN_YEAR',
    'YEARFOUNDED', 'CITY', 'ARENACAPACITY', 'RETURNTOPLAY'
]
df_all = df_all.drop(columns=[col for col in irrelevant_cols if col in df_all.columns])

leaky_cols = ['PTS_home', 'PTS_away']
df_all = df_all.drop(columns=[col for col in leaky_cols if col in df_all.columns])


Processed: final_feature_dataset, shape: (26058, 104)
Skipped: games_details_processed.csv (no game structure)
Skipped: games_processed.csv (no game structure)
Skipped: games_with_rolling_metrics.csv (no game structure)
Skipped: players_processed.csv (no game structure)
Skipped: ranking_processed.csv (no game structure)
Skipped: teams_processed.csv (no game structure)
Skipped: team_metrics.csv (no game structure)


In [122]:
# Save
final_path = output_dir / "engineered_final_feature_dataset.csv"
df_all.to_csv(final_path, index=False)
print(f"\n Final dataset saved to: {final_path}, shape: {df_all.shape}")


 Final dataset saved to: engineered_data\engineered_final_feature_dataset.csv, shape: (26058, 97)


In [123]:
if 'final_feature_dataset' in processed_dfs:
    main_df = processed_dfs['final_feature_dataset']
    
    available_features = [f for f in all_new_features if f in main_df.columns]
    
    feature_batches = [available_features[i:i+9] for i in range(0, len(available_features), 9)]
    
    for i, feature_batch in enumerate(feature_batches):
        if not feature_batch:
            continue
            
        plt.figure(figsize=(15, 15))
        for j, feature in enumerate(feature_batch):
            plt.subplot(3, 3, j+1)
            try:
                sns.histplot(main_df[feature].dropna(), kde=True)
                plt.title(feature)
            except:
                plt.title(f"{feature} (error plotting)")
        plt.tight_layout()
        plt.savefig(figures_dir / f"engineered_features_batch_{i+1}.png")
        plt.close()
    
    target_variables = ['HOME_TEAM_WINS', 'PTS_home', 'PTS_away', 'home_win']
    for target in target_variables:
        if target in main_df.columns:
            target_features = [f for f in available_features if f in main_df.columns]
            if target_features:
                try:
                    corr_with_target = main_df[target_features].corrwith(main_df[target]).abs().sort_values(ascending=False)
                    
                    plt.figure(figsize=(12, min(20, len(corr_with_target))))
                    sns.barplot(x=corr_with_target.values, y=corr_with_target.index)
                    plt.title(f'Engineered Features Correlation with {target}')
                    plt.tight_layout()
                    plt.savefig(figures_dir / f"engineered_feature_correlations_{target}.png")
                    plt.close()
                    
                    print(f"\nTop 10 engineered features by correlation with {target}:")
                    for feature, corr in corr_with_target.head(10).items():
                        print(f"- {feature}: {corr:.4f}")
                except Exception as e:
                    print(f"Error calculating correlations with {target}: {e}")

print(f"\nAll engineered data files saved to {output_dir}")
print(f"Feature visualizations saved to {figures_dir}")


All engineered data files saved to engineered_data
Feature visualizations saved to figures\engineered_features
