In [1]:


import os

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectPercentile
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
import joblib
import logging
from datetime import datetime

from utility_functions import trim_set, get_nba_season_year
from team_mappings import team_mapping

import matplotlib
matplotlib.use('TkAgg')

import matplotlib.pyplot as plt


In [9]:




current_season_start_year = get_nba_season_year() - 1
print(f'Training with data up to {current_season_start_year}')


mvp_data_dir = '../Basketball Reference Stat Scraper/Awards/MVP'
player_data_dir = '../Basketball Reference Stat Scraper/player_stats/'
team_data_dir = '../Basketball Reference Stat Scraper/team_stats/'

full_data = []
for year in range(1980, current_season_start_year + 1):
    player_data = pd.read_csv(os.path.join(player_data_dir, f'{year}_player_stats.csv'), index_col=0)

    mvp_data = pd.read_csv(os.path.join(mvp_data_dir, f'{year}_MVP.csv'), index_col=0)
    team_data = pd.read_csv(os.path.join(team_data_dir, f'{year}_tm_stats.csv'), index_col=0)
    team_data.Team = team_data.Team.str.strip('*')

    team_data['Tm'] = team_data.Team.map(team_mapping)


    # Prepend 'Team_' to team data columns except 'Team' and 'Year'
    team_data = team_data.rename(columns={col: f'Team_{col}' for col in team_data.columns if col not in ['Team', 'Year','Tm']})

    # Update team abbreviations in player and mvp data
    for df in [player_data, mvp_data]:
        df['Tm'] = df['Tm'].map(team_mapping).fillna(df['Tm'])

    # Merge player data with mvp data
    merged_data = pd.merge(
        player_data.assign(Player=lambda df: df.Player.str.strip('*')),
        mvp_data,
        how='left'
    ).fillna(0)

    # Merge with team data
    merged_data = pd.merge(
        merged_data,
        team_data,
        left_on=['Tm', 'Year'],
        right_on=['Tm', 'Year'],
        how='left'
    ).fillna(0)

    full_data.append(merged_data)

train = pd.concat(full_data)
df = trim_set(train)

# df['G'] \
# / df['Team_G']
df[df['Team_G'] == 0]
# feature_list = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA',
#                 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PER', 'TS%', '3PAr', 'FTr',
#                 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM',
#                 'DBPM', 'BPM', 'VORP',
#
#                 'Team_FG', 'Team_FGA', 'Team_FG%',
#                 'Team_3P', 'Team_3PA', 'Team_3P%', 'Team_2P', 'Team_2PA', 'Team_2P%',
#                 'Team_FT', 'Team_FTA', 'Team_FT%', 'Team_ORB', 'Team_DRB', 'Team_TRB',
#                 'Team_AST', 'Team_STL', 'Team_BLK', 'Team_TOV', 'Team_PF', 'Team_PTS',
#                 'Team_Age', 'Team_W', 'Team_L', 'Team_MOV', 'Team_SOS', 'Team_SRS',
#                 'Team_ORtg', 'Team_DRtg', 'Team_NRtg', 'Team_Pace', 'Team_FTr',
#                 'Team_3PAr', 'Team_TS%',
#
#                 ]









Training with data up to 2023


Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,Team_MOV,Team_SOS,Team_SRS,Team_ORtg,Team_DRtg,Team_NRtg,Team_Pace,Team_FTr,Team_3PAr,Team_TS%
68,Walter Davis,SF,25,PHO,75,0,30.8,8.8,15.5,0.563,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83,Alex English,SF,26,TOT,78,0,30.8,7.1,14.3,0.497,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
165,Bob Lanier,C,31,TOT,63,0,33.8,7.4,13.8,0.537,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
194,Calvin Natt,SF,23,TOT,78,0,36.6,8.0,16.6,0.479,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
221,Truck Robinson,PF,28,PHO,82,0,33.0,6.6,13.0,0.512,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172,RaiQuan Gray,PF,23,BRK,1,0,35.0,6.0,12.0,0.500,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
229,Kyrie Irving,PG,30,TOT,60,60,37.4,9.9,20.1,0.494,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
242,Cameron Johnson,PF,26,TOT,42,41,28.5,5.3,11.3,0.470,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
425,D'Angelo Russell,PG,26,TOT,71,71,32.5,6.3,13.4,0.469,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
        x = df[feature_list]
        y = df['Share']
        pipe = Pipeline([
            ('feat_selection', SelectPercentile(percentile=60)),
            ('scaler', MinMaxScaler()),
            ('reg', RandomForestRegressor(n_jobs=-1))
        ])

        pipe.fit(x, y)
        print(os.getcwd())
        filename = f'../Models/MVP/{get_nba_season_year()}/MVP_Predictions_V2_{datetime.now().strftime("%Y%m%d")}.pkl'
        joblib.dump(pipe, filename)

        logging.info("Model training completed and model saved.")

        import seaborn as sns

        # After fitting the pipeline
        support = pipe.named_steps['feat_selection'].get_support()
        selected_features = [feature for feature, selected in zip(feature_list, support) if selected]

        # Extract feature importances from the fitted model
        feature_importances = pipe.named_steps['reg'].feature_importances_

        # Create a DataFrame with feature names and their importance scores
        importances_df = pd.DataFrame({'Feature': selected_features, 'Importance': feature_importances})

        # Sort the feature importances in descending order
        importances_df = importances_df.sort_values(by='Importance', ascending=False)

        # Plotting using Seaborn
        plt.figure(figsize=(10, 6))
        sns.barplot(x='Importance', y='Feature', data=importances_df.head(20))  # Adjust number as needed
        plt.title('Top 20 Feature Importances')
        plt.xlabel('Importance Score')
        plt.ylabel('Features')
        plt.show()


    except Exception as e:
        logging.error(f"An error occurred: {e}")
