In [1]:
import os
import pandas as pd
import numpy as np


main_directory = os.path.normpath(os.getcwd() + os.sep + os.pardir)
data_directory = os.path.join(main_directory, 'data_cleaning')

player_df = pd.read_csv(os.path.join(data_directory,'cleaned_player_stat_w_MVP_defensive_player.csv'),index_col=0)

player_df.head()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,Rank,Player,Position,Age,Team,Games,Games started,Minutes played per game,Field goals per game,Field goals attempt per game,...,Total rebounds per game,Assist per game,Steals per game,Blocks per game,Turn overs per game,Personal fouls per game,Points per game,Player-additional,MVP,Defensive Player of the Year
0,1,Mahmoud Abdul-Rauf,PG,31,VAN,41,0,486,120,246,...,25,76,9,1,26,50,266,abdulma02,0,0
1,2,Tariq Abdul-Wahad,SG,26,DEN,29,12,420,43,111,...,59,22,14,13,34,54,111,abdulta01,0,0
2,3,Shareef Abdur-Rahim,SF,24,VAN,81,81,3241,604,1280,...,735,250,90,77,231,238,1663,abdursh01,0,0
3,4,Cory Alexander,PG,27,ORL,26,0,227,18,56,...,25,36,16,0,25,29,52,alexaco01,0,0
4,5,Courtney Alexander,PG,23,TOT,65,24,1382,239,573,...,143,62,45,5,75,139,618,alexaco02,0,0


In [2]:
player_df.head()

Unnamed: 0,Rank,Player,Position,Age,Team,Games,Games started,Minutes played per game,Field goals per game,Field goals attempt per game,...,Total rebounds per game,Assist per game,Steals per game,Blocks per game,Turn overs per game,Personal fouls per game,Points per game,Player-additional,MVP,Defensive Player of the Year
0,1,Mahmoud Abdul-Rauf,PG,31,VAN,41,0,486,120,246,...,25,76,9,1,26,50,266,abdulma02,0,0
1,2,Tariq Abdul-Wahad,SG,26,DEN,29,12,420,43,111,...,59,22,14,13,34,54,111,abdulta01,0,0
2,3,Shareef Abdur-Rahim,SF,24,VAN,81,81,3241,604,1280,...,735,250,90,77,231,238,1663,abdursh01,0,0
3,4,Cory Alexander,PG,27,ORL,26,0,227,18,56,...,25,36,16,0,25,29,52,alexaco01,0,0
4,5,Courtney Alexander,PG,23,TOT,65,24,1382,239,573,...,143,62,45,5,75,139,618,alexaco02,0,0


### Lets build a random forest classifier for the player data and predict MVPs

In [3]:
player_df.shape

(13927, 33)

In [4]:
player_df.dropna(inplace=True)
player_df.shape

(13927, 33)

In [5]:
player_df.dtypes

Rank                                     int64
Player                                  object
Position                                object
Age                                      int64
Team                                    object
Games                                    int64
Games started                            int64
Minutes played per game                  int64
Field goals per game                     int64
Field goals attempt per game             int64
Field goal percent                     float64
3 point field goal per game              int64
3 point field goal attempt per game      int64
3 point field goal percentage          float64
2 point field goal per game              int64
2 point field goal attempt per game      int64
2 point field goal percentage          float64
Effective field goal percentage        float64
Free throws per game                     int64
Free throw attempt per game              int64
Free throw percentage                  float64
Offensive reb

In [8]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X = player_df.drop(columns=['Player', 'Team', 'Player-additional', 'MVP','Defensive Player of the Year'])

X = pd.get_dummies(X)

y = player_df['MVP']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3,5,8,10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_classifier = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

best_rf_classifier = RandomForestClassifier(**best_params, random_state=42)

best_rf_classifier.fit(X_train, y_train)

y_pred = best_rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Best Parameters: {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy: 0.997486986178424


### Lets build a random forest classifier for the player data and predict Best Defensive player

In [9]:
X = player_df.drop(columns=['Player', 'Team', 'Player-additional', 'MVP','Defensive Player of the Year'])

X = pd.get_dummies(X)

y = player_df['Defensive Player of the Year']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3,5,8,10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_classifier = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

best_rf_classifier = RandomForestClassifier(**best_params, random_state=42)

best_rf_classifier.fit(X_train, y_train)

y_pred = best_rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Best Parameters: {'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Accuracy: 0.9980254891401903
