In [1]:
# Import the modules
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN

In [2]:
# Read the dataset
super_data_df = pd.read_csv("Resources/madness_label_final_data.csv")

# Drop unnecessary columns
X = super_data_df.drop(columns=["Years", "Madness", "Team"])
y = super_data_df["Madness"]

# Apply preprocessing to handle string columns
X['PTS'] = X['PTS'].str.replace(',', '').astype(float)
X['DRebs'] = X['DRebs'].str.replace(',', '').astype(float)
X['REB'] = X['REB'].str.replace(',', '').astype(float)
X['FGM_y'] = X['FGM_y'].str.replace(',', '').astype(float)
X['FGA_y'] = X['FGA_y'].str.replace(',', '').astype(float)
X['FTA'] = X['FTA'].str.replace(',', '').astype(float)

# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Apply ADASYN to handle class imbalance
adasyn = ADASYN(random_state=42)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(random_state=1)
rf_model.fit(X_train_resampled, y_train_resampled)

# Make predictions
rf_prediction = rf_model.predict(X_test)

In [3]:
cm_imbalanced = confusion_matrix(y_test, rf_prediction)
cm_imbalanced_df = pd.DataFrame(cm_imbalanced)
cm_imbalanced_df

Unnamed: 0,0,1
0,216,56
1,59,82


In [4]:
# Print classification report 1
print(classification_report(y_test, rf_prediction))

              precision    recall  f1-score   support

         0.0       0.79      0.79      0.79       272
         1.0       0.59      0.58      0.59       141

    accuracy                           0.72       413
   macro avg       0.69      0.69      0.69       413
weighted avg       0.72      0.72      0.72       413



## Optimization 1: ADASYN & SelectFromModel Threshold ##

In [9]:
# Read the dataset
super_data_df = pd.read_csv("Resources/madness_label_final_data.csv")

# Drop unnecessary columns
X = super_data_df.drop(columns=["Years", "Madness", "Team"])
y = super_data_df["Madness"]

# Apply preprocessing to handle string columns
X['PTS'] = X['PTS'].str.replace(',', '').astype(float)
X['DRebs'] = X['DRebs'].str.replace(',', '').astype(float)
X['REB'] = X['REB'].str.replace(',', '').astype(float)
X['FGM_y'] = X['FGM_y'].str.replace(',', '').astype(float)
X['FGA_y'] = X['FGA_y'].str.replace(',', '').astype(float)
X['FTA'] = X['FTA'].str.replace(',', '').astype(float)

# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Apply ADASYN to handle class imbalance
adasyn = ADASYN(random_state=42)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=1)

# Define hyperparameters for grid search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best model from grid search
best_rf_model = grid_search.best_estimator_

# Fit the best model on the resampled training data
best_rf_model.fit(X_train_resampled, y_train_resampled)

# Get feature importances
feature_importances = best_rf_model.feature_importances_

# Select features based on importance
sfm = SelectFromModel(best_rf_model, threshold=0.02)  # Adjust threshold as needed
X_train_selected = sfm.fit_transform(X_train_resampled, y_train_resampled)
X_test_selected = sfm.transform(X_test)

# Train the model with selected features
best_rf_model.fit(X_train_selected, y_train_resampled)

# Make predictions
rf_prediction = best_rf_model.predict(X_test_selected)

# Evaluate the model
print(classification_report(y_test, rf_prediction))

405 fits failed out of a total of 1215.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
202 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/jaxontige/anaconda3/envs/dev/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/jaxontige/anaconda3/envs/dev/lib/python3.10/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/jaxontige/anaconda3/envs/dev/lib/python3.10/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/jaxontige/anaconda3/envs/dev/lib/python3.10/site-packages/sklearn/utils/

              precision    recall  f1-score   support

         0.0       0.79      0.78      0.78       272
         1.0       0.58      0.60      0.59       141

    accuracy                           0.71       413
   macro avg       0.68      0.69      0.68       413
weighted avg       0.72      0.71      0.72       413



In [12]:
cm_imbalanced = confusion_matrix(y_test, rf_prediction)
cm_imbalanced_df = pd.DataFrame(cm_imbalanced)
cm_imbalanced_df

Unnamed: 0,0,1
0,211,61
1,57,84


In [13]:
# Print classification report 2 
#(After Implementing SelectFromModel Threshold Optimization)
print(classification_report(y_test, rf_prediction))

              precision    recall  f1-score   support

         0.0       0.79      0.78      0.78       272
         1.0       0.58      0.60      0.59       141

    accuracy                           0.71       413
   macro avg       0.68      0.69      0.68       413
weighted avg       0.72      0.71      0.72       413



In [14]:
# Read the dataset
super_data_df = pd.read_csv("Resources/madness_label_final_data.csv")

# Drop unnecessary columns
X = super_data_df.drop(columns=["Years", "Madness", "Team"])
y = super_data_df["Madness"]

# Apply preprocessing to handle string columns
X['PTS'] = X['PTS'].str.replace(',', '').astype(float)
X['DRebs'] = X['DRebs'].str.replace(',', '').astype(float)
X['REB'] = X['REB'].str.replace(',', '').astype(float)
X['FGM_y'] = X['FGM_y'].str.replace(',', '').astype(float)
X['FGA_y'] = X['FGA_y'].str.replace(',', '').astype(float)
X['FTA'] = X['FTA'].str.replace(',', '').astype(float)

# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Apply ADASYN to handle class imbalance
adasyn = ADASYN(random_state=42)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(random_state=1)
rf_model.fit(X_train_resampled, y_train_resampled)

# Initialize and train the logistic regression model
rf_model = RandomForestClassifier(class_weight={0: 1.1, 1: 0.9}, random_state=1)
rf_model.fit(X_train_resampled, y_train_resampled)

# Make predictions
rf_prediction = rf_model.predict(X_test)

In [15]:
cm_imbalanced = confusion_matrix(y_test, rf_prediction)
cm_imbalanced_df = pd.DataFrame(cm_imbalanced)
cm_imbalanced_df

Unnamed: 0,0,1
0,216,56
1,63,78


In [16]:
# Print classification report 3
# (After Implementing Class Weight Adjustments)
print(classification_report(y_test, rf_prediction))

              precision    recall  f1-score   support

         0.0       0.77      0.79      0.78       272
         1.0       0.58      0.55      0.57       141

    accuracy                           0.71       413
   macro avg       0.68      0.67      0.68       413
weighted avg       0.71      0.71      0.71       413

