In [19]:
from pscore_data_reader_preproc import read_and_process_data, model_evaluation
import xgboost as xgb
import pandas as pd

In [5]:
# Read in the data using the pre-processing code
x_train, y_train, x_test, y_test = read_and_process_data(
     r"E:\github_repos\Private_Projects\NCAA_FBS_AP_Ranking_Predictions\python_ap\scripts_and_data\data\score_pred_train_data.csv",
     r"E:\github_repos\Private_Projects\NCAA_FBS_AP_Ranking_Predictions\python_ap\scripts_and_data\data\score_pred_test_data.csv",
     True
)

# check to make sure the data looks right
x_train.shape
x_test.shape

train data has shape: (14648, 437)
test data has shape: (132, 437)


(132, 437)

In [11]:
# Set up matrix for train and test
trainm = xgb.DMatrix(x_train, label=y_train)
testm = xgb.DMatrix(x_test, label=y_test)

In [15]:
# Define the parameters for XGBoost - can adjust this as needed
params = {
    'objective': 'reg:squarederror',  # For regression problem
    'eval_metric': 'rmse',  # Evaluation metric
    'max_depth': 6,  # Max depth of trees
    'learning_rate': 0.1
}

In [21]:
# Train the XGBoost model
model = xgb.train(params, trainm, num_boost_round=100)

In [25]:
# Extract feature importance for different types
importance_weight = model.get_score(importance_type='weight')
importance_gain = model.get_score(importance_type='gain')
importance_cover = model.get_score(importance_type='cover')

# Convert dictionaries to DataFrames
df_weight = pd.DataFrame(importance_weight.items(), columns=['Feature', 'Weight'])
df_gain = pd.DataFrame(importance_gain.items(), columns=['Feature', 'Gain'])
df_cover = pd.DataFrame(importance_cover.items(), columns=['Feature', 'Cover'])

# Merge the DataFrames into a single DataFrame
importance_df = df_weight.merge(df_gain, on='Feature', how='outer').merge(df_cover, on='Feature', how='outer')

# Fill NaN values with 0 if the columns are not available for a specific feature
importance_df = importance_df.fillna(0)

# Calculate correlation between the features and the dependent variable (y_train)
correlations = x_train.copy()  # Copy the feature matrix
correlations['y'] = y_train  # Add the target variable to the dataset
correlation_with_dv = correlations.corr()['y'].drop('y')  # Correlation of all features with y

# Add the correlation values to the importance DataFrame
importance_df['Correlation_with_DV'] = importance_df['Feature'].map(correlation_with_dv).fillna(0)

# Save the DataFrame to a CSV file
importance_df.to_csv('feature_importance_with_correlation.csv', index=False)
