In [None]:
import pandas as pd

from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import f_regression

# Feature Selection for the final models
## Loading in the dataset

In [None]:
df = pd.read_excel('./data/BGG_Complete_Cleaned_Dataset.xlsx')

In [None]:
df_selection = df.dropna().copy()
df_selection['Domains'] = df_selection['Domains'].astype('category').cat.codes
df_selection['Mechanics_Categories'] = df_selection['Mechanics_Categories'].astype('category').cat.codes

df_selection.drop(columns=['Strategy Games', 'Abstract Games', 'Thematic Games', 'Party Games', 'Wargames', 'Customizable Games', 'Children\'s Games', 'Family Games', 'Other', 'Action and Turn Management', 'Resource Management', 'Interaction and Conflict', 'Game Progression and Mechanics', 'Auxiliary Mechanics', 'Narrative and Thematic', 'Movement and Positioning', 'Other', 'Specialized Mechanics', 'Strategic Elements'], inplace=True)

In [None]:
correlation_matrix = df_selection.corr()

### Feature selection Complexity Average
#### Correlation scores

In [None]:
correlation_matrix_complexity = correlation_matrix['Complexity Average'].sort_values(ascending=False)
correlation_matrix_complexity

#### F-Test scores

In [None]:
X_complexity = df_selection.drop(columns=['Complexity Average'])
y_complexity = df_selection['Complexity Average']

f_scores_complexity, p_values_complexity = f_regression(X_complexity, y_complexity)

f_test_results_complexity = pd.DataFrame({'Feature': X_complexity.columns, 'F-Score': f_scores_complexity, 'P-Value': p_values_complexity})
f_test_results_complexity = f_test_results_complexity.sort_values(by='F-Score', ascending=False)

f_test_results_complexity

#### Mutual Information scores

In [None]:
X_complexity_mi = df_selection.drop(columns=['Complexity Average']).values
y_complexity_mi = df_selection['Complexity Average'].values

mutual_info_complexity = mutual_info_regression(X_complexity_mi, y_complexity_mi, random_state=42, n_neighbors=5, discrete_features='auto')

mutual_info_results_complexity = pd.DataFrame({
    'Feature': df_selection.drop(columns=['Complexity Average']).columns,
    'Mutual Information': mutual_info_complexity
})

mutual_info_results_complexity = mutual_info_results_complexity.sort_values(by='Mutual Information', ascending=False)
mutual_info_results_complexity

#### Exporting the best fitted features for the Complexity Average

In [None]:
# The best features are the ones with the highest mutual information scores, correlation scores and F-Test scores
# I chose the one that seemed best fitted for the model
df_selected_features_complexity = df.copy().dropna()
df_selected_features_complexity = df_selected_features_complexity.drop(columns=['Min Players', 'Max Players', 'Users Rated', 'Owned Users', 'Domains', 'Mechanics_Categories', 'Owned Users'])
df_selected_features_complexity.to_excel('./data/Selected_Features_Complexity_Average.xlsx', index=False)

### Rating Average & Owned Users
#### Correlation scores

In [None]:
correlation_matrix_rating = correlation_matrix['Rating Average'].sort_values(ascending=False)
correlation_matrix_rating

In [None]:
correlation_matrix_owned = correlation_matrix['Owned Users'].sort_values(ascending=False)
correlation_matrix_owned

#### F-Test scores

In [None]:
X_rating = df_selection.drop(columns=['Rating Average'])
y_rating = df_selection['Rating Average']

f_scores_rating, p_values_rating = f_regression(X_rating, y_rating)

f_test_results_rating = pd.DataFrame({'Feature': X_rating.columns, 'F-Score': f_scores_rating, 'P-Value': p_values_rating})
f_test_results_rating = f_test_results_rating.sort_values(by='F-Score', ascending=False)

f_test_results_rating

In [None]:
X_owned = df_selection.drop(columns=['Owned Users'])
y_owned = df_selection['Owned Users']

f_scores_owned, p_values_owned = f_regression(X_owned, y_owned)

f_test_results_owned = pd.DataFrame({'Feature': X_owned.columns, 'F-Score': f_scores_owned, 'P-Value': p_values_owned})
f_test_results_owned = f_test_results_owned.sort_values(by='F-Score', ascending=False)

f_test_results_owned

#### Mutual Information scores

In [None]:
X_rating_mi = df_selection.drop(columns=['Rating Average']).values
y_rating_mi = df_selection['Rating Average'].values

mutual_info_rating = mutual_info_regression(X_rating_mi, y_rating_mi, random_state=42, n_neighbors=5, discrete_features='auto')

mutual_info_results_rating = pd.DataFrame({
    'Feature': df_selection.drop(columns=['Rating Average']).columns,
    'Mutual Information': mutual_info_rating
})

mutual_info_results_rating = mutual_info_results_rating.sort_values(by='Mutual Information', ascending=False)
mutual_info_results_rating

In [None]:
X_owned_mi = df_selection.drop(columns=['Owned Users']).values
y_owned_mi = df_selection['Owned Users'].values

mutual_info_owned = mutual_info_regression(X_owned_mi, y_owned_mi, random_state=42, n_neighbors=5, discrete_features='auto')

mutual_info_results_owned = pd.DataFrame({
    'Feature': df_selection.drop(columns=['Owned Users']).columns,
    'Mutual Information': mutual_info_owned
})

mutual_info_results_owned = mutual_info_results_owned.sort_values(by='Mutual Information', ascending=False)
mutual_info_results_owned

#### Exporting the best fitted features for the owned users

In [None]:
# The best features are the ones with the highest mutual information scores, correlation scores and F-Test scores
# I chose the one that seemed best fitted for the model
df_selected_features_owned_users = df[['Specialized Mechanics', 'Interaction and Conflict' , 'Strategic Elements', 'Other', 'Game Progression and Mechanics', 'Action and Turn Management', 'Resource Management', 'Auxiliary Mechanics', 'Movement and Positioning', 'Narrative and Thematic', 'Min Age', 'Owned Users', 'Rating Average', 'Users Rated', 'Strategy Games', 'Thematic Games', 'Wargames', 'Family Games', 'Customizable Games', 'Children\'s Games' ,'Abstract Games', 'Party Games']]
df_selected_features_owned_users.to_excel('./data/Selected_Features_Owned_Users.xlsx', index=False)

#### Exporting the best fitted features for the average rating

In [None]:
# The best features are the ones with the highest mutual information scores, correlation scores and F-Test scores
# I chose the one that seemed best fitted for the model
df_selected_features_rating_average = df[['Specialized Mechanics', 'Interaction and Conflict' , 'Strategic Elements', 'Other', 'Game Progression and Mechanics', 'Action and Turn Management', 'Resource Management', 'Auxiliary Mechanics', 'Movement and Positioning', 'Narrative and Thematic', 'Min Age', 'Rating Average', 'Users Rated', 'Play Time']]
df_selected_features_rating_average.to_excel('./data/Selected_Features_Rating_Average.xlsx', index=False)