This project has a few steps:
First, we will use 4 feature selection methods: RF feature importance, XGB feature importance, Fisher score, and Chi-squared score to select the top 15 features.
Then, we will use 4 models: Logistic Regression, Random Forest, XGBoost, and Neural Network to train the models with hyperparameter tuning.
Later, we will perform bootstrapping for each chosen model.

In [None]:
# imports
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from skfeature.function.similarity_based import fisher_score
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
# load the data
current_dir = os.getcwd()
csv_file = f'{current_dir}/spotify-2023.csv'
try:
    df = pd.read_csv(csv_file, encoding='utf-8-sig')
except UnicodeDecodeError:
    try:
        df = pd.read_csv(csv_file, encoding='latin1')
    except UnicodeDecodeError:
        df = pd.read_csv(csv_file, encoding='ISO-8859-1')

In [None]:
# preprocess the data
# Drop rows with NaN values in 'streams' column
# df.dropna(subset=['streams'], inplace=True) # there aren't any NaN values in the 'streams' column

# replace all NaN values in the 'key' column with 'Unknown'
df['key'] = df['key'].fillna('Unknown')

# replace all Nan values in the 'in_shazam_charts' column with 0
df['in_shazam_charts'] = df['in_shazam_charts'].fillna(0)

In [None]:
# Split the data into features and target
# find the median of the 'streams' column
median_streams = df['streams'].median()

# change the 'streams' column to categorical- 1 if the number of streams is greater than the median, 0 otherwise
df['streams'] = df['streams'].apply(lambda x: 1 if x > median_streams else 0)

# Split the data into features and target
y = df['streams']
X = df.drop(columns=['streams', 'track_name', 'artist(s)_name']) # removing the target and the IDs

# convert categorical columns to numerical columns
X = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.21, random_state=42)

In [None]:
# Feature selection
# Use a random forest classifier for feature selection
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
feature_importances_rf = rf.feature_importances_

# Create a dataframe of the feature importances
features = X_train.columns
feature_importances_rf_df = pd.DataFrame({'feature': features, 'importance': feature_importances_rf})

# Sort the dataframe by feature importances
feature_importances_rf_df = feature_importances_rf_df.sort_values(by='importance', ascending=False)

# Create a barplot of the feature importances
plt.figure(figsize=(10, 8))
sns.barplot(x='importance', y='feature', data=feature_importances_rf_df)
plt.title('Feature Importances- Random Forest')
plt.show()

In [None]:
# Use XGBoost for feature selection
# change the target column to numerical
y_train_XGB = y_train
y_test_XGB = y_test

# Create an XGBoost classifier
xgb = XGBClassifier(n_estimators=100, random_state=42)
xgb.fit(X_train, y_train_XGB)

# Get feature importances
feature_importances = xgb.feature_importances_

# Create a dataframe of the feature importances
features = X_train.columns
feature_importances_df = pd.DataFrame({'feature': features, 'importance': feature_importances})

# Sort the dataframe by feature importances
feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)

# Create a barplot of the feature importances
plt.figure(figsize=(10, 8))
sns.barplot(x='importance', y='feature', data=feature_importances_df)
plt.title('Feature Importances- XGBoost')
plt.show()

In [None]:
# Use fisher score for feature selection
# Get the feature scores
feature_fisher_scores = fisher_score.fisher_score(X_train.values, y_train.values)

# Create a dataframe of the feature scores
feature_fisher_scores_df = pd.DataFrame({'feature': features, 'score': feature_fisher_scores})

# Sort the dataframe by feature scores
feature_fisher_scores_df = feature_fisher_scores_df.sort_values(by='score', ascending=False)

# Create a barplot of the feature scores
plt.figure(figsize=(10, 8))
sns.barplot(x='score', y='feature', data=feature_fisher_scores_df)
plt.title('Fisher Scores')
plt.show()

In [None]:
# Use chi-squared score for feature selection
# Get the p-values
p_values = chi2(X_train, y_train)[1]
# plot the p-values
p_values_df = pd.DataFrame({'feature': features, 'p_value': p_values})
p_values_df = p_values_df.sort_values(by='p_value', ascending=True)
plt.figure(figsize=(10, 8))
sns.barplot(x='p_value', y='feature', data=p_values_df)
plt.title('Chi-squared p-values')
plt.show()

In [None]:
# Use chi-squared score for feature selection
# Get the feature scores
feature_scores = chi2(X_train, y_train)[0]

# Create a dataframe of the feature scores
feature_scores_df = pd.DataFrame({'feature': features, 'score': feature_scores})

# Sort the dataframe by feature scores
feature_scores_df = feature_scores_df.sort_values(by='score', ascending=False)

# show the features that have p-values less than 0.05
p_values = chi2(X_train, y_train)[1]
p_values_df = pd.DataFrame({'feature': features, 'p_value': p_values})

p_values_df = p_values_df.sort_values(by='p_value', ascending=True)
# print(p_values_df)

p_values_df = p_values_df[p_values_df['p_value'] < 0.05]
print(p_values_df)

In [None]:
# Save the top 15 features from each feature selection method
# Save the top 15 features from the random forest feature importances
top_15_rf_features = feature_importances_rf_df['feature'][:15]
top_15_rf_features.to_csv('top_15_rf_features.csv', index=False)
rf_features_test = X_test[top_15_rf_features]

# Save the top 15 features from the XGBoost feature importances
top_15_xgb_features = feature_importances_df['feature'][:15]
top_15_xgb_features.to_csv('top_15_xgb_features.csv', index=False)
xgb_features_test = X_test[top_15_xgb_features]

# Save the top 15 features from the fisher scores
top_15_fisher_features = feature_fisher_scores_df['feature'][:15]
top_15_fisher_features.to_csv('top_15_fisher_features.csv', index=False)
fisher_features_test = X_test[top_15_fisher_features]

# Save all features that have p-values less than 0.05
p_values_df.to_csv('p_values.csv', index=False)
chi2_features_test = X_test[p_values_df['feature']]

In [None]:
# Train the models
# Logistic Regression
# Create a logistic regression model
log_reg = LogisticRegression(random_state=42)

log_reg_rf = LogisticRegression(random_state=42)
log_reg_grid = {'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 1], 'max_iter': [100, 200]}
log_reg_rf_Grid = GridSearchCV(log_reg_rf, log_reg_grid, cv=5, verbose=2, scoring='accuracy')
log_reg_rf_features_grid = log_reg_rf_Grid.fit(X_train[top_15_rf_features], y_train)

log_reg_XGB = LogisticRegression(random_state=42)
log_reg_grid = {'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 1], 'max_iter': [100, 200]}
log_reg_XGB_Grid = GridSearchCV(log_reg_XGB, log_reg_grid, cv=5, verbose=2, scoring='accuracy')
log_reg_XGB_features_grid = log_reg_XGB_Grid.fit(X_train[top_15_xgb_features], y_train)

log_reg_Fisher = LogisticRegression(random_state=42)
log_reg_grid = {'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 1], 'max_iter': [100, 200]}
log_reg_Fisher_Grid = GridSearchCV(log_reg_Fisher, log_reg_grid, cv=5, verbose=2, scoring='accuracy')
log_reg_Fisher_features_grid = log_reg_Fisher_Grid.fit(X_train[top_15_fisher_features], y_train)

log_reg_Chi2 = LogisticRegression(random_state=42)
log_reg_grid = {'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 1], 'max_iter': [100, 200]}
log_reg_Chi2_Grid = GridSearchCV(log_reg_Chi2, log_reg_grid, cv=5, verbose=2, scoring='accuracy')
log_reg_Chi2_features_grid = log_reg_Chi2_Grid.fit(X_train[p_values_df['feature']], y_train)

# Save the best hyperparameters for each feature selection method
best_hyperparameters = {'rf_features': log_reg_rf_features_grid.best_params_, 'XGB_features': log_reg_XGB_features_grid.best_params_,
                        'Fisher_features': log_reg_Fisher_features_grid.best_params_, 'Chi2_features': log_reg_Chi2_features_grid.best_params_}
best_hyperparameters_df = pd.DataFrame(best_hyperparameters, index=[0])
best_hyperparameters_df.to_csv('best_hyperparameters.csv', index=False)

# save the best model for each feature selection method
log_reg_rf_features = log_reg_rf_features_grid.best_estimator_
log_reg_XGB_features = log_reg_XGB_features_grid.best_estimator_
log_reg_Fisher_features = log_reg_Fisher_features_grid.best_estimator_
log_reg_Chi2_features = log_reg_Chi2_features_grid.best_estimator_

In [None]:
# print the best hyperparameters for each feature selection method
print(f'Logistic Regression with RF features: {log_reg_rf_features}')
print(f'Logistic Regression with XGB features: {log_reg_XGB_features}')
print(f'Logistic Regression with Fisher features: {log_reg_Fisher_features}')
print(f'Logistic Regression with Chi2 features: {log_reg_Chi2_features}')

In [None]:
# Random Forest
# Create a random forest model
rf = RandomForestClassifier(random_state=42)

rf_rf = RandomForestClassifier(random_state=42)
rf_grid = {'n_estimators': [100, 300, 500], 'max_depth': [10, 20, 30, 40, None], 'min_samples_leaf': [1, 2, 4]}
rf_rf_Grid = GridSearchCV(rf_rf, rf_grid, cv=5, verbose=2, scoring='accuracy')
rf_rf_features_grid = rf_rf_Grid.fit(X_train[top_15_rf_features], y_train)

rf_XGB = RandomForestClassifier(random_state=42)
rf_grid = {'n_estimators': [100, 300, 500], 'max_depth': [10, 20, 30, 40, None], 'min_samples_leaf': [1, 2, 4]}
rf_XGB_Grid = GridSearchCV(rf_XGB, rf_grid, cv=5, verbose=2, scoring='accuracy')
rf_XGB_features_grid = rf_XGB_Grid.fit(X_train[top_15_xgb_features], y_train)

rf_Fisher = RandomForestClassifier(random_state=42)
rf_grid = {'n_estimators': [100, 300, 500], 'max_depth': [10, 20, 30, 40, None], 'min_samples_leaf': [1, 2, 4]}
rf_Fisher_Grid = GridSearchCV(rf_Fisher, rf_grid, cv=5, verbose=2, scoring='accuracy')
rf_Fisher_features_grid = rf_Fisher_Grid.fit(X_train[top_15_fisher_features], y_train)

rf_Chi2 = RandomForestClassifier(random_state=42)
rf_grid = {'n_estimators': [100, 300, 500], 'max_depth': [10, 20, 30, 40, None], 'min_samples_leaf': [1, 2, 4]}
rf_Chi2_Grid = GridSearchCV(rf_Chi2, rf_grid, cv=5, verbose=2, scoring='accuracy')
rf_Chi2_features_grid = rf_Chi2_Grid.fit(X_train[p_values_df['feature']], y_train)

# Save the best hyperparameters for each feature selection method
best_hyperparameters = {'rf_features': rf_rf_features_grid.best_params_, 'XGB_features': rf_XGB_features_grid.best_params_,
                        'Fisher_features': rf_Fisher_features_grid.best_params_, 'Chi2_features': rf_Chi2_features_grid.best_params_}
best_hyperparameters_df = pd.DataFrame(best_hyperparameters, index=[0])
best_hyperparameters_df.to_csv('best_hyperparameters.csv', index=False)

# save the best model for each feature selection method
rf_rf_features = rf_rf_features_grid.best_estimator_
rf_XGB_features = rf_XGB_features_grid.best_estimator_
rf_Fisher_features = rf_Fisher_features_grid.best_estimator_
rf_Chi2_features = rf_Chi2_features_grid.best_estimator_

In [None]:
# print the best hyperparameters for each feature selection method
print(f'Random Forest with RF features: {rf_rf_features}')
print(f'Random Forest with XGB features: {rf_XGB_features}')
print(f'Random Forest with Fisher features: {rf_Fisher_features}')
print(f'Random Forest with Chi2 features: {rf_Chi2_features}')

In [None]:
# combine the chosen features in a dataframe
chosen_features = pd.concat([top_15_rf_features, top_15_xgb_features, top_15_fisher_features, p_values_df['feature']], axis=1)
chosen_features.columns = ['rf_features', 'XGB_features', 'Fisher_features', 'Chi2_features']
chosen_features.to_csv('chosen_features.csv', index=False)
print(chosen_features)

In [None]:
# XGBoost
# Create an XGBoost model
xgb = XGBClassifier(random_state=42)

xgb_rf = XGBClassifier(random_state=42)
xgb_grid = {'n_estimators': [100, 300, 500], 'max_depth': [3, 5, 7, 9],
            'learning_rate': [0.01, 0.05, 0.1, 0.2]}
xgb_rf_Grid = GridSearchCV(xgb_rf, xgb_grid, cv=5, verbose=2, scoring='accuracy')
xgb_rf_features_grid = xgb_rf_Grid.fit(X_train[top_15_rf_features], y_train_XGB)

xgb_XGB = XGBClassifier(random_state=42)
xgb_grid = {'n_estimators': [100, 300, 500], 'max_depth': [3, 5, 7, 9],
            'learning_rate': [0.01, 0.05, 0.1, 0.2]}
xgb_XGB_Grid = GridSearchCV(xgb_XGB, xgb_grid, cv=5, verbose=2, scoring='accuracy')
xgb_XGB_features_grid = xgb_XGB_Grid.fit(X_train[top_15_xgb_features], y_train_XGB)

xgb_Fisher = XGBClassifier(random_state=42)
xgb_grid = {'n_estimators': [100, 300, 500], 'max_depth': [3, 5, 7, 9],
            'learning_rate': [0.01, 0.05, 0.1, 0.2]}
xgb_Fisher_Grid = GridSearchCV(xgb_Fisher, xgb_grid, cv=5, verbose=2, scoring='accuracy')
xgb_Fisher_features_grid = xgb_Fisher_Grid.fit(X_train[top_15_fisher_features], y_train_XGB)

xgb_Chi2 = XGBClassifier(random_state=42)
xgb_grid = {'n_estimators': [100, 300, 500], 'max_depth': [3, 5, 7, 9],
            'learning_rate': [0.01, 0.05, 0.1, 0.2]}
xgb_Chi2_Grid = GridSearchCV(xgb_Chi2, xgb_grid, cv=5, verbose=2, scoring='accuracy')
xgb_Chi2_features_grid = xgb_Chi2_Grid.fit(X_train[p_values_df['feature']], y_train_XGB)

# Save the best hyperparameters for each feature selection method
best_hyperparameters = {'rf_features': xgb_rf_features_grid.best_params_, 'XGB_features': xgb_XGB_features_grid.best_params_,
                        'Fisher_features': xgb_Fisher_features_grid.best_params_, 'Chi2_features': xgb_Chi2_features_grid.best_params_}
best_hyperparameters_df = pd.DataFrame(best_hyperparameters, index=[0])
best_hyperparameters_df.to_csv('best_hyperparameters.csv', index=False)

# save the best model for each feature selection method
xgb_rf_features = xgb_rf_features_grid.best_estimator_
xgb_XGB_features = xgb_XGB_features_grid.best_estimator_
xgb_Fisher_features = xgb_Fisher_features_grid.best_estimator_
xgb_Chi2_features = xgb_Chi2_features_grid.best_estimator_

In [None]:
# print the best hyperparameters for each feature selection method
print(f'XGBoost with RF features: {xgb_rf_features}')
print(f'XGBoost with XGB features: {xgb_XGB_features}')
print(f'XGBoost with Fisher features: {xgb_Fisher_features}')
print(f'XGBoost with Chi2 features: {xgb_Chi2_features}')

In [None]:
# Neural Network
# Create a neural network model
nn = MLPClassifier(random_state=42)

nn_rf = MLPClassifier(random_state=42)
nn_grid = {'hidden_layer_sizes': [(50, 50, 50), (50, 100, 50), (100,), (20, 20, 20, 20)], 'activation': ['tanh', 'relu'],
           'solver': ['sgd', 'adam'], 'alpha': [0.001, 0.05]}
nn_rf_Grid = GridSearchCV(nn_rf, nn_grid, cv=5, verbose=2, scoring='accuracy')
nn_rf_features_grid = nn_rf_Grid.fit(X_train[top_15_rf_features], y_train)

nn_XGB = MLPClassifier(random_state=42)
nn_grid = {'hidden_layer_sizes': [(50, 50, 50), (50, 100, 50), (100,), (20, 20, 20, 20)], 'activation': ['tanh', 'relu'],
           'solver': ['sgd', 'adam'], 'alpha': [0.001, 0.05]}
nn_XGB_Grid = GridSearchCV(nn_XGB, nn_grid, cv=5, verbose=2, scoring='accuracy')
nn_XGB_features_grid = nn_XGB_Grid.fit(X_train[top_15_xgb_features], y_train)

nn_Fisher = MLPClassifier(random_state=42)
nn_grid = {'hidden_layer_sizes': [(50, 50, 50), (50, 100, 50), (100,), (20, 20, 20, 20)], 'activation': ['tanh', 'relu'],
           'solver': ['sgd', 'adam'], 'alpha': [0.001, 0.05]}
nn_Fisher_Grid = GridSearchCV(nn_Fisher, nn_grid, cv=5, verbose=2, scoring='accuracy')
nn_Fisher_features_grid = nn_Fisher_Grid.fit(X_train[top_15_fisher_features], y_train)

nn_Chi2 = MLPClassifier(random_state=42)
nn_grid = {'hidden_layer_sizes': [(50, 50, 50), (50, 100, 50), (100,), (20, 20, 20, 20)], 'activation': ['tanh', 'relu'],
           'solver': ['sgd', 'adam'], 'alpha': [0.001, 0.05]}
nn_Chi2_Grid = GridSearchCV(nn_Chi2, nn_grid, cv=5, verbose=2, scoring='accuracy')
nn_Chi2_features_grid = nn_Chi2_Grid.fit(X_train[p_values_df['feature']], y_train)

# Save the best hyperparameters for each feature selection method
best_hyperparameters = {'rf_features': nn_rf_features_grid.best_params_, 'XGB_features': nn_XGB_features_grid.best_params_,
                        'Fisher_features': nn_Fisher_features_grid.best_params_, 'Chi2_features': nn_Chi2_features_grid.best_params_}
best_hyperparameters_df = pd.DataFrame(best_hyperparameters, index=[0])
best_hyperparameters_df.to_csv('best_hyperparameters.csv', index=False)

# save the best model for each feature selection method
nn_rf_features = nn_rf_features_grid.best_estimator_
nn_XGB_features = nn_XGB_features_grid.best_estimator_
nn_Fisher_features = nn_Fisher_features_grid.best_estimator_
nn_Chi2_features = nn_Chi2_features_grid.best_estimator_

In [None]:
# print the best hyperparameters for each feature selection method
print(f'Neural Network with RF features: {nn_rf_features}')
print(f'Neural Network with XGB features: {nn_XGB_features}')
print(f'Neural Network with Fisher features: {nn_Fisher_features}')
print(f'Neural Network with Chi2 features: {nn_Chi2_features}')

Testing the models to get a sense of their performance

In [None]:
# Use the test set to evaluate the models
# Logistic Regression
# Evaluate the logistic regression model with the validation set
log_reg_rf_features_val_pred = log_reg_rf_features.predict(X_test[top_15_rf_features])
log_reg_XGB_features_val_pred = log_reg_XGB_features.predict(X_test[top_15_xgb_features])
log_reg_Fisher_features_val_pred = log_reg_Fisher_features.predict(X_test[top_15_fisher_features])
log_reg_Chi2_features_val_pred = log_reg_Chi2_features.predict(X_test[p_values_df['feature']])

# choose the best model
log_reg_rf_features_val_accuracy = accuracy_score(y_test, log_reg_rf_features_val_pred)
log_reg_XGB_features_val_accuracy = accuracy_score(y_test, log_reg_XGB_features_val_pred)
log_reg_Fisher_features_val_accuracy = accuracy_score(y_test, log_reg_Fisher_features_val_pred)
log_reg_Chi2_features_val_accuracy = accuracy_score(y_test, log_reg_Chi2_features_val_pred)

log_reg_val_accuracies = {'rf_features': log_reg_rf_features_val_accuracy, 'XGB_features': log_reg_XGB_features_val_accuracy,
                          'Fisher_features': log_reg_Fisher_features_val_accuracy, 'Chi2_features': log_reg_Chi2_features_val_accuracy}
log_reg_val_accuracies_df = pd.DataFrame(log_reg_val_accuracies, index=[0])
log_reg_val_accuracies_df.to_csv('log_reg_val_accuracies.csv', index=False)

# print the best model and its feature selection method, and its accuracy
log_reg_best_model = max(log_reg_val_accuracies, key=log_reg_val_accuracies.get)
print(f'The best logistic regression model is trained with {log_reg_best_model} features and has an accuracy of {log_reg_val_accuracies[log_reg_best_model]}')

In [None]:
# Random Forest
# Evaluate the random forest model with the validation set
rf_rf_features_val_pred = rf_rf_features.predict(rf_features_test)
rf_XGB_features_val_pred = rf_XGB_features.predict(xgb_features_test)
rf_Fisher_features_val_pred = rf_Fisher_features.predict(fisher_features_test)
rf_Chi2_features_val_pred = rf_Chi2_features.predict(chi2_features_test)

# choose the best model
rf_rf_features_val_accuracy = accuracy_score(y_test, rf_rf_features_val_pred)
rf_XGB_features_val_accuracy = accuracy_score(y_test, rf_XGB_features_val_pred)
rf_Fisher_features_val_accuracy = accuracy_score(y_test, rf_Fisher_features_val_pred)
rf_Chi2_features_val_accuracy = accuracy_score(y_test, rf_Chi2_features_val_pred)

rf_val_accuracies = {'rf_features': rf_rf_features_val_accuracy, 'XGB_features': rf_XGB_features_val_accuracy,
                     'Fisher_features': rf_Fisher_features_val_accuracy, 'Chi2_features': rf_Chi2_features_val_accuracy}
rf_val_accuracies_df = pd.DataFrame(rf_val_accuracies, index=[0])
rf_val_accuracies_df.to_csv('rf_val_accuracies.csv', index=False)

# print the best model and its feature selection method, and its accuracy
rf_best_model = max(rf_val_accuracies, key=rf_val_accuracies.get)
print(f'The best random forest model is trained with {rf_best_model} features and has an accuracy of {rf_val_accuracies[rf_best_model]}')

In [None]:
# XGBoost
# Evaluate the XGBoost model with the validation set
xgb_rf_features_val_pred = xgb_rf_features.predict(rf_features_test)
xgb_XGB_features_val_pred = xgb_XGB_features.predict(xgb_features_test)
xgb_Fisher_features_val_pred = xgb_Fisher_features.predict(fisher_features_test)
xgb_Chi2_features_val_pred = xgb_Chi2_features.predict(chi2_features_test)

# choose the best model
xgb_rf_features_val_accuracy = accuracy_score(y_test_XGB, xgb_rf_features_val_pred)
xgb_XGB_features_val_accuracy = accuracy_score(y_test_XGB, xgb_XGB_features_val_pred)
xgb_Fisher_features_val_accuracy = accuracy_score(y_test_XGB, xgb_Fisher_features_val_pred)
xgb_Chi2_features_val_accuracy = accuracy_score(y_test_XGB, xgb_Chi2_features_val_pred)

xgb_val_accuracies = {'rf_features': xgb_rf_features_val_accuracy, 'XGB_features': xgb_XGB_features_val_accuracy,
                      'Fisher_features': xgb_Fisher_features_val_accuracy, 'Chi2_features': xgb_Chi2_features_val_accuracy}
xgb_val_accuracies_df = pd.DataFrame(xgb_val_accuracies, index=[0])
xgb_val_accuracies_df.to_csv('xgb_val_accuracies.csv', index=False)

# print the best model and its feature selection method, and its accuracy
xgb_best_model = max(xgb_val_accuracies, key=xgb_val_accuracies.get)
print(f'The best XGBoost model is trained with {xgb_best_model} features and has an accuracy of {xgb_val_accuracies[xgb_best_model]}')

In [None]:
# Neural Network
# Evaluate the neural network model with the validation set
nn_rf_features_val_pred = nn_rf_features.predict(rf_features_test)
nn_XGB_features_val_pred = nn_XGB_features.predict(xgb_features_test)
nn_Fisher_features_val_pred = nn_Fisher_features.predict(fisher_features_test)
nn_Chi2_features_val_pred = nn_Chi2_features.predict(chi2_features_test)

# choose the best model
nn_rf_features_val_accuracy = accuracy_score(y_test, nn_rf_features_val_pred)
nn_XGB_features_val_accuracy = accuracy_score(y_test, nn_XGB_features_val_pred)
nn_Fisher_features_val_accuracy = accuracy_score(y_test, nn_Fisher_features_val_pred)
nn_Chi2_features_val_accuracy = accuracy_score(y_test, nn_Chi2_features_val_pred)

nn_val_accuracies = {'rf_features': nn_rf_features_val_accuracy, 'XGB_features': nn_XGB_features_val_accuracy,
                     'Fisher_features': nn_Fisher_features_val_accuracy, 'Chi2_features': nn_Chi2_features_val_accuracy}
nn_val_accuracies_df = pd.DataFrame(nn_val_accuracies, index=[0])
nn_val_accuracies_df.to_csv('nn_val_accuracies.csv', index=False)

# print the best model and its feature selection method, and its accuracy
nn_best_model = max(nn_val_accuracies, key=nn_val_accuracies.get)
print(f'The best neural network model is trained with {nn_best_model} features and has an accuracy of {nn_val_accuracies[nn_best_model]}')

In [None]:
def bootstrap_accuracy(y_test, pred, num_iterations=1000, confidence_level=0.9):
    """
    Estimate the accuracy of a classification model using bootstrap resampling.

    Parameters:
        y_test (numpy array): True class labels.
        pred (numpy array): Predicted class labels.
        num_iterations (int): Number of bootstrap iterations. Default is 1000.
        confidence_level (float): Confidence level for the confidence interval. Default is 0.95.

    Returns:
        tuple: Tuple containing the point estimate of accuracy and its confidence interval.
    """
    n = len(y_test)
    accuracies = []

    for _ in range(num_iterations):
        indices = np.random.randint(0, n, n)  # Bootstrap sample indices
        y_test_bootstrap = y_test[indices]
        pred_bootstrap = pred[indices]

        # Calculate accuracy for this bootstrap sample
        accuracy = np.mean(y_test_bootstrap == pred_bootstrap)
        accuracies.append(accuracy)

    # Calculate point estimate of accuracy
    point_estimate = np.mean(accuracies)

    # Calculate confidence interval
    alpha = 1 - confidence_level
    lower_percentile = alpha / 2 * 100
    upper_percentile = (1 - alpha / 2) * 100
    lower_bound = np.percentile(accuracies, lower_percentile)
    upper_bound = np.percentile(accuracies, upper_percentile)

    return point_estimate, (lower_bound, upper_bound)

# Example usage:
# convert y_test to a numpy array
test = np.array(y_test)
pred = log_reg_rf_features_val_pred
# Assuming y_test and pred are numpy arrays containing true class labels and predicted class labels respectively
accuracy, confidence_interval = bootstrap_accuracy(test, pred)
print("Point estimate of accuracy:", accuracy)
print("Confidence interval:", confidence_interval)

In [None]:
# perform the bootstrap accuracy for all models
y_test = np.array(y_test)
# Logistic Regression
# Perform bootstrap accuracy estimation for the logistic regression model
log_reg_rf_features_accuracy, log_reg_rf_features_confidence_interval = bootstrap_accuracy(y_test, log_reg_rf_features_val_pred)
log_reg_XGB_features_accuracy, log_reg_XGB_features_confidence_interval = bootstrap_accuracy(y_test, log_reg_XGB_features_val_pred)
log_reg_Fisher_features_accuracy, log_reg_Fisher_features_confidence_interval = bootstrap_accuracy(y_test, log_reg_Fisher_features_val_pred)
log_reg_Chi2_features_accuracy, log_reg_Chi2_features_confidence_interval = bootstrap_accuracy(y_test, log_reg_Chi2_features_val_pred)

print(f'Logistic Regression with RF features: {log_reg_rf_features_accuracy} ({log_reg_rf_features_confidence_interval})')
print(f'Logistic Regression with XGB features: {log_reg_XGB_features_accuracy} ({log_reg_XGB_features_confidence_interval})')
print(f'Logistic Regression with Fisher features: {log_reg_Fisher_features_accuracy} ({log_reg_Fisher_features_confidence_interval})')
print(f'Logistic Regression with Chi2 features: {log_reg_Chi2_features_accuracy} ({log_reg_Chi2_features_confidence_interval})')

In [None]:
# Random Forest
# Perform bootstrap accuracy estimation for the random forest model
rf_rf_features_accuracy, rf_rf_features_confidence_interval = bootstrap_accuracy(y_test, rf_rf_features_val_pred)
rf_XGB_features_accuracy, rf_XGB_features_confidence_interval = bootstrap_accuracy(y_test, rf_XGB_features_val_pred)
rf_Fisher_features_accuracy, rf_Fisher_features_confidence_interval = bootstrap_accuracy(y_test, rf_Fisher_features_val_pred)
rf_Chi2_features_accuracy, rf_Chi2_features_confidence_interval = bootstrap_accuracy(y_test, rf_Chi2_features_val_pred)

print(f'Random Forest with RF features: {rf_rf_features_accuracy} ({rf_rf_features_confidence_interval})')
print(f'Random Forest with XGB features: {rf_XGB_features_accuracy} ({rf_XGB_features_confidence_interval})')
print(f'Random Forest with Fisher features: {rf_Fisher_features_accuracy} ({rf_Fisher_features_confidence_interval})')
print(f'Random Forest with Chi2 features: {rf_Chi2_features_accuracy} ({rf_Chi2_features_confidence_interval})')

In [None]:
# XGBoost
y_test_XGB = np.array(y_test_XGB)
# Perform bootstrap accuracy estimation for the XGBoost model
xgb_rf_features_accuracy, xgb_rf_features_confidence_interval = bootstrap_accuracy(y_test_XGB, xgb_rf_features_val_pred)
xgb_XGB_features_accuracy, xgb_XGB_features_confidence_interval = bootstrap_accuracy(y_test_XGB, xgb_XGB_features_val_pred)
xgb_Fisher_features_accuracy, xgb_Fisher_features_confidence_interval = bootstrap_accuracy(y_test_XGB, xgb_Fisher_features_val_pred)
xgb_Chi2_features_accuracy, xgb_Chi2_features_confidence_interval = bootstrap_accuracy(y_test_XGB, xgb_Chi2_features_val_pred)

print(f'XGBoost with RF features: {xgb_rf_features_accuracy} ({xgb_rf_features_confidence_interval})')
print(f'XGBoost with XGB features: {xgb_XGB_features_accuracy} ({xgb_XGB_features_confidence_interval})')
print(f'XGBoost with Fisher features: {xgb_Fisher_features_accuracy} ({xgb_Fisher_features_confidence_interval})')
print(f'XGBoost with Chi2 features: {xgb_Chi2_features_accuracy} ({xgb_Chi2_features_confidence_interval})')

In [None]:
# Neural Network
# Perform bootstrap accuracy estimation for the neural network model
nn_rf_features_accuracy, nn_rf_features_confidence_interval = bootstrap_accuracy(y_test, nn_rf_features_val_pred)
nn_XGB_features_accuracy, nn_XGB_features_confidence_interval = bootstrap_accuracy(y_test, nn_XGB_features_val_pred)
nn_Fisher_features_accuracy, nn_Fisher_features_confidence_interval = bootstrap_accuracy(y_test, nn_Fisher_features_val_pred)
nn_Chi2_features_accuracy, nn_Chi2_features_confidence_interval = bootstrap_accuracy(y_test, nn_Chi2_features_val_pred)

print(f'Neural Network with RF features: {nn_rf_features_accuracy} ({nn_rf_features_confidence_interval})')
print(f'Neural Network with XGB features: {nn_XGB_features_accuracy} ({nn_XGB_features_confidence_interval})')
print(f'Neural Network with Fisher features: {nn_Fisher_features_accuracy} ({nn_Fisher_features_confidence_interval})')
print(f'Neural Network with Chi2 features: {nn_Chi2_features_accuracy} ({nn_Chi2_features_confidence_interval})')

In [None]:
print('Top 15 features for Random Forest:')
print(top_15_rf_features)

In [None]:
print('Top 15 features for XGBoost:')
print(top_15_xgb_features)

In [None]:
print('Top 15 features for Fisher Score:')
print(top_15_fisher_features)

In [None]:
print('P-values under 0.05 in Chi-Squared test:')
print(p_values_df['feature'])