In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import RFE
from sklearn.ensemble import ExtraTreesClassifier



In [2]:
# Load and preprocess data
data = pd.read_csv('dataset.csv')
data['BPM'] = data['BPM'].round(2)

# Apply Label Encoding
label_cols = ['While working', 'Instrumentalist', 'Composer', 'Exploratory', 'Foreign languages', 'Fav genre']
data[label_cols] = data[label_cols].apply(LabelEncoder().fit_transform)

# Define the frequency and effect order
frequency_order = ['Never', 'Rarely', 'Sometimes', 'Very frequently']
effect_order = ['Worsen', 'No effect', 'Improve']

# Apply Ordinal Encoding
encoder1 = OrdinalEncoder(categories=[frequency_order] * 16)
encoder2 = OrdinalEncoder(categories=[effect_order])
freq_cols = [
    'Frequency [Classical]', 'Frequency [Country]', 'Frequency [EDM]', 'Frequency [Folk]', 
    'Frequency [Gospel]', 'Frequency [Hip hop]', 'Frequency [Jazz]', 'Frequency [K pop]', 
    'Frequency [Latin]', 'Frequency [Lofi]', 'Frequency [Metal]', 'Frequency [Pop]', 
    'Frequency [R&B]', 'Frequency [Rap]', 'Frequency [Rock]', 'Frequency [Video game music]'
]
data[freq_cols] = encoder1.fit_transform(data[freq_cols])
data[['Music effects']] = encoder2.fit_transform(data[['Music effects']])

# Define features and target
X = np.array(data.drop('Music effects', axis=1))
y = np.array(data['Music effects'])

# SMOTE to handle class imbalance
sampling_strategy = {0: 500, 1: 500, 2: 600}
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
X, y = smote.fit_resample(X, y)

# Initialize K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)


#Random Forest

In [4]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

# Feature selection with SelectFromModel
selector = SelectFromModel(ExtraTreesClassifier(n_estimators=100, random_state=42))
X_selected = selector.fit_transform(X, y)

# Hyperparameter tuning using GridSearchCV
rf_model = RandomForestClassifier(random_state=42, oob_score=True)

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [6, 8, 10, 12],
    'min_samples_split': [10, 15, 20],
    'min_samples_leaf': [5, 10, 15],
    'max_features': ['sqrt', 'log2', None],
    'class_weight': ['balanced', None]
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=kf, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_selected, y)

# Get the best model and evaluate
best_rf_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

Fitting 5 folds for each of 648 candidates, totalling 3240 fits
Best Parameters: {'class_weight': 'balanced', 'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 200}


In [5]:
# Perform K-Fold Cross-Validation with the best model
train_accuracies = []
test_accuracies = []
confusion_matrices = []

for train_index, test_index in kf.split(X_selected):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train the model
    best_rf_model.fit(X_train, y_train)
    
    # Evaluate on the train set
    train_pred = best_rf_model.predict(X_train)
    train_accuracy = accuracy_score(y_train, train_pred)
    train_accuracies.append(train_accuracy)
    
    # Evaluate on the test set
    y_pred = best_rf_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    test_accuracies.append(test_accuracy)
    
    # Classification report and confusion matrix
    print(f'\nTest Accuracy: {test_accuracy:.4f}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred, labels=[0, 1, 2])
    confusion_matrices.append(cm)
    print(f'Confusion Matrix for this Fold:\n{cm}')

# Calculate the average confusion matrix
avg_confusion_matrix = np.mean(confusion_matrices, axis=0)
print(f'Average Confusion Matrix:\n{avg_confusion_matrix}')

# Calculate average train and test accuracy
avg_train_accuracy = np.mean(train_accuracies)
avg_test_accuracy = np.mean(test_accuracies)
print(f'\nAverage Training Accuracy: {avg_train_accuracy:.4f}')
print(f'Average Test Accuracy: {avg_test_accuracy:.4f}')

# Out-of-Bag (OOB) score
oob_score = best_rf_model.oob_score_
print(f'OOB Accuracy: {oob_score:.4f}')


Test Accuracy: 0.8469
Classification Report:
              precision    recall  f1-score   support

         0.0       0.96      0.99      0.97        93
         1.0       0.80      0.67      0.73        96
         2.0       0.80      0.88      0.84       131

    accuracy                           0.85       320
   macro avg       0.85      0.84      0.85       320
weighted avg       0.85      0.85      0.84       320

Confusion Matrix for this Fold:
[[ 92   0   1]
 [  4  64  28]
 [  0  16 115]]

Test Accuracy: 0.8656
Classification Report:
              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92        87
         1.0       0.87      0.74      0.80       105
         2.0       0.85      0.90      0.87       128

    accuracy                           0.87       320
   macro avg       0.87      0.87      0.87       320
weighted avg       0.87      0.87      0.86       320

Confusion Matrix for this Fold:
[[ 84   1   2]
 [  9  78  18]
 [  2 

#SVM

In [7]:
from sklearn.svm import SVC

# Feature selection with Recursive Feature Elimination (RFE)
svm_model = SVC(kernel='linear', random_state=42)
selector = RFE(estimator=svm_model, n_features_to_select=10, step=1)
X_selected = selector.fit_transform(X, y)

# Hyperparameter tuning using GridSearchCV for SVM
svm_model = SVC(probability=True, random_state=42)

param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'gamma': ['scale'],
    'kernel': ['linear', 'rbf'],  # Removed 'poly' for simplicity
    'class_weight': ['balanced']
}

grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, cv=kf, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_selected, y)

# Get the best model and evaluate
best_svm_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Parameters: {'C': 10, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'rbf'}


In [8]:
# Perform K-Fold Cross-Validation with the best model
scores = cross_val_score(best_svm_model, X_selected, y, cv=kf, scoring='accuracy')
print(f'Cross-Validation Scores: {scores}')
print(f'Mean Cross-Validation Score: {scores.mean():.4f}')


confusion_matrices = []
train_accuracies = []
test_accuracies = []
for train_index, test_index in kf.split(X_selected):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train the model
    best_svm_model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = best_svm_model.predict(X_test)
    
    # Evaluate the model
    train_accuracy = best_svm_model.score(X_train, y_train)
    test_accuracy = accuracy_score(y_test, y_pred)
    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)
    
    print(f'\nFold Test Accuracy: {test_accuracy:.4f}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    # Compute confusion matrix for this fold
    cm = confusion_matrix(y_test, y_pred, labels=[0, 1, 2])
    confusion_matrices.append(cm)
    print(f'Confusion Matrix for this Fold:\n{cm}')

# Calculate the average confusion matrix
avg_confusion_matrix = np.mean(confusion_matrices, axis=0)
print(f'Average Confusion Matrix:\n{avg_confusion_matrix}')

# Calculate average train and test accuracies
avg_train_accuracy = np.mean(train_accuracies)
avg_test_accuracy = np.mean(test_accuracies)
print(f'Average Train Accuracy: {avg_train_accuracy:.4f}')
print(f'Average Test Accuracy: {avg_test_accuracy:.4f}')




Cross-Validation Scores: [0.79375  0.79375  0.85625  0.809375 0.775   ]
Mean Cross-Validation Score: 0.8056

Fold Test Accuracy: 0.7937
Classification Report:
              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97        93
         1.0       0.65      0.71      0.68        96
         2.0       0.79      0.71      0.75       131

    accuracy                           0.79       320
   macro avg       0.80      0.81      0.80       320
weighted avg       0.79      0.79      0.79       320

Confusion Matrix for this Fold:
[[93  0  0]
 [ 4 68 24]
 [ 2 36 93]]

Fold Test Accuracy: 0.7937
Classification Report:
              precision    recall  f1-score   support

         0.0       0.90      1.00      0.95        87
         1.0       0.72      0.75      0.73       105
         2.0       0.78      0.69      0.73       128

    accuracy                           0.79       320
   macro avg       0.80      0.81      0.80       320
weighted avg  

#Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression

# Feature selection with Recursive Feature Elimination (RFE)
logistic_model = LogisticRegression(random_state=42)
selector = RFE(estimator=logistic_model, n_features_to_select=10, step=1)
X_selected = selector.fit_transform(X, y)

# Hyperparameter tuning using GridSearchCV for Logistic Regression
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['liblinear'], 
    'class_weight': ['balanced']
}

grid_search = GridSearchCV(estimator=logistic_model, param_grid=param_grid, cv=kf, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_selected, y)

# Get the best model and evaluate
best_logistic_model = grid_search.best_estimator_
print(f'Best Parameters: {grid_search.best_params_}')

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Parameters: {'C': 1, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'liblinear'}


In [11]:
# Perform K-Fold Cross-Validation with the best model
scores = cross_val_score(best_logistic_model, X_selected, y, cv=kf, scoring='accuracy')
print(f'Cross-Validation Scores: {scores}')
print(f'Mean Cross-Validation Score: {scores.mean():.4f}')

# List to store confusion matrices and accuracy differences
confusion_matrices = []
train_accuracies = []
test_accuracies = []

for train_index, test_index in kf.split(X_selected):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train the model
    best_logistic_model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = best_logistic_model.predict(X_test)
    
    # Evaluate the model
    test_accuracy = accuracy_score(y_test, y_pred)
    train_accuracy = accuracy_score(y_train, best_logistic_model.predict(X_train))
    test_accuracies.append(test_accuracy)
    train_accuracies.append(train_accuracy)
    print(f'\nTest Accuracy: {test_accuracy:.4f}')
    print(f'Training Accuracy: {train_accuracy:.4f}')
    
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    # Compute confusion matrix for this fold
    cm = confusion_matrix(y_test, y_pred, labels=[0, 1, 2])
    confusion_matrices.append(cm)
    print(f'Confusion Matrix for this Fold:\n{cm}')

# Calculate the average confusion matrix
avg_confusion_matrix = np.mean(confusion_matrices, axis=0)
print(f"Average Confusion Matrix:\n{avg_confusion_matrix}")

# Check for overfitting using average training and test accuracies
avg_train_accuracy = np.mean(train_accuracies)
avg_test_accuracy = np.mean(test_accuracies)
print(f'\nAverage Training Accuracy: {avg_train_accuracy:.4f}')
print(f'Average Test Accuracy: {avg_test_accuracy:.4f}')


Cross-Validation Scores: [0.66875  0.6375   0.646875 0.69375  0.60625 ]
Mean Cross-Validation Score: 0.6506

Test Accuracy: 0.6687
Training Accuracy: 0.6641
Classification Report:
              precision    recall  f1-score   support

         0.0       0.73      0.91      0.81        93
         1.0       0.52      0.44      0.47        96
         2.0       0.71      0.66      0.69       131

    accuracy                           0.67       320
   macro avg       0.65      0.67      0.66       320
weighted avg       0.66      0.67      0.66       320

Confusion Matrix for this Fold:
[[85  7  1]
 [19 42 35]
 [12 32 87]]

Test Accuracy: 0.6375
Training Accuracy: 0.6766
Classification Report:
              precision    recall  f1-score   support

         0.0       0.67      0.98      0.80        87
         1.0       0.59      0.42      0.49       105
         2.0       0.62      0.59      0.60       128

    accuracy                           0.64       320
   macro avg       0.63   

#KNN

In [13]:
from sklearn.neighbors import KNeighborsClassifier

# Feature selection with Recursive Feature Elimination (RFE) using ExtraTreesClassifier
selector = RFE(estimator=ExtraTreesClassifier(n_estimators=100, random_state=42), n_features_to_select=10, step=1)
X_selected = selector.fit_transform(X, y)

# Hyperparameter tuning using GridSearchCV for KNN
knn_model = KNeighborsClassifier()

param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

grid_search = GridSearchCV(estimator=knn_model, param_grid=param_grid, cv=kf, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_selected, y)

# Get the best model and evaluate
best_knn_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best Parameters: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}


In [14]:
# Perform K-Fold Cross-Validation with the best model
scores = cross_val_score(best_knn_model, X_selected, y, cv=kf, scoring='accuracy')
print(f'Cross-Validation Scores: {scores}')
print(f'Mean Cross-Validation Score: {scores.mean():.4f}')

# List to store confusion matrices and accuracies for each fold
confusion_matrices = []
accuracies = []

for train_index, test_index in kf.split(X_selected):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train the model
    best_knn_model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = best_knn_model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    print(f'\nAccuracy: {accuracy:.4f}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    # Compute confusion matrix for this fold
    cm = confusion_matrix(y_test, y_pred, labels=[0, 1, 2])
    confusion_matrices.append(cm)
    print(f'Confusion Matrix for this Fold:\n{cm}')

# Calculate the average confusion matrix
avg_confusion_matrix = np.mean(confusion_matrices, axis=0)
print(f'Average Confusion Matrix:\n{avg_confusion_matrix}')

avg_train_accuracy = np.mean(accuracies)
print(f'\nAverage Test Accuracy: {avg_train_accuracy:.4f}')



Cross-Validation Scores: [0.8375   0.83125  0.85625  0.846875 0.8375  ]
Mean Cross-Validation Score: 0.8419

Accuracy: 0.8375
Classification Report:
              precision    recall  f1-score   support

         0.0       0.92      1.00      0.96        93
         1.0       0.74      0.79      0.76        96
         2.0       0.85      0.76      0.80       131

    accuracy                           0.84       320
   macro avg       0.84      0.85      0.84       320
weighted avg       0.84      0.84      0.84       320

Confusion Matrix for this Fold:
[[93  0  0]
 [ 3 76 17]
 [ 5 27 99]]

Accuracy: 0.8313
Classification Report:
              precision    recall  f1-score   support

         0.0       0.88      1.00      0.94        87
         1.0       0.76      0.84      0.80       105
         2.0       0.87      0.71      0.78       128

    accuracy                           0.83       320
   macro avg       0.83      0.85      0.84       320
weighted avg       0.83      0.83 

#Decision Tree

In [16]:
from sklearn.tree import DecisionTreeClassifier

# Feature selection with Recursive Feature Elimination (RFE)
dt_model = DecisionTreeClassifier(random_state=42)
selector = RFE(estimator=dt_model, n_features_to_select=10, step=1)
X_selected = selector.fit_transform(X, y)

# Hyperparameter tuning using GridSearchCV for Decision Tree
param_grid = {
    'max_depth': [4, 6, 8, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, cv=kf, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_selected, y)

# Get the best model and evaluate
best_dt_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters: {'criterion': 'gini', 'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 5}


In [17]:
# Perform K-Fold Cross-Validation with the best model
scores = cross_val_score(best_dt_model, X_selected, y, cv=kf, scoring='accuracy')
print(f'Cross-Validation Scores: {scores}')
print(f'Mean Cross-Validation Score: {scores.mean():.4f}')

# List to store confusion matrices and training accuracies for overfitting check
confusion_matrices = []
train_accuracies = []
test_accuracies = []

for train_index, test_index in kf.split(X_selected):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train the model
    best_dt_model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = best_dt_model.predict(X_test)
    
    # Evaluate the model on the test set
    test_accuracy = accuracy_score(y_test, y_pred)
    test_accuracies.append(test_accuracy)
    
    # Evaluate the model on the training set
    y_train_pred = best_dt_model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_accuracies.append(train_accuracy)
    
    print(f'\nTest Accuracy: {test_accuracy:.4f}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    # Compute confusion matrix for this fold
    cm = confusion_matrix(y_test, y_pred, labels=[0, 1, 2])
    confusion_matrices.append(cm)
    print(f'Confusion Matrix for this Fold:\n{cm}')

# Calculate the average confusion matrix
avg_confusion_matrix = np.mean(confusion_matrices, axis=0)
print(f'Average Confusion Matrix:\n{avg_confusion_matrix}')

# Calculate average training and test accuracy across all folds
avg_train_accuracy = np.mean(train_accuracies)
avg_test_accuracy = np.mean(test_accuracies)

print(f'\nAverage Training Accuracy: {avg_train_accuracy:.4f}')
print(f'Average Test Accuracy: {avg_test_accuracy:.4f}')


Cross-Validation Scores: [0.746875 0.815625 0.759375 0.79375  0.70625 ]
Mean Cross-Validation Score: 0.7644

Test Accuracy: 0.7469
Classification Report:
              precision    recall  f1-score   support

         0.0       0.88      0.92      0.90        93
         1.0       0.64      0.64      0.64        96
         2.0       0.72      0.70      0.71       131

    accuracy                           0.75       320
   macro avg       0.75      0.75      0.75       320
weighted avg       0.74      0.75      0.75       320

Confusion Matrix for this Fold:
[[86  1  6]
 [ 6 61 29]
 [ 6 33 92]]

Test Accuracy: 0.8156
Classification Report:
              precision    recall  f1-score   support

         0.0       0.91      0.92      0.91        87
         1.0       0.82      0.68      0.74       105
         2.0       0.76      0.86      0.81       128

    accuracy                           0.82       320
   macro avg       0.83      0.82      0.82       320
weighted avg       0.82 