### Dataset

In [1]:
import pandas as pd

X_train = pd.read_csv('../../data/genre_class_train_df.csv')
X_test = pd.read_csv('../../data/genre_class_test_df.csv')

In [2]:
import numpy as np

y_train = np.load('../../data/genre_class_train_y.npy', allow_pickle=True)
y_test = np.load('../../data/genre_class_test_y.npy', allow_pickle=True)

### Cross Validation (K = 10)
#### Baseline

In [3]:
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_validate

dummy_clf = DummyClassifier()
cv_results = cross_validate(dummy_clf, X_train, y_train, scoring='roc_auc_ovr', cv=10)

print(f"Cross-Validation Results: ")
for i, res in enumerate(cv_results['test_score']):
    print(f"Split {i}: {res}")

Cross-Validation Results: 
Split 0: 0.5
Split 1: 0.5
Split 2: 0.5
Split 3: 0.5
Split 4: 0.5
Split 5: 0.5
Split 6: 0.5
Split 7: 0.5
Split 8: 0.5
Split 9: 0.5


#### Naive Bayes

In [3]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_validate

nb_clf = GaussianNB()
cv_results = cross_validate(nb_clf, X_train, y_train, scoring='roc_auc_ovr', cv=10)

print(f"Cross-Validation Results: ")
for i, res in enumerate(cv_results['test_score']):
    print(f"Split {i}: {res}")

Cross-Validation Results: 
Split 0: 0.5160407055635572
Split 1: 0.5153708620658187
Split 2: 0.5121918050161856
Split 3: 0.5116289649632666
Split 4: 0.5116616833948087
Split 5: 0.5133545528979042
Split 6: 0.5133230275543502
Split 7: 0.515579810454476
Split 8: 0.5099320396948464
Split 9: 0.5136766531269641


#### Logistic Regression

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate

lr_clf = LogisticRegression(solver='saga', random_state=42)
cv_results = cross_validate(lr_clf, X_train, y_train, scoring='roc_auc_ovr', cv=10)

print(f"Cross-Validation Results: ")
for i, res in enumerate(cv_results['test_score']):
    print(f"Split {i}: {res}")



Cross-Validation Results: 
Split 0: 0.5124231121793161
Split 1: 0.5111649765212117
Split 2: 0.5084898499055066
Split 3: 0.5089509851247662
Split 4: 0.508503936085081
Split 5: 0.5085836046472482
Split 6: 0.5093719668353593
Split 7: 0.5098473146050797
Split 8: 0.5108464640670476
Split 9: 0.5103836567392642




### Statistical Testing
#### Between Best Models

In [7]:
from scipy.stats import ttest_rel

nb_cv = [0.5160407055635572, 0.5153708620658187, 0.5121918050161856, 0.5116289649632666, 0.5116616833948087,
         0.5133545528979042, 0.5133230275543502, 0.515579810454476, 0.5099320396948464, 0.5136766531269641]
lr_cv = [0.5124231121793161, 0.5111649765212117, 0.5084898499055066, 0.5089509851247662, 0.508503936085081, 
         0.5085836046472482, 0.5093719668353593, 0.5098473146050797, 0.5108464640670476, 0.5103836567392642]

t_stat, p_value = ttest_rel(nb_cv, lr_cv)

print(f"t-statistic: {t_stat}")
print(f"p_value: {p_value}")

t-statistic: 6.173231535467158
p_value: 0.00016406147325362017


#### Between Baseline and Best Model

In [8]:
from scipy.stats import ttest_rel

nb_cv = [0.5160407055635572, 0.5153708620658187, 0.5121918050161856, 0.5116289649632666, 0.5116616833948087,
         0.5133545528979042, 0.5133230275543502, 0.515579810454476, 0.5099320396948464, 0.5136766531269641]
dummy_cv = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]

t_stat, p_value = ttest_rel(nb_cv, dummy_cv)

print(f"t-statistic: {t_stat}")
print(f"p_value: {p_value}")

t-statistic: 21.233001971169585
p_value: 5.353598287380803e-09
