In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, RidgeClassifierCV
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
import statsmodels.api as sm
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

In [None]:
df = pd.read_csv('music_features.csv')
df

In [None]:
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])
label_mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))
print("Label mapping:", label_mapping)

In [None]:
features_to_standardize = [
    'tempo', 'chroma_stft', 'rmse', 'spectral_centroid', 
    'spectral_bandwidth', 'rolloff', 'zero_crossing_rate'
] + [f'mfcc{i}' for i in range(1, 21)]

for feature in features_to_standardize:
    df[feature] = (df[feature] - df[feature].mean()) / df[feature].std()

1. Correlation matrix

In [None]:
data_for_corr = df.drop(columns=['filename', 'label', 'label_encoded'])

corr_matrix = data_for_corr.corr()

plt.figure(figsize=(15, 10))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Music Features')
plt.show()

2. VIF calculation

In [None]:
df_vif = df.drop(columns=['filename', 'label', 'label_encoded'])
# print(df_vif.dtypes)

In [None]:
vals = [VIF(df_vif, i)
for i in range(0, df_vif.shape[1])]
vif = pd.DataFrame({'vif':vals},
index=df_vif.columns)
vif

3. Previously Used models (Logistic Regression)

Without 'beats'

In [None]:
X = df[['tempo', 'chroma_stft', 'rmse', 'spectral_centroid', 'spectral_bandwidth', 'rolloff'] + 
       [f'mfcc{i}' for i in range(1, 21)]]
y = df['label_encoded']

X_train, X_test, y_train, y_test_logistic = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

lm = LogisticRegression(class_weight='balanced', multi_class='multinomial', max_iter=1000)
lm.fit(X_train, y_train)

y_pred_logistic = lm.predict(X_test)

print("Logistic Regression Results on Test Set")
p, r, f, s = precision_recall_fscore_support(y_test_logistic, y_pred_logistic, labels=np.unique(y))
for label, genre in label_mapping.items():
    print(f"Genre: {genre}")
    print(f"  Precision: {p[label]:.4f}")
    print(f"  Recall: {r[label]:.4f}")
    print(f"  F-score: {f[label]:.4f}")
    print(f"  Support: {s[label]}")

target_names = ['Blues', 'Classical', 'Country', 'Disco', 'Hiphop', 'Jazz', 
                'Metal', 'Pop', 'Reggae', 'Rock']

mat_logistic = confusion_matrix(y_test_logistic, y_pred_logistic)

plt.figure(figsize=(8, 6))
sns.heatmap(mat_logistic.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=target_names, yticklabels=target_names)
plt.xlabel('True Label')
plt.ylabel('Predicted Label')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()

With 'beats'

In [None]:
X = df[['tempo', 'beats', 'chroma_stft', 'rmse', 'spectral_centroid', 'spectral_bandwidth', 'rolloff'] + 
       [f'mfcc{i}' for i in range(1, 21)]]
y = df['label_encoded']

X_train, X_test, y_train, y_test_logistic = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

lm = LogisticRegression(class_weight='balanced', multi_class='multinomial', max_iter=1000)
lm.fit(X_train, y_train)

y_pred_logistic = lm.predict(X_test)

print("Logistic Regression Results on Test Set")
p, r, f, s = precision_recall_fscore_support(y_test_logistic, y_pred_logistic, labels=np.unique(y))
for label, genre in label_mapping.items():
    print(f"Genre: {genre}")
    print(f"  Precision: {p[label]:.4f}")
    print(f"  Recall: {r[label]:.4f}")
    print(f"  F-score: {f[label]:.4f}")
    print(f"  Support: {s[label]}")

target_names = ['Blues', 'Classical', 'Country', 'Disco', 'Hiphop', 'Jazz', 
                'Metal', 'Pop', 'Reggae', 'Rock']

mat_logistic = confusion_matrix(y_test_logistic, y_pred_logistic)

plt.figure(figsize=(8, 6))
sns.heatmap(mat_logistic.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=target_names, yticklabels=target_names)
plt.xlabel('True Label')
plt.ylabel('Predicted Label')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()

Multinomial

In [None]:
X = df[['tempo', 'chroma_stft', 'rmse', 'spectral_centroid', 'spectral_bandwidth', 'rolloff'] + 
       [f'mfcc{i}' for i in range(1, 21)]]
y = df['label_encoded']

lm = LogisticRegression(class_weight='balanced', multi_class='multinomial', max_iter=1000)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
coefficients = []

for train_idx, test_idx in cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    lm.fit(X_train, y_train)
    
    coefficients.append(lm.coef_)
    
    y_pred = lm.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=label_mapping.values()))


coefficients = np.array(coefficients)
avg_coefficients = np.mean(coefficients, axis=0)
std_error_coefficients = np.std(coefficients, axis=0) / np.sqrt(cv.get_n_splits())

coef_data = []
for class_idx, class_name in label_mapping.items():
    for feature_idx, feature_name in enumerate(X.columns):
        coef_data.append({
            'Class': class_name,
            'Feature': feature_name,
            'Average Coefficient': avg_coefficients[class_idx, feature_idx],
            'Standard Error': std_error_coefficients[class_idx, feature_idx]
        })

coef_df = pd.DataFrame(coef_data)
print(coef_df)

class_averages = coef_df.groupby('Class').agg(
    Average_Coefficient=('Average Coefficient', 'mean'),
    Average_Standard_Error=('Standard Error', 'mean')
).reset_index()
print(class_averages)

class_name_to_visualize = "rock"  # Change to any class you want to visualize
class_df = coef_df[coef_df['Class'] == class_name_to_visualize].sort_values(
    by='Average Coefficient', key=abs, ascending=False
)

plt.figure(figsize=(10, 6))
plt.barh(class_df['Feature'], class_df['Average Coefficient'], xerr=class_df['Standard Error'])
plt.xlabel('Coefficient Value')
plt.title(f'Average Coefficients with Standard Errors for Class: {class_name_to_visualize}')
plt.tight_layout()
plt.show()

N-best

In [None]:
lm_full = LogisticRegression(class_weight='balanced', multi_class='multinomial', max_iter=1000)
lm_full.fit(X, y)

coef_magnitudes = np.abs(lm_full.coef_).mean(axis=0)
top_features = np.argsort(coef_magnitudes)[-10:]
top_features_names = X.columns[top_features]

X_top = X[top_features_names]
X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.2, random_state=42, stratify=y)

lm_top = LogisticRegression(class_weight='balanced', multi_class='multinomial', max_iter=1000)
lm_top.fit(X_train, y_train)

y_pred = lm_top.predict(X_test)
print("Logistic Model with Top-N Variables")
print(classification_report(y_test, y_pred, target_names=label_mapping.values()))
print("Top Features:", top_features_names)

VIF < 5

In [None]:
X_vif = X.copy()
vif_data = pd.DataFrame({
    'Feature': X_vif.columns,
    'VIF': [VIF(X_vif.values, i) for i in range(X_vif.shape[1])]
})

low_vif_features = vif_data[vif_data['VIF'] < 5]['Feature']
X_low_vif = X[low_vif_features]

X_train, X_test, y_train, y_test = train_test_split(X_low_vif, y, test_size=0.2, random_state=42, stratify=y)

lm_vif = LogisticRegression(class_weight='balanced', multi_class='multinomial', max_iter=1000)
lm_vif.fit(X_train, y_train)

y_pred = lm_vif.predict(X_test)
print("Logistic Model with Low-VIF Variables")
print(classification_report(y_test, y_pred, target_names=label_mapping.values()))
print("Low VIF Features:", low_vif_features.tolist())

Backward Selection

In [None]:
def backward_selection(X, y, significance_level=1):
    X_with_const = sm.add_constant(X)
    
    model = sm.MNLogit(y, X_with_const).fit()
    
    while True:
        pvalues = model.pvalues.iloc[1:]
        max_pvalue = pvalues.max().max()
        
        if max_pvalue > significance_level:
            excluded_feature = pvalues.max(axis=1).idxmax()
            X_with_const = X_with_const.drop(columns=[excluded_feature]) 
            model = sm.MNLogit(y, X_with_const).fit()
        else:
            break

    return model, X_with_const.columns

model, selected_features = backward_selection(X, y)
print("Selected Features from Backward Selection:", selected_features)

X_selected = X[selected_features.drop("const")]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42, stratify=y)

log_reg = sm.MNLogit(y_train, sm.add_constant(X_train)).fit()

y_pred_prob = log_reg.predict(sm.add_constant(X_test))
y_pred_class = np.argmax(y_pred_prob.values, axis=1)

unique_labels = sorted(np.unique(y))
target_names = [label_mapping[label] for label in unique_labels]

print("Backward Selection Classification Metrics")
print(classification_report(y_test, y_pred_class, target_names=target_names, labels=unique_labels))

PCA

In [None]:
pca = PCA(n_components=10)
lm_pca = LogisticRegression(class_weight='balanced', multi_class='multinomial', max_iter=1000)
pcr_pipeline = Pipeline([('pca', pca), ('logistic', lm_pca)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
pcr_pipeline.fit(X_train, y_train)

y_pred = pcr_pipeline.predict(X_test)
print("Logistic Model with Principal Components")
print(classification_report(y_test, y_pred, target_names=label_mapping.values()))

Ridge

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

lambdas = 10**np.linspace(4, -2, 50)

ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', RidgeClassifierCV(alphas=lambdas, cv=5))
])

ridge_pipeline.fit(X_train, y_train)

y_pred_class = ridge_pipeline.predict(X_test)

unique_labels = sorted(np.unique(y_test))
target_names = [label_mapping[label] for label in unique_labels]

print("Ridge Classifier Classification Metrics")
print(classification_report(y_test, y_pred_class, target_names=target_names))

mat_logistic = confusion_matrix(y_test, y_pred_class)

plt.figure(figsize=(8, 6))
sns.heatmap(mat_logistic.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=target_names, yticklabels=target_names)
plt.xlabel('True Label')
plt.ylabel('Predicted Label')
plt.title('Confusion Matrix - Ridge Regularization')
plt.show()

Lasso

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

lasso_classifier_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(
        penalty='l1',
        solver='saga',
        max_iter=10000,
        class_weight='balanced',
        random_state=42,
        multi_class='multinomial',
        n_jobs=-1 
    ))
])

C_values = np.logspace(-4, 4, 50)
random_search = RandomizedSearchCV(
    estimator=lasso_classifier_pipeline,
    param_distributions={'logreg__C': C_values},
    n_iter=20,
    cv=5, 
    scoring='accuracy',
    random_state=42,
    n_jobs=-1 
)

random_search.fit(X_train, y_train)

best_model = random_search.best_estimator_

y_pred_class = best_model.predict(X_test)

unique_labels = sorted(np.unique(y_test))
target_names = [label_mapping[label] for label in unique_labels]

print("Lasso Classifier Metrics")
print(classification_report(y_test, y_pred_class, target_names=target_names))

coefficients = list(zip(best_model.named_steps['logreg'].coef_.T, X.columns))
non_zero_coefficients = [coef for coef in coefficients if np.any(coef[0] != 0)]

print("Non-zero Coefficients:")
for coef, name in non_zero_coefficients:
    print(f"{name}: {coef}")

mat_logistic = confusion_matrix(y_test, y_pred_class)

plt.figure(figsize=(8, 6))
sns.heatmap(mat_logistic.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=target_names, yticklabels=target_names)
plt.xlabel('True Label')
plt.ylabel('Predicted Label')
plt.title('Confusion Matrix - Lasso Regularization')
plt.show()

Elastic net

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

elastic_net_classifier = LogisticRegressionCV(
    penalty='elasticnet',
    solver='saga',
    max_iter=10000,
    class_weight='balanced',
    random_state=42,
    cv=5,
    l1_ratios=np.linspace(0.1, 1.0, 10),
    n_jobs=-1 
)

elastic_net_classifier.fit(X_train_scaled, y_train)

y_pred_class = elastic_net_classifier.predict(X_test_scaled)
unique_labels = sorted(np.unique(y_test))
target_names = [label_mapping[label] for label in unique_labels]

print("Elastic Net Classification Metrics")
print(classification_report(y_test, y_pred_class, target_names=target_names))


coefficients = list(zip(elastic_net_classifier.coef_.T, X.columns))
non_zero_coefficients = [coef for coef in coefficients if np.any(coef[0] != 0)]

print("Non-zero Coefficients:")
for coef, name in non_zero_coefficients:
    print(f"{name}: {coef}")

mat_logistic = confusion_matrix(y_test, y_pred_class)

plt.figure(figsize=(8, 6))
sns.heatmap(mat_logistic.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=target_names, yticklabels=target_names)
plt.xlabel('True Label')
plt.ylabel('Predicted Label')
plt.title('Confusion Matrix - Elastic Net Regularization')
plt.show()    