In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
df = pd.read_csv('music_features.csv')
df

In [None]:
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])
label_mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))
print("Label mapping:", label_mapping)


In [None]:
features_to_standardize = [
    'tempo', 'chroma_stft', 'rmse', 'spectral_centroid', 
    'spectral_bandwidth', 'rolloff', 'zero_crossing_rate'
] + [f'mfcc{i}' for i in range(1, 21)]

for feature in features_to_standardize:
    df[feature] = (df[feature] - df[feature].mean()) / df[feature].std()

1. Correlation matrix

In [None]:
data_for_corr = df.drop(columns=['filename', 'label', 'label_encoded'])

corr_matrix = data_for_corr.corr()

plt.figure(figsize=(15, 10))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Music Features')
plt.show()

2. VIF calculation

In [None]:
df_vif = df.drop(columns=['filename', 'label', 'label_encoded'])
# print(df_vif.dtypes)

In [None]:
vals = [VIF(df_vif, i)
for i in range(0, df_vif.shape[1])]
vif = pd.DataFrame({'vif':vals},
index=df_vif.columns)
vif

3. Previously Used models (Logistic Regression)

Without 'beats'

In [None]:
X = df[['tempo', 'chroma_stft', 'rmse', 'spectral_centroid', 'spectral_bandwidth', 'rolloff'] + 
       [f'mfcc{i}' for i in range(1, 21)]]
y = df['label_encoded']

X_train, X_test, y_train, y_test_logistic = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

lm = LogisticRegression(class_weight='balanced', multi_class='multinomial', max_iter=1000)
lm.fit(X_train, y_train)

y_pred_logistic = lm.predict(X_test)

print("Logistic Regression Results on Test Set")
p, r, f, s = precision_recall_fscore_support(y_test_logistic, y_pred_logistic, labels=np.unique(y))
for label, genre in label_mapping.items():
    print(f"Genre: {genre}")
    print(f"  Precision: {p[label]:.4f}")
    print(f"  Recall: {r[label]:.4f}")
    print(f"  F-score: {f[label]:.4f}")
    print(f"  Support: {s[label]}")

target_names = ['Blues', 'Classical', 'Country', 'Disco', 'Hiphop', 'Jazz', 
                'Metal', 'Pop', 'Reggae', 'Rock']

mat_logistic = confusion_matrix(y_test_logistic, y_pred_logistic)

plt.figure(figsize=(8, 6))
sns.heatmap(mat_logistic.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=target_names, yticklabels=target_names)
plt.xlabel('True Label')
plt.ylabel('Predicted Label')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()

With 'beats'

In [None]:
X = df[['tempo', 'beats', 'chroma_stft', 'rmse', 'spectral_centroid', 'spectral_bandwidth', 'rolloff'] + 
       [f'mfcc{i}' for i in range(1, 21)]]
y = df['label_encoded']

X_train, X_test, y_train, y_test_logistic = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

lm = LogisticRegression(class_weight='balanced', multi_class='multinomial', max_iter=1000)
lm.fit(X_train, y_train)

y_pred_logistic = lm.predict(X_test)

print("Logistic Regression Results on Test Set")
p, r, f, s = precision_recall_fscore_support(y_test_logistic, y_pred_logistic, labels=np.unique(y))
for label, genre in label_mapping.items():
    print(f"Genre: {genre}")
    print(f"  Precision: {p[label]:.4f}")
    print(f"  Recall: {r[label]:.4f}")
    print(f"  F-score: {f[label]:.4f}")
    print(f"  Support: {s[label]}")

target_names = ['Blues', 'Classical', 'Country', 'Disco', 'Hiphop', 'Jazz', 
                'Metal', 'Pop', 'Reggae', 'Rock']

mat_logistic = confusion_matrix(y_test_logistic, y_pred_logistic)

plt.figure(figsize=(8, 6))
sns.heatmap(mat_logistic.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=target_names, yticklabels=target_names)
plt.xlabel('True Label')
plt.ylabel('Predicted Label')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()

In [None]:
X = df[['tempo', 'chroma_stft', 'rmse', 'spectral_centroid', 'spectral_bandwidth', 'rolloff'] + 
       [f'mfcc{i}' for i in range(1, 21)]]
y = df['label_encoded']

lm = LogisticRegression(class_weight='balanced', multi_class='multinomial', max_iter=1000)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
coefficients = []

for train_idx, test_idx in cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    lm.fit(X_train, y_train)
    
    coefficients.append(lm.coef_)
    
    y_pred = lm.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=label_mapping.values()))


coefficients = np.array(coefficients)
avg_coefficients = np.mean(coefficients, axis=0)
std_error_coefficients = np.std(coefficients, axis=0) / np.sqrt(cv.get_n_splits())

coef_data = []
for class_idx, class_name in label_mapping.items():
    for feature_idx, feature_name in enumerate(X.columns):
        coef_data.append({
            'Class': class_name,
            'Feature': feature_name,
            'Average Coefficient': avg_coefficients[class_idx, feature_idx],
            'Standard Error': std_error_coefficients[class_idx, feature_idx]
        })

coef_df = pd.DataFrame(coef_data)
print(coef_df)

class_averages = coef_df.groupby('Class').agg(
    Average_Coefficient=('Average Coefficient', 'mean'),
    Average_Standard_Error=('Standard Error', 'mean')
).reset_index()
print(class_averages)

class_name_to_visualize = "rock"  # Change to any class you want to visualize
class_df = coef_df[coef_df['Class'] == class_name_to_visualize].sort_values(
    by='Average Coefficient', key=abs, ascending=False
)

plt.figure(figsize=(10, 6))
plt.barh(class_df['Feature'], class_df['Average Coefficient'], xerr=class_df['Standard Error'])
plt.xlabel('Coefficient Value')
plt.title(f'Average Coefficients with Standard Errors for Class: {class_name_to_visualize}')
plt.tight_layout()
plt.show()

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Fit logistic regression on full dataset to rank features
lm_full = LogisticRegression(class_weight='balanced', multi_class='multinomial', max_iter=1000)
lm_full.fit(X, y)

# Rank features by absolute coefficient values
coef_magnitudes = np.abs(lm_full.coef_).mean(axis=0)
top_features = np.argsort(coef_magnitudes)[-10:]  # Select top 10 variables
top_features_names = X.columns[top_features]

# Refit logistic regression using only top-N features
X_top = X[top_features_names]
X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.2, random_state=42, stratify=y)

lm_top = LogisticRegression(class_weight='balanced', multi_class='multinomial', max_iter=1000)
lm_top.fit(X_train, y_train)

# Evaluate the model
y_pred = lm_top.predict(X_test)
print("Logistic Model with Top-N Variables")
print(classification_report(y_test, y_pred, target_names=label_mapping.values()))
print("Top Features:", top_features_names)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF

# Calculate VIF
X_vif = X.copy()
vif_data = pd.DataFrame({
    'Feature': X_vif.columns,
    'VIF': [VIF(X_vif.values, i) for i in range(X_vif.shape[1])]
})

# Filter features with VIF < 5
low_vif_features = vif_data[vif_data['VIF'] < 5]['Feature']
X_low_vif = X[low_vif_features]

# Refit logistic regression using low VIF variables
X_train, X_test, y_train, y_test = train_test_split(X_low_vif, y, test_size=0.2, random_state=42, stratify=y)

lm_vif = LogisticRegression(class_weight='balanced', multi_class='multinomial', max_iter=1000)
lm_vif.fit(X_train, y_train)

# Evaluate the model
y_pred = lm_vif.predict(X_test)
print("Logistic Model with Low-VIF Variables")
print(classification_report(y_test, y_pred, target_names=label_mapping.values()))
print("Low VIF Features:", low_vif_features.tolist())

In [None]:
import statsmodels.api as sm

# Backward selection function with correction
def backward_selection(X, y, significance_level=0.05):
    # Add intercept term
    X_with_const = sm.add_constant(X)
    model = sm.MNLogit(y, X_with_const).fit()

    while True:
        # Get p-values for all features except the intercept
        pvalues = model.pvalues.iloc[1:]  # Exclude the constant term
        max_pvalue = pvalues.max().max()  # Max across features and classes

        if max_pvalue > significance_level:
            # Find the feature corresponding to the max p-value
            excluded_feature = pvalues.max(axis=1).idxmax()
            X_with_const = X_with_const.drop(columns=[excluded_feature])  # Drop the feature
            model = sm.MNLogit(y, X_with_const).fit()  # Refit the model
        else:
            break

    return model, X_with_const.columns  # Return the final model and selected features

# Perform backward selection
model, selected_features = backward_selection(X, y)
print("Selected Features from Backward Selection:", selected_features)


In [None]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

# PCR Pipeline
pca = PCA(n_components=10)
lm_pca = LogisticRegression(class_weight='balanced', multi_class='multinomial', max_iter=1000)
pcr_pipeline = Pipeline([('pca', pca), ('logistic', lm_pca)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
pcr_pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pcr_pipeline.predict(X_test)
print("Logistic Model with Principal Components")
print(classification_report(y_test, y_pred, target_names=label_mapping.values()))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Data preparation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Regularization parameters
alpha = 1.0  # Regularization strength

# Ridge (L2 Regularization)
ridge_model = LogisticRegression(
    penalty='l2', solver='saga', multi_class='multinomial', C=1/alpha, max_iter=1000
)
ridge_model.fit(X_train, y_train)
ridge_predictions = ridge_model.predict(X_test)

print("Ridge Logistic Regression")
print(classification_report(y_test, ridge_predictions, target_names=label_mapping.values()))

# LASSO (L1 Regularization)
lasso_model = LogisticRegression(
    penalty='l1', solver='saga', multi_class='multinomial', C=1/alpha, max_iter=1000
)
lasso_model.fit(X_train, y_train)
lasso_predictions = lasso_model.predict(X_test)

print("LASSO Logistic Regression")
print(classification_report(y_test, lasso_predictions, target_names=label_mapping.values()))

# Elastic Net (Combination of L1 and L2)
elastic_net_model = LogisticRegression(
    penalty='elasticnet', solver='saga', multi_class='multinomial', C=1/alpha, l1_ratio=0.5, max_iter=1000
)
elastic_net_model.fit(X_train, y_train)
elastic_net_predictions = elastic_net_model.predict(X_test)

print("Elastic Net Logistic Regression")
print(classification_report(y_test, elastic_net_predictions, target_names=label_mapping.values()))


There is something wrong with Ridge regression, I still need to fix it

In [None]:
# import numpy as np
# import pandas as pd
# from sklearn.linear_model import ElasticNetCV
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import cross_validate, KFold

# # Data preparation
# X_standardized = (X - X.mean(axis=0)) / X.std(axis=0)  # Standardize features
# lambdas = 10**np.linspace(8, -2, 100)  # Define lambda (alpha) values

# # Ridge regression using ElasticNetCV with l1_ratio=0
# ridge_cv = ElasticNetCV(alphas=lambdas, l1_ratio=0, cv=5, max_iter=100000)

# # Create a pipeline with standardization and ridge regression
# ridge_pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('ridge', ridge_cv)
# ])

# # Fit the model
# ridge_pipeline.fit(X, y)

# # Extract best alpha (lambda) and coefficients
# best_lambda = ridge_cv.alpha_
# ridge_coefficients = ridge_cv.coef_

# print("Best Lambda (Ridge):", best_lambda)
# print("Ridge Coefficients:", ridge_coefficients)

# # Cross-validated MSE
# cv_results = cross_validate(ridge_pipeline, X, y, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
# cv_mse = -np.mean(cv_results['test_score'])

# print("Cross-Validated MSE (Ridge):", cv_mse)

Lasso

In [None]:
from sklearn.linear_model import ElasticNetCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_validate
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Prepare LASSO Regression
lasso_cv = ElasticNetCV(l1_ratio=1, n_alphas=100, cv=5, max_iter=10000)

# Create a pipeline for standardization and LASSO
lasso_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lasso', lasso_cv)
])

# Fit the pipeline
lasso_pipeline.fit(X, y)

# Extract the tuned LASSO model
tuned_lasso = lasso_pipeline.named_steps['lasso']

# Best alpha (lambda) value
best_lambda = tuned_lasso.alpha_
print("Best Lambda (LASSO):", best_lambda)

# Cross-validated MSE
cv_results = cross_validate(lasso_pipeline, X, y, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
cv_mse = -np.mean(cv_results['test_score'])
print("Cross-Validated MSE (LASSO):", cv_mse)

# Extract coefficients
lasso_coefficients = tuned_lasso.coef_
print("LASSO Coefficients:", lasso_coefficients)

# Number of non-zero coefficients
non_zero_coefficients = np.sum(lasso_coefficients != 0)
print("Number of Non-Zero Coefficients:", non_zero_coefficients)

In [None]:
from sklearn.linear_model import lasso_path

# Compute LASSO path
alphas, coefs, _ = lasso_path(X, y)

# Plot the coefficient paths
plt.figure(figsize=(8, 6))
for coef in coefs:
    plt.plot(-np.log10(alphas), coef)
plt.xlabel('$-\log(\lambda)$', fontsize=20)
plt.ylabel('Coefficients', fontsize=20)
plt.title('LASSO Coefficient Paths')
plt.show()

In [None]:
# Plot cross-validated MSE
plt.figure(figsize=(8, 6))
mean_mse = tuned_lasso.mse_path_.mean(axis=1)
std_mse = tuned_lasso.mse_path_.std(axis=1)

plt.errorbar(-np.log10(tuned_lasso.alphas_), mean_mse, yerr=std_mse / np.sqrt(5))
plt.axvline(-np.log10(tuned_lasso.alpha_), color='k', linestyle='--')
plt.xlabel('$-\log(\lambda)$', fontsize=20)
plt.ylabel('Cross-validated MSE', fontsize=20)
plt.title('LASSO Cross-Validation Error')
plt.ylim([50000, 250000])
plt.show()