In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

In [3]:
# --- Configuration ---
# Path to the raw data file
RAW_DATA_PATH = "../data/raw/YearPredictionMSD.txt"
# Directory to save generated plots
PLOT_SAVE_DIR = "../results/plots/"
# Ensure plot directory exists
os.makedirs(PLOT_SAVE_DIR, exist_ok=True)

# --- Optional: Add src to path to reuse functions ---
module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)
from data_processing import load_data, create_decade_bins
#If not reusing functions from src, redefine them or necessary parts below

In [None]:
# --- Data Loading ---
print(f"Loading raw data from: {RAW_DATA_PATH}")
try:
    # Define column names as in data_processing.py
    N_FEATURES = 90
    colnames = ['Year'] + [f'Feature_{i+1}' for i in range(N_FEATURES)]
    df_raw = pd.read_csv(RAW_DATA_PATH, header=None, names=colnames)
    print(f"Data loaded successfully. Shape: {df_raw.shape}")
    print("\nFirst 5 rows of raw data:")
    print(df_raw.head())
    print("\nBasic data info:")
    df_raw.info()
except FileNotFoundError:
    print(f"ERROR: Raw data file not found at {RAW_DATA_PATH}. Please ensure it's downloaded.")
    # Exit or handle error appropriately in a real script
    # For a notebook, we might just stop execution here or raise the error
    raise
except Exception as e:
    print(f"An error occurred during data loading: {e}")
    raise

In [None]:
# --- Decade Binning (Reproduce logic from data_processing.py) ---
print("\nCreating decade bins for analysis...")
min_year = 1920 # Start decade reference
df_raw['Decade_Start'] = (df_raw['Year'] // 10) * 10
df_raw['Decade_Label'] = ((df_raw['Decade_Start'] - min_year) // 10).astype(int)
df_raw['Decade_Label'] = df_raw['Decade_Label'].clip(lower=0) # Clip years < 1920
decade_map = {i: f"{min_year + i*10}s" for i in range(10)}
df_raw['Decade_Name'] = df_raw['Decade_Label'].map(decade_map)
print("Decade columns ('Decade_Label', 'Decade_Name') added.")

In [None]:
# --- 1. Class Balance Analysis ---
print("\n--- 1. Class Balance Analysis ---")
plt.figure(figsize=(10, 6))
sns.countplot(data=df_raw, x='Decade_Name', order=[decade_map[i] for i in range(10)], palette='viridis')
plt.title('Distribution of Songs Across Decades')
plt.xlabel('Decade')
plt.ylabel('Number of Songs')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(os.path.join(PLOT_SAVE_DIR, 'eda_decade_distribution.png'))
plt.show()

decade_counts = df_raw['Decade_Name'].value_counts().sort_index()
print("\nSong Counts per Decade:")
print(decade_counts)
print(f"\nObservations: The dataset is heavily imbalanced, with a vast majority of songs from the 2000s, followed by the 1990s. Earlier decades have significantly fewer samples.")

In [None]:
# --- 2. Feature Distribution Analysis ---
print("\n--- 2. Feature Distribution Analysis ---")
# Select a subset of features for detailed analysis (e.g., first 12, often timbre averages)
# and maybe a few from the covariance features later on.
features_to_plot = df_raw.columns[1:13] # Features 1 to 12 (Timbre Averages)
print(f"Plotting distributions for features: {list(features_to_plot)}")

In [None]:
plt.figure(figsize=(15, 10))
for i, col in enumerate(features_to_plot):
    plt.subplot(3, 4, i + 1) # Adjust grid size (3x4) as needed
    sns.histplot(df_raw[col], kde=True, bins=50)
    plt.title(col)
    plt.xlabel('')
    plt.ylabel('')
plt.suptitle('Distribution of First 12 Features (Timbre Averages)', y=1.02)
plt.tight_layout()
plt.savefig(os.path.join(PLOT_SAVE_DIR, 'eda_feature_distributions_hist.png'))
plt.show()

In [None]:
# Box plots can also show distribution and outliers
plt.figure(figsize=(15, 6))
sns.boxplot(data=df_raw[features_to_plot], palette='viridis')
plt.title('Box Plots of First 12 Features (Timbre Averages)')
plt.xlabel('Feature')
plt.ylabel('Value')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(PLOT_SAVE_DIR, 'eda_feature_distributions_box.png'))
plt.show()

In [None]:
print(f"\nObservations: Examine the plots for skewness, modality (number of peaks), and spread. Many features might appear roughly normally distributed but could have long tails (indicating outliers).")
print("Numerical summary:")
print(df_raw[features_to_plot].describe())

In [None]:
# --- 3. Correlation Analysis ---
print("\n--- 3. Correlation Analysis ---")
# Calculate correlation matrix for numerical features (excluding Year and derived decade cols)
feature_cols = [col for col in df_raw.columns if col.startswith('Feature_')]
correlation_matrix = df_raw[feature_cols].corr()

plt.figure(figsize=(18, 15))
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=False, fmt=".1f", linewidths=.5) # annot=True is too crowded for 90 features
plt.title('Correlation Matrix of Audio Features')
plt.tight_layout()
plt.savefig(os.path.join(PLOT_SAVE_DIR, 'eda_correlation_heatmap.png'))
plt.show()

In [None]:
# Find highly correlated pairs (optional)
threshold = 0.8
# Create a mask for the upper triangle (including diagonal)
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
# Apply mask and find correlations above threshold
highly_correlated_filtered = correlation_matrix.mask(mask)
highly_correlated_filtered = highly_correlated_filtered[abs(highly_correlated_filtered) > threshold]

corr_pairs = highly_correlated_filtered.unstack().dropna().sort_values(ascending=False)

print(f"\nHighly Correlated Feature Pairs (Threshold > {threshold}):")
if not corr_pairs.empty:
    print(corr_pairs)
else:
    print(f"No feature pairs found with absolute correlation above the threshold {threshold}.")

In [None]:
# --- 4. Outlier Analysis (using Box Plots from Feature Distribution) ---
print("\n--- 4. Outlier Analysis ---")
print("Refer back to the box plots generated in the 'Feature Distribution Analysis' section.")
print("Box plots visually indicate potential outliers as points beyond the 'whiskers'.")


In [15]:
# Example: Calculate IQR bounds for one feature
feature_example = 'Feature_1'
Q1 = df_raw[feature_example].quantile(0.25)
Q3 = df_raw[feature_example].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [None]:
outliers = df_raw[(df_raw[feature_example] < lower_bound) | (df_raw[feature_example] > upper_bound)]
print(f"\nExample Outlier Check for '{feature_example}':")
print(f"  IQR: {IQR:.2f}")
print(f"  Lower Bound (Q1 - 1.5*IQR): {lower_bound:.2f}")
print(f"  Upper Bound (Q3 + 1.5*IQR): {upper_bound:.2f}")
print(f"  Number of potential outliers (based on 1.5*IQR rule): {len(outliers)}")
print(f"  Percentage of potential outliers: {len(outliers) / len(df_raw) * 100:.2f}%")

In [None]:
print(f"\nObservations & Handling Strategy:")
print(" - Many features show points beyond the 1.5*IQR whiskers, suggesting the presence of outliers.")
print(" - Strategy Decision: For this project, we used StandardScaler in data_processing.py. While StandardScaler is sensitive to outliers, deep learning models (especially with techniques like Batch Norm, which we might test later) can sometimes be relatively robust.")
print(" - Alternative strategies (not implemented here but considered):")
print("   - Use RobustScaler: Scales using percentiles, less sensitive to outliers.")
print("   - Clipping: Cap feature values at certain percentiles (e.g., 1st and 99th).")
print("   - Transformation: Apply log or Box-Cox transforms if features are highly skewed.")
print(" - Chosen Approach: Proceed with StandardScaler, acknowledging the presence of outliers. We will monitor model performance and may revisit outlier handling if necessary.")

In [None]:
# --- 5. Categorical Features ---
print("\n--- 5. Categorical Features ---")
# Check data types again after loading
print(df_raw.info())
# Identify non-numeric columns (excluding our derived Decade_Name)
categorical_cols = df_raw.select_dtypes(include=['object', 'category']).columns
print(f"\nPotential categorical columns detected (excluding Decade_Name): {list(categorical_cols.drop('Decade_Name', errors='ignore'))}")
print("Observations: As expected for this dataset, all original predictor columns (Feature_1 to Feature_90) are numeric (float64). No categorical feature embedding strategy is required for the predictors.")

In [None]:
# --- Summary of EDA Findings ---
print("\n--- Summary of Key EDA Findings ---")
print("1.  **Target Variable (Decade):** Heavily imbalanced, dominated by 2000s and 1990s.")
print("2.  **Features:** All 90 predictor features are numeric (float).")
print("3.  **Distributions:** Feature distributions vary. Some are roughly normal, others might be skewed or have multiple peaks (visual inspection needed per feature).")
print("4.  **Correlations:** Some correlations exist between features, particularly noted visually within blocks (e.g., early timbre features, later covariance features). No extremely high correlations (>0.95) jumped out immediately in the sample check, but moderate correlations are present.")
print("5.  **Outliers:** Potential outliers detected in many features based on visual inspection of box plots and IQR checks.")
print("6.  **Missing Values:** No missing values detected by `df.info()` (consistent with dataset description).")
print("7.  **Preprocessing Decisions (Recap):**")
print("    - Decade binning successfully converted regression to classification.")
print("    - Stratified splitting addressed the class imbalance during data partitioning.")
print("    - StandardScaler was used for feature scaling, acknowledging outlier presence.")
print("    - No categorical encoding needed for predictors.")