In [7]:
# ===============================
# STEP 1: Imports and Setup
# ===============================
import numpy as np
np.complex = complex  # Patch for librosa compatibility

import os
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# ===============================
# STEP 2: Load Data
# ===============================
AUDIO_DIR = 'audios/train'  # Make sure this is the correct path
TRAIN_CSV = 'train.csv'
TEST_CSV = 'test.csv'
SUBMISSION_CSV = 'sample_submission.csv'

train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

print("✅ Train data shape:", train_df.shape)
print("✅ Test data shape:", test_df.shape)
print("✅ Train columns:", train_df.columns)

# ===============================
# STEP 3: Feature Extraction
# ===============================
def extract_features(filepath):
    try:
        y, sr = librosa.load(filepath, sr=16000)
        y, _ = librosa.effects.trim(y)

        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        zcr = librosa.feature.zero_crossing_rate(y)
        rmse = librosa.feature.rms(y=y)

        features = np.hstack([
            np.mean(mfcc, axis=1), np.std(mfcc, axis=1),
            np.mean(chroma, axis=1), np.std(chroma, axis=1),
            np.mean(zcr), np.std(zcr),
            np.mean(rmse), np.std(rmse)
        ])
        return features
    except Exception as e:
        print(f"❌ Error processing {filepath}: {e}")
        return np.zeros(60,)  # Adjust size based on feature vector

# ===============================
# STEP 4: Extract Training Features
# ===============================
X, y = [], []

for _, row in train_df.iterrows():
    file_path = os.path.join(AUDIO_DIR, row['filename'])
    if os.path.exists(file_path):
        features = extract_features(file_path)
        X.append(features)
        y.append(row['label'])
    else:
        print(f"❌ File not found: {file_path}")

X = np.array(X)
y = np.array(y)

# ===============================
# STEP 5: Preprocess and Split
# ===============================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# ===============================
# STEP 6: Train Model
# ===============================
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# ==============================
# ==============================
# Evaluation
# ==============================
train_preds = model.predict(X_train)
val_preds = model.predict(X_val)

train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
val_rmse = np.sqrt(mean_squared_error(y_val, val_preds))

print(f"Train RMSE: {train_rmse:.4f}")
print(f"Validation RMSE: {val_rmse:.4f}")


# ===============================
# STEP 8: Predict Test Set
# ===============================
X_test = []
for _, row in test_df.iterrows():
    file_path = os.path.join(AUDIO_DIR, row['filename'])
    if os.path.exists(file_path):
        features = extract_features(file_path)
        X_test.append(features)
    else:
        print(f"❌ Test file not found: {file_path}")
        X_test.append(np.zeros_like(X[0]))

X_test = scaler.transform(np.array(X_test))
test_preds = model.predict(X_test)

# ===============================
# STEP 9: Create Submission File
# ===============================
submission = pd.read_csv(SUBMISSION_CSV)
submission['label'] = test_preds
submission.to_csv('submission.csv', index=False)

# ===============================
# STEP 10: Summary
# ===============================
print("\n✅ Training Complete!")
print(f"📁 Submission file saved as submission.csv")


✅ Train data shape: (444, 2)
✅ Test data shape: (204, 1)
✅ Train columns: Index(['filename', 'label'], dtype='object')
Train RMSE: 0.3272
Validation RMSE: 0.8650
❌ Test file not found: audios/train\audio_804.wav
❌ Test file not found: audios/train\audio_1028.wav
❌ Test file not found: audios/train\audio_865.wav
❌ Test file not found: audios/train\audio_774.wav
❌ Test file not found: audios/train\audio_1138.wav
❌ Test file not found: audios/train\audio_278.wav
❌ Test file not found: audios/train\audio_1212.wav
❌ Test file not found: audios/train\audio_178.wav
❌ Test file not found: audios/train\audio_542.wav
❌ Test file not found: audios/train\audio_248.wav
❌ Test file not found: audios/train\audio_872.wav
❌ Test file not found: audios/train\audio_954.wav
❌ Test file not found: audios/train\audio_853.wav
❌ Test file not found: audios/train\audio_171.wav
❌ Test file not found: audios/train\audio_922.wav
❌ Test file not found: audios/train\audio_915.wav
❌ Test file not found: audios/train