In [1]:
import os
import pandas as pd
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


In [2]:
def extract_features(file_path):
    try:
        y, sr = librosa.load(file_path, sr=None)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        return np.mean(mfcc.T, axis=0)
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None


In [3]:
train_df = pd.read_csv('train.csv')
audio_files = os.listdir('audios_train')

train_df['file_exists'] = train_df['filename'].apply(lambda x: x in audio_files)
print("Total entries:", len(train_df))
print("Files that exist:", train_df['file_exists'].sum())

# Filter only existing audio files
train_df = train_df[train_df['file_exists']]


Total entries: 444
Files that exist: 444


In [4]:
X = []
y = []

for _, row in train_df.iterrows():
    file_path = os.path.join('audios_train', row['filename'])
    features = extract_features(file_path)
    if features is not None:
        X.append(features)
        y.append(row['label'])

X = np.array(X)
y = np.array(y)

print("Extracted features:", X.shape)


Extracted features: (444, 13)


In [5]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [6]:
# Filter out labels with less than 2 occurrences
from collections import Counter

label_counts = Counter(y)
filtered_indices = [i for i, label in enumerate(y) if label_counts[label] >= 2]

X = X[filtered_indices]
y = y[filtered_indices]


In [7]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [8]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
val_preds = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, val_preds))


ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")


NameError: name 'X_test' is not defined

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Assuming 'X' is your feature set and 'y' is your target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")


Mean Squared Error: 1.0942
R² Score: 0.2103


In [11]:
test_df = pd.read_csv('test.csv')
test_audio_files = os.listdir('audios_test')

test_df['file_exists'] = test_df['filename'].apply(lambda x: x in test_audio_files)
test_df = test_df[test_df['file_exists']]

X_test = []
test_ids = []

for _, row in test_df.iterrows():
    file_path = os.path.join('audios_test', row['filename'])
    features = extract_features(file_path)
    if features is not None:
        X_test.append(features)
        test_ids.append(row['filename'])

X_test = np.array(X_test)
test_preds = model.predict(X_test)


In [20]:
submission = pd.DataFrame({
    'filename': test_ids,
    'label': test_preds
})

submission.to_csv('submission.csv', index=False)
print("✅ submission.csv created!")


✅ submission.csv created!
