In [1]:
import os
import numpy as np
import librosa
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Define the path to the audio files
audio_path_pop = '/home/minhah/vc/genres_original/pop/'

# Initialize an empty list to hold features and labels
features = []
labels = []

# Define a function to extract features from an audio file
def extract_features(y, sr):
    # Extract pitch (using librosa's pitch detection)
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
    pitch = pitches[magnitudes > np.median(magnitudes)].mean() if magnitudes.any() else 0

    # Extract tempo (using librosa's tempo detection)
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)

    return pitch, tempo

# Initialize lists to hold pitch and tempo values
pitches = []
tempos = []

# Loop through each file in the pop directory and extract features
for file_name in os.listdir(audio_path_pop):
    if file_name.endswith('.wav'):
        full_file_name = os.path.join(audio_path_pop, file_name)
        
        # Load the audio file
        y, sr = librosa.load(full_file_name, sr=None)
        
        # Divide audio into 1-second segments
        for start in range(0, len(y), sr):
            end = start + sr
            segment = y[start:end]
            if len(segment) < sr:
                break

            # Extract features from the segment
            pitch, tempo = extract_features(segment, sr)
            # Append the pitch and tempo to the lists
            pitches.append(pitch)
            tempos.append(tempo)

# Convert lists to numpy arrays for easy manipulation
pitches = np.array(pitches)
tempos = np.array(tempos)

# Calculate the range for pitch and tempo
pitch_min = np.min(pitches)
pitch_max = np.max(pitches)
tempo_min = np.min(tempos)
tempo_max = np.max(tempos)

print(f'Pitch range: {pitch_min} to {pitch_max}')
print(f'Tempo range: {tempo_min} to {tempo_max}')

# Define thresholds
pitch_threshold = 1268  # Example threshold for pitch
tempo_threshold = 136  # Example threshold for tempo

# Loop through each file in the pop directory and extract features
for file_name in os.listdir(audio_path_pop):
    if file_name.endswith('.wav'):
        full_file_name = os.path.join(audio_path_pop, file_name)
        
        # Load the audio file
        y, sr = librosa.load(full_file_name, sr=None)
        
        # Divide audio into 1-second segments
        for start in range(0, len(y), sr):
            end = start + sr
            segment = y[start:end]
            if len(segment) < sr:
                break

            # Extract features from the segment
            pitch, tempo = extract_features(segment, sr)
            
            if pitch is not None and tempo[0] is not None:
                # Define the label based on pitch and tempo
                if pitch < pitch_threshold and tempo[0] < tempo_threshold:
                    label = 'backward'
                elif pitch < pitch_threshold and tempo[0] >= tempo_threshold:
                    label = 'left'
                elif pitch >= pitch_threshold and tempo[0] < tempo_threshold:
                    label = 'forward'
                else:
                    label = 'right'
                
                # Append the features and label to the lists
                features.append([pitch, tempo[0]])
                labels.append(label)

# Convert features and labels to numpy arrays
X = np.array(features)
y = np.array(labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the feature values
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the Support Vector Classifier (SVC)
classifier = SVC(kernel='linear', random_state=42)

# Train the classifier
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

# Save the trained classifier and scaler
joblib.dump(classifier, 'classifier.joblib')
joblib.dump(scaler, 'scaler.joblib')

Pitch range: 288.100341796875 to 2310.0126953125
Tempo range: 67.99958881578948 to 287.109375
              precision    recall  f1-score   support

    backward       1.00      0.99      1.00       146
     forward       0.99      1.00      1.00       158
        left       1.00      1.00      1.00       141
       right       1.00      1.00      1.00       155

    accuracy                           1.00       600
   macro avg       1.00      1.00      1.00       600
weighted avg       1.00      1.00      1.00       600



['scaler.joblib']

In [1]:
import os
import numpy as np
import librosa
import joblib

# Define a function to extract features from an audio file
def extract_features(y, sr):
    # Extract pitch (using librosa's pitch detection)
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
    pitch = pitches[magnitudes > np.median(magnitudes)].mean() if magnitudes.any() else 0

    # Extract tempo (using librosa's tempo detection)
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)

    return pitch, tempo

# Function to process a single audio file and return directions per second
def process_audio_file(file_path):
    # Load the trained classifier and scaler
    classifier = joblib.load('classifier.joblib')
    scaler = joblib.load('scaler.joblib')
    # Load the audio file
    y, sr = librosa.load(file_path, sr=None)
    directions = []
    
    # Divide audio into 1-second segments
    for start in range(0, len(y), sr):
        end = start + sr
        segment = y[start:end]
        if len(segment) < sr:
            break

        # Extract features from the segment
        pitch, tempo = extract_features(segment, sr)
        
        if pitch is not None and tempo is not None:
            # Predict the direction using the trained classifier
            feature = np.array([[pitch, tempo[0]]])
            feature = scaler.transform(feature)  # Scale the features
            direction = classifier.predict(feature)[0]
            directions.append(direction)
    
    return directions

# Define the path to the rock audio files
audio_path = '/home/minhah/vc/genres_original/rock/rock.00086.wav'
directions = process_audio_file(audio_path)

# Print the directions
print(f'Directions:\n{directions}')


Directions:
['forward', 'right', 'right', 'forward', 'forward', 'forward', 'right', 'forward', 'forward', 'forward', 'forward', 'forward', 'forward', 'right', 'forward', 'forward', 'backward', 'forward', 'forward', 'backward', 'forward', 'forward', 'forward', 'forward', 'right', 'right', 'forward', 'forward', 'forward', 'forward']


In [2]:
# Define the path to the rock audio files
audio_path = '/home/minhah/vc/genres_original/country/country.00002.wav'
directions = process_audio_file(audio_path)

# Print the directions
print(f'Directions:\n{directions}')

Directions:
['left', 'left', 'left', 'left', 'backward', 'left', 'left', 'left', 'left', 'right', 'left', 'backward', 'right', 'right', 'left', 'left', 'left', 'left', 'left', 'left', 'left', 'backward', 'left', 'backward', 'left', 'forward', 'left', 'left', 'backward', 'left']
