In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

features_df = pd.read_csv('features.csv')

# Extract the MFCC features and corresponding labels
X = features_df.iloc[:, :-1].values  # Assuming MFCC features are in columns 1 to n
y = features_df.iloc[:, -1].values  # Assuming labels are in the first column

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Encode the labels
y = label_encoder.fit_transform(y)

# Convert to PyTorch tensors
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.int64)

# Split the dataset into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Define the RNN model
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        return out

input_size = X_train.shape[1]
hidden_size = 64  # You can adjust this as needed
num_classes = len(np.unique(y_train))

model = RNN(input_size, hidden_size, num_classes)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 300  # You can adjust this as needed

for epoch in range(num_epochs):
    model.train()
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    outputs = model(X_train_tensor.unsqueeze(1))  # Add an extra dimension for sequence length
    optimizer.zero_grad()
    loss = criterion(outputs.squeeze(), y_train)
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 1 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {loss.item():.4f}')

    # Validation
    model.eval()
    X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
    with torch.no_grad():
        val_outputs = model(X_val_tensor.unsqueeze(1))
        val_loss = criterion(val_outputs.squeeze(), y_val)
        _, val_predicted = torch.max(val_outputs, 1)
        val_accuracy = accuracy_score(y_val.numpy(), val_predicted.numpy())

    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss.item():.4f}, Validation Accuracy: {val_accuracy:.4f}')

# Evaluate the model on the test set
model.eval()
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
with torch.no_grad():
    test_outputs = model(X_test_tensor.unsqueeze(1))
    _, test_predicted = torch.max(test_outputs, 1)
    test_accuracy = accuracy_score(y_test.numpy(), test_predicted.numpy())

print("Test Accuracy:", test_accuracy)


Epoch [1/300], Training Loss: 0.6664
Epoch [1/300], Validation Loss: 0.5877, Validation Accuracy: 0.7377
Epoch [2/300], Training Loss: 0.5916
Epoch [2/300], Validation Loss: 0.5218, Validation Accuracy: 0.8147
Epoch [3/300], Training Loss: 0.5246
Epoch [3/300], Validation Loss: 0.4633, Validation Accuracy: 0.8803
Epoch [4/300], Training Loss: 0.4651
Epoch [4/300], Validation Loss: 0.4117, Validation Accuracy: 0.9202
Epoch [5/300], Training Loss: 0.4128
Epoch [5/300], Validation Loss: 0.3666, Validation Accuracy: 0.9487
Epoch [6/300], Training Loss: 0.3670
Epoch [6/300], Validation Loss: 0.3273, Validation Accuracy: 0.9587
Epoch [7/300], Training Loss: 0.3272
Epoch [7/300], Validation Loss: 0.2933, Validation Accuracy: 0.9629
Epoch [8/300], Training Loss: 0.2927
Epoch [8/300], Validation Loss: 0.2638, Validation Accuracy: 0.9672
Epoch [9/300], Training Loss: 0.2630
Epoch [9/300], Validation Loss: 0.2383, Validation Accuracy: 0.9694
Epoch [10/300], Training Loss: 0.2373
Epoch [10/300], V

Epoch [97/300], Training Loss: 0.0251
Epoch [97/300], Validation Loss: 0.0234, Validation Accuracy: 0.9936
Epoch [98/300], Training Loss: 0.0249
Epoch [98/300], Validation Loss: 0.0232, Validation Accuracy: 0.9936
Epoch [99/300], Training Loss: 0.0247
Epoch [99/300], Validation Loss: 0.0230, Validation Accuracy: 0.9936
Epoch [100/300], Training Loss: 0.0245
Epoch [100/300], Validation Loss: 0.0228, Validation Accuracy: 0.9936
Epoch [101/300], Training Loss: 0.0243
Epoch [101/300], Validation Loss: 0.0226, Validation Accuracy: 0.9936
Epoch [102/300], Training Loss: 0.0241
Epoch [102/300], Validation Loss: 0.0224, Validation Accuracy: 0.9943
Epoch [103/300], Training Loss: 0.0239
Epoch [103/300], Validation Loss: 0.0222, Validation Accuracy: 0.9943
Epoch [104/300], Training Loss: 0.0237
Epoch [104/300], Validation Loss: 0.0220, Validation Accuracy: 0.9943
Epoch [105/300], Training Loss: 0.0235
Epoch [105/300], Validation Loss: 0.0218, Validation Accuracy: 0.9943
Epoch [106/300], Training

Epoch [179/300], Training Loss: 0.0145
Epoch [179/300], Validation Loss: 0.0122, Validation Accuracy: 0.9979
Epoch [180/300], Training Loss: 0.0144
Epoch [180/300], Validation Loss: 0.0122, Validation Accuracy: 0.9979
Epoch [181/300], Training Loss: 0.0143
Epoch [181/300], Validation Loss: 0.0121, Validation Accuracy: 0.9979
Epoch [182/300], Training Loss: 0.0142
Epoch [182/300], Validation Loss: 0.0120, Validation Accuracy: 0.9979
Epoch [183/300], Training Loss: 0.0141
Epoch [183/300], Validation Loss: 0.0119, Validation Accuracy: 0.9979
Epoch [184/300], Training Loss: 0.0141
Epoch [184/300], Validation Loss: 0.0118, Validation Accuracy: 0.9979
Epoch [185/300], Training Loss: 0.0140
Epoch [185/300], Validation Loss: 0.0118, Validation Accuracy: 0.9979
Epoch [186/300], Training Loss: 0.0139
Epoch [186/300], Validation Loss: 0.0117, Validation Accuracy: 0.9979
Epoch [187/300], Training Loss: 0.0138
Epoch [187/300], Validation Loss: 0.0116, Validation Accuracy: 0.9979
Epoch [188/300], Tr

Epoch [274/300], Training Loss: 0.0088
Epoch [274/300], Validation Loss: 0.0069, Validation Accuracy: 0.9986
Epoch [275/300], Training Loss: 0.0088
Epoch [275/300], Validation Loss: 0.0069, Validation Accuracy: 0.9986
Epoch [276/300], Training Loss: 0.0088
Epoch [276/300], Validation Loss: 0.0068, Validation Accuracy: 0.9986
Epoch [277/300], Training Loss: 0.0087
Epoch [277/300], Validation Loss: 0.0068, Validation Accuracy: 0.9986
Epoch [278/300], Training Loss: 0.0087
Epoch [278/300], Validation Loss: 0.0068, Validation Accuracy: 0.9986
Epoch [279/300], Training Loss: 0.0086
Epoch [279/300], Validation Loss: 0.0067, Validation Accuracy: 0.9986
Epoch [280/300], Training Loss: 0.0086
Epoch [280/300], Validation Loss: 0.0067, Validation Accuracy: 0.9986
Epoch [281/300], Training Loss: 0.0085
Epoch [281/300], Validation Loss: 0.0067, Validation Accuracy: 0.9986
Epoch [282/300], Training Loss: 0.0085
Epoch [282/300], Validation Loss: 0.0066, Validation Accuracy: 0.9986
Epoch [283/300], Tr

In [2]:
import joblib

# Save the trained SVM model to a file
joblib.dump(model, 'pytorch_model_new.pkl')

['pytorch_model_new.pkl']

In [3]:
import os
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm

def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')

        # MFCC (Mel-frequency cepstral coefficients)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_processed = np.mean(mfccs.T, axis=0)

        # Chroma feature
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)

        # Spectral contrast
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sample_rate).T, axis=0)

        # Spectral centroid
        spectral_centroids = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sample_rate).T, axis=0)

        # Zero-crossing rate
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)

        # Spectral rolloff
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sample_rate).T, axis=0)

        # Combine all features into a 1D array
        features = np.hstack([mfccs_processed, chroma_stft, spectral_contrast, spectral_centroids, zero_crossing_rate, spectral_rolloff])

        return features
    except Exception as e:
        print(f"Error encountered while parsing file: {file_name}")
        return None

# Specify the directories containing the .mp3 files
directories = ['something']

# Create an empty DataFrame to store the features
features_df = pd.DataFrame()

for directory in directories:
    print(f"Processing files in {directory} directory")
    for filename in tqdm(os.listdir(directory)):
        if filename.endswith('.wav'):
            file_path = os.path.join(directory, filename)
            try:
                features = extract_features(file_path)
                # Append the features to the DataFrame as a new row
                if features is not None:
                    features_series = pd.Series(features)
                    features_df = pd.concat([features_df, features_series], axis=0)  # Concatenate along rows (axis=0)
            except Exception as e:
                print(f"Error encountered while processing file: {file_path}")
                continue



# Rename the DataFrame columns as needed
# features_df.columns = [list_of_feature_names]

# Now, you have the features in the 'features_df' DataFrame.

Processing files in something directory


100%|█████████████████████████████████████████████| 1/1 [00:03<00:00,  3.21s/it]


In [5]:
X_new= features_df.T
X_new.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,52,53,54,55,56,57,58,59,60,61
0,-224.141861,123.548615,-11.934964,35.14872,-2.978472,11.988633,-10.192609,7.214376,-13.9877,1.976113,...,20.868756,15.711451,19.730115,19.38868,19.780333,19.137458,41.547228,1692.69833,0.082245,3374.557483


In [7]:
# Define the model architecture first
loaded_model = RNN(input_size, hidden_size, num_classes)

# Load the trained weights
loaded_model.load_state_dict(torch.load('rnn_model.pth'))


<All keys matched successfully>

In [9]:
loaded_model.eval()  # Set the model to evaluation mode

# Assuming you have new data in X_new (make sure to preprocess it the same way as the training data)
X_new = scaler.transform(X_new)  # Standardize the new data
X_new_tensor = torch.tensor(X_new, dtype=torch.float32)

with torch.no_grad():
    new_outputs = loaded_model(X_new_tensor.unsqueeze(1))
    _, new_predicted = torch.max(new_outputs, 1)

# 'new_predicted' now contains the predicted class labels for your new data
print(new_predicted)

tensor([1])
