In [12]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, TimeDistributed, Flatten
from tensorflow.keras.utils import to_categorical

# For reproducibility
np.random.seed(42)


In [13]:
def extract_features(file_path, n_mfcc=40, max_pad_len=174):
    """Extract MFCC features from an audio file and pad them to fixed length."""
    try:
        audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast') 
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
        
        # Pad the MFCCs to the maximum length
        pad_width = max_pad_len - mfccs.shape[1]
        if pad_width > 0:
            mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            mfccs = mfccs[:, :max_pad_len]
        
        return mfccs
    except Exception as e:
        print(f"Error encountered while parsing file: {file_path}")
        return None


In [14]:
def load_dataset(dataset_dir):
    """Load and label phishing and non-phishing audio files."""
    labels = []
    mfcc_features = []
    
    for label_type in ['Phishing', 'NonPhishing']:
        folder_path = os.path.join(dataset_dir, label_type)
        
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.mp3'):
                file_path = os.path.join(folder_path, file_name)
                mfcc = extract_features(file_path)
                if mfcc is not None:
                    mfcc_features.append(mfcc)
                    labels.append(label_type)
    
    return np.array(mfcc_features), np.array(labels)

dataset_dir = 'PhishingVoiceDataset'
X, y = load_dataset(dataset_dir)

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_encoded = to_categorical(y_encoded)

# Reshape X for LSTM (samples, time_steps, features)
X = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


Error encountered while parsing file: PhishingVoiceDataset/Phishing/17.mp3
Error encountered while parsing file: PhishingVoiceDataset/Phishing/10.mp3
Error encountered while parsing file: PhishingVoiceDataset/Phishing/28.mp3
Error encountered while parsing file: PhishingVoiceDataset/Phishing/25.mp3
Error encountered while parsing file: PhishingVoiceDataset/Phishing/26.mp3
Error encountered while parsing file: PhishingVoiceDataset/Phishing/14.mp3
Error encountered while parsing file: PhishingVoiceDataset/Phishing/3.mp3
Error encountered while parsing file: PhishingVoiceDataset/Phishing/31.mp3
Error encountered while parsing file: PhishingVoiceDataset/Phishing/4.mp3
Error encountered while parsing file: PhishingVoiceDataset/Phishing/9.mp3
Error encountered while parsing file: PhishingVoiceDataset/Phishing/33.mp3
Error encountered while parsing file: PhishingVoiceDataset/Phishing/22.mp3
Error encountered while parsing file: PhishingVoiceDataset/Phishing/39.mp3
Error encountered while pars

ValueError: zero-size array to reduction operation maximum which has no identity