In [1]:
import os
import librosa
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

# 1. Dataset Path
PHISHING_PATH = 'PhishingVoiceDataset/Phishing/'
NONPHISHING_PATH = 'PhishingVoiceDataset/NonPhishing/'

# 2. Function to Extract MFCC Features from Audio Files
def extract_mfcc(file_path, n_mfcc=13):
    audio, sample_rate = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
    return np.mean(mfccs.T, axis=0)

# 3. Load Dataset and Extract Features
def load_dataset(phishing_path, non_phishing_path):
    features = []
    labels = []

    # Phishing audio files
    for file in os.listdir(phishing_path):
        if file.endswith('.mp3') or file.endswith('.wav'):
            file_path = os.path.join(phishing_path, file)
            mfcc_features = extract_mfcc(file_path)
            features.append(mfcc_features)
            labels.append(1)  # 1 for phishing

    # Non-phishing audio files
    for file in os.listdir(non_phishing_path):
        if file.endswith('.mp3') or file.endswith('.wav'):
            file_path = os.path.join(non_phishing_path, file)
            mfcc_features = extract_mfcc(file_path)
            features.append(mfcc_features)
            labels.append(0)  # 0 for non-phishing

    return np.array(features), np.array(labels)

# 4. Load and Prepare Data
X, y = load_dataset(PHISHING_PATH, NONPHISHING_PATH)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Standardize Features (Feature Scaling)
scaler = StandardScaler()

# 6. Create a Pipeline for Scaling and Random Forest
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))  # n_jobs=-1 uses all available CPU cores
])

# 7. Hyperparameter Optimization with GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200, 300],  # Number of trees in the forest
    'classifier__max_depth': [10, 20, None],  # Maximum depth of the tree
    'classifier__min_samples_split': [2, 5, 10],  # Minimum samples required to split an internal node
    'classifier__min_samples_leaf': [1, 2, 4],  # Minimum samples required to be at a leaf node
    'classifier__bootstrap': [True, False],  # Whether bootstrap samples are used when building trees
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# 8. Train the Model with Best Hyperparameters
grid_search.fit(X_train, y_train)

# Get the best parameters
print(f"Best parameters found: {grid_search.best_params_}")

# 9. Evaluate the Optimized Model
y_pred = grid_search.predict(X_test)

# Accuracy and Classification Report
accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized Model Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred, target_names=['Non-Phishing', 'Phishing']))

# 10. Cross-Validation to Evaluate Generalization
cross_val_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5)
print(f"Cross-Validation Accuracy: {cross_val_scores.mean():.2f} (+/- {cross_val_scores.std():.2f})")

# 11. Save the Optimized Model
import pickle
with open('optimized_phishing_voice_model.pkl', 'wb') as model_file:
    pickle.dump(grid_search.best_estimator_, model_file)

print("Optimized model saved successfully as 'optimized_phishing_voice_model.pkl'.")


Fitting 5 folds for each of 162 candidates, totalling 810 fits
[CV] END classifier__bootstrap=True, classifier__max_depth=10, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.3s
[CV] END classifier__bootstrap=True, classifier__max_depth=10, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.3s
[CV] END classifier__bootstrap=True, classifier__max_depth=10, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__bootstrap=True, classifier__max_depth=10, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.3s
[CV] END classifier__bootstrap=True, classifier__max_depth=10, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=200; total time=   0.5s
[CV] END classifier__bootstrap=True, classifier__max

In [3]:
import os
import librosa
import numpy as np
import pickle

# Load the saved model
with open('optimized_phishing_voice_model.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

# Function to Extract MFCC Features from an Audio File
def extract_mfcc(file_path, n_mfcc=13):
    audio, sample_rate = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
    return np.mean(mfccs.T, axis=0)

# Function to Predict Phishing or Non-Phishing from an Audio File
def predict_phishing(file_path):
    mfcc_features = extract_mfcc(file_path)
    mfcc_features = np.array([mfcc_features])  # Reshape for model input
    prediction = model.predict(mfcc_features)
    
    if prediction[0] == 1:
        print(f"The file '{os.path.basename(file_path)}' is predicted to be a PHISHING audio.")
    else:
        print(f"The file '{os.path.basename(file_path)}' is predicted to be a NON-PHISHING audio.")

# Test on Audio Files in /mnt/data folder
mnt_data_path = 'mnt'

# Loop through all files in the /mnt/data/ directory
for file_name in os.listdir(mnt_data_path):
    if file_name.endswith('.mp3') or file_name.endswith('.wav'):
        file_path = os.path.join(mnt_data_path, file_name)
        predict_phishing(file_path)


The file 'n5.mp3' is predicted to be a NON-PHISHING audio.
The file 'p1.mp3' is predicted to be a PHISHING audio.
The file 'n4.mp3' is predicted to be a NON-PHISHING audio.
The file 'n3.mp3' is predicted to be a NON-PHISHING audio.
The file 'p4.mp3' is predicted to be a PHISHING audio.
The file 'p3.mp3' is predicted to be a PHISHING audio.
The file 'p5.mp3' is predicted to be a PHISHING audio.
The file 'p2.mp3' is predicted to be a PHISHING audio.
The file 'n2.mp3' is predicted to be a NON-PHISHING audio.
The file 'n1.mp3' is predicted to be a NON-PHISHING audio.
