In [3]:
import os
import numpy as np
import librosa
import joblib
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight

# Directory paths
base_path = 'PhishingVoiceDataset'
phishing_path = os.path.join(base_path, 'Phishing')
non_phishing_path = os.path.join(base_path, 'NonPhishing')

# Function to extract features (MFCCs) from an audio file
def extract_features(file_path):
    try:
        audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_processed = np.mean(mfccs.T, axis=0)
    except Exception as e:
        print("Error encountered while parsing file: ", file_path)
        return None 
    return mfccs_processed

# Data augmentation function (time stretching and pitch shifting)
def augment_audio(audio, sample_rate):
    audio_time_stretch = librosa.effects.time_stretch(audio, rate=0.9)  # Time stretch
    audio_pitch_shift = librosa.effects.pitch_shift(audio, sr=sample_rate, n_steps=2)  # Pitch shift
    return audio_time_stretch, audio_pitch_shift

# Parse files, extract features, and apply data augmentation
features = []
labels = []

for folder, label in [(phishing_path, 1), (non_phishing_path, 0)]:
    for file in os.listdir(folder):
        file_path = os.path.join(folder, file)
        if file_path.endswith('.mp3'):
            audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
            mfccs = extract_features(file_path)
            if mfccs is not None:
                features.append(mfccs)
                labels.append(label)
                
                # Apply Data Augmentation
                augmented_time_stretch, augmented_pitch_shift = augment_audio(audio, sample_rate)
                mfccs_aug_time = np.mean(librosa.feature.mfcc(y=augmented_time_stretch, sr=sample_rate, n_mfcc=40).T, axis=0)
                mfccs_aug_pitch = np.mean(librosa.feature.mfcc(y=augmented_pitch_shift, sr=sample_rate, n_mfcc=40).T, axis=0)
                
                # Append augmented data
                features.append(mfccs_aug_time)
                labels.append(label)
                features.append(mfccs_aug_pitch)
                labels.append(label)

# Convert into Numpy arrays
features = np.array(features)
labels = np.array(labels)

# Feature scaling
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Compute class weights to handle class imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Model Selection: Gradient Boosting Classifier with Grid Search for Hyperparameter Optimization
model = GradientBoostingClassifier(random_state=0)

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.05],
    'max_depth': [3, 4, 5]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)

# Cross-validation setup using KFold
kfold = KFold(n_splits=5, shuffle=True, random_state=0)

# Train and evaluate the model using KFold Cross-Validation
best_model = None
best_score = 0
for train_idx, test_idx in kfold.split(features_scaled):
    X_train, X_test = features_scaled[train_idx], features_scaled[test_idx]
    y_train, y_test = labels[train_idx], labels[test_idx]
    
    # Fit the model using GridSearchCV
    grid_search.fit(X_train, y_train)
    print(f"Best parameters: {grid_search.best_params_}")
    
    # Evaluate the model
    predictions = grid_search.predict(X_test)
    score = grid_search.score(X_test, y_test)
    
    print(classification_report(y_test, predictions))
    
    # Save the best performing model
    if score > best_score:
        best_score = score
        best_model = grid_search.best_estimator_

# Save the best model to disk
joblib.dump(best_model, 'phishing_detection_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
print(f"Best model accuracy: {best_score}")


Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
              precision    recall  f1-score   support

           0       0.74      0.64      0.68        22
           1       0.72      0.81      0.76        26

    accuracy                           0.73        48
   macro avg       0.73      0.72      0.72        48
weighted avg       0.73      0.73      0.73        48

Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
              precision    recall  f1-score   support

           0       0.79      0.62      0.70        24
           1       0.69      0.83      0.75        24

    accuracy                           0.73        48
   macro avg       0.74      0.73      0.73        48
weighted avg       0.74      0.73      0.73        48

Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best par

In [4]:
import os
import joblib
import numpy as np
import librosa

# Load the saved model and scaler
loaded_model = joblib.load('phishing_detection_model.pkl')
loaded_scaler = joblib.load('scaler.pkl')

# Function to extract features (MFCCs) from an audio file
def extract_features(file_path):
    try:
        audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_processed = np.mean(mfccs.T, axis=0)
    except Exception as e:
        print("Error encountered while parsing file: ", file_path)
        return None 
    return mfccs_processed

# Function to predict on a new file
def predict_phishing(file_path, model, scaler):
    # Extract MFCCs from the audio file
    features = extract_features(file_path)
    
    # Check if feature extraction was successful
    if features is not None:
        # Scale the features using the loaded scaler
        features_scaled = scaler.transform([features])
        
        # Make the prediction (0 = Non-Phishing, 1 = Phishing)
        prediction = model.predict(features_scaled)
        proba = model.predict_proba(features_scaled)
        
        return prediction[0], proba
    else:
        return None, None

# Directory paths for test files
test_folder_path = 'mnt'

# Iterate over all files in the test folder and predict
results = []

for root, dirs, files in os.walk(test_folder_path):
    for file in files:
        if file.endswith('.mp3'):  # Ensure only MP3 files are processed
            file_path = os.path.join(root, file)
            prediction, proba = predict_phishing(file_path, loaded_model, loaded_scaler)
            
            if prediction is not None:
                if prediction == 1:
                    result = f"File: {file} is predicted as Phishing with probability {proba[0][1]:.2f}"
                else:
                    result = f"File: {file} is predicted as Non-Phishing with probability {proba[0][0]:.2f}"
                
                print(result)  # Print each result to the console
                results.append(result)
            else:
                print(f"Failed to extract features from {file}")

# Save the results to a file
with open('phishing_detection_results.txt', 'w') as f:
    for result in results:
        f.write(result + '\n')

print("Bulk prediction completed. Results saved to phishing_detection_results.txt")


File: n5.mp3 is predicted as Non-Phishing with probability 0.66
File: p1.mp3 is predicted as Phishing with probability 0.99
File: n4.mp3 is predicted as Non-Phishing with probability 0.60
File: n3.mp3 is predicted as Non-Phishing with probability 0.96
File: p4.mp3 is predicted as Non-Phishing with probability 1.00
File: p3.mp3 is predicted as Phishing with probability 1.00
File: p5.mp3 is predicted as Phishing with probability 0.98
File: p2.mp3 is predicted as Phishing with probability 0.92
File: n2.mp3 is predicted as Non-Phishing with probability 0.98
Bulk prediction completed. Results saved to phishing_detection_results.txt
