# The Audio Model

## Conversion from .g729a to .wav
Since the dataset contains audio files in the .g729a format, they must be converted to .wav format.

This code:


*   Defines input folders for stego (positive) and non-stego (negative) audio files.
*   Defines corresponding output folders for .wav conversions.
*   Uses ffmpeg via Python's subprocess to convert .g729a files to .wav.
*   Automatically creates output folders if they don’t exist.











In [None]:
import os
import subprocess

# Paths to your dataset folders
positive_folder = r'dataset/g729a_Steg'
negative_folder = r'dataset/g729a_0'

# Output folders
positive_output = r'dataset/positive_wav'
negative_output = r'dataset/negative_wav'

# Create output folders if they don't exist
os.makedirs(positive_output, exist_ok=True)
os.makedirs(negative_output, exist_ok=True)

def convert_to_wav(input_folder, output_folder):
    for filename in os.listdir(input_folder):
        if filename.endswith('.g729a'):
            input_path = os.path.join(input_folder, filename)
            output_filename = os.path.splitext(filename)[0] + '.wav'
            output_path = os.path.join(output_folder, output_filename)

            # Command to run ffmpeg
            command = ['ffmpeg', '-y', '-f', 'g729', '-i', input_path, output_path]

            # Execute the command
            subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Convert both positive and negative samples
convert_to_wav(positive_folder, positive_output)
convert_to_wav(negative_folder, negative_output)


We use the ffmpeg to convert all the files of the dataset to .wav and download the converted files to new folders.

## Feature extraction

Once files are in .wav format, features that reflect the impact of steganographic embedding on audio signals are extracted.

The extract_features function extracts MFCC, Spectral Centroid, Spectral Bandwidth, Spectral Flatness and Zero-Crossing Rate using the Librosa library.

The extract_LPC_features extracts the LPC coefficients for each frame and extract their mean, variance and their differences delta.


In [None]:
import os
import librosa
import numpy as np
import pandas as pd
from scipy.signal.windows import hamming
from librosa import lpc
import soundfile as sf
from tqdm import tqdm



def extract_features(y,sr):


    # MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfccs_mean = np.mean(mfccs, axis=1)

    # Spectral features
    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
    spectral_flatness = np.mean(librosa.feature.spectral_flatness(y=y))

    # Zero-crossing rate
    zero_crossings = np.mean(librosa.feature.zero_crossing_rate(y))

    # Combine all features into one array
    features = np.hstack([mfccs_mean, spectral_centroid, spectral_bandwidth, spectral_flatness, zero_crossings])

    return features

def extract_lpc_features(audio, sr, frame_length=0.05, frame_shift=0.01, order=10, min_duration=0.2):

    try:

        # Duration check
        duration = len(audio) / sr
        if duration < min_duration:
            print(f"Skipped (too short): {audio}")
            return None

        frame_size = int(frame_length * sr)
        hop_size = int(frame_shift * sr)
        num_frames = 1 + (len(audio) - frame_size) // hop_size

        lpcs = []

        for i in range(num_frames):
            start = i * hop_size
            frame = audio[start:start + frame_size]
            if len(frame) < frame_size:
                break
            frame *= hamming(len(frame))
            frame = frame / (np.max(np.abs(frame)) + 1e-6)

            try:
                a = lpc(frame, order=order)
                #print("LPC coeffs:", a)
                if len(a) == order + 1:  # LPC includes a0=1
                    lpcs.append(a[1:])  # Skip the first coeff (it's always 1)
            except Exception as e:
                print(f"Skipped unstable frame: {e}")
                continue

        lpcs = np.array(lpcs)
        if len(lpcs) == 0:
            return None

        # Compute mean, variance, and delta of LPCs
        mean_lpc = np.mean(lpcs, axis=0)
        var_lpc = np.var(lpcs, axis=0)
        delta_lpc = np.mean(np.abs(np.diff(lpcs, axis=0)), axis=0)

        features = np.concatenate([mean_lpc, var_lpc, delta_lpc])
        return features

    except Exception as e:
        print(f"Error processing {audio}: {e}")
        return None

Once individual feature extraction functions are ready, the entire dataset (positive and negative samples) is processed to extract features and compile them into a structured .csv file for model training.

Since the dataset seperates the negative and positive samples in different folder we need to assign a label for each file (0 for clean and 1 for stego) for the combined csv file.

In [None]:
def process_folder(folder_path, output_csv, label=None):

    data = []
    for file in tqdm(os.listdir(folder_path)):
        if file.lower().endswith(".wav"):
            file_path = os.path.join(folder_path, file)
            try:
                audio, sr = sf.read(file_path)
                if len(audio.shape) > 1:  # Convert stereo to mono
                    audio = audio[:, 0]

                lpc_features = extract_lpc_features(audio, sr)
                basic_features = extract_features(audio, sr)

                features = np.concatenate([basic_features, lpc_features] )

                if features is not None:
                    row = list(features)
                    if label is not None:
                        row.append(label)
                    data.append(row)

            except Exception as e:
                print(f"Error processing {file}: {e}")

    if not data:
        print(f"No features extracted from folder: {folder_path}")
        return

    if data:
        columns =[f"mfcc_{i+1}" for i in range(13)] + \
              ["spectral_centroid", "spectral_bandwidth", "spectral_flatness", "zero_crossing_rate"] + \
              [f"mean_lpc_{i}" for i in range(10)] + \
              [f"var_lpc_{i}" for i in range(10)] + \
              [f"delta_lpc_{i}" for i in range(10)]+ ['label']
        df = pd.DataFrame(data, columns=columns)
        df.to_csv(output_csv, index=False)
        print(f"Saved features to {output_csv}")


''' The code execution for the feature extraction for the full dataset
clean_folder = "dataset\\negative_wav"
stego_folder = "dataset\\positive_wav"

process_folder(clean_folder, "dataset\\clean_features.csv", label=0)
process_folder(stego_folder, "dataset\\stego_features.csv", label=1)

#Merging the two .csv files

df_clean = pd.read_csv("dataset\\clean_features.csv")
df_stego = pd.read_csv("dataset\\stego_features.csv")
df_all = pd.concat([df_clean, df_stego], ignore_index=True)
df_all.to_csv("dataset\\combined_features.csv", index=False)
'''



' The code execution for the feature extraction for the full dataset \nclean_folder = "dataset\\negative_wav"\nstego_folder = "dataset\\positive_wav"\n\nprocess_folder(clean_folder, "dataset\\clean_features.csv", label=0)\nprocess_folder(stego_folder, "dataset\\stego_features.csv", label=1)\n\n#Merging the two .csv files \n\ndf_clean = pd.read_csv("dataset\\clean_features.csv")\ndf_stego = pd.read_csv("dataset\\stego_features.csv")\ndf_all = pd.concat([df_clean, df_stego], ignore_index=True)\ndf_all.to_csv("dataset\\combined_features.csv", index=False)\n'

Extract_features_file returns the features for only one file.

In [None]:
def extract_features_file(audio,sr):

    lpc_feature = extract_lpc_features(audio,sr)
    basic_features = extract_features(audio,sr)
    features = np.concatenate([basic_features, lpc_feature])
    return(features)

## Training the Model

The model_learning.py trains a xgboost model from the saved combined_features.csv which contains the features for all the files in the dataset and that contains the appropriate labels. ( 0 for clean and 1 for stego)

In [None]:
import pandas as pd
import os
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from xgboost import XGBClassifier
import joblib

#loading the csv file
csv_file = pd.read_csv('dataset//combined_features.csv')

# Seperate the labels from the features
label = csv_file['label']
features = csv_file.drop(columns=['label'])



#divide the training set and the learning set
x_train, x_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)

#training the model
model = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)

print('training the model ...')
model.fit(x_train, y_train)
print("model trained")

to test the model we print its accuracy, a full classification report and plot the confusion matrix

In [None]:
#prediction
y_pred = model.predict(x_test)

# Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")

# Full classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Clean", "Stego"], yticklabels=["Clean", "Stego"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()


We saved the model as a .pkl file

In [None]:
joblib.dump(model,"audio_detection_model.pkl")

## Final Prediction Model
To run test the model with a new audio file we run this code that firslt checks if the file is a .wav file and extract its features and checks if the file is stego or clean.

In [None]:
import joblib
import numpy as np
from feature_extraction import extract_features_file # same one as before
import os

# Load saved model and scaler
model = joblib.load("audio_detection_model.pkl")


# Predict function
def predict(file_path):

    if not file_path.lower().endswith(".wav"):
        print(f"Unsupported file type: {file_path}. Only .wav files are supported.")
        return

    features = extract_features_file(file_path)
    if features is None:
        print("Could not extract features.")
        return
    features = features.reshape(1, -1)
    prediction = model.predict(features)
    print(f"Prediction for {file_path}: {'STEGANOGRAPHIC' if prediction == 1 else 'CLEAN'}")