In [38]:
!pip install audiomentations



In [39]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import OneClassSVM
import os
import joblib
import json
import librosa
from audiomentations import AddGaussianNoise

In [40]:
def extract_features(audio_file, speaker=None, output_csv='/content/audio_features_augmented.csv'):
    y, sr = librosa.load(audio_file)
    duration = librosa.get_duration(y=y, sr=sr)

    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc_mean = np.mean(mfcc, axis=1)
    mfcc_std = np.std(mfcc, axis=1)
    mfcc_delta = librosa.feature.delta(mfcc)
    mfcc_delta_mean = np.mean(mfcc_delta, axis=1)
    mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
    mfcc_delta2_mean = np.mean(mfcc_delta2, axis=1)

    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    rolloff_mean = np.mean(rolloff)
    rolloff_std = np.std(rolloff)
    centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    centroid_mean = np.mean(centroid)
    centroid_std = np.std(centroid)
    bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    bandwidth_mean = np.mean(bandwidth)
    bandwidth_std = np.std(bandwidth)
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    contrast_mean = np.mean(contrast)
    contrast_std = np.std(contrast)
    flatness = librosa.feature.spectral_flatness(y=y)
    flatness_mean = np.mean(flatness)
    flatness_std = np.std(flatness)

    rms = librosa.feature.rms(y=y)
    rms_mean = np.mean(rms)
    rms_std = np.std(rms)
    zcr = librosa.feature.zero_crossing_rate(y=y)
    zcr_mean = np.mean(zcr)
    zcr_std = np.std(zcr)
    f0, _, _ = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
    f0_mean = np.mean(f0[~np.isnan(f0)]) if np.any(~np.isnan(f0)) else 0
    f0_std = np.std(f0[~np.isnan(f0)]) if np.any(~np.isnan(f0)) else 0
    f0_min = np.min(f0[~np.isnan(f0)]) if np.any(~np.isnan(f0)) else 0
    f0_max = np.max(f0[~np.isnan(f0)]) if np.any(~np.isnan(f0)) else 0

    order = 12
    autocorr = np.correlate(y, y, mode='full')
    autocorr = autocorr[len(autocorr)//2:len(autocorr)//2+order+1]
    r = autocorr[1:]
    R = autocorr[:-1]
    from scipy.linalg import solve_toeplitz
    lpc_coeffs = solve_toeplitz((R, R), r)[:order]
    lpc_coeffs = np.concatenate(([1], -lpc_coeffs))

    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    chroma_mean = np.mean(chroma, axis=1)

    onset_env = librosa.onset.onset_strength(y=y, sr=sr)
    onset_mean = np.mean(onset_env)
    onset_std = np.std(onset_env)

    features = {
        'file': audio_file,
        'speaker': speaker,
        'duration': duration,
        **{f'mfcc_{i}_mean': mfcc_mean[i] for i in range(13)},
        **{f'mfcc_{i}_std': mfcc_std[i] for i in range(13)},
        **{f'mfcc_delta_{i}_mean': mfcc_delta_mean[i] for i in range(13)},
        **{f'mfcc_delta2_{i}_mean': mfcc_delta2_mean[i] for i in range(13)},
        'rolloff_mean': rolloff_mean,
        'rolloff_std': rolloff_std,
        'centroid_mean': centroid_mean,
        'centroid_std': centroid_std,
        'bandwidth_mean': bandwidth_mean,
        'bandwidth_std': bandwidth_std,
        'contrast_mean': contrast_mean,
        'contrast_std': contrast_std,
        'flatness_mean': flatness_mean,
        'flatness_std': flatness_std,
        'rms_mean': rms_mean,
        'rms_std': rms_std,
        'zcr_mean': zcr_mean,
        'zcr_std': zcr_std,
        'f0_mean': f0_mean,
        'f0_std': f0_std,
        'f0_min': f0_min,
        'f0_max': f0_max,
        **{f'lpc_{i}': lpc_coeffs[i] for i in range(13)},
        **{f'chroma_{i}_mean': chroma_mean[i] for i in range(12)},
        'onset_mean': onset_mean,
        'onset_std': onset_std
    }
    pd.DataFrame([features]).to_csv(output_csv, mode='a', index=False, header=not os.path.exists(output_csv))
    return features

In [41]:
def augment_audio(audio_file, speaker, output_csv='/content/audio_features_augmented.csv', n_augmentations=2):
    y, sr = librosa.load(audio_file)
    augment = AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=1.0)
    for i in range(n_augmentations):
        y_aug = augment(y, sample_rate=sr)
        augmented_file = f"aug_{i}_{os.path.basename(audio_file)}"
        features = extract_features(audio_file, speaker=speaker, output_csv=output_csv)
        if features is not None:
            features['file'] = augmented_file
            pd.DataFrame([features]).to_csv(output_csv, mode='a', index=False, header=not os.path.exists(output_csv))

In [42]:
print("Loading dataset...")
try:
    df = pd.read_csv('/content/audio_features.csv')
    print(f"Loaded {len(df)} samples")
except FileNotFoundError:
    print("Error: /content/audio_features.csv not found")
    exit()

# Augment data using files in /content/audios with yes_approve_ and confirm_ prefixes
speakers = df['speaker'].tolist()
if os.path.exists('/content/audio_features_augmented.csv'):
    os.remove('/content/audio_features_augmented.csv')
for speaker in set(speakers):  # Unique speakers
    for prefix in ['_approve', '_confirm']:
        audio_file = f"{speaker}{prefix}.m4a"
        audio_path = os.path.join('/content/', audio_file)
        if os.path.exists(audio_path):
            augment_audio(audio_path, speaker, n_augmentations=2)
        else:
            print(f"Warning: {audio_path} not found, skipping augmentation")
df = pd.read_csv('/content/audio_features_augmented.csv') if os.path.exists('/content/audio_features_augmented.csv') else df
print(f"Loaded {len(df)} samples after augmentation")

# Extract features
feature_cols = [col for col in df.columns if col not in ['file', 'speaker', 'command', 'duration']]
X = df[feature_cols].values
file_names = df['file'].tolist()

Loading dataset...
Loaded 6 samples


  y, sr = librosa.load(audio_file)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(audio_file)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(audio_file)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(audio_file)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(audio_file)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(audio_file)
	Deprecated as of librosa version 0

Loaded 24 samples after augmentation


In [47]:

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import LeaveOneOut
from sklearn.svm import OneClassSVM
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
import numpy as np

# Assuming feature_cols and X are defined
# And also y_true is your true labels array with 1 and -1 labels

# Feature selection
selector = VarianceThreshold(threshold=0.01)
X_selected = selector.fit_transform(X)
selected_features = np.array(feature_cols)[selector.get_support()]
print(f"Selected {X_selected.shape[1]} features")

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)

# Prepare LOOCV
loo = LeaveOneOut()

y_preds = []
y_trues = []

for train_idx, test_idx in loo.split(X_scaled):
    X_fold_train, X_fold_test = X_scaled[train_idx], X_scaled[test_idx]
    y_fold_train, y_fold_test = y_true[train_idx], y_true[test_idx]

    # Train model on train fold
    model = OneClassSVM(kernel='rbf', nu=0.1, gamma='auto')
    model.fit(X_fold_train)

    # Predict on test fold
    y_pred = model.predict(X_fold_test)

    y_preds.append(y_pred[0])
    y_trues.append(y_fold_test[0])

# Convert to numpy arrays
y_preds = np.array(y_preds)
y_trues = np.array(y_trues)

# Calculate metrics
acc = accuracy_score(y_trues, y_preds)
prec = precision_score(y_trues, y_preds, pos_label=1)
rec = recall_score(y_trues, y_preds, pos_label=1)
f1 = f1_score(y_trues, y_preds, pos_label=1)

print(f"Accuracy: {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall: {rec:.3f}")
print(f"F1 Score: {f1:.3f}")

# Calculate inlier_counts
inlier_counts = np.sum(y_preds == 1)

Selected 61 features
Accuracy: 0.167
Precision: 1.000
Recall: 0.167
F1 Score: 0.286


In [48]:
model_output_dir = '/content/models'
os.makedirs(model_output_dir, exist_ok=True)
joblib.dump(model, f'{model_output_dir}/voiceprint_verification_model.pkl')
joblib.dump(scaler, f'{model_output_dir}/voiceprint_scaler.pkl')
joblib.dump(selected_features, f'{model_output_dir}/voiceprint_feature_columns.pkl')

# Save metadata
model_metadata = {
    'model_type': 'OneClassSVM',
    'feature_count': X_selected.shape[1],
    'selected_features': selected_features.tolist(),
    'sample_count': len(df),
    'inlier_proportion': float(np.mean(inlier_counts)),
    'datetime': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
}
with open(f'{model_output_dir}/voiceprint_model_metadata.json', 'w') as f:
    json.dump(model_metadata, f, indent=2)

print(f"Model saved to {model_output_dir}/")

Model saved to /content/models/


In [49]:
# Extract features dictionary for a new audio file
features_dict = extract_features('/content/Unknown-voice.m4a', speaker='unknown')

if features_dict is None:
    print("Feature extraction failed.")
else:
    # Convert to DataFrame (1-row)
    import pandas as pd
    test_df = pd.DataFrame([features_dict])

feature_cols = joblib.load('/content/models/voiceprint_feature_columns.pkl')
scaler = joblib.load('/content/models/voiceprint_scaler.pkl')

# Select only the features the model was trained on
X_new = test_df[feature_cols].values

# Scale features
X_new_scaled = scaler.transform(X_new)

model = joblib.load('/content/models/voiceprint_verification_model.pkl')

# Predict (1 = accepted/inlier, -1 = rejected/outlier)
prediction = model.predict(X_new_scaled)[0]

if prediction == 1:
    print("Audio accepted: Speaker is authorized.")
else:
    print("Audio rejected: Speaker is NOT authorized.")



  y, sr = librosa.load(audio_file)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Audio rejected: Speaker is NOT authorized.
