In [6]:
import os
import librosa
import librosa.display
import numpy as np
import joblib
from pydub import AudioSegment
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import soundfile as sf
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [7]:
class SpeakerIdentification:
    def __init__(self):
        self.samplerate = 16000
        self.model = None
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()

        # Reference speakers and their folder paths
        self.reference_speakers = {
            "Kaviya": r"D:\VS Workspace\Biometrics project\dataset\kaviya",
            "Vamsi": r"D:\VS Workspace\Biometrics project\dataset\vamsi",
            "Lavanya": r"D:\VS Workspace\Biometrics project\dataset\lavanya",
            "Raji": r"D:\VS Workspace\Biometrics project\dataset\raji",
            "Raman": r"D:\VS Workspace\Biometrics project\dataset\raman",
        }

        # Load the pre-trained model if it exists, otherwise train a new one
        self.load_model()

    def print_feature_importances(self):
        """Print the feature importances from the trained Random Forest model."""
        if self.model is None:
            print("Model is not trained yet.")
            return
        
        rf_model = self.model[0]  # Random Forest model is stored at index 0 of self.model
        
        # Get feature importances from the RandomForest model
        feature_importances = rf_model.feature_importances_
        sorted_idx = np.argsort(feature_importances)[::-1]  # Sort by importance
        
        print("Feature importances:")
        for idx in sorted_idx:
            print(f"Feature {idx}: {feature_importances[idx]}")

    def get_audio_files(self, folder_path):
        """Retrieve all MP3 and WAV files from the specified folder."""
        return [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(".mp3") or file.endswith(".wav")]

    def convert_mp3_to_wav(self, mp3_path):
        """Convert MP3 file to WAV format."""
        wav_path = mp3_path.replace(".mp3", ".wav")
        if not os.path.exists(wav_path):
            try:
                sound = AudioSegment.from_mp3(mp3_path)
                sound = sound.set_channels(1)  # Convert to mono
                sound = sound.set_frame_rate(self.samplerate)  # Convert to 16kHz
                sound.export(wav_path, format="wav")
                print(f"Converted {mp3_path} to {wav_path}")
            except Exception as e:
                print(f"Error converting {mp3_path} to WAV: {e}")
        return wav_path

    def extract_features(self, file_path):
        """Extract audio features for speaker identification."""
        try:
            audio_data, sr = librosa.load(file_path, sr=self.samplerate)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            return None

        # Extract MFCCs (Mel Frequency Cepstral Coefficients)
        mfcc = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=13)
        mfcc_delta = librosa.feature.delta(mfcc)  # Delta of MFCC
        mfcc_delta2 = librosa.feature.delta(mfcc, order=2)  # Delta-Delta of MFCC

        # Extract Chroma Features
        chroma = librosa.feature.chroma_stft(y=audio_data, sr=sr)

        # Extract Spectral Contrast
        spectral_contrast = librosa.feature.spectral_contrast(y=audio_data, sr=sr)

        # Extract Tonnetz (Harmonic Features)
        tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(audio_data), sr=sr)

        # Extract Zero Crossing Rate
        zcr = librosa.feature.zero_crossing_rate(y=audio_data)

        # Extract Root Mean Square Energy
        rms = librosa.feature.rms(y=audio_data)

        # Extract Mel Spectrogram
        mel_spectrogram = librosa.feature.melspectrogram(y=audio_data, sr=sr)

        # Extract Pitch (Fundamental Frequency)
        pitches, magnitudes = librosa.core.piptrack(y=audio_data, sr=sr)
        pitch = np.mean(pitches, axis=1)

        # Combine all features into a single array (mean of each feature across time frames)
        features = []

        # Ensure each feature array has at least one value; else, use zero arrays
        features.append(np.mean(mfcc, axis=1) if mfcc.size > 0 else np.zeros(13))  # MFCC
        features.append(np.mean(mfcc_delta, axis=1) if mfcc_delta.size > 0 else np.zeros(13))  # Delta MFCC
        features.append(np.mean(mfcc_delta2, axis=1) if mfcc_delta2.size > 0 else np.zeros(13))  # Delta-Delta MFCC
        features.append(np.mean(chroma, axis=1) if chroma.size > 0 else np.zeros(12))  # Chroma
        features.append(np.mean(spectral_contrast, axis=1) if spectral_contrast.size > 0 else np.zeros(7))  # Spectral Contrast
        features.append(np.mean(tonnetz, axis=1) if tonnetz.size > 0 else np.zeros(6))  # Tonnetz
        features.append(np.mean(zcr, axis=1) if zcr.size > 0 else np.zeros(1))  # Zero Crossing Rate
        features.append(np.mean(rms, axis=1) if rms.size > 0 else np.zeros(1))  # Root Mean Square Energy
        features.append(np.mean(mel_spectrogram, axis=1) if mel_spectrogram.size > 0 else np.zeros(mel_spectrogram.shape[0]))  # Mel Spectrogram
        features.append(np.mean(pitch, axis=0) if pitch.size > 0 else np.zeros(1))  # Pitch

        # Ensure that every feature in the list is an array with the same dimensions
        features = [f if f.ndim > 0 else np.zeros(1) for f in features]

        # Concatenate all features into a single vector
        return np.concatenate(features)

    def load_data(self):
        """Load audio data and extract features from the reference speakers."""
        X = []
        y = []
        for speaker, folder_path in self.reference_speakers.items():
            audio_files = self.get_audio_files(folder_path)
            for file_path in audio_files:
                # Convert MP3 to WAV if necessary
                if file_path.endswith(".mp3"):
                    file_path = self.convert_mp3_to_wav(file_path)

                features = self.extract_features(file_path)
                if features is not None:
                    X.append(features)
                    y.append(speaker)

        X = np.array(X)
        y = np.array(y)
        return X, y

    def train_model(self):
        """Train the speaker identification model using RandomForest and GradientBoosting."""
        # Load data and encode speaker labels
        X, y = self.load_data()
        y_encoded = self.label_encoder.fit_transform(y)

        # Split data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.5, random_state=42)

        # Scale the features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

        # Apply PCA for dimensionality reduction
        pca = PCA(n_components=0.95)  # Retain 95% of the variance
        X_train_pca = pca.fit_transform(X_train_scaled)
        X_test_pca = pca.transform(X_test_scaled)

        # Apply LDA for supervised dimensionality reduction (with class labels)
        lda = LDA(n_components=1)  # Reduce to 1 component for classification
        X_train_lda = lda.fit_transform(X_train_pca, y_train)
        X_test_lda = lda.transform(X_test_pca)

        # Train RandomForest and GradientBoosting classifiers
        rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
        gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)

        rf_model.fit(X_train_lda, y_train)
        gb_model.fit(X_train_lda, y_train)

        # Stacking: Combine predictions from both models
        rf_preds = rf_model.predict(X_train_lda)
        gb_preds = gb_model.predict(X_train_lda)

        stacking_train_data = np.vstack([rf_preds, gb_preds]).T
        stacking_model = LogisticRegression()
        stacking_model.fit(stacking_train_data, y_train)

        # Save the models and scaler for later use
        self.model = (rf_model, gb_model, stacking_model, pca, lda)
        print(f"Model is trained successfully.")

        # Save the models and components using joblib
        joblib.dump(self.model, 'speaker_model.pkl')
        joblib.dump(self.scaler, 'scaler.pkl')
        joblib.dump(self.label_encoder, 'label_encoder.pkl')

        # Evaluate on the test set
        rf_test_preds = rf_model.predict(X_test_lda)
        gb_test_preds = gb_model.predict(X_test_lda)
        stacking_test_data = np.vstack([rf_test_preds, gb_test_preds]).T
        final_test_preds = stacking_model.predict(stacking_test_data)

        accuracy = accuracy_score(y_test, final_test_preds)
        print(f"Model accuracy: {accuracy * 100:.2f}%")

    def load_model(self):
        """Load the pre-trained models and scaler."""
        if os.path.exists('speaker_model.pkl'):
            self.model = joblib.load('speaker_model.pkl')
            self.scaler = joblib.load('scaler.pkl')
            self.label_encoder = joblib.load('label_encoder.pkl')
            print(f"Model loaded successfully.")
        else:
            print("No pre-trained model found. Training a new model...")
            self.train_model()

    def perform_identification(self, file_path):
        """Perform speaker identification for a given audio file."""
        features = self.extract_features(file_path)
        if features is None:
            print("Error extracting features from the audio file.")
            return

        features_scaled = self.scaler.transform([features])

        # Apply PCA to the input features
        pca, lda = self.model[3], self.model[4]
        features_pca = pca.transform(features_scaled)

        # Apply LDA to the PCA-transformed features
        features_lda = lda.transform(features_pca)

        # Make predictions using both models and stack them
        rf_model, gb_model, stacking_model = self.model[0], self.model[1], self.model[2]
        rf_pred = rf_model.predict(features_lda)
        gb_pred = gb_model.predict(features_lda)

        stacked_data = np.vstack([rf_pred, gb_pred]).T
        final_pred = stacking_model.predict(stacked_data)

        # Convert the predicted label back to the speaker's name
        try:
            speaker_name = self.label_encoder.inverse_transform(final_pred)[0]
        except IndexError:
            speaker_name = None

        # Print the result
        if speaker_name and speaker_name in self.reference_speakers:
            print(f"Access granted.")
            print(f"Identified Speaker: {speaker_name}")
        else:
            print("Access denied.")
    


In [8]:
# Initialize the model
speaker_id = SpeakerIdentification()


No pre-trained model found. Training a new model...
Model is trained successfully.
Model accuracy: 100.00%


In [9]:

# Provide the path to your pre-recorded audio file
audio_file_path = r"test\kaviya6.wav"

# Perform identification
speaker_id.perform_identification(audio_file_path)

Access granted.
Identified Speaker: Kaviya


In [10]:
# Assuming you have the trained model and have loaded it
def print_feature_importances(self):
    rf_model = self.model[0]  # Random Forest model is stored at index 0 of self.model

    # Get feature importances from the RandomForest model
    feature_importances = rf_model.feature_importances_
    sorted_idx = np.argsort(feature_importances)[::-1]  # Sort by importance

    print("Feature importances:")
    for idx in sorted_idx:
        print(f"Feature {idx}: {feature_importances[idx]}")

# Call the function to print feature importances
speaker_id.print_feature_importances()


Feature importances:
Feature 0: 1.0


In [26]:
speaker_id = SpeakerIdentification()
speaker_id.train_model()  # This will train and save the models


AttributeError: 'SpeakerIdentification' object has no attribute 'pca'