In [210]:

# # Suppress specific warnings
# warnings.filterwarnings("ignore", category=UserWarning)
# warnings.filterwarnings("ignore", category=FutureWarning)

# # Check for PySoundFile installation
# try:
#     import soundfile as sf
# except ImportError:
#     print("PySoundFile not found. You may install it with 'pip install soundfile' to avoid audio loading warnings.")

# # Function to extract MFCC features from an audio file
# def extract_features(audio_file, max_pad_len=174):
#     try:
#         y, sr = librosa.load(audio_file)
#         mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
#         pad_width = max_pad_len - mfccs.shape[1]
#         if pad_width > 0:
#             mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='mean')
#         else:
#             mfccs = mfccs[:, :max_pad_len]
#         return mfccs
#     except Exception as e:
#         print(f"Error processing {audio_file}: {e}")
#         return None

# # Function to load data from a folder and extract features
# def load_data(folder_path, label, max_pad_len=174):
#     features = []
#     labels = []
#     for filename in os.listdir(folder_path):
#         if filename.endswith(('.wav', '.mp3', '.flac')):
#             audio_file = os.path.join(folder_path, filename)
#             mfccs = extract_features(audio_file, max_pad_len)
#             if mfccs is not None:
#                 features.append(mfccs.flatten())  # Flatten MFCC features
#                 labels.append(label)
#     return features, labels

# # Paths to real and fake audio folders
# real_folder = './rf5000/'
# fake_folder = './df5000/'

# # Load real and fake audio data
# real_features, real_labels = load_data(real_folder, label=1)  # Label 1 for real
# fake_features, fake_labels = load_data(fake_folder, label=0)  # Label 0 for fake

# # Combine real and fake data
# all_features = real_features + fake_features
# all_labels = real_labels + fake_labels

# # Convert to numpy arrays for processing
# all_features = np.array(all_features)
# all_labels = np.array(all_labels)


# # Shuffle the data
# all_features, all_labels = shuffle(all_features, all_labels, random_state=42)

# # Print features and labels to verify
# print("All Features Shape:", all_features.shape)
# print("All Labels:", all_labels)

# # Data preprocessing: split, scale, and reduce dimensions
# X_train, X_test, y_train, y_test = train_test_split(all_features, all_labels, test_size=0.2, random_state=42)

# # Scaling the data
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# # Print the number of features before PCA
# print("Number of features before PCA:", X_train.shape[1])

# # Apply PCA with 800 components
# pca = PCA(n_components=800)
# X_train = pca.fit_transform(X_train)
# X_test = pca.transform(X_test)

# # Print the number of components after PCA
# print("Number of components after PCA:", X_train.shape[1])

# # Model setup with Bagging and Boosting classifiers
# bagging_clf = BaggingClassifier(n_estimators=50, random_state=42)
# boosting_clf = AdaBoostClassifier(n_estimators=50, random_state=42)

# # Train and evaluate Bagging model
# bagging_clf.fit(X_train, y_train)
# y_pred_bagging = bagging_clf.predict(X_test)
# accuracy_bagging = accuracy_score(y_test, y_pred_bagging)
# print(f"Bagging Model accuracy: {accuracy_bagging * 100:.2f}%")
# print("Bagging Classification Report:\n", classification_report(y_test, y_pred_bagging, target_names=["Fake", "Real"]))

# # Train and evaluate Boosting model
# boosting_clf.fit(X_train, y_train)
# y_pred_boosting = boosting_clf.predict(X_test)
# accuracy_boosting = accuracy_score(y_test, y_pred_boosting)
# print(f"Boosting Model accuracy: {accuracy_boosting * 100:.2f}%")
# print("Boosting Classification Report:\n", classification_report(y_test, y_pred_boosting, target_names=["Fake", "Real"]))


In [211]:
import os
import numpy as np
import librosa
import warnings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Suppress specific warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Function to extract MFCC features from an audio file
def extract_features(audio_file, max_pad_len=174):
    try:
        y, sr = librosa.load(audio_file)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        pad_width = max_pad_len - mfccs.shape[1]
        if pad_width > 0:
            mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='mean')
        else:
            mfccs = mfccs[:, :max_pad_len]
        return mfccs
    except Exception as e:
        print(f"Error processing {audio_file}: {e}")
        return None

# Function to load data from a folder and extract features
def load_data(folder_path, label, max_pad_len=174):
    features = []
    labels = []
    for filename in os.listdir(folder_path):
        if filename.endswith(('.wav', '.mp3', '.flac')):
            audio_file = os.path.join(folder_path, filename)
            mfccs = extract_features(audio_file, max_pad_len)
            if mfccs is not None:
                features.append(mfccs.flatten())  # Flatten MFCC features
                labels.append(label)
    return features, labels


# # Paths to real and fake audio folders
# real_folder_3000 = './real3000/'
# fake_folder_3000 = './fake3000/'
# real_folder_5000 = './rf5000/'
# fake_folder_5000 = './df5000/'

# # Load real and fake audio data from each folder
# real_features_3000, real_labels_3000 = load_data(real_folder_3000, label=1)  # Label 1 for real
# fake_features_3000, fake_labels_3000 = load_data(fake_folder_3000, label=0)  # Label 0 for fake

# real_features_5000, real_labels_5000 = load_data(real_folder_5000, label=1)  # Label 1 for real
# fake_features_5000, fake_labels_5000 = load_data(fake_folder_5000, label=0)  # Label 0 for fake

# # Combine data from all folders
# all_features = real_features_3000 + fake_features_3000 + real_features_5000 + fake_features_5000
# all_labels = real_labels_3000 + fake_labels_3000 + real_labels_5000 + fake_labels_5000

# # Convert to numpy arrays for further processing
# all_features = np.array(all_features)
# all_labels = np.array(all_labels)

# # Shuffle the combined dataset
# all_features, all_labels = shuffle(all_features, all_labels, random_state=42)

# # Verify the combined data
# print("Combined Features Shape:", all_features.shape)
# print("Combined Labels Shape:", all_labels.shape)
# print("Sample Labels:", all_labels[:10])  # Display first 10 labels to verify



# Paths to real and fake audio folders
real_folder = './rf5000/'
fake_folder = './df5000/'
# Load real and fake audio data
real_features, real_labels = load_data(real_folder, label=1) # Label 1 for real
fake_features, fake_labels = load_data(fake_folder, label=0)  # Label 0 for fake

# Combine real and fake data


In [212]:
all_features = real_features + fake_features
all_labels = real_labels + fake_labels

# Convert to numpy arrays for processing
all_features = np.array(all_features)
all_labels = np.array(all_labels)

# Shuffle the data
all_features, all_labels = shuffle(all_features, all_labels, random_state=42)


print("Combined Features Shape:", all_features.shape)
print("Combined Labels Shape:", all_labels.shape)
print("Sample Labels:", all_labels[:10]) 


# Print the number of features before PCA
print("Number of features before PCA:", all_features.shape[1])

# Data preprocessing: split, scale, and reduce dimensions
X_train, X_test, y_train, y_test = train_test_split(all_features, all_labels, test_size=0.2, random_state=42)

# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# # Apply PCA with 800 components
# pca = PCA(n_components=800)
# X_train = pca.fit_transform(X_train)
# X_test = pca.transform(X_test)

print("Number of components after PCA:", X_train.shape[1])

Combined Features Shape: (6015, 2262)
Combined Labels Shape: (6015,)
Sample Labels: [1 0 0 0 1 1 0 0 0 0]
Number of features before PCA: 2262
Number of components after PCA: 2262


In [213]:
print("Number of rows (data points):", all_features.shape[0])


Number of rows (data points): 6015


In [214]:
# BEFORE REAL & FAKE DATA
# Define the base models for stacking
logistic_clf = LogisticRegression(random_state=42, max_iter=200)
bagging_clf = BaggingClassifier(n_estimators=25, random_state=42)
boosting_clf = AdaBoostClassifier(n_estimators=25, random_state=42)


stacking_clf = StackingClassifier(
    estimators=[
        ('logistic', logistic_clf),
        ('bagging', bagging_clf),
        ('boosting', boosting_clf),
    ],
    final_estimator=LogisticRegression(class_weight = 'balanced')
)

# Train the stacking model
stacking_clf.fit(X_train, y_train)











# # Predict and evaluate
# y_pred_stacking = stacking_clf.predict(X_test)
# accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
# print(f"Stacking Model accuracy: {accuracy_stacking * 100:.2f}%")
# print("Stacking Classification Report:\n", classification_report(y_test, y_pred_stacking, target_names=["Fake", "Real"]))

In [215]:

# Predict probabilities for the test data
y_pred_proba = stacking_clf.predict_proba(X_test)

# Apply custom threshold: If the probability of 'Real' (class 1) is greater than 0.7, predict 'Real'
y_pred_stacking = (y_pred_proba[:, 1] >= 0.50).astype(int)

# Evaluate accuracy and generate classification report
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
print(f"Stacking Model accuracy with custom threshold: {accuracy_stacking * 100:.2f}%")
print("Stacking Classification Report with custom threshold:\n", classification_report(y_test, y_pred_stacking, target_names=["Fake", "Real"]))

Stacking Model accuracy with custom threshold: 99.33%
Stacking Classification Report with custom threshold:
               precision    recall  f1-score   support

        Fake       1.00      0.99      0.99       578
        Real       0.99      1.00      0.99       625

    accuracy                           0.99      1203
   macro avg       0.99      0.99      0.99      1203
weighted avg       0.99      0.99      0.99      1203



In [216]:
# Display the first 10 actual labels and the first 10 predicted labels
print("First 10 Actual Labels:", y_test[:100].tolist())
print("First 10 Predicted Labels:", y_pred_stacking[:100].tolist())

First 10 Actual Labels: [0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0]
First 10 Predicted Labels: [0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0]


In [217]:
# Display the first 10 actual labels and the first 10 predicted labels
print("First 10 Actual Labels:", y_test[:100].tolist())
print("First 10 Predicted Labels:", y_pred_stacking[:100].tolist())

First 10 Actual Labels: [0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0]
First 10 Predicted Labels: [0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0]


In [218]:
from collections import Counter

# Count the occurrences of 0 and 1 in actual labels
actual_label_counts = Counter(y_test[:100])  # Counting first 100 labels
print("Actual Labels Count:", actual_label_counts)

# Count the occurrences of 0 and 1 in predicted labels
predicted_label_counts = Counter(y_pred_stacking[:100])  # Counting first 100 predictions
print("Predicted Labels Count:", predicted_label_counts)


Actual Labels Count: Counter({1: 59, 0: 41})
Predicted Labels Count: Counter({1: 59, 0: 41})


In [220]:
from sklearn.utils import shuffle
import numpy as np

# Paths to real and fake audio folders
new_real_folder = './narendramodi/'

# Load real audio data
new_real_features, new_real_labels = load_data(new_real_folder, label=1)  # Label 1 for real

# Check if the data is loaded correctly
print("Real Labels:", new_real_labels[:])  # Display first 10 labels to verify

# Convert to numpy arrays for processing
new_real_features = np.array(new_real_features)
new_real_labels = np.array(new_real_labels)
print(new_real_features.shape[1])
# Shuffle the real data
new_real_features, new_real_labels = shuffle(new_real_features, new_real_labels, random_state=42)


# new_real_features = scaler.transform(new_real_features)
# new_real_features = pca.transform(new_real_features)
# Check if the trained model exists and predict with it
try:
    y_pred_real = stacking_clf.predict(new_real_features)

    # Display the predictions for real data
    print("Predictions on Real Audio Data (First 10):", y_pred_real[:])
    print("Actual Labels for Real Audio Data (First 10):", new_real_labels[:])
    
    # Evaluate model on real data
    accuracy_real = accuracy_score(new_real_labels, y_pred_real) * 100
    print(f"Prediction Accuracy on Real Audio Data: {accuracy_real:.2f}%")
    
    # Generate a classification report
    print("Classification Report for Real Audio Data:\n", classification_report(new_real_labels, y_pred_real, target_names=["Fake", "Real"]))

except NameError:
    print("Error: The model 'stacking_clf' is not defined. Please ensure the model is trained before making predictions.")


Real Labels: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
2262
Predictions on Real Audio Data (First 10): [1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Actual Labels for Real Audio Data (First 10): [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Prediction Accuracy on Real Audio Data: 96.67%
Classification Report for Real Audio Data:
               precision    recall  f1-score   support

        Fake       0.00      0.00      0.00         0
        Real       1.00      0.97      0.98        30

    accuracy                           0.97        30
   macro avg       0.50      0.48      0.49        30
weighted avg       1.00      0.97      0.98        30



In [234]:
from sklearn.utils import shuffle
import numpy as np

# Paths to real and fake audio folders
new_real_folder = './krishna_bhat/'

# Load real audio data
new_real_features, new_real_labels = load_data(new_real_folder, label=1)  # Label 1 for real

# Check if the data is loaded correctly
print("Real Labels:", new_real_labels[:])  # Display first 10 labels to verify

# Convert to numpy arrays for processing
new_real_features = np.array(new_real_features)
new_real_labels = np.array(new_real_labels)
print(new_real_features.shape[1])
# Shuffle the real data
new_real_features, new_real_labels = shuffle(new_real_features, new_real_labels, random_state=42)


# new_real_features = scaler.transform(new_real_features)
# new_real_features = pca.transform(new_real_features)
# Check if the trained model exists and predict with it
try:
    y_pred_real = stacking_clf.predict(new_real_features)

    # Display the predictions for real data
    print("Predictions on Real Audio Data (First 10):", y_pred_real[:])
    print("Actual Labels for Real Audio Data (First 10):", new_real_labels[:])
    
    # Evaluate model on real data
    accuracy_real = accuracy_score(new_real_labels, y_pred_real) * 100
    print(f"Prediction Accuracy on Real Audio Data: {accuracy_real:.2f}%")
    
    # Generate a classification report
    print("Classification Report for Real Audio Data:\n", classification_report(new_real_labels, y_pred_real, target_names=["Fake", "Real"]))

except NameError:
    print("Error: The model 'stacking_clf' is not defined. Please ensure the model is trained before making predictions.")


Real Labels: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
2262
Predictions on Real Audio Data (First 10): [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

In [226]:
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Paths to real audio folder
new_real_folder = './fakenamo/chunk_5.wav'

# Load real audio data
new_real_features, new_real_labels = load_data(new_real_folder, label=0)  # Label 1 for real

# Check if the data is loaded correctly
print("Real Labels:", new_real_labels[:])  # Display all labels to verify

# Convert to numpy arrays for processing
new_real_features = np.array(new_real_features)
new_real_labels = np.array(new_real_labels)
print("Feature dimension:", new_real_features.shape[1])

# Shuffle the real data
new_real_features, new_real_labels = shuffle(new_real_features, new_real_labels, random_state=42)

# # Preprocess the data using the same scaler and PCA as before
# new_real_features = scaler.transform(new_real_features)
# new_real_features = pca.transform(new_real_features)

# Check if the trained model exists and predict with probabilities
try:
    # Get prediction probabilities for real data
    y_pred_proba_real = stacking_clf.predict_proba(new_real_features)

    # Apply custom threshold: if probability of 'Real' (class 1) > 0.7, classify as 'Real'
    threshold = 0.5
    y_pred_real = (y_pred_proba_real[:, 1] >= threshold).astype(int)

    # Display the predictions for real data
    print("Predictions on Real Audio Data:", y_pred_real[:])
    print("Actual Labels for Real Audio Data:", new_real_labels[:])
    
    # Evaluate model on real data
    accuracy_real = accuracy_score(new_real_labels, y_pred_real) * 100
    print(f"Prediction Accuracy on Real Audio Data with custom threshold ({threshold * 100:.0f}%): {accuracy_real:.2f}%")
    
    # Generate a classification report
    print("Classification Report for Real Audio Data with custom threshold:\n", classification_report(new_real_labels, y_pred_real, target_names=["Fake", "Real"]))

except NameError:
    print("Error: The model 'stacking_clf' is not defined. Please ensure the model is trained before making predictions.")


NotADirectoryError: [Errno 20] Not a directory: './fakenamo/chunk_5.wav'

In [232]:
from sklearn.utils import shuffle
import numpy as np

# Paths to real and fake audio folders
new_real_folder = './realnamo/'
new_fake_folder = './fakenamo/'

# Load real and fake audio data
new_real_features, new_real_labels = load_data(new_real_folder, label=1)  # Label 1 for real
new_fake_features, new_fake_labels = load_data(new_fake_folder, label=0)  # Label 0 for fake

# Check if the data is loaded correctly
print("Real Labels:", new_real_labels[:10])  # Display first 10 labels to verify
print("Fake Labels:", new_fake_labels[:10])  # Display first 10 labels to verify

# Convert to numpy arrays for processing
new_real_features = np.array(new_real_features)
new_real_labels = np.array(new_real_labels)
new_fake_features = np.array(new_fake_features)
new_fake_labels = np.array(new_fake_labels)

# Shuffle the real and fake data
new_real_features, new_real_labels = shuffle(new_real_features, new_real_labels, random_state=42)
new_fake_features, new_fake_labels = shuffle(new_fake_features, new_fake_labels, random_state=42)

# # Preprocess the data using the same scaler and PCA as before
# new_real_features = scaler.transform(new_real_features)
# new_real_features = pca.transform(new_real_features)

# new_fake_features = scaler.transform(new_fake_features)
# new_fake_features = pca.transform(new_fake_features)

# Combine real and fake features and labels
all_features = np.concatenate((new_real_features, new_fake_features), axis=0)
all_labels = np.concatenate((new_real_labels, new_fake_labels), axis=0)

# Shuffle the combined data
all_features, all_labels = shuffle(all_features, all_labels, random_state=42)

# Check if the trained model exists and predict with it
try:
    # Predict for the combined data (real and fake)
    y_pred_all = stacking_clf.predict(all_features)

    # Display predictions for the first 10 samples
    print("Predictions on Combined Audio Data (First 10):", y_pred_all[:10])
    print("Actual Labels for Combined Audio Data (First 10):", all_labels[:10])

    # Evaluate model on the combined data
    accuracy_all = accuracy_score(all_labels, y_pred_all) * 100
    print(f"Prediction Accuracy on Combined Audio Data: {accuracy_all:.2f}%")

    # Generate a classification report for the combined data
    print("Classification Report for Combined Audio Data:\n", classification_report(all_labels, y_pred_all, target_names=["Fake", "Real"]))

except NameError:
    print("Error: The model 'stacking_clf' is not defined. Please ensure the model is trained before making predictions.")


Real Labels: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Fake Labels: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Predictions on Combined Audio Data (First 10): [1 0 0 1 1 1 0 1 0 0]
Actual Labels for Combined Audio Data (First 10): [1 0 0 1 1 1 0 1 0 0]
Prediction Accuracy on Combined Audio Data: 100.00%
Classification Report for Combined Audio Data:
               precision    recall  f1-score   support

        Fake       1.00      1.00      1.00        10
        Real       1.00      1.00      1.00        10

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20



In [None]:
# from sklearn.ensemble import VotingClassifier

# # Model setup with Bagging and Boosting classifiers
# bagging_clf = BaggingClassifier(n_estimators=50, random_state=42)
# boosting_clf = AdaBoostClassifier(n_estimators=50, random_state=42)

# # Train Bagging model
# bagging_clf.fit(X_train, y_train)
# y_pred_bagging = bagging_clf.predict(X_test)
# accuracy_bagging = accuracy_score(y_test, y_pred_bagging)
# print(f"Bagging Model accuracy: {accuracy_bagging * 100:.2f}%")
# print("Bagging Classification Report:\n", classification_report(y_test, y_pred_bagging, target_names=["Fake", "Real"]))

# # Train Boosting model
# boosting_clf.fit(X_train, y_train)
# y_pred_boosting = boosting_clf.predict(X_test)
# accuracy_boosting = accuracy_score(y_test, y_pred_boosting)
# print(f"Boosting Model accuracy: {accuracy_boosting * 100:.2f}%")
# print("Boosting Classification Report:\n", classification_report(y_test, y_pred_boosting, target_names=["Fake", "Real"]))

# # Ensemble with Soft Voting
# voting_clf = VotingClassifier(
#     estimators=[('bagging', bagging_clf), ('boosting', boosting_clf)],
#     voting='soft'
# )

# # Train and evaluate the ensemble model
# voting_clf.fit(X_train, y_train)
# y_pred_voting = voting_clf.predict(X_test)
# accuracy_voting = accuracy_score(y_test, y_pred_voting)
# print(f"Ensemble Model (Soft Voting) accuracy: {accuracy_voting * 100:.2f}%")
# print("Ensemble Classification Report:\n", classification_report(y_test, y_pred_voting, target_names=["Fake", "Real"]))


In [None]:
# import os
# import numpy as np
# import librosa
# import warnings
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
# from sklearn.metrics import accuracy_score, classification_report
# from sklearn.decomposition import PCA
# from sklearn.preprocessing import StandardScaler
# from sklearn.utils import shuffle

# # Suppress specific warnings
# warnings.filterwarnings("ignore", category=UserWarning)
# warnings.filterwarnings("ignore", category=FutureWarning)

# # Check for PySoundFile installation
# try:
#     import soundfile as sf
# except ImportError:
#     print("PySoundFile not found. You may install it with 'pip install soundfile' to avoid audio loading warnings.")

# # Function to extract MFCC features from an audio file
# def extract_features(audio_file, max_pad_len=174):
#     try:
#         y, sr = librosa.load(audio_file)
#         mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
#         pad_width = max_pad_len - mfccs.shape[1]
#         if pad_width > 0:
#             mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='mean')
#         else:
#             mfccs = mfccs[:, :max_pad_len]
#         return mfccs
#     except Exception as e:
#         print(f"Error processing {audio_file}: {e}")
#         return None

# # Function to load data from a folder and extract features
# def load_data(folder_path, label, max_pad_len=174):
#     features = []
#     labels = []
#     for filename in os.listdir(folder_path):
#         if filename.endswith(('.wav', '.mp3', '.flac')):
#             audio_file = os.path.join(folder_path, filename)
#             mfccs = extract_features(audio_file, max_pad_len)
#             if mfccs is not None:
#                 features.append(mfccs.flatten())  # Flatten MFCC features
#                 labels.append(label)
#     return features, labels

# # Paths to real and fake audio folders
# real_folder = './rf5000/'
# fake_folder = './df5000/'

# # Load real and fake audio data
# real_features, real_labels = load_data(real_folder, label=1)  # Label 1 for real
# fake_features, fake_labels = load_data(fake_folder, label=0)  # Label 0 for fake

# # Combine real and fake data
# all_features = np.array(real_features + fake_features)
# all_labels = np.array(real_labels + fake_labels)

# # Shuffle the data
# all_features, all_labels = shuffle(all_features, all_labels, random_state=42)





