# Deepfake Voice Detection Pipeline

This notebook demonstrates the complete machine learning pipeline for detecting deepfake voices.

## Table of Contents
1. [Data Loading and Exploration](#data-loading)
2. [Feature Analysis](#feature-analysis)
3. [Model Training](#model-training)
4. [Model Evaluation](#model-evaluation)
5. [Feature Importance](#feature-importance)
6. [Model Deployment](#model-deployment)

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc

import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 1. Data Loading and Exploration {#data-loading}

In [None]:
# Load the dataset
url = "https://hebbkx1anhila5yf.public.blob.vercel-storage.com/DATASET-balanced-JcqFJYhgnWK5P8zrmIuuMwyj9BIpH9.csv"
df = pd.read_csv(url)

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nLabel distribution:")
print(df['LABEL'].value_counts())

# Display first few rows
df.head()

In [None]:
# Basic statistics
print("Dataset Info:")
print(f"Total samples: {len(df)}")
print(f"Features: {len(df.columns) - 1}")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Duplicate rows: {df.duplicated().sum()}")

# Data types
print("\nData types:")
print(df.dtypes.value_counts())

In [None]:
# Label distribution visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Count plot
df['LABEL'].value_counts().plot(kind='bar', ax=ax1, color=['#FF6B6B', '#4ECDC4'])
ax1.set_title('Label Distribution')
ax1.set_xlabel('Label')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=0)

# Pie chart
df['LABEL'].value_counts().plot(kind='pie', ax=ax2, autopct='%1.1f%%', colors=['#FF6B6B', '#4ECDC4'])
ax2.set_title('Label Distribution (Percentage)')
ax2.set_ylabel('')

plt.tight_layout()
plt.show()

## 2. Feature Analysis {#feature-analysis}

In [None]:
# Prepare feature columns
feature_columns = [col for col in df.columns if col != 'LABEL']
print(f"Feature columns ({len(feature_columns)}): {feature_columns}")

# Convert string columns to numeric
for col in feature_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Check for any remaining missing values
print(f"\nMissing values after conversion: {df[feature_columns].isnull().sum().sum()}")

In [None]:
# Statistical summary
df[feature_columns].describe()

In [None]:
# Correlation matrix
plt.figure(figsize=(15, 12))
correlation_matrix = df[feature_columns].corr()
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0, 
            square=True, linewidths=0.1)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Feature distributions by label
# Select a few key features for visualization
key_features = ['chroma_stft', 'rms', 'spectral_centroid', 'spectral_bandwidth', 
                'rolloff', 'zero_crossing_rate', 'mfcc1', 'mfcc2']

fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.ravel()

for i, feature in enumerate(key_features):
    for label in ['REAL', 'FAKE']:
        data = df[df['LABEL'] == label][feature]
        axes[i].hist(data, alpha=0.7, label=label, bins=30)
    
    axes[i].set_title(f'{feature} Distribution')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Frequency')
    axes[i].legend()

plt.tight_layout()
plt.show()

In [None]:
# Box plots for key features
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.ravel()

for i, feature in enumerate(key_features):
    sns.boxplot(data=df, x='LABEL', y=feature, ax=axes[i])
    axes[i].set_title(f'{feature} by Label')

plt.tight_layout()
plt.show()

## 3. Model Training {#model-training}

In [None]:
# Prepare data for modeling
X = df[feature_columns].values
y = df['LABEL'].values

# Handle any remaining NaN values
X = np.nan_to_num(X, nan=0.0)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"Feature matrix shape: {X.shape}")
print(f"Label distribution: {np.bincount(y_encoded)}")
print(f"Label mapping: {dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))}")

In [None]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Training label distribution: {np.bincount(y_train)}")
print(f"Test label distribution: {np.bincount(y_test)}")

In [None]:
# Train multiple models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': SVC(random_state=42, probability=True)
}

model_results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    model.fit(X_train, y_train)
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    model_results[name] = {
        'model': model,
        'cv_scores': cv_scores,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }
    
    print(f"CV F1 Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Test F1 Score: {f1:.4f}")

## 4. Model Evaluation {#model-evaluation}

In [None]:
# Compare model performance
comparison_df = pd.DataFrame({
    'Model': list(model_results.keys()),
    'CV F1 Mean': [results['cv_mean'] for results in model_results.values()],
    'CV F1 Std': [results['cv_std'] for results in model_results.values()],
    'Test Accuracy': [results['accuracy'] for results in model_results.values()],
    'Test Precision': [results['precision'] for results in model_results.values()],
    'Test Recall': [results['recall'] for results in model_results.values()],
    'Test F1': [results['f1'] for results in model_results.values()]
})

print("Model Comparison:")
print(comparison_df.round(4))

In [None]:
# Visualize model comparison
metrics = ['Test Accuracy', 'Test Precision', 'Test Recall', 'Test F1']

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for i, metric in enumerate(metrics):
    comparison_df.plot(x='Model', y=metric, kind='bar', ax=axes[i], 
                      color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
    axes[i].set_title(f'{metric} Comparison')
    axes[i].set_ylabel(metric)
    axes[i].tick_params(axis='x', rotation=45)
    axes[i].legend().remove()

plt.tight_layout()
plt.show()

In [None]:
# Confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, (name, results) in enumerate(model_results.items()):
    cm = confusion_matrix(y_test, results['y_pred'])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['FAKE', 'REAL'], yticklabels=['FAKE', 'REAL'], ax=axes[i])
    axes[i].set_title(f'{name} - Confusion Matrix')
    axes[i].set_ylabel('True Label')
    axes[i].set_xlabel('Predicted Label')

plt.tight_layout()
plt.show()

In [None]:
# ROC curves
plt.figure(figsize=(10, 8))

for name, results in model_results.items():
    fpr, tpr, _ = roc_curve(y_test, results['y_pred_proba'][:, 1])
    roc_auc = auc(fpr, tpr)
    
    plt.plot(fpr, tpr, linewidth=2, 
             label=f'{name} (AUC = {roc_auc:.3f})')

plt.plot([0, 1], [0, 1], 'k--', linewidth=2, label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.show()

## 5. Feature Importance {#feature-importance}

In [None]:
# Feature importance for Random Forest (best performing model)
best_model_name = comparison_df.loc[comparison_df['Test F1'].idxmax(), 'Model']
best_model = model_results[best_model_name]['model']

print(f"Best performing model: {best_model_name}")

if hasattr(best_model, 'feature_importances_'):
    # Get feature importance
    importance = best_model.feature_importances_
    feature_importance_df = pd.DataFrame({
        'feature': feature_columns,
        'importance': importance
    }).sort_values('importance', ascending=False)
    
    # Plot top 15 features
    plt.figure(figsize=(12, 8))
    top_features = feature_importance_df.head(15)
    sns.barplot(data=top_features, x='importance', y='feature', palette='viridis')
    plt.title(f'Top 15 Feature Importance - {best_model_name}')
    plt.xlabel('Importance')
    plt.ylabel('Features')
    plt.tight_layout()
    plt.show()
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance_df.head(10))
else:
    print(f"{best_model_name} does not support feature importance.")

In [None]:
# Feature importance heatmap (for MFCC features)
if hasattr(best_model, 'feature_importances_'):
    mfcc_features = [col for col in feature_columns if col.startswith('mfcc')]
    mfcc_importance = []
    
    for feature in mfcc_features:
        idx = feature_columns.index(feature)
        mfcc_importance.append(importance[idx])
    
    # Reshape for heatmap (4x5 grid for 20 MFCC features)
    mfcc_matrix = np.array(mfcc_importance).reshape(4, 5)
    
    plt.figure(figsize=(10, 6))
    sns.heatmap(mfcc_matrix, annot=True, fmt='.4f', cmap='YlOrRd',
                xticklabels=[f'MFCC{i+1}' for i in range(5)],
                yticklabels=[f'Group {i+1}' for i in range(4)])
    plt.title('MFCC Feature Importance Heatmap')
    plt.tight_layout()
    plt.show()

## 6. Model Deployment {#model-deployment}

In [None]:
# Save the best model and preprocessing components
import joblib
import os

# Create models directory
os.makedirs('../models', exist_ok=True)

# Save model
model_data = {
    'model': best_model,
    'model_type': best_model_name.lower().replace(' ', '_'),
    'training_history': {
        'cv_scores': model_results[best_model_name]['cv_scores'],
        'cv_mean': model_results[best_model_name]['cv_mean'],
        'cv_std': model_results[best_model_name]['cv_std'],
        'val_metrics': {
            'accuracy': model_results[best_model_name]['accuracy'],
            'precision': model_results[best_model_name]['precision'],
            'recall': model_results[best_model_name]['recall'],
            'f1': model_results[best_model_name]['f1']
        }
    }
}

joblib.dump(model_data, '../models/best_model.pkl')
print("Model saved to ../models/best_model.pkl")

# Save preprocessor
preprocessor_data = {
    'scaler': scaler,
    'label_encoder': label_encoder,
    'feature_columns': feature_columns
}

joblib.dump(preprocessor_data, '../models/preprocessor.pkl')
print("Preprocessor saved to ../models/preprocessor.pkl")

# Save processed dataset
os.makedirs('../data', exist_ok=True)
df.to_csv('../data/DATASET-balanced.csv', index=False)
print("Dataset saved to ../data/DATASET-balanced.csv")

In [None]:
# Model summary
print("=" * 50)
print("MODEL TRAINING SUMMARY")
print("=" * 50)
print(f"Best Model: {best_model_name}")
print(f"Dataset Size: {len(df)} samples")
print(f"Features: {len(feature_columns)}")
print(f"Classes: {label_encoder.classes_}")
print("\nPerformance Metrics:")
print(f"  Cross-Validation F1: {model_results[best_model_name]['cv_mean']:.4f} Â± {model_results[best_model_name]['cv_std']:.4f}")
print(f"  Test Accuracy: {model_results[best_model_name]['accuracy']:.4f}")
print(f"  Test Precision: {model_results[best_model_name]['precision']:.4f}")
print(f"  Test Recall: {model_results[best_model_name]['recall']:.4f}")
print(f"  Test F1 Score: {model_results[best_model_name]['f1']:.4f}")
print("\nFiles Saved:")
print("  - ../models/best_model.pkl")
print("  - ../models/preprocessor.pkl")
print("  - ../data/DATASET-balanced.csv")
print("=" * 50)