In [1]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import torch
import torchaudio
import numpy as np
from torch import nn
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.amp import GradScaler, autocast

class AudioDeepfakeDataset(Dataset):
    def __init__(self, data_dirs, sample_rate=16000, max_length=4.0):
        self.data_dirs = data_dirs
        self.sample_rate = sample_rate
        self.max_length = max_length
        self.max_samples = int(max_length * sample_rate)
        
        self.audio_files = []
        self.labels = []
        
        for data_dir in data_dirs:
            data_dir = Path(data_dir)
            label = 0 if 'real' in data_dir.name.lower() else 1
            for audio_file in data_dir.glob('*.wav'):
                self.audio_files.append(str(audio_file))
                self.labels.append(label)
        
        assert len(self.audio_files) > 0, "No audio files found in the provided directories."
    
    def __len__(self):
        return len(self.audio_files)
    
    def __getitem__(self, idx):
        audio_path = self.audio_files[idx]
        label = self.labels[idx]
        
        waveform, orig_sample_rate = torchaudio.load(audio_path)
        
        if orig_sample_rate != self.sample_rate:
            resampler = torchaudio.transforms.Resample(orig_sample_rate, self.sample_rate)
            waveform = resampler(waveform)
        
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        
        waveform = (waveform - waveform.mean()) / (waveform.std() + 1e-6)
        
        num_samples = waveform.shape[1]
        if num_samples > self.max_samples:
            waveform = waveform[:, :self.max_samples]
        elif num_samples < self.max_samples:
            padding = torch.zeros(1, self.max_samples - num_samples)
            waveform = torch.cat([waveform, padding], dim=1)
        
        return waveform.squeeze(0), label

def collate_fn(batch):
    waveforms, labels = zip(*batch)
    waveforms = torch.stack([wf for wf in waveforms])
    labels = torch.tensor(labels, dtype=torch.long)
    return waveforms, labels

def get_dataloaders(train_dirs, val_dirs, test_dirs, batch_size=16, num_workers=8):
    train_dataset = AudioDeepfakeDataset(train_dirs)
    val_dataset = AudioDeepfakeDataset(val_dirs)
    test_dataset = AudioDeepfakeDataset(test_dirs)
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        collate_fn=collate_fn,
        pin_memory=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        collate_fn=collate_fn,
        pin_memory=True
    )
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        collate_fn=collate_fn,
        pin_memory=True
    )
    
    return train_loader, val_loader, test_loader

class AudioDeepfakeModel(nn.Module):
    def __init__(self, model_name="facebook/wav2vec2-base", num_labels=2):
        super(AudioDeepfakeModel, self).__init__()
        self.wav2vec2 = Wav2Vec2ForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels
        )
        self.wav2vec2.gradient_checkpointing_enable()  # Enable gradient checkpointing
        self.wav2vec2.wav2vec2.feature_extractor.eval()
        for param in self.wav2vec2.wav2vec2.feature_extractor.parameters():
            param.requires_grad = False
    
    def forward(self, input_values, labels=None):
        outputs = self.wav2vec2(input_values, labels=labels)
        return outputs

def compute_metrics(labels, preds):
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='binary')
    return {"accuracy": accuracy, "f1": f1}

def train_model(model, train_loader, val_loader, output_dir, num_epochs=15, patience=5, accum_steps=4):
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=0.1, total_iters=500)
    scaler = GradScaler('cuda')
    
    train_losses, val_losses = [], []
    train_accuracies, val_accuracies = [], []
    best_f1 = 0
    best_val_loss = float('inf')
    epochs_no_improve = 0
    
    for epoch in range(num_epochs):
        model.train()
        train_loss, train_correct, train_total = 0, 0, 0
        train_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")
        
        optimizer.zero_grad()
        for i, batch in enumerate(train_pbar):
            waveforms, labels = batch
            waveforms, labels = waveforms.to(device), labels.to(device)
            
            with autocast('cuda'):
                outputs = model(waveforms, labels=labels)
                loss = outputs.loss / accum_steps
            
            scaler.scale(loss).backward()
            
            if (i + 1) % accum_steps == 0 or (i + 1) == len(train_loader):
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()
            
            train_loss += loss.item() * accum_steps
            preds = outputs.logits.argmax(dim=-1)
            train_correct += (preds == labels).sum().item()
            train_total += labels.size(0)
            
            train_pbar.set_postfix({
                "loss": f"{train_loss/train_total:.4f}",
                "acc": f"{train_correct/train_total:.4f}"
            })
        
        train_loss /= len(train_loader)
        train_accuracy = train_correct / train_total
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)
        
        model.eval()
        val_loss, val_correct, val_total = 0, 0, 0
        val_preds, val_labels = [], []
        val_pbar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]")
        
        with torch.no_grad():
            for batch in val_pbar:
                waveforms, labels = batch
                waveforms, labels = waveforms.to(device), labels.to(device)
                
                with autocast('cuda'):
                    outputs = model(waveforms, labels=labels)
                    loss = outputs.loss
                
                val_loss += loss.item()
                preds = outputs.logits.argmax(dim=-1)
                val_correct += (preds == labels).sum().item()
                val_total += labels.size(0)
                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())
                
                val_pbar.set_postfix({
                    "loss": f"{val_loss/val_total:.4f}",
                    "acc": f"{val_correct/val_total:.4f}"
                })
        
        val_loss /= len(val_loader)
        val_accuracy = val_correct / val_total
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)
        
        metrics = compute_metrics(val_labels, val_preds)
        val_f1 = metrics["f1"]
        
        if epoch == 0 or val_f1 > best_f1:
            best_f1 = val_f1
            model.wav2vec2.save_pretrained(os.path.join(output_dir, "best_model"))
            processor.save_pretrained(os.path.join(output_dir, "best_model"))
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print(f"Early stopping triggered after {epoch+1} epochs.")
                break
        
        print(f"Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}, Val F1: {val_f1:.4f}")
    
    return train_losses, train_accuracies, val_losses, val_accuracies

def plot_metrics(train_losses, train_accuracies, val_losses, val_accuracies, output_dir):
    epochs = range(1, len(train_losses) + 1)
    
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, 'b-', label='Train Loss')
    plt.plot(epochs, val_losses, 'r-', label='Val Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    
    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_accuracies, 'b-', label='Train Accuracy')
    plt.plot(epochs, val_accuracies, 'r-', label='Val Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'metrics_plot.png'))
    plt.close()

train_dirs = [
    "/teamspace/studios/this_studio/audio_detect/dataset/split_data/train/real",
    "/teamspace/studios/this_studio/audio_detect/dataset/split_data/train/fake"
]
val_dirs = [
    "/teamspace/studios/this_studio/audio_detect/dataset/split_data/val/real",
    "/teamspace/studios/this_studio/audio_detect/dataset/split_data/val/fake"
]
test_dirs = [
    "/teamspace/studios/this_studio/audio_detect/dataset/split_data/test/real",
    "/teamspace/studios/this_studio/audio_detect/dataset/split_data/test/fake"
]

train_loader, val_loader, test_loader = get_dataloaders(
    train_dirs,
    val_dirs,
    test_dirs,
    batch_size=16,
    num_workers=8
)



In [2]:
model = AudioDeepfakeModel(model_name="facebook/wav2vec2-base", num_labels=2)

output_dir = "saved_model"
os.makedirs(output_dir, exist_ok=True)

train_losses, train_accuracies, val_losses, val_accuracies = train_model(
    model, train_loader, val_loader, output_dir, num_epochs=15, patience=5, accum_steps=4
)

plot_metrics(train_losses, train_accuracies, val_losses, val_accuracies, output_dir)

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/380M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Epoch 1/15 [Train]: 100%|██████████| 2001/2001 [06:33<00:00,  5.09it/s, loss=0.0232, acc=0.8267]
Epoch 1/15 [Val]: 100%|██████████| 251/251 [00:34<00:00,  7.22it/s, loss=0.0139, acc=0.9075]


Epoch 1: Train Loss: 0.3709, Train Acc: 0.8267, Val Loss: 0.2220, Val Acc: 0.9075, Val F1: 0.9029


Epoch 2/15 [Train]: 100%|██████████| 2001/2001 [06:23<00:00,  5.22it/s, loss=0.0089, acc=0.9429]
Epoch 2/15 [Val]: 100%|██████████| 251/251 [00:29<00:00,  8.38it/s, loss=0.0063, acc=0.9643]


Epoch 2: Train Loss: 0.1416, Train Acc: 0.9429, Val Loss: 0.1001, Val Acc: 0.9643, Val F1: 0.9634


Epoch 3/15 [Train]: 100%|██████████| 2001/2001 [06:21<00:00,  5.24it/s, loss=0.0054, acc=0.9656]
Epoch 3/15 [Val]: 100%|██████████| 251/251 [00:25<00:00,  9.77it/s, loss=0.0055, acc=0.9650]


Epoch 3: Train Loss: 0.0866, Train Acc: 0.9656, Val Loss: 0.0871, Val Acc: 0.9650, Val F1: 0.9642


Epoch 4/15 [Train]: 100%|██████████| 2001/2001 [06:21<00:00,  5.24it/s, loss=0.0038, acc=0.9759]
Epoch 4/15 [Val]: 100%|██████████| 251/251 [00:24<00:00, 10.21it/s, loss=0.0113, acc=0.9423]


Epoch 4: Train Loss: 0.0610, Train Acc: 0.9759, Val Loss: 0.1805, Val Acc: 0.9423, Val F1: 0.9391


Epoch 5/15 [Train]: 100%|██████████| 2001/2001 [06:20<00:00,  5.26it/s, loss=0.0032, acc=0.9807]
Epoch 5/15 [Val]: 100%|██████████| 251/251 [00:21<00:00, 11.47it/s, loss=0.0049, acc=0.9755]


Epoch 5: Train Loss: 0.0518, Train Acc: 0.9807, Val Loss: 0.0789, Val Acc: 0.9755, Val F1: 0.9750


Epoch 6/15 [Train]: 100%|██████████| 2001/2001 [06:21<00:00,  5.25it/s, loss=0.0023, acc=0.9861]
Epoch 6/15 [Val]: 100%|██████████| 251/251 [00:20<00:00, 12.32it/s, loss=0.0043, acc=0.9738]


Epoch 6: Train Loss: 0.0367, Train Acc: 0.9861, Val Loss: 0.0680, Val Acc: 0.9738, Val F1: 0.9733


Epoch 7/15 [Train]: 100%|██████████| 2001/2001 [06:20<00:00,  5.26it/s, loss=0.0022, acc=0.9870]
Epoch 7/15 [Val]: 100%|██████████| 251/251 [00:20<00:00, 12.34it/s, loss=0.0058, acc=0.9758]


Epoch 7: Train Loss: 0.0355, Train Acc: 0.9870, Val Loss: 0.0927, Val Acc: 0.9758, Val F1: 0.9753


Epoch 8/15 [Train]: 100%|██████████| 2001/2001 [06:19<00:00,  5.27it/s, loss=0.0016, acc=0.9908]
Epoch 8/15 [Val]: 100%|██████████| 251/251 [00:19<00:00, 13.11it/s, loss=0.0029, acc=0.9833]


Epoch 8: Train Loss: 0.0258, Train Acc: 0.9908, Val Loss: 0.0470, Val Acc: 0.9833, Val F1: 0.9832


Epoch 9/15 [Train]: 100%|██████████| 2001/2001 [06:20<00:00,  5.25it/s, loss=0.0016, acc=0.9912]
Epoch 9/15 [Val]: 100%|██████████| 251/251 [00:19<00:00, 13.01it/s, loss=0.0031, acc=0.9818]


Epoch 9: Train Loss: 0.0254, Train Acc: 0.9912, Val Loss: 0.0493, Val Acc: 0.9818, Val F1: 0.9819


Epoch 10/15 [Train]: 100%|██████████| 2001/2001 [06:21<00:00,  5.25it/s, loss=0.0014, acc=0.9923]
Epoch 10/15 [Val]: 100%|██████████| 251/251 [00:19<00:00, 13.10it/s, loss=0.0045, acc=0.9795]


Epoch 10: Train Loss: 0.0223, Train Acc: 0.9923, Val Loss: 0.0722, Val Acc: 0.9795, Val F1: 0.9791


Epoch 11/15 [Train]: 100%|██████████| 2001/2001 [06:20<00:00,  5.26it/s, loss=0.0012, acc=0.9931]
Epoch 11/15 [Val]: 100%|██████████| 251/251 [00:19<00:00, 13.00it/s, loss=0.0091, acc=0.9733]


Epoch 11: Train Loss: 0.0191, Train Acc: 0.9931, Val Loss: 0.1455, Val Acc: 0.9733, Val F1: 0.9725


Epoch 12/15 [Train]: 100%|██████████| 2001/2001 [06:21<00:00,  5.25it/s, loss=0.0010, acc=0.9943]
Epoch 12/15 [Val]: 100%|██████████| 251/251 [00:19<00:00, 12.93it/s, loss=0.0111, acc=0.9623]


Epoch 12: Train Loss: 0.0166, Train Acc: 0.9943, Val Loss: 0.1772, Val Acc: 0.9623, Val F1: 0.9608


Epoch 13/15 [Train]: 100%|██████████| 2001/2001 [06:23<00:00,  5.22it/s, loss=0.0011, acc=0.9944]
Epoch 13/15 [Val]: 100%|██████████| 251/251 [00:17<00:00, 14.55it/s, loss=0.0036, acc=0.9863]


Early stopping triggered after 13 epochs.


In [None]:
import os
import torch
import torchaudio
import numpy as np
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
from sklearn.metrics import confusion_matrix, classification_report
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

class AudioDeepfakeDataset(Dataset):
    def __init__(self, data_dirs, sample_rate=16000, max_length=4.0):
        self.data_dirs = data_dirs
        self.sample_rate = sample_rate
        self.max_length = max_length
        self.max_samples = int(max_length * sample_rate)
        
        self.audio_files = []
        self.labels = []
        
        for data_dir in data_dirs:
            data_dir = Path(data_dir)
            label = 0 if 'real' in data_dir.name.lower() else 1
            for audio_file in data_dir.glob('*.wav'):
                self.audio_files.append(str(audio_file))
                self.labels.append(label)
        
        assert len(self.audio_files) > 0, "No audio files found in the provided directories."
    
    def __len__(self):
        return len(self.audio_files)
    
    def __getitem__(self, idx):
        audio_path = self.audio_files[idx]
        label = self.labels[idx]
        
        waveform, orig_sample_rate = torchaudio.load(audio_path)
        
        if orig_sample_rate != self.sample_rate:
            resampler = torchaudio.transforms.Resample(orig_sample_rate, self.sample_rate)
            waveform = resampler(waveform)
        
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        
        waveform = (waveform - waveform.mean()) / (waveform.std() + 1e-6)
        
        num_samples = waveform.shape[1]
        if num_samples > self.max_samples:
            waveform = waveform[:, :self.max_samples]
        elif num_samples < self.max_samples:
            padding = torch.zeros(1, self.max_samples - num_samples)
            waveform = torch.cat([waveform, padding], dim=1)
        
        return waveform.squeeze(0), label

def collate_fn(batch):
    waveforms, labels = zip(*batch)
    waveforms = torch.stack([wf for wf in waveforms])
    labels = torch.tensor(labels, dtype=torch.long)
    return waveforms, labels

def get_test_dataloader(test_dirs, batch_size=16, num_workers=8):
    test_dataset = AudioDeepfakeDataset(test_dirs)
    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        collate_fn=collate_fn,
        pin_memory=True
    )
    return test_loader

def evaluate_model(model, test_loader, processor, output_dir):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    
    test_loss, test_correct, test_total = 0, 0, 0
    test_preds, test_labels = [], []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating on Test Data"):
            waveforms, labels = batch
            waveforms, labels = waveforms.to(device), labels.to(device)
            
            outputs = model(waveforms, labels=labels)
            loss = outputs.loss
            
            test_loss += loss.item()
            preds = outputs.logits.argmax(dim=-1)
            test_correct += (preds == labels).sum().item()
            test_total += labels.size(0)
            test_preds.extend(preds.cpu().numpy())
            test_labels.extend(labels.cpu().numpy())
    
    test_loss /= len(test_loader)
    test_accuracy = test_correct / test_total
    
    # Compute confusion matrix
    cm = confusion_matrix(test_labels, test_preds)
    
    # Compute classification report
    class_report = classification_report(test_labels, test_preds, target_names=["Real", "Fake"])
    
    # Save classification report
    with open(os.path.join(output_dir, "classification_report.txt"), "w") as f:
        f.write(class_report)
    
    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")
    print("\nClassification Report:\n", class_report)
    
    return cm, test_labels, test_preds, test_loss, test_accuracy

def plot_confusion_matrix(cm, output_dir, class_names=["Real", "Fake"]):
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "confusion_matrix.png"))
    plt.close()

def main():
    # Define test data directories
    test_dirs = [
        "/teamspace/studios/this_studio/audio_detect/dataset/split_data/test/real",
        "/teamspace/studios/this_studio/audio_detect/dataset/split_data/test/fake"
    ]
    

In [5]:
    # Output directory
    output_dir = "saved_model"
    os.makedirs(output_dir, exist_ok=True)
    
    # Load saved model and processor
    model = Wav2Vec2ForSequenceClassification.from_pretrained(os.path.join(output_dir, "best_model"))
    processor = Wav2Vec2Processor.from_pretrained(os.path.join(output_dir, "best_model"))
    
    # Get test data loader
    test_loader = get_test_dataloader(test_dirs, batch_size=16, num_workers=8)
    
    # Evaluate model
    cm, test_labels, test_preds, test_loss, test_accuracy = evaluate_model(model, test_loader, processor, output_dir)
    
    # Plot and save confusion matrix
    plot_confusion_matrix(cm, output_dir)

Evaluating on Test Data: 100%|██████████| 251/251 [00:37<00:00,  6.67it/s]


Test Loss: 0.0743, Test Accuracy: 0.9830

Classification Report:
               precision    recall  f1-score   support

        Real       0.97      0.99      0.98      2001
        Fake       0.99      0.97      0.98      2001

    accuracy                           0.98      4002
   macro avg       0.98      0.98      0.98      4002
weighted avg       0.98      0.98      0.98      4002



In [None]:
import torch
import torchaudio
import numpy as np
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification

def preprocess_audio(audio_path, sample_rate=16000, max_length=4.0):
    # Load audio
    waveform, orig_sample_rate = torchaudio.load(audio_path)
    
    # Resample to 16kHz if needed
    if orig_sample_rate != sample_rate:
        resampler = torchaudio.transforms.Resample(orig_sample_rate, sample_rate)
        waveform = resampler(waveform)
    
    # Convert to mono if stereo
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    
    # Normalize (zero mean, unit variance)
    waveform = (waveform - waveform.mean()) / (waveform.std() + 1e-6)
    
    # Trim or pad to 4 seconds
    max_samples = int(max_length * sample_rate)
    num_samples = waveform.shape[1]
    if num_samples > max_samples:
        waveform = waveform[:, :max_samples]
    elif num_samples < max_samples:
        padding = torch.zeros(1, max_samples - num_samples)
        waveform = torch.cat([waveform, padding], dim=1)
    
    return waveform.squeeze(0)

def predict_audio(model, processor, audio_path, device):
    # Preprocess audio
    waveform = preprocess_audio(audio_path)
    
    # Move to device and add batch dimension
    waveform = waveform.unsqueeze(0).to(device)
    
    # Get model prediction
    model.eval()
    with torch.no_grad():
        outputs = model(waveform)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=-1)
        predicted_label = logits.argmax(dim=-1).item()
        confidence = probabilities[0, predicted_label].item()
    
    return predicted_label, confidence

def main():
    # Paths
    model_path = "saved_model/best_model"
    audio_path = "path/to/your/audio_clip.wav"  # Replace with your audio file path
    
    # Device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Load model and processor
    model = Wav2Vec2ForSequenceClassification.from_pretrained(model_path).to(device)
    processor = Wav2Vec2Processor.from_pretrained(model_path)
    
    # Predict
    label, confidence = predict_audio(model, processor, audio_path, device)
    
    # Output result
    class_name = "Real" if label == 0 else "Fake"
    print(f"Prediction: {class_name}")
    print(f"Confidence: {confidence:.4f}")



In [10]:
import os
import torch
import torchaudio
import numpy as np
import glob
import random
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification

def preprocess_audio(audio_path, sample_rate=16000, max_length=4.0):
    waveform, orig_sample_rate = torchaudio.load(audio_path)
    if orig_sample_rate != sample_rate:
        resampler = torchaudio.transforms.Resample(orig_sample_rate, sample_rate)
        waveform = resampler(waveform)
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    waveform = (waveform - waveform.mean()) / (waveform.std() + 1e-6)
    max_samples = int(max_length * sample_rate)
    num_samples = waveform.shape[1]
    if num_samples > max_samples:
        waveform = waveform[:, :max_samples]
    elif num_samples < max_samples:
        padding = torch.zeros(1, max_samples - num_samples)
        waveform = torch.cat([waveform, padding], dim=1)
    return waveform.squeeze(0)

def predict_audio(model, processor, audio_path, device):
    waveform = preprocess_audio(audio_path)
    waveform = waveform.unsqueeze(0).to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(waveform)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=-1)
        predicted_label = logits.argmax(dim=-1).item()
        confidence = probabilities[0, predicted_label].item()
    return predicted_label, confidence

def main():
    model_path = "saved_model/best_model"
    real_dir = "/teamspace/studios/this_studio/audio_detect/dataset/merged_data/real"
    real_files = glob.glob(os.path.join(real_dir, "*.wav"))
    
    if not real_files:
        print("No .wav files found in the real directory.")
        return
    
    audio_path = random.choice(real_files)
    file_name = os.path.basename(audio_path)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = Wav2Vec2ForSequenceClassification.from_pretrained(model_path).to(device)
    processor = Wav2Vec2Processor.from_pretrained(model_path)
    
    label, confidence = predict_audio(model, processor, audio_path, device)
    
    class_name = "Real" if label == 0 else "Fake"
    print(f"Selected Audio File: {file_name}")
    print(f"Prediction: {class_name}")
    print(f"Confidence: {confidence:.4f}")

if __name__ == "__main__":
    main()

Selected Audio File: real_20250503_235443_996155_9964_file31829.wav
Prediction: Real
Confidence: 0.9999


In [2]:
import os
import torch
import torchaudio
import numpy as np
import glob
import random
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification

def preprocess_audio(audio_path, sample_rate=16000, max_length=4.0):
    waveform, orig_sample_rate = torchaudio.load(audio_path)
    if orig_sample_rate != sample_rate:
        resampler = torchaudio.transforms.Resample(orig_sample_rate, sample_rate)
        waveform = resampler(waveform)
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    waveform = (waveform - waveform.mean()) / (waveform.std() + 1e-6)
    max_samples = int(max_length * sample_rate)
    num_samples = waveform.shape[1]
    if num_samples > max_samples:
        waveform = waveform[:, :max_samples]
    elif num_samples < max_samples:
        padding = torch.zeros(1, max_samples - num_samples)
        waveform = torch.cat([waveform, padding], dim=1)
    return waveform.squeeze(0)

def predict_audio(model, processor, audio_path, device):
    waveform = preprocess_audio(audio_path)
    waveform = waveform.unsqueeze(0).to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(waveform)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=-1)
        predicted_label = logits.argmax(dim=-1).item()
        confidence = probabilities[0, predicted_label].item()
    return predicted_label, confidence

def main():
    model_path = "saved_model/best_model"
    fake_dir = "/teamspace/studios/this_studio/audio_detect/dataset/merged_data/fake"
    fake_files = glob.glob(os.path.join(fake_dir, "*.wav"))
    
    if not fake_files:
        print("No .wav files found in the fake directory.")
        return
    
    audio_path = random.choice(fake_files)
    file_name = os.path.basename(audio_path)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = Wav2Vec2ForSequenceClassification.from_pretrained(model_path).to(device)
    processor = Wav2Vec2Processor.from_pretrained(model_path)
    
    label, confidence = predict_audio(model, processor, audio_path, device)
    
    class_name = "Real" if label == 0 else "Fake"
    print(f"Selected Audio File: {file_name}")
    print(f"Prediction: {class_name}")
    print(f"Confidence: {confidence:.4f}")

if __name__ == "__main__":
    main()

Selected Audio File: fake_20250503_234905_944772_2089_C_31439_05_D.wav
Prediction: Fake
Confidence: 0.9999


In [1]:
import os
import zipfile

def zip_folder(folder_path, output_path):
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, start=folder_path)
                zipf.write(file_path, arcname)

# Paths
folder_to_zip = '/teamspace/studios/this_studio/audio_detect/model_trainging'
output_zip = '/teamspace/studios/this_studio/audio_detect/model_trainging.zip'

zip_folder(folder_to_zip, output_zip)
