In [1]:

# import packages
import torch
import gc
from transformers import  pipeline, AutoProcessor,AutoFeatureExtractor, AutoModelForAudioClassification, Wav2Vec2Processor, AutoConfig
from huggingface_hub import notebook_login
import librosa
import os
import pandas as pd
#import torchaudio
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.model_selection import train_test_split
from torch.optim import Adam
from torch.nn import CrossEntropyLoss



  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [2]:
# Custom dataset
class AudioDataset(Dataset):
    def __init__(self, df, feature_extractor):
        self.df = df
        self.feature_extractor = feature_extractor

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_path = self.df.iloc[idx]['file_path']
        label = self.df.iloc[idx]['label']
        waveform, sample_rate = librosa.load(file_path, sr=16000)
        inputs = self.feature_extractor(waveform, sampling_rate=sample_rate, return_tensors="pt")
        return inputs, label

In [3]:
class FullAudioDataset(Dataset):
    def __init__(self, df, feature_extractor):
        self.df = df
        self.feature_extractor = feature_extractor

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_path = self.df.iloc[idx]['file_path']
        waveform, sample_rate = librosa.load(file_path, sr=16000)
        inputs = self.feature_extractor(waveform, sampling_rate=sample_rate, return_tensors="pt")
        file_name = self.df.iloc[idx]['file_name_without_ext']
        return inputs, file_name

In [4]:
# Path to your directory containing the .wav files and the CSV file
csv_file_path_fake = '/Users/saji/Desktop/juliusbaer-main/client_profiles/fake_recordings.csv'
# Read the CSV file
df_fake = pd.read_csv(csv_file_path_fake)

# Path to your directory containing the .wav files and the CSV file
csv_file_path_real = '/Users/saji/Desktop/juliusbaer-main/client_profiles/real_recordings.csv'
# Read the CSV file
df_real = pd.read_csv(csv_file_path_real)

In [5]:
#feature_extractor = AutoFeatureExtractor.from_pretrained("MelodyMachine/Deepfake-audio-detection-V2")
#model = AutoModelForAudioClassification.from_pretrained("MelodyMachine/Deepfake-audio-detection-V2")


In [6]:
feature_extractor = AutoFeatureExtractor.from_pretrained("MattyB95/AST-ASVspoof2019-Synthetic-Voice-Detection-New")
model = AutoModelForAudioClassification.from_pretrained("MattyB95/AST-ASVspoof2019-Synthetic-Voice-Detection-New")




In [7]:
#feature_extractor = AutoFeatureExtractor.from_pretrained("HyperMoon/wav2vec2-base-960h-finetuned-deepfake")
#model = AutoModelForAudioClassification.from_pretrained("HyperMoon/wav2vec2-base-960h-finetuned-deepfake") 

In [8]:
#feature_extractor = AutoFeatureExtractor.from_pretrained("abhishtagatya/hubert-base-960h-itw-deepfake")
#model = AutoModelForAudioClassification.from_pretrained("abhishtagatya/hubert-base-960h-itw-deepfake") 

In [9]:
#feature_extractor = AutoFeatureExtractor.from_pretrained("Gustking/wav2vec2-large-xlsr-deepfake-audio-classification")
#model = AutoModelForAudioClassification.from_pretrained("Gustking/wav2vec2-large-xlsr-deepfake-audio-classification") 

In [10]:
# Freeze all layers except the classifier
for param in model.parameters():
    param.requires_grad = False

for param in model.classifier.parameters():
    param.requires_grad = True

#for param in model.projector.parameters():
#    param.requires_grad = True

#optimizer = Adam(model.classifier.parameters(), lr=5e-4)


In [11]:
PATH = "/Users/saji/Desktop/juliusbaer-main/notebooks/audio_data/all/"

# Create the full paths
fake_wav_list = PATH + df_fake['rec_id'] + ".wav"
fake_labels = [0] * 20

# Create the full paths
real_wav_list = PATH + df_real['rec_id'] + ".wav"
real_labels = [1] * 20



# Combine the lists
combined_wav_list = pd.concat([fake_wav_list, real_wav_list]).tolist()
combined_labels = fake_labels + real_labels

# Create a DataFrame from the combined lists
df_data = pd.DataFrame({
    'file_path': combined_wav_list,
    'label': combined_labels
})

In [12]:
train_df, val_df = train_test_split(df_data, test_size=0.6, stratify=df_data['label'], random_state=999)


In [13]:
train_dataset = AudioDataset(train_df, feature_extractor)
val_dataset = AudioDataset(val_df, feature_extractor)



In [14]:
# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False)


In [15]:



# Define the optimizer
#best matty
optimizer = Adam(model.classifier.parameters(), lr=8e-4, weight_decay=1e-3)

#optimizer = Adam(model.classifier.parameters(), lr=4e-5, weight_decay=1e-3)

#optimizer = Adam(list(model.classifier.parameters()) + list(model.projector.parameters()), lr=2e-4, weight_decay=1e-5)


# Define the loss function
criterion = CrossEntropyLoss()


In [16]:
# Fine-tune the classifier
model.train()
num_epochs = 8

best_val_loss = float('inf')


for epoch in range(num_epochs):
    total_train_loss = 0.0
    model.train()
    for inputs, label in train_dataloader:
        input_values = inputs["input_values"].squeeze(0)  # Remove batch dimension
        #attention_mask = inputs["attention_mask"].squeeze(0)  # Remove batch dimension

        # Move tensors to the appropriate device
        input_values = input_values.to(model.device)
        #attention_mask = attention_mask.to(model.device)
        label = torch.tensor(label).to(model.device)

        # Forward pass
        outputs = model(input_values=input_values)
        logits = outputs.logits
        #print(logits)
        #print(label)
        loss = criterion(logits, label)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
    
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Validate the model
    model.eval()
    total_val_loss = 0.0
    with torch.no_grad():
        for inputs, label in val_dataloader:
            input_values = inputs["input_values"].squeeze(0)  # Remove batch dimension
            #attention_mask = inputs["attention_mask"].squeeze(0)  # Remove batch dimension

            # Move tensors to the appropriate device
            input_values = input_values.to(model.device)
            #attention_mask = attention_mask.to(model.device)
            label = torch.tensor(label).to(model.device)

            # Forward pass
            outputs = model(input_values=input_values)
            logits = outputs.logits
            loss = criterion(logits, label)

            total_val_loss += loss.item()
    
    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")
    
    # Save the model after each epoch
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': avg_val_loss,
    }, f'model_epoch_{epoch + 1}.pth')

    # Save the best model based on validation loss
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': avg_val_loss,
        }, 'best_model.pth')

  label = torch.tensor(label).to(model.device)
  label = torch.tensor(label).to(model.device)


Epoch 1/8, Training Loss: 6.4145, Validation Loss: 3.8111
Epoch 2/8, Training Loss: 1.3764, Validation Loss: 0.7511
Epoch 3/8, Training Loss: 1.0878, Validation Loss: 0.9358
Epoch 4/8, Training Loss: 0.4681, Validation Loss: 0.4514
Epoch 5/8, Training Loss: 0.5493, Validation Loss: 0.4573
Epoch 6/8, Training Loss: 0.3924, Validation Loss: 0.3902
Epoch 7/8, Training Loss: 0.3298, Validation Loss: 0.3398
Epoch 8/8, Training Loss: 0.3025, Validation Loss: 0.3635


In [20]:
model.eval()

# Initialize lists to hold the data
predicted_labels_list = []
probabilities_list = []
true_labels_list = []

with torch.no_grad():
    for inputs, labels in val_dataloader:
        input_values = inputs["input_values"].squeeze(0)  # Remove batch dimension
        # Move tensor to the appropriate device
        input_values = input_values.to(model.device)
        labels = torch.tensor(labels).to(model.device)

        # Forward pass
        outputs = model(input_values=input_values)
        logits = outputs.logits

        # Apply softmax to convert logits into probabilities
        probabilities = torch.nn.functional.softmax(logits, dim=-1)

        # Get the predicted labels
        _, predicted_labels = torch.max(probabilities, dim=-1)

        # Convert tensors to list and store in the lists
        predicted_labels_list.extend(predicted_labels.cpu().numpy())
        probabilities_list.extend(probabilities.cpu().numpy())
        true_labels_list.extend(labels.cpu().numpy())

# Create a DataFrame from the lists
df = pd.DataFrame({
    'True_Labels': true_labels_list,
    'Predicted_Labels': predicted_labels_list,
    'Probabilities': probabilities_list
})



  labels = torch.tensor(labels).to(model.device)


In [17]:
feature_extractor_base = AutoFeatureExtractor.from_pretrained("MattyB95/AST-ASVspoof2019-Synthetic-Voice-Detection-New")
model_base = AutoModelForAudioClassification.from_pretrained("MattyB95/AST-ASVspoof2019-Synthetic-Voice-Detection-New")


In [17]:
model_base.eval()

# Initialize lists to hold the data
predicted_labels_list = []
probabilities_list = []
true_labels_list = []

with torch.no_grad():
    for inputs, labels in val_dataloader:
        input_values = inputs["input_values"].squeeze(0)  # Remove batch dimension
        # Move tensor to the appropriate device
        input_values = input_values.to(model_base.device)
        labels = torch.tensor(labels).to(model_base.device)

        # Forward pass
        outputs = model_base(input_values=input_values)
        logits = outputs.logits

        # Apply softmax to convert logits into probabilities
        probabilities = torch.nn.functional.softmax(logits, dim=-1)

        # Get the predicted labels
        _, predicted_labels = torch.max(probabilities, dim=-1)

        # Convert tensors to list and store in the lists
        predicted_labels_list.extend(predicted_labels.cpu().numpy())
        probabilities_list.extend(probabilities.cpu().numpy())
        true_labels_list.extend(labels.cpu().numpy())

# Create a DataFrame from the lists
df_base = pd.DataFrame({
    'True_Labels': true_labels_list,
    'Predicted_Labels': predicted_labels_list,
    'Probabilities': probabilities_list
})


  labels = torch.tensor(labels).to(model_base.device)


In [63]:
feature_extractor_load = AutoFeatureExtractor.from_pretrained("MattyB95/AST-ASVspoof2019-Synthetic-Voice-Detection-New")
model_load = AutoModelForAudioClassification.from_pretrained("MattyB95/AST-ASVspoof2019-Synthetic-Voice-Detection-New")

# Load the saved state dictionary
checkpoint = torch.load('/Users/saji/Desktop/juliusbaer-main/notebooks/matty/best_model_epoch_8_no_scheduler_matty.pth')

# Load the model state dictionary
model_load.load_state_dict(checkpoint['model_state_dict'])



In [36]:
model_load.eval()

# Initialize lists to hold the data
predicted_labels_list = []
probabilities_list = []
true_labels_list = []

with torch.no_grad():
    for inputs, labels in val_dataloader:
        input_values = inputs["input_values"].squeeze(0)  # Remove batch dimension
        # Move tensor to the appropriate device
        input_values = input_values.to(model_load.device)
        labels = torch.tensor(labels).to(model_load.device)

        # Forward pass
        outputs = model_load(input_values=input_values)
        logits = outputs.logits

        # Apply softmax to convert logits into probabilities
        probabilities = torch.nn.functional.softmax(logits, dim=-1)

        # Get the predicted labels
        _, predicted_labels = torch.max(probabilities, dim=-1)

        # Convert tensors to list and store in the lists
        predicted_labels_list.extend(predicted_labels.cpu().numpy())
        probabilities_list.extend(probabilities.cpu().numpy())
        true_labels_list.extend(labels.cpu().numpy())

# Create a DataFrame from the lists
df_load = pd.DataFrame({
    'True_Labels': true_labels_list,
    'Predicted_Labels': predicted_labels_list,
    'Probabilities': probabilities_list
})

  labels = torch.tensor(labels).to(model_load.device)


NameError: name 'feature_extractor_load' is not defined

In [26]:
feature_extractor_load = AutoFeatureExtractor.from_pretrained("MattyB95/AST-ASVspoof2019-Synthetic-Voice-Detection-New")
model_load = AutoModelForAudioClassification.from_pretrained("MattyB95/AST-ASVspoof2019-Synthetic-Voice-Detection-New")

# Load the saved state dictionary
checkpoint = torch.load('/Users/saji/Desktop/juliusbaer-main/notebooks/matty_200/best_model.pth')

# Load the model state dictionary
model_load.load_state_dict(checkpoint['model_state_dict'])


<All keys matched successfully>

In [30]:
path = "/Users/saji/Desktop/juliusbaer-main/notebooks/audio_data/all/"

# Get all files with ".wav" ending, including subdirectories
full_paths = []
file_names = []
for root, dirs, files in os.walk(path):
    for file in files:
        if file.endswith('.wav'):
            full_path = os.path.join(root, file)
            file_name_without_ext = os.path.splitext(file)[0]
            full_paths.append(full_path)
            file_names.append(file_name_without_ext)

# Create a DataFrame with columns "file_name" and "file_name_without_ext"
df_full = pd.DataFrame({
    'file_path': full_paths,
    'file_name_without_ext': file_names
})

full_data = FullAudioDataset(df_full, feature_extractor_load)

fulldataloader = DataLoader(full_data, batch_size=1, shuffle=False)

In [31]:
model_load.eval()

predicted_labels_list = []
probabilities_list = []
probabilities_list_0 = []
probabilities_list_1 = []
name_list = []

with torch.no_grad():
    for inputs, name in fulldataloader:
        #print(name)
        input_values = inputs["input_values"].squeeze(0)  # Remove batch dimension
        # Move tensor to the appropriate device
        input_values = input_values.to(model_load.device)

        # Forward pass
        outputs = model_load(input_values=input_values)
        logits = outputs.logits

        # Apply softmax to convert logits into probabilities
        probabilities = torch.nn.functional.softmax(logits, dim=-1)

        # Get the predicted labels
        _, predicted_labels = torch.max(probabilities, dim=-1)

        # Convert tensors to list and store in the lists
        predicted_labels_list.extend(predicted_labels.cpu().numpy())
        #print(probabilities.cpu().numpy()[0][1])
        probabilities_list.extend(probabilities.cpu().numpy())
        probabilities_list_0.extend([probabilities.cpu().numpy()[0][0]])
        probabilities_list_1.extend([probabilities.cpu().numpy()[0][1]])

        # name is a list of strings; extend the name_list with these names
        name_list.extend(name)  # Assuming name is a list of strings

# Create a DataFrame from the lists
df_full = pd.DataFrame({
    'Name': name_list,
    'Predicted_Labels': predicted_labels_list,
    'Probabilities': probabilities_list,
    'Probabilities_0': probabilities_list_0,
    'Probabilities_1': probabilities_list_1,
})

        

In [32]:
df_full.to_csv('/Users/saji/Desktop/juliusbaer-main/notebooks/fake_pred_trained_model_200.csv', index=False)
