## Import data

In [None]:
## The data is not pushed to repo, only model and training logs etc are uploaded

import os
import torchaudio
from datasets import DatasetDict, load_dataset

def prepare_dataset(directory):
    data = {"path": [], "label": []}
    labels = {"fake": 0, "real": 1}  # Map fake to 0 and real to 1

    for label, label_id in labels.items():
        folder_path = os.path.join(directory, label)
        for file in os.listdir(folder_path):
            if file.endswith(".wav"):
                data["path"].append(os.path.join(folder_path, file))
                data["label"].append(label_id)
    return data

# Prepare train, validation, and test datasets
train_data = prepare_dataset(r"dataset\for-norm\for-norm\testing")
val_data = prepare_dataset(r"dataset\for-norm\for-norm\testing")
test_data = prepare_dataset(r"dataset\for-norm\for-norm\testing")


In [4]:
from datasets import Dataset

train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)
test_dataset = Dataset.from_dict(test_data)

dataset = DatasetDict({"train": train_dataset, "validation": val_dataset, "test": test_dataset})


## Import Model

In [5]:
from transformers import AutoProcessor
from transformers import AutoModelForAudioClassification

# Initialize processor
model_name = "facebook/wav2vec2-base"  # Replace with your model if different
model = AutoModelForAudioClassification.from_pretrained(model_name, num_labels=2)  # Adjust `num_labels` based on your dataset
processor = AutoProcessor.from_pretrained(model_name)


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Preprocess Data

In [None]:
import torch


def preprocess_function(batch):
    audio = torchaudio.load(batch["path"])[0].squeeze().numpy()
    inputs = processor(
        audio,
        sampling_rate=16000,
        padding=True,
        truncation=True,
        max_length=32000,  
        return_tensors="pt"
    )
    batch["input_values"] = inputs.input_values[0]
    # Ensure labels are converted to LongTensor
    batch["label"] = torch.tensor(batch["label"], dtype=torch.long)  # Convert label to LongTensor
    return batch

processed_dataset = dataset.map(preprocess_function, remove_columns=["path"], batched=False)
# Set format to torch tensors for compatibility with PyTorch
processed_dataset.set_format(type="torch", columns=["input_values", "label"])

# Double-check the label type again
print(processed_dataset["train"][0]["label"], type(processed_dataset["train"][0]["label"]))
print(processed_dataset["train"][0]["label"].dtype)  # Should print torch.int64


Map:   0%|          | 0/4634 [00:00<?, ? examples/s]

Map:   0%|          | 0/4634 [00:00<?, ? examples/s]

Map:   0%|          | 0/4634 [00:00<?, ? examples/s]

tensor(0) <class 'torch.Tensor'>
torch.int64


## Map Training Labels

In [12]:
# Ensure labels are in numerical format (e.g., 0, 1)
id2label = {0: "Fake", 1: "Real"}  # Define the mapping based on your dataset
label2id = {v: k for k, v in id2label.items()}  # Reverse mapping


print("Labels:", id2label)

# Update the model's configuration with labels
model.config.id2label = id2label
model.config.label2id = label2id

print("Labels:", model.config.id2label)  # Verify


Labels: {0: 'Fake', 1: 'Real'}
Labels: {0: 'Fake', 1: 'Real'}


In [None]:
from transformers import DataCollatorWithPadding

# Use the processor's tokenizer for padding
data_collator = DataCollatorWithPadding(tokenizer=processor, padding=True)


## Initialize Training Arguments

In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    fp16=True,  
    push_to_hub=False,
)
print("TrainingArguments initialized successfully!")


TrainingArguments initialized successfully!




In [9]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["validation"],
    tokenizer=processor,  # Required for the data collator
    data_collator=data_collator,
)


## Start Training

In [33]:
trainer.train()


  0%|          | 0/1740 [00:00<?, ?it/s]

{'loss': 0.6607, 'grad_norm': 2.5936832427978516, 'learning_rate': 4.971264367816092e-05, 'epoch': 0.02}
{'loss': 0.4545, 'grad_norm': 5.04218864440918, 'learning_rate': 4.9425287356321845e-05, 'epoch': 0.03}
{'loss': 0.1779, 'grad_norm': 0.8874927163124084, 'learning_rate': 4.913793103448276e-05, 'epoch': 0.05}
{'loss': 0.0833, 'grad_norm': 0.40262681245803833, 'learning_rate': 4.885057471264368e-05, 'epoch': 0.07}
{'loss': 0.0948, 'grad_norm': 0.5579108595848083, 'learning_rate': 4.85632183908046e-05, 'epoch': 0.09}
{'loss': 0.0128, 'grad_norm': 0.1585635393857956, 'learning_rate': 4.827586206896552e-05, 'epoch': 0.1}
{'loss': 0.0696, 'grad_norm': 0.12149885296821594, 'learning_rate': 4.798850574712644e-05, 'epoch': 0.12}
{'loss': 0.0065, 'grad_norm': 0.09655608981847763, 'learning_rate': 4.770114942528736e-05, 'epoch': 0.14}
{'loss': 0.0052, 'grad_norm': 0.08148041367530823, 'learning_rate': 4.741379310344828e-05, 'epoch': 0.16}
{'loss': 0.0041, 'grad_norm': 0.07030971348285675, 'le

  0%|          | 0/580 [00:00<?, ?it/s]

{'eval_loss': 0.00017967642634175718, 'eval_runtime': 566.6055, 'eval_samples_per_second': 8.179, 'eval_steps_per_second': 1.024, 'epoch': 1.0}
{'loss': 0.0002, 'grad_norm': 0.005443067755550146, 'learning_rate': 3.3045977011494256e-05, 'epoch': 1.02}
{'loss': 0.0002, 'grad_norm': 0.005919346585869789, 'learning_rate': 3.275862068965517e-05, 'epoch': 1.03}
{'loss': 0.0002, 'grad_norm': 0.00538916140794754, 'learning_rate': 3.24712643678161e-05, 'epoch': 1.05}
{'loss': 0.0002, 'grad_norm': 0.00514333276078105, 'learning_rate': 3.218390804597701e-05, 'epoch': 1.07}
{'loss': 0.0002, 'grad_norm': 0.005011783912777901, 'learning_rate': 3.1896551724137935e-05, 'epoch': 1.09}
{'loss': 0.0002, 'grad_norm': 0.005112846381962299, 'learning_rate': 3.160919540229885e-05, 'epoch': 1.1}
{'loss': 0.0002, 'grad_norm': 0.004895139951258898, 'learning_rate': 3.132183908045977e-05, 'epoch': 1.12}
{'loss': 0.0002, 'grad_norm': 0.004565018694847822, 'learning_rate': 3.103448275862069e-05, 'epoch': 1.14}
{'

  0%|          | 0/580 [00:00<?, ?it/s]

{'eval_loss': 6.0841484810225666e-05, 'eval_runtime': 581.8939, 'eval_samples_per_second': 7.964, 'eval_steps_per_second': 0.997, 'epoch': 2.0}
{'loss': 0.0001, 'grad_norm': 0.0022026619408279657, 'learning_rate': 1.6379310344827585e-05, 'epoch': 2.02}
{'loss': 0.0001, 'grad_norm': 0.002242449903860688, 'learning_rate': 1.6091954022988507e-05, 'epoch': 2.03}
{'loss': 0.0001, 'grad_norm': 0.002464097458869219, 'learning_rate': 1.5804597701149425e-05, 'epoch': 2.05}
{'loss': 0.0001, 'grad_norm': 0.0022003022022545338, 'learning_rate': 1.5517241379310346e-05, 'epoch': 2.07}
{'loss': 0.0001, 'grad_norm': 0.0021785416174679995, 'learning_rate': 1.5229885057471265e-05, 'epoch': 2.09}
{'loss': 0.0001, 'grad_norm': 0.0021638190373778343, 'learning_rate': 1.4942528735632185e-05, 'epoch': 2.1}
{'loss': 0.0001, 'grad_norm': 0.0022439502645283937, 'learning_rate': 1.4655172413793103e-05, 'epoch': 2.12}
{'loss': 0.0001, 'grad_norm': 0.0020717435982078314, 'learning_rate': 1.4367816091954022e-05, 'e

  0%|          | 0/580 [00:00<?, ?it/s]

{'eval_loss': 4.472154250834137e-05, 'eval_runtime': 541.083, 'eval_samples_per_second': 8.564, 'eval_steps_per_second': 1.072, 'epoch': 3.0}
{'train_runtime': 10729.5411, 'train_samples_per_second': 1.296, 'train_steps_per_second': 0.162, 'train_loss': 0.01232232698998762, 'epoch': 3.0}


TrainOutput(global_step=1740, training_loss=0.01232232698998762, metrics={'train_runtime': 10729.5411, 'train_samples_per_second': 1.296, 'train_steps_per_second': 0.162, 'total_flos': 2.5228134820702045e+17, 'train_loss': 0.01232232698998762, 'epoch': 3.0})

## Save The Model

In [34]:
# Save the trained model and processor
trainer.save_model("./trained_model")  # Saves the model to the specified directory
processor.save_pretrained("./trained_model")  # Saves the processor as well


[]

In [35]:
# Evaluate the model on the validation dataset
evaluation_metrics = trainer.evaluate()

# Print the evaluation metrics
print("Evaluation Metrics:")
for metric, value in evaluation_metrics.items():
    print(f"{metric}: {value}")


  0%|          | 0/580 [00:00<?, ?it/s]

Evaluation Metrics:
eval_loss: 4.472154250834137e-05
eval_runtime: 527.4045
eval_samples_per_second: 8.786
eval_steps_per_second: 1.1
epoch: 3.0


In [10]:
from transformers import AutoProcessor, AutoModelForAudioClassification

# Load the trained model and processor
model_path = "./trained_model"  # Path to your saved model
model = AutoModelForAudioClassification.from_pretrained(model_path)
processor = AutoProcessor.from_pretrained(model_path)


## Single Audio Testing

In [34]:
def prepare_audio(file_path, sampling_rate=16000, duration=10):
    """
    Prepares audio by loading, resampling, and returning it in manageable chunks.
    
    Parameters:
    - file_path: Path to the audio file.
    - sampling_rate: Target sampling rate for the audio.
    - duration: Duration in seconds for each chunk.
    
    Returns:
    - A list of audio chunks, each as a numpy array.
    """
    # Load and resample the audio file
    waveform, original_sampling_rate = torchaudio.load(file_path)
    
    # Convert stereo to mono if necessary
    if waveform.shape[0] > 1:  # More than 1 channel
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    
    # Resample if needed
    if original_sampling_rate != sampling_rate:
        resampler = torchaudio.transforms.Resample(orig_freq=original_sampling_rate, new_freq=sampling_rate)
        waveform = resampler(waveform)
    
    # Calculate chunk size in samples
    chunk_size = sampling_rate * duration
    audio_chunks = []

    # Split the audio into chunks
    for start in range(0, waveform.shape[1], chunk_size):
        chunk = waveform[:, start:start + chunk_size]
        
        # Pad the last chunk if it's shorter than the chunk size
        if chunk.shape[1] < chunk_size:
            padding = chunk_size - chunk.shape[1]
            chunk = torch.nn.functional.pad(chunk, (0, padding))
        
        audio_chunks.append(chunk.squeeze().numpy())
    
    return audio_chunks


In [35]:
def predict_audio(file_path):
    """
    Predicts the class of an audio file by aggregating predictions from chunks.
    
    Args:
        file_path (str): Path to the audio file.

    Returns:
        str: Predicted class label.
    """
    # Prepare audio chunks
    audio_chunks = prepare_audio(file_path)
    predictions = []

    for i, chunk in enumerate(audio_chunks):
        # Prepare input for the model
        print(f"Chunk shape: {chunk.shape}")
        inputs = processor(
            chunk, sampling_rate=16000, return_tensors="pt", padding=True
        )
        
        # Perform inference
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            print(f"Logits for chunk {i + 1}: {logits}")  # Print the logits
            predicted_class = torch.argmax(logits, dim=1).item()
            predictions.append(predicted_class)
    
    # Aggregate predictions (e.g., majority voting)
    aggregated_prediction = max(set(predictions), key=predictions.count)
    
    # Convert class ID to label
    return model.config.id2label[aggregated_prediction]

# Example: Test a single audio file
file_path = r"D:\Year 3 Sem 2\Godamlah\Deepfake\deepfake model ver3\data\KAGGLE\AUDIO\FAKE\biden-to-linus.wav"  # Replace with your audio file path
predicted_class = predict_audio(file_path)
print(f"Predicted Class: {predicted_class}")


Chunk shape: (160000,)
Logits for chunk 1: tensor([[ 4.6742, -5.1778]])
Chunk shape: (160000,)
Logits for chunk 2: tensor([[ 4.7219, -5.2332]])
Chunk shape: (160000,)
Logits for chunk 3: tensor([[ 4.7545, -5.2641]])
Chunk shape: (160000,)
Logits for chunk 4: tensor([[ 4.6714, -5.1740]])
Chunk shape: (160000,)
Logits for chunk 5: tensor([[ 4.7660, -5.2743]])
Chunk shape: (160000,)
Logits for chunk 6: tensor([[ 4.7724, -5.2836]])
Chunk shape: (160000,)
Logits for chunk 7: tensor([[ 4.7268, -5.2362]])
Chunk shape: (160000,)
Logits for chunk 8: tensor([[ 4.6898, -5.1898]])
Chunk shape: (160000,)
Logits for chunk 9: tensor([[ 4.6646, -5.1708]])
Chunk shape: (160000,)
Logits for chunk 10: tensor([[ 4.5948, -5.0867]])
Chunk shape: (160000,)
Logits for chunk 11: tensor([[ 4.7512, -5.2579]])
Chunk shape: (160000,)
Logits for chunk 12: tensor([[-4.5599,  5.0363]])
Chunk shape: (160000,)
Logits for chunk 13: tensor([[-0.4980,  0.5546]])
Chunk shape: (160000,)
Logits for chunk 14: tensor([[ 4.7295

## Batch Testing

In [36]:
import os

def batch_predict(test_folder, limit=10):
    """
    Batch processes audio files for predictions.

    Args:
        test_folder (str): Path to the folder containing audio files.
        limit (int): Maximum number of files to process. Set to None for all files.

    Returns:
        list: A list of dictionaries containing file names and predicted classes.
    """
    results = []
    files = os.listdir(test_folder)

    # Limit the number of files processed if a limit is provided
    if limit is not None:
        files = files[:limit]

    # Process each file in the folder
    for file_name in files:
        file_path = os.path.join(test_folder, file_name)
        try:
            predicted_class = predict_audio(file_path)  # Use the predict_audio function
            results.append({"file": file_name, "predicted_class": predicted_class})
        except Exception as e:
            print(f"Error processing {file_name}: {e}")
    
    return results

# Specify the folder path and limit
test_folder = r"D:\Year 3 Sem 2\Godamlah\Deepfake\deepfake model ver3\data\real life test audio"  # Replace with your test folder path
results = batch_predict(test_folder, limit=10)

# Print results
for result in results:
    print(result)


Chunk shape: (160000,)
Logits for chunk 1: tensor([[-3.3933,  3.7590]])
Chunk shape: (160000,)
Logits for chunk 1: tensor([[-3.3933,  3.7590]])
Chunk shape: (160000,)
Logits for chunk 1: tensor([[-1.5531,  1.7190]])
Chunk shape: (160000,)
Logits for chunk 1: tensor([[-1.5917,  1.7620]])
Chunk shape: (160000,)
Logits for chunk 1: tensor([[ 4.7569, -5.2631]])
Chunk shape: (160000,)
Logits for chunk 1: tensor([[ 4.7569, -5.2630]])
Chunk shape: (160000,)
Logits for chunk 1: tensor([[-4.5033,  4.9768]])
Chunk shape: (160000,)
Logits for chunk 1: tensor([[-4.5029,  4.9765]])
Chunk shape: (160000,)
Logits for chunk 1: tensor([[ 4.7639, -5.2653]])
Chunk shape: (160000,)
Logits for chunk 1: tensor([[ 4.7639, -5.2653]])
{'file': 'human voice 1 to mr beast.mp3', 'predicted_class': 'Real'}
{'file': 'human voice 1 to mr beast.wav', 'predicted_class': 'Real'}
{'file': 'human voice 1.mp3', 'predicted_class': 'Real'}
{'file': 'human voice 1.wav', 'predicted_class': 'Real'}
{'file': 'human voice 2 to J

In [14]:
import evaluate

# Load the accuracy metric
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)  # Get the predicted class
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    return {"accuracy": accuracy["accuracy"]}

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["validation"],
    tokenizer=processor,  # Required for padding
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # Add this line
)

# Evaluate the model
metrics = trainer.evaluate()
print(metrics)


  0%|          | 0/580 [00:00<?, ?it/s]

{'eval_loss': 4.472154250834137e-05, 'eval_model_preparation_time': 0.0032, 'eval_accuracy': 1.0, 'eval_runtime': 1168.3696, 'eval_samples_per_second': 3.966, 'eval_steps_per_second': 0.496}
