In [1]:
import os
import torch
import torch.nn as nn
import torchaudio
import torchaudio.transforms as transforms
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Set GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [11]:
!pip install pydub


Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [13]:
from pydub import AudioSegment
import os

def convert_mp3_to_wav(mp3_dir, output_dir):
    """
    Converts all MP3 files in the given directory to WAV format and saves them in the output directory.
    
    Args:
        mp3_dir (str): Directory containing the MP3 files.
        output_dir (str): Directory where the converted WAV files will be saved.
    """
    
    os.makedirs(output_dir, exist_ok=True)
    
    # Loop through files in the given directory
    for file in os.listdir(mp3_dir):
        if file.endswith(".mp3"):
            mp3_path = os.path.join(mp3_dir, file)
            wav_path = os.path.join(output_dir, file.replace(".mp3", ".wav"))
            
            try:
                # Convert MP3 to WAV
                audio = AudioSegment.from_mp3(mp3_path)
                audio.export(wav_path, format="wav")
                print(f"Converted {mp3_path} to {wav_path}")
            except Exception as e:
                print(f"Error converting {mp3_path}: {e}")

# Example usage:
sarcastic_input_dir = "C:/Users/Rifat/Music/dsv1audio/sarcastic"
nonsarcastic_input_dir = "C:/Users/Rifat/Music/dsv1audio/nonsarcastic"
sarcastic_output_dir = "C:/Users/Rifat/Music/dsv1audio/sarcastic_wav"
nonsarcastic_output_dir = "C:/Users/Rifat/Music/dsv1audio/nonsarcastic_wav"

# Convert MP3 to WAV
convert_mp3_to_wav(sarcastic_input_dir, sarcastic_output_dir)
convert_mp3_to_wav(nonsarcastic_input_dir, nonsarcastic_output_dir)

print("MP3 to WAV conversion completed!")


Converted C:/Users/Rifat/Music/dsv1audio/sarcastic\dia1000_utt3.mp3 to C:/Users/Rifat/Music/dsv1audio/sarcastic_wav\dia1000_utt3.wav
Converted C:/Users/Rifat/Music/dsv1audio/sarcastic\dia1003_utt3.mp3 to C:/Users/Rifat/Music/dsv1audio/sarcastic_wav\dia1003_utt3.wav
Converted C:/Users/Rifat/Music/dsv1audio/sarcastic\dia1003_utt4.mp3 to C:/Users/Rifat/Music/dsv1audio/sarcastic_wav\dia1003_utt4.wav
Converted C:/Users/Rifat/Music/dsv1audio/sarcastic\dia1005_utt16.mp3 to C:/Users/Rifat/Music/dsv1audio/sarcastic_wav\dia1005_utt16.wav
Converted C:/Users/Rifat/Music/dsv1audio/sarcastic\dia1005_utt21.mp3 to C:/Users/Rifat/Music/dsv1audio/sarcastic_wav\dia1005_utt21.wav
Converted C:/Users/Rifat/Music/dsv1audio/sarcastic\dia1005_utt9.mp3 to C:/Users/Rifat/Music/dsv1audio/sarcastic_wav\dia1005_utt9.wav
Converted C:/Users/Rifat/Music/dsv1audio/sarcastic\dia1007_utt2.mp3 to C:/Users/Rifat/Music/dsv1audio/sarcastic_wav\dia1007_utt2.wav
Converted C:/Users/Rifat/Music/dsv1audio/sarcastic\dia1009_utt1.m

In [31]:
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader, random_split
import os
import torch.nn.functional as F

class SarcasmAudioDataset(Dataset):
    def __init__(self, root_dir, label, transform=None, target_length=16000):
       
        self.root_dir = root_dir
        self.label = label
        self.audio_files = [os.path.join(root_dir, file) for file in os.listdir(root_dir) if file.endswith('.wav')]
        self.transform = transform
        self.target_length = target_length

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        
        audio_path = self.audio_files[idx]
        waveform, sample_rate = torchaudio.load(audio_path)
        
        # Apply transformations
        if self.transform:
            waveform = self.transform(waveform)
        
        # Convert to mono if it's stereo
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        
        
        waveform = waveform.squeeze(0)  

        
        if waveform.size(0) < self.target_length:
            waveform = F.pad(waveform, (0, self.target_length - waveform.size(0)))  # Padding
        elif waveform.size(0) > self.target_length:
            waveform = waveform[:self.target_length]  # Truncating

        return waveform, self.label


transform = None  

# dataset directories
sarcastic_dir = "C:/Users/Rifat/Music/dsv1audio/sarcastic"
nonsarcastic_dir = "C:/Users/Rifat/Music/dsv1audio/nonsarcastic"


sarcastic_dataset = SarcasmAudioDataset(sarcastic_dir, label=1, transform=transform, target_length=16000)
nonsarcastic_dataset = SarcasmAudioDataset(nonsarcastic_dir, label=0, transform=transform, target_length=16000)

# splitting datasets into train, validation, and test sets
full_dataset = sarcastic_dataset + nonsarcastic_dataset
train_size = int(0.7 * len(full_dataset))
val_size = int(0.15 * len(full_dataset))
test_size = len(full_dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(full_dataset, [train_size, val_size, test_size])


train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

print("Data loading complete!")


Data loading complete!


In [37]:
import torch
import torchaudio
import torch.nn as nn

class AudioClassifier(nn.Module):
    def __init__(self):
        super(AudioClassifier, self).__init__()
        
        self.wav2vec = torchaudio.pipelines.WAV2VEC2_BASE.get_model()
        self.classifier = nn.Sequential(
            nn.Linear(768, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 2)  
        )

    def forward(self, x):
        
        with torch.no_grad():
            features, _ = self.wav2vec(x)  
        features = features.mean(dim=1) 
        return self.classifier(features)


model = AudioClassifier().to(device)


criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [39]:
save_path = "C:/Users/Rifat/Music/Models/ver0o1"

def train_model(model, train_loader, val_loader, num_epochs=10):
    best_acc = 0.0
    for epoch in range(num_epochs):
        model.train()
        train_loss, correct, total = 0.0, 0, 0
        
        print(f"Starting Epoch {epoch+1}/{num_epochs}")
        for batch_idx, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            if batch_idx % 10 == 0:
                print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}")
        
        train_acc = 100. * correct / total
        val_acc = evaluate_model(model, val_loader)
        print(f"Epoch {epoch+1}, Train Accuracy: {train_acc:.2f}%, Validation Accuracy: {val_acc:.2f}%")

        
        if val_acc > best_acc:
            best_acc = val_acc
            model_filename = f"{save_path}/audio_wav2vec_epoch{epoch+1}_val{val_acc:.2f}.pt"
            torch.save(model.state_dict(), model_filename)
            print(f"Model saved as {model_filename}")

def evaluate_model(model, val_loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return 100. * correct / total

train_model(model, train_loader, val_loader)


Starting Epoch 1/10
Epoch [1/10], Batch [1/97], Loss: 0.6767
Epoch [1/10], Batch [11/97], Loss: 0.5744
Epoch [1/10], Batch [21/97], Loss: 0.5634
Epoch [1/10], Batch [31/97], Loss: 0.6774
Epoch [1/10], Batch [41/97], Loss: 0.8068
Epoch [1/10], Batch [51/97], Loss: 0.6740
Epoch [1/10], Batch [61/97], Loss: 0.6714
Epoch [1/10], Batch [71/97], Loss: 0.6217
Epoch [1/10], Batch [81/97], Loss: 0.6883
Epoch [1/10], Batch [91/97], Loss: 0.6307
Epoch 1, Train Accuracy: 59.61%, Validation Accuracy: 62.05%
Model saved as C:/Users/Rifat/Music/Models/ver0o1/audio_wav2vec_epoch1_val62.05.pt
Starting Epoch 2/10
Epoch [2/10], Batch [1/97], Loss: 0.6934
Epoch [2/10], Batch [11/97], Loss: 0.7416
Epoch [2/10], Batch [21/97], Loss: 0.6181
Epoch [2/10], Batch [31/97], Loss: 0.7286
Epoch [2/10], Batch [41/97], Loss: 0.6347
Epoch [2/10], Batch [51/97], Loss: 1.1710
Epoch [2/10], Batch [61/97], Loss: 0.6923
Epoch [2/10], Batch [71/97], Loss: 0.6795
Epoch [2/10], Batch [81/97], Loss: 0.7089
Epoch [2/10], Batch 