# AUDIO EXTRACTION

In [1]:
# import os
# import subprocess

# def list_videos(base_dir, extensions=('.mp4',)):
#     """
#     List all video files in the dataset directory.
#     :param base_dir: Path to the dataset base directory
#     :param extensions: Tuple of file extensions to include
#     :return: List of file paths for all videos
#     """
#     video_files = []
#     for root, dirs, files in os.walk(base_dir):
#         for file in files:
#             if file.endswith(extensions):
#                 video_files.append(os.path.join(root, file))
#     return video_files

# def extract_audio(video_files, output_base_dir):
#     """
#     Extract audio from video files and save as .wav files.
#     :param video_files: List of video file paths
#     :param output_base_dir: Base directory for saving audio files
#     """
#     for video_file in video_files:
#         # Generate output directory and file paths
#         relative_path = os.path.relpath(video_file, input_dir)
#         output_dir = os.path.join(output_base_dir, os.path.dirname(relative_path))
#         os.makedirs(output_dir, exist_ok=True)
#         output_file = os.path.join(output_dir, os.path.basename(video_file).replace('.mp4', '.wav'))
        
#         # Run FFmpeg to extract audio
#         subprocess.run(['ffmpeg', '-i', video_file, '-vn', '-acodec', 'pcm_s16le', '-ar', '44100', '-ac', '2', output_file])
#         print(f"Extracted: {video_file} -> {output_file}")



In [2]:
# # Base directories
# input_dir = '/kaggle/input/first-impression-v2-train-dataset/train-1'
# output_base_dir = '/kaggle/working/fi_audio_train_dataset/train-1'

# # List video files
# video_files = list_videos(input_dir)

# print(f"Found {len(video_files)} video files.")
# extract_audio(video_files, output_base_dir)

In [3]:
# print("done")

---

# FEATURE EXTRACTION

In [4]:
# import os
# import shutil

# # Specify the folder to delete
# folder_path = "/kaggle/working/segmented_audio"

# # Check if the folder exists
# if os.path.exists(folder_path):
#     # Delete the folder and its contents
#     shutil.rmtree(folder_path)
#     print(f"Folder '{folder_path}' has been deleted.")
# else:
#     print(f"Folder '{folder_path}' does not exist.")


In [4]:
# import os

# # Specify the file to delete
# file_path = "/kaggle/working/audio_feature_extraction/train_6_af.csv"

# # Check if the file exists
# if os.path.exists(file_path):
#     # Delete the file
#     os.remove(file_path)
#     print(f"File '{file_path}' has been deleted.")
# else:
#     print(f"File '{file_path}' does not exist.")


File '/kaggle/working/audio_feature_extraction/train_6_af.csv' does not exist.


## DEEP AND HC FEATURE EXTRACTION

In [6]:
import os

def list_audio_files(base_dir, extension=".wav"):
    """
    List all audio files in the dataset directory.

    :param base_dir: Path to the dataset base directory
    :param extension: File extension to look for (default is ".wav")
    :return: List of file paths for all audio files
    """
    audio_files = []
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith(extension):
                audio_files.append(os.path.join(root, file))
    return audio_files

# Example usage
base_dir = "/kaggle/working/fi_audio_train_dataset/train-1"  # Replace with the path to your dataset
audio_files = list_audio_files(base_dir)
print(f"Found {len(audio_files)} audio files.")


Found 960 audio files.


In [7]:
import os
import pandas as pd

output_dir = "audio_feature_extraction"

# Create output directory if not exists
os.makedirs(output_dir, exist_ok=True)

# Define output file and columns
output_file = "audio_feature_extraction/train_1_af.csv"
columns = ['Audio_ID', 'Segment_ID'] + [f'Deep_{i}' for i in range(512)] + [f'Hc_{i}' for i in range(25)]

# Initialize the CSV if it doesn't exist
if not os.path.exists(output_file):
    pd.DataFrame(columns=columns).to_csv(output_file, index=False)


In [8]:
from pydub import AudioSegment

def segment_audio(audio_path, segment_duration=2, skip_duration=3):
    audio = AudioSegment.from_file(audio_path)
    segments = []
    audio_duration = len(audio) / 1000  # in seconds
    for start in range(0, int(audio_duration - segment_duration), skip_duration):  # Segment based on audio length
        segment = audio[start * 1000:(start + segment_duration) * 1000]  # Segment duration in milliseconds
        if len(segment) < segment_duration * 1000:
            segment = segment + AudioSegment.silent(duration=(segment_duration * 1000 - len(segment)))  # Pad with silence
        segments.append(segment)
    return segments



In [9]:
import numpy as np
import librosa
from PIL import Image
import torch
import torchvision.transforms as transforms

# Load pretrained emotional VGG-16 model
vgg_model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg16', pretrained=True)
vgg_model.eval()

def extract_deep_features(segments):
    features = []
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    for segment in segments:
        # Compute log-Mel spectrogram
        samples = np.array(segment.get_array_of_samples(), dtype=np.float32)
        mel_spec = librosa.feature.melspectrogram(y=samples, sr=16000, n_mels=128, fmax=8000, hop_length=512, win_length=2048)
        log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

        # Normalize the spectrogram to range 0-255 for image conversion
        log_mel_spec = np.clip(log_mel_spec, a_min=-80, a_max=0)  # Log scale
        log_mel_spec = (log_mel_spec + 80) * (255.0 / 80.0)  # Normalize to [0, 255]

        # Convert to image and resize
        image = Image.fromarray(log_mel_spec.astype(np.uint8)).convert("RGB")
        image = transform(image).unsqueeze(0)

        # Extract features using VGG-16
        with torch.no_grad():
            deep_feature = vgg_model(image).numpy()
        features.append(deep_feature)

    return features


Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


In [10]:
# pip install opensmile

In [11]:
import opensmile
from opensmile import Smile, FeatureSet, FeatureLevel

smile = Smile(
    feature_set=FeatureSet.eGeMAPSv02,
    feature_level=FeatureLevel.Functionals
)

def extract_egemaps(segment):
    """
    Extract eGeMAPS features using openSMILE.

    :param segment: Audio segment
    :return: Hand-crafted features as a numpy array
    """
    samples = np.array(segment.get_array_of_samples(), dtype=np.float32)
    features = smile.process_signal(samples, sampling_rate=16000).to_numpy()
    return features


In [12]:
def save_features(audio_id, segment_id, deep_feat, hc_feat):
    """
    Save the extracted features to the CSV file.

    :param audio_id: ID of the audio file
    :param segment_id: ID of the audio segment
    :param deep_feat: Deep features for the segment
    :param hc_feat: Hand-crafted features for the segment
    """
    # Aggregate features (e.g., mean)
    combined_features = {
        'Audio_ID': audio_id,
        'Segment_ID': segment_id,
    }
    combined_features.update({f'Deep_{i}': deep_feat.mean(axis=0)[i] for i in range(512)})
    combined_features.update({f'Hc_{i}': hc_feat.mean(axis=0)[i] for i in range(25)})

    # Append to CSV
    feature_row = pd.DataFrame([combined_features])
    feature_row.to_csv(output_file, mode='a', header=False, index=False)


In [13]:
def process_audio_files(audio_files):
    for audio_path in audio_files:
        audio_id = os.path.splitext(os.path.basename(audio_path))[0]
        print(f"Processing {audio_id}...")

        # Segment the audio
        segments = segment_audio(audio_path)

        for segment_id, segment in enumerate(segments):
            # Extract features
            deep_feat = extract_deep_features([segment])[0]
            hc_feat = extract_egemaps(segment)

            # Save features to CSV
            save_features(audio_id, segment_id, deep_feat, hc_feat)

    print(f"All features saved to {output_file}")


In [14]:
# Step 1: List all audio files in the dataset directory
# base_dir = "/kaggle/working/fi_audio_train_dataset/train-2"  # Replace with your path
audio_files = list_audio_files(base_dir)
print(f"Found {len(audio_files)} audio files.")

# Step 2: Process all audio files
process_audio_files(audio_files)  # This function is already defined in the previous pipeline


Found 960 audio files.
Processing 69BopbFc34U.004...
Processing p_Z4QGqu4Qo.004...
Processing 6TkMavTyimI.000...
Processing kdPRPO3wgg0.004...
Processing PTV5TBVRHxo.002...
Processing dmycfNpiWCE.005...
Processing fEZrAGQoh_g.001...
Processing u50B6bIkN9g.002...
Processing zvX4fNj0uxk.004...
Processing si_gZCrLa4A.005...
Processing D4TU65xbF4g.000...
Processing 8i7H-uSCQcc.004...
Processing ya8Ec_yTai8.002...
Processing t899haDGi38.003...
Processing ZiKxJbVI5_g.002...
Processing Sb1b8JMVhLs.002...
Processing bFwtVtZodIg.002...
Processing KahSwziq8F0.002...
Processing PtA7yAu9-VE.005...
Processing C0UMWrEJ2x0.005...
Processing JO44XCaQGVY.000...
Processing GwKmjEb3qN0.001...
Processing 78zauTEQ-k8.001...
Processing orlXEgAepGo.002...
Processing ld7mBR4v3yU.002...
Processing fEix0DPOWtg.005...
Processing Vk3scj5vhd4.004...
Processing LeQKH1vAVpg.005...
Processing 2fzLibPAtvI.005...
Processing L-C1blPD_ec.000...
Processing e2EmGXRJ1K0.001...
Processing nDsTSmE73M0.000...
Processing OD_gyC

In [15]:
print("done")

done


## DEEP AND HC FEATURE EXTRACTION

In [16]:
import os

def list_audio_files(base_dir, extension=".wav"):
    """
    List all audio files in the dataset directory.

    :param base_dir: Path to the dataset base directory
    :param extension: File extension to look for (default is ".wav")
    :return: List of file paths for all audio files
    """
    audio_files = []
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith(extension):
                audio_files.append(os.path.join(root, file))
    return audio_files

# Example usage
base_dir = "/kaggle/working/fi_audio_train_dataset/train-2"  # Replace with the path to your dataset
audio_files = list_audio_files(base_dir)
print(f"Found {len(audio_files)} audio files.")


Found 960 audio files.


In [17]:
import os
import pandas as pd

output_dir = "audio_feature_extraction"

# Create output directory if not exists
os.makedirs(output_dir, exist_ok=True)

# Define output file and columns
output_file = "audio_feature_extraction/train_2_af.csv"
columns = ['Audio_ID', 'Segment_ID'] + [f'Deep_{i}' for i in range(512)] + [f'Hc_{i}' for i in range(25)]

# Initialize the CSV if it doesn't exist
if not os.path.exists(output_file):
    pd.DataFrame(columns=columns).to_csv(output_file, index=False)


In [18]:
from pydub import AudioSegment

def segment_audio(audio_path, segment_duration=2, skip_duration=3):
    audio = AudioSegment.from_file(audio_path)
    segments = []
    audio_duration = len(audio) / 1000  # in seconds
    for start in range(0, int(audio_duration - segment_duration), skip_duration):  # Segment based on audio length
        segment = audio[start * 1000:(start + segment_duration) * 1000]  # Segment duration in milliseconds
        if len(segment) < segment_duration * 1000:
            segment = segment + AudioSegment.silent(duration=(segment_duration * 1000 - len(segment)))  # Pad with silence
        segments.append(segment)
    return segments



In [19]:
import numpy as np
import librosa
from PIL import Image
import torch
import torchvision.transforms as transforms

# Load pretrained emotional VGG-16 model
vgg_model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg16', pretrained=True)
vgg_model.eval()

def extract_deep_features(segments):
    features = []
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    for segment in segments:
        # Compute log-Mel spectrogram
        samples = np.array(segment.get_array_of_samples(), dtype=np.float32)
        mel_spec = librosa.feature.melspectrogram(y=samples, sr=16000, n_mels=128, fmax=8000, hop_length=512, win_length=2048)
        log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

        # Normalize the spectrogram to range 0-255 for image conversion
        log_mel_spec = np.clip(log_mel_spec, a_min=-80, a_max=0)  # Log scale
        log_mel_spec = (log_mel_spec + 80) * (255.0 / 80.0)  # Normalize to [0, 255]

        # Convert to image and resize
        image = Image.fromarray(log_mel_spec.astype(np.uint8)).convert("RGB")
        image = transform(image).unsqueeze(0)

        # Extract features using VGG-16
        with torch.no_grad():
            deep_feature = vgg_model(image).numpy()
        features.append(deep_feature)

    return features


Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


In [20]:
# pip install opensmile

In [21]:
import opensmile
from opensmile import Smile, FeatureSet, FeatureLevel

smile = Smile(
    feature_set=FeatureSet.eGeMAPSv02,
    feature_level=FeatureLevel.Functionals
)

def extract_egemaps(segment):
    """
    Extract eGeMAPS features using openSMILE.

    :param segment: Audio segment
    :return: Hand-crafted features as a numpy array
    """
    samples = np.array(segment.get_array_of_samples(), dtype=np.float32)
    features = smile.process_signal(samples, sampling_rate=16000).to_numpy()
    return features


In [22]:
def save_features(audio_id, segment_id, deep_feat, hc_feat):
    """
    Save the extracted features to the CSV file.

    :param audio_id: ID of the audio file
    :param segment_id: ID of the audio segment
    :param deep_feat: Deep features for the segment
    :param hc_feat: Hand-crafted features for the segment
    """
    # Aggregate features (e.g., mean)
    combined_features = {
        'Audio_ID': audio_id,
        'Segment_ID': segment_id,
    }
    combined_features.update({f'Deep_{i}': deep_feat.mean(axis=0)[i] for i in range(512)})
    combined_features.update({f'Hc_{i}': hc_feat.mean(axis=0)[i] for i in range(25)})

    # Append to CSV
    feature_row = pd.DataFrame([combined_features])
    feature_row.to_csv(output_file, mode='a', header=False, index=False)


In [23]:
def process_audio_files(audio_files):
    for audio_path in audio_files:
        audio_id = os.path.splitext(os.path.basename(audio_path))[0]
        print(f"Processing {audio_id}...")

        # Segment the audio
        segments = segment_audio(audio_path)

        for segment_id, segment in enumerate(segments):
            # Extract features
            deep_feat = extract_deep_features([segment])[0]
            hc_feat = extract_egemaps(segment)

            # Save features to CSV
            save_features(audio_id, segment_id, deep_feat, hc_feat)

    print(f"All features saved to {output_file}")


In [24]:
# Step 1: List all audio files in the dataset directory
# base_dir = "/kaggle/working/fi_audio_train_dataset/train-2"  # Replace with your path
audio_files = list_audio_files(base_dir)
print(f"Found {len(audio_files)} audio files.")

# Step 2: Process all audio files
process_audio_files(audio_files)  # This function is already defined in the previous pipeline


Found 960 audio files.
Processing djsoQkroHpo.001...
Processing 3Vr5-zedeWk.004...
Processing xGo_wImhwkQ.003...
Processing rTG9gtci67c.004...
Processing CQMH9Qguuao.003...
Processing dB-kMg4t-V8.001...
Processing BLeDNfK2quI.002...
Processing 44rxmXiga90.002...
Processing M0U48Lm33A8.003...
Processing C-48U5oDuvw.003...
Processing mBJTwQsDC4M.003...
Processing bJktioZ5Yxk.004...
Processing 7LHmNEH65Pk.002...
Processing huIQlWsIEvY.004...
Processing noepjVnUVFY.001...
Processing dB65He57Ki8.005...
Processing EeI8iXLDfc0.005...
Processing 176vWywoq9E.004...
Processing fBMKlYUjA9E.000...
Processing G7QTNbKbu_4.005...
Processing NDC375coN1o.005...
Processing TaTMtWvSmu4.000...
Processing fBcc1UMtZME.001...
Processing TNtcyfM9jak.003...
Processing 8aLr0vNobr4.001...
Processing p7-JUvStF4w.000...
Processing UzgW75Fd4jU.001...
Processing c4XnKouozXU.003...
Processing 3gKpBq-1yG4.000...
Processing Kmrd1MsZKmQ.001...
Processing jgyDXrhO3n4.000...
Processing EeI8iXLDfc0.001...
Processing US4Pxg

In [25]:
print("done")

done


## DEEP AND HC FEATURE EXTRACTION

In [26]:
import os

def list_audio_files(base_dir, extension=".wav"):
    """
    List all audio files in the dataset directory.

    :param base_dir: Path to the dataset base directory
    :param extension: File extension to look for (default is ".wav")
    :return: List of file paths for all audio files
    """
    audio_files = []
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith(extension):
                audio_files.append(os.path.join(root, file))
    return audio_files

# Example usage
base_dir = "/kaggle/working/fi_audio_train_dataset/train-3"  # Replace with the path to your dataset
audio_files = list_audio_files(base_dir)
print(f"Found {len(audio_files)} audio files.")


Found 960 audio files.


In [27]:
import os
import pandas as pd

output_dir = "audio_feature_extraction"

# Create output directory if not exists
os.makedirs(output_dir, exist_ok=True)

# Define output file and columns
output_file = "audio_feature_extraction/train_3_af.csv"
columns = ['Audio_ID', 'Segment_ID'] + [f'Deep_{i}' for i in range(512)] + [f'Hc_{i}' for i in range(25)]

# Initialize the CSV if it doesn't exist
if not os.path.exists(output_file):
    pd.DataFrame(columns=columns).to_csv(output_file, index=False)


In [28]:
from pydub import AudioSegment

def segment_audio(audio_path, segment_duration=2, skip_duration=3):
    audio = AudioSegment.from_file(audio_path)
    segments = []
    audio_duration = len(audio) / 1000  # in seconds
    for start in range(0, int(audio_duration - segment_duration), skip_duration):  # Segment based on audio length
        segment = audio[start * 1000:(start + segment_duration) * 1000]  # Segment duration in milliseconds
        if len(segment) < segment_duration * 1000:
            segment = segment + AudioSegment.silent(duration=(segment_duration * 1000 - len(segment)))  # Pad with silence
        segments.append(segment)
    return segments



In [29]:
import numpy as np
import librosa
from PIL import Image
import torch
import torchvision.transforms as transforms

# Load pretrained emotional VGG-16 model
vgg_model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg16', pretrained=True)
vgg_model.eval()

def extract_deep_features(segments):
    features = []
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    for segment in segments:
        # Compute log-Mel spectrogram
        samples = np.array(segment.get_array_of_samples(), dtype=np.float32)
        mel_spec = librosa.feature.melspectrogram(y=samples, sr=16000, n_mels=128, fmax=8000, hop_length=512, win_length=2048)
        log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

        # Normalize the spectrogram to range 0-255 for image conversion
        log_mel_spec = np.clip(log_mel_spec, a_min=-80, a_max=0)  # Log scale
        log_mel_spec = (log_mel_spec + 80) * (255.0 / 80.0)  # Normalize to [0, 255]

        # Convert to image and resize
        image = Image.fromarray(log_mel_spec.astype(np.uint8)).convert("RGB")
        image = transform(image).unsqueeze(0)

        # Extract features using VGG-16
        with torch.no_grad():
            deep_feature = vgg_model(image).numpy()
        features.append(deep_feature)

    return features


Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


In [30]:
# pip install opensmile

In [31]:
import opensmile
from opensmile import Smile, FeatureSet, FeatureLevel

smile = Smile(
    feature_set=FeatureSet.eGeMAPSv02,
    feature_level=FeatureLevel.Functionals
)

def extract_egemaps(segment):
    """
    Extract eGeMAPS features using openSMILE.

    :param segment: Audio segment
    :return: Hand-crafted features as a numpy array
    """
    samples = np.array(segment.get_array_of_samples(), dtype=np.float32)
    features = smile.process_signal(samples, sampling_rate=16000).to_numpy()
    return features


In [32]:
def save_features(audio_id, segment_id, deep_feat, hc_feat):
    """
    Save the extracted features to the CSV file.

    :param audio_id: ID of the audio file
    :param segment_id: ID of the audio segment
    :param deep_feat: Deep features for the segment
    :param hc_feat: Hand-crafted features for the segment
    """
    # Aggregate features (e.g., mean)
    combined_features = {
        'Audio_ID': audio_id,
        'Segment_ID': segment_id,
    }
    combined_features.update({f'Deep_{i}': deep_feat.mean(axis=0)[i] for i in range(512)})
    combined_features.update({f'Hc_{i}': hc_feat.mean(axis=0)[i] for i in range(25)})

    # Append to CSV
    feature_row = pd.DataFrame([combined_features])
    feature_row.to_csv(output_file, mode='a', header=False, index=False)


In [33]:
def process_audio_files(audio_files):
    for audio_path in audio_files:
        audio_id = os.path.splitext(os.path.basename(audio_path))[0]
        print(f"Processing {audio_id}...")

        # Segment the audio
        segments = segment_audio(audio_path)

        for segment_id, segment in enumerate(segments):
            # Extract features
            deep_feat = extract_deep_features([segment])[0]
            hc_feat = extract_egemaps(segment)

            # Save features to CSV
            save_features(audio_id, segment_id, deep_feat, hc_feat)

    print(f"All features saved to {output_file}")


In [34]:
# Step 1: List all audio files in the dataset directory
# base_dir = "/kaggle/working/fi_audio_train_dataset/train-2"  # Replace with your path
audio_files = list_audio_files(base_dir)
print(f"Found {len(audio_files)} audio files.")

# Step 2: Process all audio files
process_audio_files(audio_files)  # This function is already defined in the previous pipeline


Found 960 audio files.
Processing BS0wgLXqFgc.003...
Processing bcRPLKygrNk.000...
Processing FoWuHvikyqU.001...
Processing F4UeAogUMMs.004...
Processing lLObIVKYDUI.004...
Processing ai-RZb0ZpKI.002...
Processing IMCEXoAkZv4.005...
Processing JFKZEGDhcRs.000...
Processing 6wIEiqmuHOM.000...
Processing PooFvZH6fK0.001...
Processing 5Eez38v8TuU.005...
Processing SyaRn3MHEIE.003...
Processing TNjEJGdqmH0.005...
Processing 3taD1fEPfC8.004...
Processing OhHg1lyxe4I.000...
Processing Gk94xrcmFts.000...
Processing s6gN_358tk4.001...
Processing U_QQ8EuuWPg.003...
Processing Ea0UKBPCm3Q.001...
Processing 8n6G3V3-Tk8.005...
Processing rGNeR3p1jKU.000...
Processing KJ643kfjqLY.001...
Processing 9Crw2RtrBcY.001...
Processing b6eoqD1J_qU.003...
Processing fIiXuRlTPa0.000...
Processing si_gZCrLa4A.002...
Processing ev42vngXERU.001...
Processing cgGCyBMdGrA.001...
Processing 8PXQ_5tUv74.005...
Processing HTF8k56_Oxo.004...
Processing iPblZBQpafs.001...
Processing Gk94xrcmFts.002...
Processing p6UMx8

In [35]:
print("done")

done


## DEEP AND HC FEATURE EXTRACTION

In [36]:
import os

def list_audio_files(base_dir, extension=".wav"):
    """
    List all audio files in the dataset directory.

    :param base_dir: Path to the dataset base directory
    :param extension: File extension to look for (default is ".wav")
    :return: List of file paths for all audio files
    """
    audio_files = []
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith(extension):
                audio_files.append(os.path.join(root, file))
    return audio_files

# Example usage
base_dir = "/kaggle/working/fi_audio_train_dataset/train-4"  # Replace with the path to your dataset
audio_files = list_audio_files(base_dir)
print(f"Found {len(audio_files)} audio files.")


Found 960 audio files.


In [37]:
import os
import pandas as pd

output_dir = "audio_feature_extraction"

# Create output directory if not exists
os.makedirs(output_dir, exist_ok=True)

# Define output file and columns
output_file = "audio_feature_extraction/train_4_af.csv"
columns = ['Audio_ID', 'Segment_ID'] + [f'Deep_{i}' for i in range(512)] + [f'Hc_{i}' for i in range(25)]

# Initialize the CSV if it doesn't exist
if not os.path.exists(output_file):
    pd.DataFrame(columns=columns).to_csv(output_file, index=False)


In [38]:
from pydub import AudioSegment

def segment_audio(audio_path, segment_duration=2, skip_duration=3):
    audio = AudioSegment.from_file(audio_path)
    segments = []
    audio_duration = len(audio) / 1000  # in seconds
    for start in range(0, int(audio_duration - segment_duration), skip_duration):  # Segment based on audio length
        segment = audio[start * 1000:(start + segment_duration) * 1000]  # Segment duration in milliseconds
        if len(segment) < segment_duration * 1000:
            segment = segment + AudioSegment.silent(duration=(segment_duration * 1000 - len(segment)))  # Pad with silence
        segments.append(segment)
    return segments



In [39]:
import numpy as np
import librosa
from PIL import Image
import torch
import torchvision.transforms as transforms

# Load pretrained emotional VGG-16 model
vgg_model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg16', pretrained=True)
vgg_model.eval()

def extract_deep_features(segments):
    features = []
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    for segment in segments:
        # Compute log-Mel spectrogram
        samples = np.array(segment.get_array_of_samples(), dtype=np.float32)
        mel_spec = librosa.feature.melspectrogram(y=samples, sr=16000, n_mels=128, fmax=8000, hop_length=512, win_length=2048)
        log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

        # Normalize the spectrogram to range 0-255 for image conversion
        log_mel_spec = np.clip(log_mel_spec, a_min=-80, a_max=0)  # Log scale
        log_mel_spec = (log_mel_spec + 80) * (255.0 / 80.0)  # Normalize to [0, 255]

        # Convert to image and resize
        image = Image.fromarray(log_mel_spec.astype(np.uint8)).convert("RGB")
        image = transform(image).unsqueeze(0)

        # Extract features using VGG-16
        with torch.no_grad():
            deep_feature = vgg_model(image).numpy()
        features.append(deep_feature)

    return features


Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


In [40]:
# pip install opensmile

In [41]:
import opensmile
from opensmile import Smile, FeatureSet, FeatureLevel

smile = Smile(
    feature_set=FeatureSet.eGeMAPSv02,
    feature_level=FeatureLevel.Functionals
)

def extract_egemaps(segment):
    """
    Extract eGeMAPS features using openSMILE.

    :param segment: Audio segment
    :return: Hand-crafted features as a numpy array
    """
    samples = np.array(segment.get_array_of_samples(), dtype=np.float32)
    features = smile.process_signal(samples, sampling_rate=16000).to_numpy()
    return features


In [42]:
def save_features(audio_id, segment_id, deep_feat, hc_feat):
    """
    Save the extracted features to the CSV file.

    :param audio_id: ID of the audio file
    :param segment_id: ID of the audio segment
    :param deep_feat: Deep features for the segment
    :param hc_feat: Hand-crafted features for the segment
    """
    # Aggregate features (e.g., mean)
    combined_features = {
        'Audio_ID': audio_id,
        'Segment_ID': segment_id,
    }
    combined_features.update({f'Deep_{i}': deep_feat.mean(axis=0)[i] for i in range(512)})
    combined_features.update({f'Hc_{i}': hc_feat.mean(axis=0)[i] for i in range(25)})

    # Append to CSV
    feature_row = pd.DataFrame([combined_features])
    feature_row.to_csv(output_file, mode='a', header=False, index=False)


In [43]:
def process_audio_files(audio_files):
    for audio_path in audio_files:
        audio_id = os.path.splitext(os.path.basename(audio_path))[0]
        print(f"Processing {audio_id}...")

        # Segment the audio
        segments = segment_audio(audio_path)

        for segment_id, segment in enumerate(segments):
            # Extract features
            deep_feat = extract_deep_features([segment])[0]
            hc_feat = extract_egemaps(segment)

            # Save features to CSV
            save_features(audio_id, segment_id, deep_feat, hc_feat)

    print(f"All features saved to {output_file}")


In [44]:
# Step 1: List all audio files in the dataset directory
# base_dir = "/kaggle/working/fi_audio_train_dataset/train-2"  # Replace with your path
audio_files = list_audio_files(base_dir)
print(f"Found {len(audio_files)} audio files.")

# Step 2: Process all audio files
process_audio_files(audio_files)  # This function is already defined in the previous pipeline


Found 960 audio files.
Processing Zi7QHI_5ipU.004...
Processing nex61B854Ms.003...
Processing G-25EWOIGNs.000...
Processing MrYEK0nvnAo.002...
Processing shAhNNV-zHA.005...
Processing k2buv6xZ4_o.003...
Processing Rops7WYMCCY.001...
Processing lkdQjIunxSE.000...
Processing Oe7ItP7gS4w.004...
Processing OTXGM3Guxy4.000...
Processing 8mZZlnbmOYE.001...
Processing atdhaGEh_0c.000...
Processing 4zkY1SG-7xc.003...
Processing PKyqjYem0dY.004...
Processing 9UWCHrG5mys.001...
Processing 9CPKW0sqR3E.005...
Processing HPEemWelY1E.000...
Processing AP0aklGHino.003...
Processing FkzVX7SQ9wE.002...
Processing kL-CeaXG9jM.002...
Processing LuLUAwn0aWU.002...
Processing obr7ktzX8w8.004...
Processing P6g1mZ3f76U.002...
Processing -Wqk9eex6bQ.002...
Processing vhugKRUnd-c.004...
Processing _7hGh6VU4IU.005...
Processing jZhlaIUFMLc.000...
Processing 5ghk5950BhU.005...
Processing KnbEca4Ibwk.000...
Processing 39BJkEXJpgc.003...
Processing C2MEPH7x9m0.001...
Processing cY3nHc5fDYE.003...
Processing W0LRjS

In [45]:
print("done")

done


## DEEP AND HC FEATURE EXTRACTION

In [11]:
import os

def list_audio_files(base_dir, extension=".wav"):
    """
    List all audio files in the dataset directory.

    :param base_dir: Path to the dataset base directory
    :param extension: File extension to look for (default is ".wav")
    :return: List of file paths for all audio files
    """
    audio_files = []
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith(extension):
                audio_files.append(os.path.join(root, file))
    return audio_files

# Example usage
base_dir = "/kaggle/working/fi_audio_train_dataset/train-5"  # Replace with the path to your dataset
audio_files = list_audio_files(base_dir)
print(f"Found {len(audio_files)} audio files.")


Found 1040 audio files.


In [12]:
import os
import pandas as pd

output_dir = "audio_feature_extraction"

# Create output directory if not exists
os.makedirs(output_dir, exist_ok=True)

# Define output file and columns
output_file = "audio_feature_extraction/train_5_af.csv"
columns = ['Audio_ID', 'Segment_ID'] + [f'Deep_{i}' for i in range(512)] + [f'Hc_{i}' for i in range(25)]

# Initialize the CSV if it doesn't exist
if not os.path.exists(output_file):
    pd.DataFrame(columns=columns).to_csv(output_file, index=False)


In [13]:
from pydub import AudioSegment

def segment_audio(audio_path, segment_duration=2, skip_duration=3):
    audio = AudioSegment.from_file(audio_path)
    segments = []
    audio_duration = len(audio) / 1000  # in seconds
    for start in range(0, int(audio_duration - segment_duration), skip_duration):  # Segment based on audio length
        segment = audio[start * 1000:(start + segment_duration) * 1000]  # Segment duration in milliseconds
        if len(segment) < segment_duration * 1000:
            segment = segment + AudioSegment.silent(duration=(segment_duration * 1000 - len(segment)))  # Pad with silence
        segments.append(segment)
    return segments



In [14]:
import numpy as np
import librosa
from PIL import Image
import torch
import torchvision.transforms as transforms

# Load pretrained emotional VGG-16 model
vgg_model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg16', pretrained=True)
vgg_model.eval()

def extract_deep_features(segments):
    features = []
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    for segment in segments:
        # Compute log-Mel spectrogram
        samples = np.array(segment.get_array_of_samples(), dtype=np.float32)
        mel_spec = librosa.feature.melspectrogram(y=samples, sr=16000, n_mels=128, fmax=8000, hop_length=512, win_length=2048)
        log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

        # Normalize the spectrogram to range 0-255 for image conversion
        log_mel_spec = np.clip(log_mel_spec, a_min=-80, a_max=0)  # Log scale
        log_mel_spec = (log_mel_spec + 80) * (255.0 / 80.0)  # Normalize to [0, 255]

        # Convert to image and resize
        image = Image.fromarray(log_mel_spec.astype(np.uint8)).convert("RGB")
        image = transform(image).unsqueeze(0)

        # Extract features using VGG-16
        with torch.no_grad():
            deep_feature = vgg_model(image).numpy()
        features.append(deep_feature)

    return features


Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


In [15]:
pip install opensmile

Collecting opensmile
  Downloading opensmile-2.5.0-py3-none-manylinux_2_17_x86_64.whl.metadata (15 kB)
Collecting audobject>=0.6.1 (from opensmile)
  Downloading audobject-0.7.11-py3-none-any.whl.metadata (2.6 kB)
Collecting audinterface>=0.7.0 (from opensmile)
  Downloading audinterface-1.2.2-py3-none-any.whl.metadata (4.1 kB)
Collecting audeer>=1.18.0 (from audinterface>=0.7.0->opensmile)
  Downloading audeer-2.2.0-py3-none-any.whl.metadata (4.1 kB)
Collecting audformat<2.0.0,>=1.0.1 (from audinterface>=0.7.0->opensmile)
  Downloading audformat-1.3.1-py3-none-any.whl.metadata (4.6 kB)
Collecting audiofile>=1.3.0 (from audinterface>=0.7.0->opensmile)
  Downloading audiofile-1.5.0-py3-none-any.whl.metadata (4.9 kB)
Collecting audmath>=1.4.1 (from audinterface>=0.7.0->opensmile)
  Downloading audmath-1.4.1-py3-none-any.whl.metadata (3.6 kB)
Collecting audresample<2.0.0,>=1.1.0 (from audinterface>=0.7.0->opensmile)
  Downloading audresample-1.3.3-py3-none-manylinux_2_17_x86_64.whl.metada

In [16]:
import opensmile
from opensmile import Smile, FeatureSet, FeatureLevel

smile = Smile(
    feature_set=FeatureSet.eGeMAPSv02,
    feature_level=FeatureLevel.Functionals
)

def extract_egemaps(segment):
    """
    Extract eGeMAPS features using openSMILE.

    :param segment: Audio segment
    :return: Hand-crafted features as a numpy array
    """
    samples = np.array(segment.get_array_of_samples(), dtype=np.float32)
    features = smile.process_signal(samples, sampling_rate=16000).to_numpy()
    return features


In [17]:
def save_features(audio_id, segment_id, deep_feat, hc_feat):
    """
    Save the extracted features to the CSV file.

    :param audio_id: ID of the audio file
    :param segment_id: ID of the audio segment
    :param deep_feat: Deep features for the segment
    :param hc_feat: Hand-crafted features for the segment
    """
    # Aggregate features (e.g., mean)
    combined_features = {
        'Audio_ID': audio_id,
        'Segment_ID': segment_id,
    }
    combined_features.update({f'Deep_{i}': deep_feat.mean(axis=0)[i] for i in range(512)})
    combined_features.update({f'Hc_{i}': hc_feat.mean(axis=0)[i] for i in range(25)})

    # Append to CSV
    feature_row = pd.DataFrame([combined_features])
    feature_row.to_csv(output_file, mode='a', header=False, index=False)


In [18]:
def process_audio_files(audio_files):
    for audio_path in audio_files:
        audio_id = os.path.splitext(os.path.basename(audio_path))[0]
        print(f"Processing {audio_id}...")

        # Segment the audio
        segments = segment_audio(audio_path)

        for segment_id, segment in enumerate(segments):
            # Extract features
            deep_feat = extract_deep_features([segment])[0]
            hc_feat = extract_egemaps(segment)

            # Save features to CSV
            save_features(audio_id, segment_id, deep_feat, hc_feat)

    print(f"All features saved to {output_file}")


In [19]:
# Step 1: List all audio files in the dataset directory
# base_dir = "/kaggle/working/fi_audio_train_dataset/train-2"  # Replace with your path
audio_files = list_audio_files(base_dir)
print(f"Found {len(audio_files)} audio files.")

# Step 2: Process all audio files
process_audio_files(audio_files)  # This function is already defined in the previous pipeline


Found 1040 audio files.
Processing NvwD448H-40.001...
Processing f39E4ct09Cc.000...
Processing sBHv1jYmZQE.003...
Processing xA3GCTL5o-k.000...
Processing GTmHSF6vNWc.002...
Processing c2LeNguVNrI.002...
Processing 4kIHxR6s1L4.001...
Processing XPV4pavXh94.001...
Processing RvoxbgEP49Q.003...
Processing MuYYY3XaJ7Q.005...
Processing nQxTTYWy7pA.001...
Processing MkVsl0hdphE.000...
Processing dmOKZLeFKCM.002...
Processing WUNXs4r_xCU.002...
Processing 8py6SksNsZQ.002...
Processing xmD4FKMq2rk.000...
Processing sPMNhG1Sehc.005...
Processing HWuh0biZQ_c.001...
Processing RsYSOFRFszk.001...
Processing b39BQbVhOAg.004...
Processing s1DOqsQoN5s.002...
Processing ua5c716lu5s.005...
Processing cyma9Q6QF6M.001...
Processing 2TMl5EMrdVI.005...
Processing laPGbkC87us.004...
Processing wr4dP9MuHME.004...
Processing o7J2fWIJndQ.005...
Processing HhC2cGFFZeY.001...
Processing ztyBhnjtrz0.003...
Processing jrhvXPaXo-0.000...
Processing 5Wmlo8Z5yVA.003...
Processing 9a-3LrB7cDI.005...
Processing DyZlh

In [20]:
print("done")

done


## DEEP AND HC FEATURE EXTRACTION

In [21]:
import os

def list_audio_files(base_dir, extension=".wav"):
    """
    List all audio files in the dataset directory.

    :param base_dir: Path to the dataset base directory
    :param extension: File extension to look for (default is ".wav")
    :return: List of file paths for all audio files
    """
    audio_files = []
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith(extension):
                audio_files.append(os.path.join(root, file))
    return audio_files

# Example usage
base_dir = "/kaggle/working/fi_audio_train_dataset/train-6"  # Replace with the path to your dataset
audio_files = list_audio_files(base_dir)
print(f"Found {len(audio_files)} audio files.")


Found 1120 audio files.


In [22]:
import os
import pandas as pd

output_dir = "audio_feature_extraction"

# Create output directory if not exists
os.makedirs(output_dir, exist_ok=True)

# Define output file and columns
output_file = "audio_feature_extraction/train_6_af.csv"
columns = ['Audio_ID', 'Segment_ID'] + [f'Deep_{i}' for i in range(512)] + [f'Hc_{i}' for i in range(25)]

# Initialize the CSV if it doesn't exist
if not os.path.exists(output_file):
    pd.DataFrame(columns=columns).to_csv(output_file, index=False)


In [23]:
from pydub import AudioSegment

def segment_audio(audio_path, segment_duration=2, skip_duration=3):
    audio = AudioSegment.from_file(audio_path)
    segments = []
    audio_duration = len(audio) / 1000  # in seconds
    for start in range(0, int(audio_duration - segment_duration), skip_duration):  # Segment based on audio length
        segment = audio[start * 1000:(start + segment_duration) * 1000]  # Segment duration in milliseconds
        if len(segment) < segment_duration * 1000:
            segment = segment + AudioSegment.silent(duration=(segment_duration * 1000 - len(segment)))  # Pad with silence
        segments.append(segment)
    return segments



In [24]:
import numpy as np
import librosa
from PIL import Image
import torch
import torchvision.transforms as transforms

# Load pretrained emotional VGG-16 model
vgg_model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg16', pretrained=True)
vgg_model.eval()

def extract_deep_features(segments):
    features = []
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    for segment in segments:
        # Compute log-Mel spectrogram
        samples = np.array(segment.get_array_of_samples(), dtype=np.float32)
        mel_spec = librosa.feature.melspectrogram(y=samples, sr=16000, n_mels=128, fmax=8000, hop_length=512, win_length=2048)
        log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

        # Normalize the spectrogram to range 0-255 for image conversion
        log_mel_spec = np.clip(log_mel_spec, a_min=-80, a_max=0)  # Log scale
        log_mel_spec = (log_mel_spec + 80) * (255.0 / 80.0)  # Normalize to [0, 255]

        # Convert to image and resize
        image = Image.fromarray(log_mel_spec.astype(np.uint8)).convert("RGB")
        image = transform(image).unsqueeze(0)

        # Extract features using VGG-16
        with torch.no_grad():
            deep_feature = vgg_model(image).numpy()
        features.append(deep_feature)

    return features


Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


In [25]:
# pip install opensmile

In [26]:
import opensmile
from opensmile import Smile, FeatureSet, FeatureLevel

smile = Smile(
    feature_set=FeatureSet.eGeMAPSv02,
    feature_level=FeatureLevel.Functionals
)

def extract_egemaps(segment):
    """
    Extract eGeMAPS features using openSMILE.

    :param segment: Audio segment
    :return: Hand-crafted features as a numpy array
    """
    samples = np.array(segment.get_array_of_samples(), dtype=np.float32)
    features = smile.process_signal(samples, sampling_rate=16000).to_numpy()
    return features


In [27]:
def save_features(audio_id, segment_id, deep_feat, hc_feat):
    """
    Save the extracted features to the CSV file.

    :param audio_id: ID of the audio file
    :param segment_id: ID of the audio segment
    :param deep_feat: Deep features for the segment
    :param hc_feat: Hand-crafted features for the segment
    """
    # Aggregate features (e.g., mean)
    combined_features = {
        'Audio_ID': audio_id,
        'Segment_ID': segment_id,
    }
    combined_features.update({f'Deep_{i}': deep_feat.mean(axis=0)[i] for i in range(512)})
    combined_features.update({f'Hc_{i}': hc_feat.mean(axis=0)[i] for i in range(25)})

    # Append to CSV
    feature_row = pd.DataFrame([combined_features])
    feature_row.to_csv(output_file, mode='a', header=False, index=False)


In [28]:
def process_audio_files(audio_files):
    for audio_path in audio_files:
        audio_id = os.path.splitext(os.path.basename(audio_path))[0]
        print(f"Processing {audio_id}...")

        # Segment the audio
        segments = segment_audio(audio_path)

        for segment_id, segment in enumerate(segments):
            # Extract features
            deep_feat = extract_deep_features([segment])[0]
            hc_feat = extract_egemaps(segment)

            # Save features to CSV
            save_features(audio_id, segment_id, deep_feat, hc_feat)

    print(f"All features saved to {output_file}")


In [29]:
# Step 1: List all audio files in the dataset directory
# base_dir = "/kaggle/working/fi_audio_train_dataset/train-2"  # Replace with your path
audio_files = list_audio_files(base_dir)
print(f"Found {len(audio_files)} audio files.")

# Step 2: Process all audio files
process_audio_files(audio_files)  # This function is already defined in the previous pipeline


Found 1120 audio files.
Processing rGNeR3p1jKU.001...
Processing ezMO8EHKAVw.000...
Processing 4yogPbHFQ9o.002...
Processing 38fE5B4ghw0.005...
Processing JciLbidX1Q8.003...
Processing o-fNF5QVMNo.004...
Processing jz9kgvYbBYA.005...
Processing Rwsss8y1bf0.002...
Processing NpiglwrZzsA.001...
Processing U-XrzfFzMkg.004...
Processing DqA3j7juNs4.000...
Processing lAJWoIcgXlU.001...
Processing 3VTL2kFt17M.002...
Processing hX2TVefa5bQ.002...
Processing IFXaH_E54iA.004...
Processing X9AEEGzWSU8.003...
Processing cP_dSXql7z8.004...
Processing om-9kFEKJIs.000...
Processing alucqviYJFE.004...
Processing T0drx76OLAc.000...
Processing Or2_bP4lWYI.001...
Processing g00oS-Kt248.001...
Processing bkzgccBFztM.001...
Processing FM5UaJS-bhg.002...
Processing YzcZY1mUZfc.002...
Processing f3ZbA4UqGKE.005...
Processing P7poMmhYnM0.001...
Processing bAxlTgobVHE.002...
Processing ana5C73n9bY.000...
Processing 2pNKs0i3YkY.002...
Processing teRgvZ63_2I.003...
Processing jolhoLhAurM.005...
Processing aqd4_

In [30]:
print("done")

done


In [31]:
import os
import zipfile

def compress_folder_to_zip(folder_path, output_zip_path):
    # Check if the folder exists
    if not os.path.isdir(folder_path):
        raise ValueError(f"Folder path '{folder_path}' does not exist.")
    
    # Ensure the output file has the `.zip` extension
    if not output_zip_path.endswith(".zip"):
        output_zip_path += ".zip"

    # Create a ZIP file
    with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                # Add file to ZIP with a relative path
                zipf.write(file_path, os.path.relpath(file_path, folder_path))

    print(f"Folder successfully compressed into: {output_zip_path}")

# Example usage
folder_path = "/kaggle/working/audio_feature_extraction"  # Replace with your folder path on Kaggle
output_zip_path = "/kaggle/working/audio_feature_extraction.zip"  # Output file will be saved in the working directory

compress_folder_to_zip(folder_path, output_zip_path)


Folder successfully compressed into: /kaggle/working/audio_feature_extraction.zip
