In [None]:
import os
import shutil
from pathlib import Path
from collections import defaultdict

def copy_emotion_files(source_folder, target_folder, emotions, num_files=100):
    # Create target folder if it doesn't exist
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)
    
    # Dictionary to keep track of the number of copied files for each emotion
    emotion_counts = defaultdict(int)
    
    # Get all files in the source folder
    files = Path(source_folder).glob('*')
    
    for file in files:
        for emotion in emotions:
            if emotion in file.name and emotion_counts[emotion] < num_files:
                # Copy file to the target folder
                shutil.copy(str(file), target_folder)
                emotion_counts[emotion] += 1
                print(f'Copied: {file.name} to {target_folder}')
                
                # Stop copying if the required number of files for each emotion is reached
                if all(count >= num_files for count in emotion_counts.values()):
                    return

# Example usage
source_folder = './data/SUST'  # Replace with your source folder path
target_folder = './data/speech_subset'  # Replace with your target folder path
emotions = ['ANGRY', 'HAPPY', 'NEUTRAL', 'SAD', 'SURPRISE']  # Replace with your emotions

copy_emotion_files(source_folder, target_folder, emotions, num_files=100)

In [19]:
import os
import random
import shutil
from pathlib import Path

def transfer_random_files(src_dir, dst_dir, num_files_per_emotion, move=False):
    emotions = ['ANGRY', 'HAPPY', 'SAD', 'SURPRISE']

    # Create the destination directory if it does not exist
    Path(dst_dir).mkdir(parents=True, exist_ok=True)

    for emotion in emotions:
        # Find all files for the current emotion
        emotion_files = list(Path(src_dir).glob(f'*{emotion}*.wav'))

        if len(emotion_files) < num_files_per_emotion:
            print(f"Warning: Not enough files for {emotion}. Found only {len(emotion_files)} files.")
            num_files_to_copy = len(emotion_files)
        else:
            num_files_to_copy = num_files_per_emotion

        # Randomly select the specified number of files
        selected_files = random.sample(emotion_files, num_files_to_copy)

        # Copy the selected files to the destination directory
        for file_path in selected_files:
            if move:
                shutil.move(file_path, dst_dir)
            else:
                shutil.copy(file_path, dst_dir)
    action = "moved" if move else "copied"
    print(f"{action.capitalize()} {num_files_per_emotion} files for each emotion to {dst_dir}")

def copy_or_move_emotion_files(emotion, num_files, src_dir, dst_dir, move=False):
    # Create the destination directory if it does not exist
    Path(dst_dir).mkdir(parents=True, exist_ok=True)

    # Find all files for the specified emotion
    emotion_files = list(Path(src_dir).glob(f'*{emotion}*.wav'))

    if len(emotion_files) < num_files:
        print(f"Warning: Not enough files for {emotion}. Found only {len(emotion_files)} files.")
        num_files_to_process = len(emotion_files)
    else:
        num_files_to_process = num_files

    # Randomly select the specified number of files
    selected_files = random.sample(emotion_files, num_files_to_process)

    # Copy or move the selected files to the destination directory
    for file_path in selected_files:
        if move:
            shutil.move(file_path, dst_dir)
        else:
            shutil.copy(file_path, dst_dir)

    action = "moved" if move else "copied"
    print(f"{action.capitalize()} {num_files_to_process} files for emotion '{emotion}' to {dst_dir}")

# Example usage
src_directory = './data/UIU'
num_files = 194
dst_directory = './data/SUBESCOxUIU/train'
emotion = 'NEUTRAL'

copy_or_move_emotion_files(emotion, num_files, src_directory, dst_directory, True)
# transfer_random_files(src_directory, dst_directory, num_files, move=True)



Moved 194 files for emotion 'NEUTRAL' to ./data/SUBESCOxUIU/train


In [None]:
import os
import random
import csv

def split_dataset(directory, train_csv, test_csv, train_ratio=0.8):
    # Dictionary to store files by emotion
    emotion_files = {}

    # Traverse through all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".wav"):
            # Extract emotion from filename
            emotions = ["ANGRY", "HAPPY", "NEUTRAL", "SAD", "SURPRISE"]
            for emotion in emotions:
                if emotion in filename.upper():
                    if emotion not in emotion_files:
                        emotion_files[emotion] = []
                    emotion_files[emotion].append(filename)
                    break

    # Lists to hold training and testing filenames
    train_files = []
    test_files = []

    # Split each emotion's files into training and testing
    for emotion, files in emotion_files.items():
        random.shuffle(files)
        split_point = int(len(files) * train_ratio)
        train_files.extend(files[:split_point])
        test_files.extend(files[split_point:])

    # Write training files to CSV
    with open(train_csv, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        for file in train_files:
            writer.writerow([file])

    # Write testing files to CSV
    with open(test_csv, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        for file in test_files:
            writer.writerow([file])

# Example usage
data_dir = './data/uiu-labelData'
emotions = ['ANGRY', 'HAPPY', 'NEUTRAL', 'SAD', 'SURPRISE']
train_csv = './data/uiu-labelData/split/train_files.csv'
test_csv = './data/uiu-labelData/split/test_files.csv'
split_dataset(data_dir, train_csv, test_csv)


In [None]:
import os
import csv

def retrieve_files_from_csv(directory, train_csv, test_csv):
    def read_csv(csv_file):
        with open(csv_file, 'r') as file:
            reader = csv.reader(file)
            return [row[0] for row in reader]

    train_files = read_csv(train_csv)
    test_files = read_csv(test_csv)

    train_paths = [os.path.join(directory, filename) for filename in train_files]
    test_paths = [os.path.join(directory, filename) for filename in test_files]

    return train_paths, test_paths

# Example usage
directory = './data/uiu-labelData/split'
train_csv = './data/uiu-labelData/split/train_files.csv'
test_csv = './data/uiu-labelData/split/test_files.csv'
train_paths, test_paths = retrieve_files_from_csv(directory, train_csv, test_csv)

print("Training files:")
for path in train_paths:
    print(path)

print("\nTesting files:")
for path in test_paths:
    print(path)

In [None]:
import os
import shutil

def copy_files_except_list(csv_file, source_dir, target_dir):
    # Read the filenames from the CSV file
    with open(csv_file, 'r') as f:
        filenames_to_exclude = {os.path.basename(line.strip()) for line in f.readlines()[1:]}  # Skip the header line and extract only the filename
    # Ensure the target directory exists
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # Copy only .wav files from source_dir to target_dir except those in filenames_to_exclude
    for filename in os.listdir(source_dir):
        print(filename)
        full_path = os.path.join(source_dir, filename)
        if os.path.isfile(full_path) and filename.endswith('.wav') and filename not in filenames_to_exclude:
            target_path = os.path.join(target_dir, filename)
            shutil.copy2(full_path, target_path)

# Usage
csv_file = './data/SUBESCOxUIU/csv/train_filenames.csv'
source_dir = './data/SUBESCO_900'
target_dir = 'data/test-data/SUBESCO_900'
copy_files_except_list(csv_file, source_dir, target_dir)


In [None]:
import os

def add_suffix_to_wav_files(directory, suffix):
    # Loop through all files in the directory
    for filename in os.listdir(directory):
        # Construct full file path
        full_path = os.path.join(directory, filename)
        # Check if it is a .wav file and not a directory
        if os.path.isfile(full_path) and filename.endswith('.wav'):
            # Split the filename and extension
            name, ext = os.path.splitext(filename)
            # Create the new filename with the suffix
            new_filename = f"{name}_{suffix}{ext}"
            # Construct full new file path
            new_full_path = os.path.join(directory, new_filename)
            # Rename the file
            os.rename(full_path, new_full_path)

# Usage
directory = 'path_to_your_directory'
suffix = 'your_suffix'
add_suffix_to_wav_files(directory, suffix)


In [None]:
import os
import csv
import re
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt

def count_files_by_emotion(csv_file, emotions):
    with open(csv_file, 'r') as file:
        reader = csv.reader(file)
        filenames = [row[0] for row in reader]
    
    emotion_counts = {emotion: 0 for emotion in emotions}
    
    for filename in filenames:
        for emotion in emotions:
            if emotion in filename:
                emotion_counts[emotion] += 1
                break
    
    return emotion_counts

def draw_emotion_counts_table(train_csv, test_csv):
    emotions = ['ANGRY', 'HAPPY', 'NEUTRAL', 'SAD', 'SURPRISE']
    train_counts = count_files_by_emotion(train_csv, emotions)
    test_counts = count_files_by_emotion(test_csv, emotions)

    data = {
        'Emotion': emotions,
        'Train Count': [train_counts[emotion] for emotion in emotions],
        'Test Count': [test_counts[emotion] for emotion in emotions]
    }
    
    df = pd.DataFrame(data)
    print(df)

    # Plotting table
    fig, ax = plt.subplots(figsize=(8, len(df) * 0.5))
    ax.axis('tight')
    ax.axis('off')
    table = ax.table(cellText=df.values, colLabels=df.columns, cellLoc='center', loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(12)
    table.scale(1, 1.5)
    plt.title("SUST-Count of Train and Test Files for Each Emotion")
    plt.show()

# Example usage
train_csv = './data/SUBESCO/split/train_files.csv'
test_csv = './data/SUBESCO/split/test_files.csv'
draw_emotion_counts_table(train_csv, test_csv)


In [1]:
import librosa
import pathlib
import numpy as np
import csv
from sklearn.model_selection import train_test_split
from tqdm import tqdm

def save_array_to_npy(array, file_name):
    np.save(file_name, array)

def load_array_from_npy(file_name):
    return np.load(file_name)

def get_cwt_mel(path, n_fft, hop_length, n_mels):
    
    y, sr = librosa.load(path, sr=16000)
    file_length = np.size(y)
    
    duration = 4
    samples = duration * sr

    if file_length < samples:
        y = np.concatenate((y, np.zeros(samples - file_length)), axis=0)
    else:
        y = y[:samples]

    hop_length = 250 #125=1025, 250=513, 501=256,1001
    mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=128)
    stft = np.abs(librosa.stft(y, n_fft=254, hop_length=hop_length))
    chroma = librosa.feature.chroma_stft(S=stft, n_chroma=128)
    log_mel_spectrogram = np.concatenate((mfcc, stft, chroma), axis=1)
    log_mel_spectrogram = log_mel_spectrogram.reshape((-1,))
    # print(f"log_mel_spectrogram shape = {log_mel_spectrogram.shape}")

    # Clear temporary variables
    del mfcc, stft, chroma, y

    return log_mel_spectrogram

def classify_files_new(path):
    dataset_dict = {
        'total': 0,
        'file_dict': {
            'ANGRY': {'represent': 0, 'count': 0, 'all_data': []},
            'HAPPY': {'represent': 1, 'count': 0, 'all_data': []},
            'NEUTRAL': {'represent': 2, 'count': 0, 'all_data': []},
            'SAD': {'represent': 3, 'count': 0, 'all_data': []},
            'SURPRISE': {'represent': 4, 'count': 0, 'all_data': []}
        }
    }

    wav_path = pathlib.Path(path)
    emotion_file_list = [str(file_name) for file_name in wav_path.glob('*.wav')]

    train_data_x, train_data_y = [], []
    test_data_x, test_data_y = [], []
    train_filenames, test_filenames = [], []

    emotion_label_list = dataset_dict['file_dict'].keys()
    for emotion_label in emotion_label_list:
        print(emotion_label)

        emotion_classify_file_list = [letter for letter in emotion_file_list if emotion_label in letter]
        files_count = len(emotion_classify_file_list)
        
        print(files_count)

        dataset_dict['file_dict'][emotion_label]['count'] = files_count
        dataset_dict['total'] += files_count

        emotion_data = [get_cwt_mel(path, n_fft=2048, hop_length=512, n_mels=128) for path in tqdm(emotion_classify_file_list, desc="Processing files")]

        x = emotion_data
        count = dataset_dict['file_dict'][emotion_label]['count']
        y = np.full(count, dataset_dict['file_dict'][emotion_label]['represent'])
        
        # x_train, x_test, y_train, y_test, train_files, test_files = train_test_split(
        #     x, y, emotion_classify_file_list, train_size=.999999, random_state=1, stratify=y)
        
        # 100% training data
        x_train, x_test, y_train, y_test, train_files, test_files = x, [], y, [], emotion_classify_file_list, []

        # x, y, z = 128, 513, 3
        x, y, z = 128, 257, 3
        
        # #create the npy folder if it does not exist
        # pathlib.Path(path + '/npy').mkdir(parents=True, exist_ok=True)
        
        # # save the partial train data
        # save_array_to_npy(x_train, path + '/npy/' + emotion_label + '_x_train.npy')
        # save_array_to_npy(y_train, path + '/npy/' + emotion_label + '_y_train.npy')

        train_data_x = np.append(train_data_x, np.array(x_train)).reshape(-1, x, y, z)
        test_data_x = np.append(test_data_x, np.array(x_test)).reshape(-1, x, y, z)

        train_data_y = np.append(train_data_y, y_train)
        test_data_y = np.append(test_data_y, y_test)

    #     train_filenames.extend(train_files)
    #     test_filenames.extend(test_files)

    # #if the csv folder does not exist, create it
    # pathlib.Path(path + '/csv').mkdir(parents=True, exist_ok=True)
    
    # np.savetxt(path + '/csv/train_data_y.csv', train_data_y, delimiter=',')
    # np.savetxt(path + '/csv/test_data_y.csv', test_data_y, delimiter=',')

    return train_data_x, train_data_y, test_data_x, test_data_y

path = './data/SUBESCOxUIU/test'
train_data_x, train_data_y, test_data_x, test_data_y = classify_files_new(path)

print(train_data_x.shape)
print(train_data_y.shape)
# print(test_data_x.shape)
# print(test_data_y.shape)

ANGRY
261


Processing files: 100%|██████████| 261/261 [00:23<00:00, 11.15it/s]


HAPPY
261


Processing files: 100%|██████████| 261/261 [00:09<00:00, 26.97it/s]


NEUTRAL
248


Processing files: 100%|██████████| 248/248 [00:08<00:00, 28.92it/s]


SAD
261


Processing files: 100%|██████████| 261/261 [00:09<00:00, 28.03it/s]


SURPRISE
261


Processing files: 100%|██████████| 261/261 [00:09<00:00, 27.54it/s]


(1292, 128, 257, 3)
(1292,)


In [2]:
def save_array_to_npy(array, file_name):
    np.save(file_name, array)

def load_array_from_npy(file_name):
    return np.load(file_name)

data_type = 'test'

#if saved directory does not exist, create it
pathlib.Path(path + '/npy').mkdir(parents=True, exist_ok=True)

save_array_to_npy(train_data_x, path + f"/npy/{data_type}_data_x.npy")
save_array_to_npy(train_data_y, path + f"/npy/{data_type}_data_y.npy")

del train_data_x, train_data_y, test_data_x, test_data_y

train_data_x = load_array_from_npy(path + f"/npy/{data_type}_data_x.npy")
train_data_y = load_array_from_npy(path + f"/npy/{data_type}_data_y.npy")

print(f"{data_type}_data_x shape:", train_data_x.shape)
print(f"{data_type}_data_y shape:", train_data_y.shape)

test_data_x shape: (1292, 128, 257, 3)
test_data_y shape: (1292,)


In [7]:
import os
import wave
from tqdm import tqdm

def calculate_avg_audio_duration(directory):
    total_duration = 0.0
    wav_files_count = 0

    # Get the list of all .wav files in the directory
    wav_files = [f for f in os.listdir(directory) if f.endswith('.wav') and os.path.isfile(os.path.join(directory, f))]
    
    for filename in tqdm(wav_files, desc="Processing files"):
        filepath = os.path.join(directory, filename)
        with wave.open(filepath, 'r') as wav_file:
            frames = wav_file.getnframes()
            rate = wav_file.getframerate()
            duration = frames / float(rate)
            total_duration += duration
            wav_files_count += 1

    if wav_files_count == 0:
        return 0.0, 0  # No .wav files found

    avg_duration = total_duration / wav_files_count
    return avg_duration, wav_files_count

# Usage
directory = './data/SUBESCOxUIU'
avg_duration, total_files = calculate_avg_audio_duration(directory)
print(f"Total .wav files: {total_files}")
print(f"Average audio duration: {avg_duration} seconds")



Processing files: 100%|██████████| 6466/6466 [01:30<00:00, 71.73it/s] 

Total .wav files: 6466
Average audio duration: 3.876932452973861 seconds



