In [2]:
import pandas as pd
from pydub import AudioSegment
import os

In [3]:
# Assuming all CSV files and MP3 files are in the same directory
csv_directory = 'csv'
audio_directory = 'audio'
output_directory = 'output'

# Create output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# List all CSV and MP3 files
csv_files = [f for f in os.listdir(csv_directory) if f.endswith('.csv')]
audio_files = [f for f in os.listdir(audio_directory) if f.endswith('.mp3')]

In [7]:
for csv_file in csv_files:
    audio_file = csv_file.replace('.csv', '.mp3')
    csv_path = os.path.join(csv_directory, csv_file)
    
    try:
        df = pd.read_csv(csv_path, header=None, usecols=[0, 2, 3, 7])
    except UnicodeDecodeError:
        df = pd.read_csv(csv_path, header=None, usecols=[0, 2, 3, 7], encoding='cp1252')  # You can try 'iso-8859-1' or 'cp1252'
    
    audio_path = os.path.join(audio_directory, audio_file)
    audio = AudioSegment.from_mp3(audio_path)

    for index, row in df.iterrows():
        start_time = int(row[2] * 1000)
        end_time = int(row[3] * 1000)
        segment = audio[start_time:end_time]

        segment_filename = f"{os.path.splitext(audio_file)[0]}_segment_{index}.mp3"
        segment_path = os.path.join(output_directory, segment_filename)
        segment.export(segment_path, format='mp3')

        df.loc[index, 'audio_clip_path'] = segment_path

    modified_csv_path = os.path.join(csv_directory, f"modified_{csv_file}")
    df.to_csv(modified_csv_path, index=False)

print("Audio processing complete. Modified CSVs and audio clips are saved.")

Audio processing complete. Modified CSVs and audio clips are saved.


In [10]:
from datasets import Dataset, Audio, DatasetDict
from huggingface_hub import HfApi, HfFolder, Repository
import pandas as pd
import os

# Directories
csv_directory = 'csv'
audio_segment_directory = 'output'

# List all segment files
audio_files = [f for f in os.listdir(audio_segment_directory) if f.endswith('.mp3')]
audio_files.sort()

In [20]:
# Initialize dataset list
data = []

# Process each audio segment file
for audio_file in audio_files:
    # Find the corresponding CSV file and row index
    base_name = "_".join(audio_file.split("_")[:-2]) + ".csv"
    # index = int(audio_file.split("_")[-1].replace('.mp3', ''))
    segment_index = int(audio_file.split("_")[-1].replace('.mp3', ''))  # segment index

    # Load the CSV file
    csv_path = os.path.join(csv_directory, base_name)
    
    try:
        df = pd.read_csv(csv_path, header=None, usecols=[0, 2, 3, 7])
    except UnicodeDecodeError:
        df = pd.read_csv(csv_path, header=None, usecols=[0, 2, 3, 7], encoding='cp1252')  # You can try 'iso-8859-1' or 'cp1252'
    
    
    # df = pd.read_csv(csv_path, header=None, usecols=[0, 2, 3, 7])

    # Retrieve metadata from the CSV file
    # utterance_word = df.iloc[index][0]
    # class_label = df.iloc[index][7]
    utterance_word = df.iloc[segment_index][0]
    class_label = df.iloc[segment_index][7]
    
    # Full path to the audio file
    full_path = os.path.join(audio_segment_directory, audio_file)

    # Append to data list
    data.append({
        'path': full_path,
        'utterance': utterance_word,
        'class': class_label
    })

In [21]:
data

[{'path': 'output\\F_0101_10y4m_1_segment_0.mp3',
  'utterance': 'and',
  'class': 0},
 {'path': 'output\\F_0101_10y4m_1_segment_1.mp3',
  'utterance': 'yjeu',
  'class': 0},
 {'path': 'output\\F_0101_10y4m_1_segment_10.mp3',
  'utterance': 'it',
  'class': 0},
 {'path': 'output\\F_0101_10y4m_1_segment_11.mp3',
  'utterance': 'UM',
  'class': 1},
 {'path': 'output\\F_0101_10y4m_1_segment_12.mp3',
  'utterance': 'they',
  'class': 0},
 {'path': 'output\\F_0101_10y4m_1_segment_13.mp3',
  'utterance': 'had',
  'class': 0},
 {'path': 'output\\F_0101_10y4m_1_segment_14.mp3',
  'utterance': 'animation',
  'class': 0},
 {'path': 'output\\F_0101_10y4m_1_segment_15.mp3',
  'utterance': 'and',
  'class': 0},
 {'path': 'output\\F_0101_10y4m_1_segment_16.mp3',
  'utterance': 'UM',
  'class': 1},
 {'path': 'output\\F_0101_10y4m_1_segment_17.mp3',
  'utterance': 'computer',
  'class': 0},
 {'path': 'output\\F_0101_10y4m_1_segment_18.mp3',
  'utterance': 'work',
  'class': 0},
 {'path': 'output\\F_01

In [23]:
# Create a dataset from the list
dataset = Dataset.from_pandas(pd.DataFrame(data))

In [24]:
dataset

Dataset({
    features: ['path', 'utterance', 'class'],
    num_rows: 5534
})

In [28]:
from datasets import Features, ClassLabel, Value

features = Features({
    'path': Audio(sampling_rate=16_000),
    'utterance': Value('string'),
    'class': ClassLabel(num_classes=8)  # Adjust num_classes as necessary
})

In [29]:
features

{'path': Audio(sampling_rate=16000, mono=True, decode=True, id=None),
 'utterance': Value(dtype='string', id=None),
 'class': ClassLabel(names=['0', '1', '2', '3', '4', '5', '6', '7'], id=None)}

In [30]:
# Cast the dataset to these features
dataset = dataset.cast(features)

Casting the dataset:   0%|          | 0/5534 [00:00<?, ? examples/s]

In [31]:
dataset

Dataset({
    features: ['path', 'utterance', 'class'],
    num_rows: 5534
})

In [33]:
dataset.push_to_hub("HamdanXI/uclass_clipped_labeled")

Map:   0%|          | 0/5534 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/56 [00:00<?, ?ba/s]