In [1]:
# MIT License

# Copyright (c) 2024 GitHub

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

## Import Libraries

In [2]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# from scipy.io import wavfile
from IPython.display import Audio
import torch
import torchaudio
import torchaudio.transforms as transforms

In [3]:
# Load the audio file
waveform, sample_rate = torchaudio.load("../data/coraal/audio/wav/ATL_se0_ag1_f_01_1.wav")

print(f"Waveform shape: {waveform.shape}")
print(f"Sample rate: {sample_rate}")


Waveform shape: torch.Size([1, 82158296])
Sample rate: 44100


In [4]:
# Apply a Mel spectrogram transformation
mel_spectrogram = transforms.MelSpectrogram(sample_rate=sample_rate)(waveform)

print(f"Mel spectrogram shape: {mel_spectrogram.shape}")



Mel spectrogram shape: torch.Size([1, 128, 410792])


## Trying this again

In [5]:
# Set the path to your directory
transcript_dir = '../data/coraal/transcript/text/'

# Create a list of file paths
transcript_paths = [os.path.join(transcript_dir, filename) for filename in os.listdir(transcript_dir) if filename.endswith('.txt')]
transcript_paths

['../data/coraal/transcript/text/ATL_se0_ag2_f_02_1.txt',
 '../data/coraal/transcript/text/ATL_se0_ag2_m_02_1.txt',
 '../data/coraal/transcript/text/ATL_se0_ag2_f_01_1.txt',
 '../data/coraal/transcript/text/ATL_se0_ag2_m_03_1.txt',
 '../data/coraal/transcript/text/ATL_se0_ag2_m_01_1.txt',
 '../data/coraal/transcript/text/ATL_se0_ag1_m_05_1.txt',
 '../data/coraal/transcript/text/ATL_se0_ag1_m_03_1.txt',
 '../data/coraal/transcript/text/ATL_se0_ag1_f_01_1.txt',
 '../data/coraal/transcript/text/ATL_se0_ag1_f_03_1.txt',
 '../data/coraal/transcript/text/ATL_se0_ag1_m_01_1.txt',
 '../data/coraal/transcript/text/ATL_se0_ag1_m_04_2.txt',
 '../data/coraal/transcript/text/ATL_se0_ag1_m_04_1.txt',
 '../data/coraal/transcript/text/ATL_se0_ag1_f_02_1.txt',
 '../data/coraal/transcript/text/ATL_se0_ag1_m_02_1.txt']

In [6]:
# Set the path to your directory
audio_directory = '../data/coraal/audio/wav'

# Create a list of file paths
audio_paths = [os.path.join(audio_directory, filename) for filename in os.listdir(audio_directory) if filename.endswith('.wav')]
audio_paths

['../data/coraal/audio/wav/ATL_se0_ag2_m_02_1.wav',
 '../data/coraal/audio/wav/ATL_se0_ag2_f_02_1.wav',
 '../data/coraal/audio/wav/ATL_se0_ag2_m_01_1.wav',
 '../data/coraal/audio/wav/ATL_se0_ag2_f_01_1.wav',
 '../data/coraal/audio/wav/ATL_se0_ag2_m_03_1.wav',
 '../data/coraal/audio/wav/ATL_se0_ag1_m_05_1.wav',
 '../data/coraal/audio/wav/ATL_se0_ag1_f_03_1.wav',
 '../data/coraal/audio/wav/ATL_se0_ag1_m_01_1.wav',
 '../data/coraal/audio/wav/ATL_se0_ag1_m_03_1.wav',
 '../data/coraal/audio/wav/ATL_se0_ag1_f_01_1.wav',
 '../data/coraal/audio/wav/ATL_se0_ag1_m_04_1.wav',
 '../data/coraal/audio/wav/ATL_se0_ag1_m_04_2.wav',
 '../data/coraal/audio/wav/ATL_se0_ag1_m_02_1.wav',
 '../data/coraal/audio/wav/ATL_se0_ag1_f_02_1.wav']

In [7]:
def transcript_properties(filepath):
    df = pd.read_csv(filepath, delimiter="\t", index_col="Line")
    df['Transcript Path'] = filepath
    return df

# List to hold individual DataFrames
transcript_df = []

# Iterate through each file path, read the DataFrame, and append it to the list
for path in transcript_paths:
    transcript_df.append(transcript_properties(path))

# Concatenate all DataFrames in the list into a single DataFrame
transcript_df = pd.concat(transcript_df).reset_index()

# Display the combined DataFrame
transcript_df.head()

Unnamed: 0,Line,Spkr,StTime,Content,EnTime,Transcript Path
0,1,ATL_int_01,0.7526,Hey what's going on?,2.5113,../data/coraal/transcript/text/ATL_se0_ag2_f_0...
1,2,ATL_int_01,2.5113,(pause 0.63),3.1447,../data/coraal/transcript/text/ATL_se0_ag2_f_0...
2,3,ATL_int_01,3.1447,I'm here with,4.1659,../data/coraal/transcript/text/ATL_se0_ag2_f_0...
3,4,ATL_int_01,4.1659,(pause 0.92),5.083,../data/coraal/transcript/text/ATL_se0_ag2_f_0...
4,5,ATL_int_01,5.083,/RD-NAME-2/.,5.8536,../data/coraal/transcript/text/ATL_se0_ag2_f_0...


In [8]:
def split_audio_to_tensors(audio_path, interval=30):
    waveform, sample_rate = torchaudio.load(audio_path)
    total_length = waveform.size(1) / sample_rate  # in seconds
    num_chunks = int(np.ceil(total_length / interval))
    
    tensors = []
    start_times = []
    end_times = []

    for i in range(num_chunks):
        start = int(i * interval * sample_rate)
        end = int(min((i + 1) * interval * sample_rate, waveform.size(1)))
        chunk_waveform = waveform[:, start:end]
        tensors.append(chunk_waveform)
        start_times.append(start / sample_rate)
        end_times.append(end / sample_rate)
    
    return tensors, start_times, end_times

In [9]:
def split_transcript(transcript_df, interval=30):
    transcript_df['StTime'] = pd.to_timedelta(transcript_df['StTime'], unit='s')
    transcript_df['EnTime'] = pd.to_timedelta(transcript_df['EnTime'], unit='s')
    
    new_rows = []
    current_content = []
    current_start_time = pd.to_timedelta(0, unit='s')
    current_end_time = pd.to_timedelta(interval, unit='s')
    
    for _, row in transcript_df.iterrows():
        start_time = row['StTime']
        end_time = row['EnTime']
        content = row['Content']
        
        if start_time >= current_end_time:
            # Save the current chunk
            new_rows.append({
                'Content': ' '.join(current_content),
                'Start Time': current_start_time.total_seconds(),
                'End Time': current_end_time.total_seconds()
            })
            # Reset for the next chunk
            current_content = []
            current_start_time = current_end_time
            current_end_time += pd.to_timedelta(interval, unit='s')
        
        current_content.append(content)
    
    # Add the last chunk
    if current_content:
        new_rows.append({
            'Content': ' '.join(current_content),
            'Start Time': current_start_time.total_seconds(),
            'End Time': current_end_time.total_seconds()
        })
    
    return pd.DataFrame(new_rows)

In [11]:
# Load a small subset of data
def load_small_subset(transcript_path, audio_path, num_rows=120):
    # Load transcript data
    transcript_df = pd.read_csv(transcript_path, delimiter='\t')
    
    # Select a subset of rows
    transcript_subset = transcript_df.head(num_rows).copy()

    # Load audio data
    waveform, sample_rate = torchaudio.load(audio_path)
    
    return transcript_subset, waveform, sample_rate

In [12]:
# Load a small subset of data for demonstration
transcript_path = '../data/coraal/transcript/text/ATL_se0_ag1_f_01_1.txt'  # Replace with your transcript file path
audio_path = '../data/coraal/audio/wav/ATL_se0_ag1_f_01_1.wav'  # Replace with your audio file path

# Process data
transcript_df_subset, waveform, sample_rate = load_small_subset(transcript_path, audio_path, num_rows=120)
tensors, start_times, end_times = split_audio_to_tensors(audio_path, interval=30)

# Create DataFrame for audio chunks
audio_data_subset = pd.DataFrame({
    'Audio Tensor': tensors,
    'Start Time': start_times,
    'End Time': end_times,
    'Audio Path': [audio_path] * len(tensors)
})

# Process the transcript subset
transcript_df_subset = split_transcript(transcript_df_subset, interval=30)

# Merge DataFrames
audio_data_subset['Start Time'] = audio_data_subset['Start Time'].apply(lambda x: round(x, 2))
audio_data_subset['End Time'] = audio_data_subset['End Time'].apply(lambda x: round(x, 2))
transcript_df_subset['Start Time'] = transcript_df_subset['Start Time'].apply(lambda x: round(x, 2))
transcript_df_subset['End Time'] = transcript_df_subset['End Time'].apply(lambda x: round(x, 2))

merged_df = pd.merge_asof(transcript_df_subset.sort_values('Start Time'),
                          audio_data_subset.sort_values('Start Time'),
                          on='Start Time', direction='backward')

# Drop redundant columns
merged_df = merged_df.drop(columns=['End Time_y'])
merged_df = merged_df.rename(columns={'End Time_x': 'End Time'})

# Display the merged DataFrame
merged_df.head()

Unnamed: 0,Content,Start Time,End Time,Audio Tensor,Audio Path
0,"They talking about, don't send him to his dadd...",0.0,30.0,"[[tensor(0.0025), tensor(0.0028), tensor(0.002...",../data/coraal/audio/wav/ATL_se0_ag1_f_01_1.wav
1,<ts> (pause 0.19) Ninety-five. Okay. Nineteen-...,30.0,60.0,"[[tensor(-0.0178), tensor(-0.0201), tensor(-0....",../data/coraal/audio/wav/ATL_se0_ag1_f_01_1.wav
2,"Okay. Let's see, any other places lived? Nah, ...",60.0,90.0,"[[tensor(0.0188), tensor(0.0175), tensor(0.016...",../data/coraal/audio/wav/ATL_se0_ag1_f_01_1.wav
3,that's a hell of a connection. (pause 0.47) Le...,90.0,120.0,"[[tensor(0.0022), tensor(0.0021), tensor(0.002...",../data/coraal/audio/wav/ATL_se0_ag1_f_01_1.wav
4,(pause 1.05) You know. (pause 0.47) She the fo...,120.0,150.0,"[[tensor(-0.0041), tensor(-0.0051), tensor(-0....",../data/coraal/audio/wav/ATL_se0_ag1_f_01_1.wav


In [13]:
merged_df[merged_df.columns].iloc[0]

Content         They talking about, don't send him to his dadd...
Start Time                                                    0.0
End Time                                                     30.0
Audio Tensor    [[tensor(0.0025), tensor(0.0028), tensor(0.002...
Audio Path        ../data/coraal/audio/wav/ATL_se0_ag1_f_01_1.wav
Name: 0, dtype: object

In [14]:
merged_df["Content"].iloc[0]

"They talking about, don't send him to his daddy. (pause 0.28) You just need to go file for child support. [/Oh man/.] [Bye.] Why? (pause 0.80) Why? Okay, what's your name? /RD-NAME-2/ (pause 0.52) /RD-NAME-1/ what? (pause 0.48) /RD-NAME-3/ Okay. (pause 0.61) And, uh, (pause 0.39) are you a male or female? I'm a girl, I think. [<laugh>] [I'm just playing.] Okay. (pause 0.19) And your ethnicity? Hum, I'm supposed to say, black or non-hispanic. Okay. (pause 0.74) Um, year of birth?"

In [16]:
merged_df["Audio Tensor"].iloc[0].shape

torch.Size([1, 1323000])