In [None]:
# MIT License

# Copyright (c) 2024 GitHub

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


# African-American English Vernacular English

African American Vernacular English (AAVE), also known as African American English (AAE) or Black English, is a distinctive dialect of English spoken primarily by African Americans. It has its roots in the linguistic legacy of African slaves brought to the United States, blending elements of West African languages with English. AAVE is characterized by unique grammatical, phonological, and syntactic features, such as the use of double negatives, the dropping of consonant clusters, and a specific set of verb tenses. Although often stigmatized and misunderstood, AAVE is a legitimate, rule-governed variety of English with deep cultural and historical significance. It serves as a powerful tool of identity and cultural expression within the African-American community.

Corpus of Regional African American Language (CORAAL) is a corpus of African American Language Varieties. The data set provides several hours of audio and transcripts from variety of communities. The UF Data studio has added much of the data set to HuggingFace [CORAAL Data Set](https://huggingface.co/datasets/zsayers/CORAAL).

## Import Libraries

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import wavfile
from IPython.display import Audio

## Component Details
### CORAAL:ATL (Atlanta, GA 2017; Version 2020.05)

CORAAL:ATL consists of 13 primary speakers across 14 audio files, collected in 2017 and 2018 by Patrick Slay Brooks, a music producer in Atlanta ([www.slayinrecords.com](www.slayinrecords.com)), specifically for CORAAL. Speakers represent a modern friendship network in Atlanta, GA. Atlanta has been  described as a “black mecca" in the South (Hobson 2010), especially in the context of the so- called reverse Great Migration, the movement of African Americans from Northern and Western cities back to the (urban) South. Brooks has a friendship group that highlights a diversity of experiences in Atlanta. Speakers range from being born and raised in Atlanta, to growing up in  places like New York City, Washington DC, and Los Angeles, CA. As with all sub-components,  see metadata for speaker details.

Speakers were interviewed by Brooks for CORAAL to fill a 2 x 2 demographic matrix. In file naming, like with CORAAL:PRV and CORAAL:ROC, the socioeconomic group is listed as “0” (e.g., ATL_se0_ag1_m_01_1) to denote no focus on socioeconomic groups (not to indicate a group lower than 1). We have attempted to capture and include in the metadata broad information about speakers’ demographic backgrounds, such as length of residence and other places lived, but leave questions of interpretation up to end users.

### CORAAL:ATL data

The 14 audio files are 44.1 kHz, 16 bit, mono in WAV format, totaling 8.6 hours and 93.5K 
words. Interviews were recording on a Zoom H5 recorder, with either a lapel microphone or an 
internal microphone, between 2017 and 2018. Interviews are sociolinguistic styled interviews 
and conversations on topics such as life in Atlanta, and the interviewee’s neighborhood, 
schooling, and work history.   

Speaker numbers are listed in each cell. 
 
| Socio-Economic Group | Gender | Age Group 1 (under 29) | Age Group 2 (30 to 50) |
|----------------------|--------|-------------------------|-------------------------|
| Group 0             | Female | 3                       | 2                       |
|                      | Male   | 5                       | 3                       |

In [None]:
# Set the path to your directory
data_directory = '../data/coraal/audio/wav'

# Create a list of file paths
file_paths = [os.path.join(data_directory, filename) for filename in os.listdir(data_directory) if filename.endswith('.wav')]


In [None]:
for paths in file_paths:
    print(paths)

In [None]:
test_path = file_paths[0]

## Visualize some basic properties of the audio recordings

In [None]:
# Load the audio file
sr, y = wavfile.read(test_path)
sr = sr/1000 # Change to kHz

# Get duration
duration = len(y) / sr

# Generate a time array for plotting
time = np.linspace(0, duration, len(y))

num_samples = len(y)
num_channels = 1 if len(y.shape) == 1 else y.shape[1]  # Check if mono or stereo

### Audio Properties

1. **Audio Information**: Properties of the audio sample (frequency, samples, duration, channels)
1. **Waveform**: Amplitude of the audio signal over time.
1. **Spectrogram**: Frequency content of the audio signal over time.

In [None]:
# Print audio information
print(f"File: {test_path}")
print(f"Sample Rate: {sr} kHz")
print(f"Duration: {duration:.2f} seconds")
print(f"Number of Samples: {num_samples}")
print(f"Number of Channels: {num_channels}")

In [None]:
plt.figure(figsize=(12, 6))
plt.subplot(3, 1, 1)
plt.plot(time, y)
plt.title(f'Waveform of {test_path}')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.show()

In [None]:
plt.subplot(3, 1, 2)
plt.specgram(y, Fs=sr, NFFT=1024, noverlap=512, cmap='plasma')
plt.title('Spectrogram')
plt.xlabel('Time (s)')
plt.ylabel('Frequency (Hz)')
plt.show()

In [None]:
display(Audio(test_path))

## Audio File Analysis

1. **Duration Analysis**: Calculate the duration of each audio file to understand the average length of interviews.
1. **Sample Rate and Channels**: Check the sample rate and number of channels to ensure consistency across your dataset.
1. **Signal Visualization**: Plot waveforms of a few audio samples to visualize variations in speech patterns.
1. **Spectrograms**: Generate spectrograms to analyze frequency content and observe characteristics of speech patterns.

In [None]:
def audio_properties(filepath):
    # Load the audio file
    sr, y = wavfile.read(filepath)
    sr = sr/1000 # Change to kHz

    # Get duration
    duration = (len(y) / sr)

    # Generate a time array for plotting
    time = np.linspace(0, duration, len(y))

    num_samples = len(y)
    num_channels = 1 if len(y.shape) == 1 else y.shape[1]  # Check if mono or stereo

    audio_prop_dict = {"Audio Path": filepath,
                       "Sample Rate (kHz)": sr,
                       "Duration (s)": duration,
                       "Number of Samples": num_samples,
                       "Number of Channels": num_channels}
    return audio_prop_dict

In [None]:
audio_properties(file_paths[0])

In [None]:
audio_prop_list = []

for paths in file_paths:
    prop = audio_properties(paths)
    audio_prop_list.append(prop)

print(audio_prop_list)

In [None]:
df = pd.DataFrame(audio_prop_list)
display(df)

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
shortest_duration = df['Duration (s)'].min()
longest_duration = df['Duration (s)'].max()

print("Shortest Sample Duration:", shortest_duration,
      "\nLongest Sample Duration:", longest_duration)

In [None]:
shortest_sample_freq = df['Number of Samples'].min()
longest_sample_freq = df['Number of Samples'].max()

print("Shortest Sample Frequency:", shortest_sample_freq,
      "\nLongest Sample Frequency:", longest_sample_freq)

## Analyzing Transcript

In [None]:
# Set the path to your directory
data_directory = '../data/coraal/transcript/text/'

# Create a list of file paths
file_paths = [os.path.join(data_directory, filename) for filename in os.listdir(data_directory) if filename.endswith('.txt')]


In [None]:
file_paths

In [None]:
def transcript_properties(filepath):
    df = pd.read_csv(filepath, delimiter="\t", index_col="Line")
    df['Audio Path'] = filepath
    return df

In [None]:
for paths in file_paths:
    display(transcript_properties(paths).head())

In [None]:
transcript_df = pd.read_csv("../data/coraal/transcript/text/ATL_se0_ag2_m_02_1.txt",
                            delimiter="\t", index_col="Line")
transcript_df['Audio Path'] = '../data/coraal/transcript/text/ATL_se0_ag2_m_02_1.txt'

In [None]:
transcript_df

## Getting 10 or 15 second segments of our data

In [None]:
import numpy as np
from scipy.io import wavfile
import pandas as pd

def audio_properties(filepath, segment_duration=10):
    # Load the audio file
    sr, y = wavfile.read(filepath)
    sr_khz = sr / 1000  # Change to kHz
    segments = []

    total_duration = len(y) / sr
    num_segments = int(np.ceil(total_duration / segment_duration))

    for seg_index in range(num_segments):
        start_time = seg_index * segment_duration
        end_time = min((seg_index + 1) * segment_duration, total_duration)

        start_sample = int(start_time * sr)
        end_sample = int(end_time * sr)
        segment_y = y[start_sample:end_sample]

        # Find overlapping content from the transcript
        content_segments = []
        for index, row in transcript_df.iterrows():
            transcript_start = row['StTime']
            transcript_end = row['EnTime']

            # Check if there's an overlap
            if transcript_end > start_time and transcript_start < end_time:
                content_segments.append(row['Content'])

        # Combine content for the current segment
        combined_content = ' '.join(content_segments)

        segments.append({
            "Audio Path": filepath,
            "Start Time (s)": start_time,
            "End Time (s)": end_time,
            "Sample Rate (kHz)": sr_khz,
            "Number of Samples": len(segment_y),
            "Number of Channels": 1 if len(segment_y.shape) == 1 else segment_y.shape[1],
            "Content": combined_content.strip()  # Join all overlapping content
        })

    return segments

# Usage
segment = audio_properties(file_paths[0])
segment_df = pd.DataFrame(segment)

# Display the resulting DataFrame
display(segment_df)