### Import libraries

In [1]:
# ! don't use this unless you're me
yo_mama_path = "/Volumes/YoMama/Rizzerator/model/"

In [2]:
import whisper
import soundfile as sf
import json

import librosa
import os
"""
if librosa.__version__ != '0.6.2':
    os.system('pip3 install librosa==0.6.2')
    import librosa
"""

import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# model imports
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from transformers import BertModel, BertTokenizer
from torch.nn import CosineSimilarity
import torch

  from .autonotebook import tqdm as notebook_tqdm


### Loop through files

In [3]:
# 1. Loop through .wav files and their corresponding .json files
data_path_caller = yo_mama_path + "data/audio/caller/"
data_path_agent = yo_mama_path + "data/audio/agent/"
transcript_path = yo_mama_path + "data/transcript/"

all_wav_files_caller = [f for f in os.listdir(data_path_caller) if f.endswith(".wav")]

# Select 3 random samples
# ! Increase later, keeping this low to test
wav_files_caller = random.sample(all_wav_files_caller, 6)

# Use the same filenames for the agent audio files
wav_files_agent = wav_files_caller

json_files = [f.replace(".wav", ".json") for f in wav_files_caller]

### Make function to extract features

In [1]:
### This is the current one below

In [4]:
import librosa.display
from scipy.signal import lfilter, butter # do i need this? i can't remember
from scipy.signal import find_peaks
from pydub import AudioSegment
from pydub.silence import split_on_silence

# Function to extract acoustic features
def extract_acoustic_features(audio_file, start_time, end_time, window_size=0.1, hop_size=0.05):
    # Load the audio file with a specified start and end time
    y, sr = librosa.load(audio_file, sr=None, offset=start_time, duration=end_time - start_time)
    
    # If the audio file is empty, return None
    if len(y) == 0:
        return None
    
    # Set the target sample rate and resample the audio if necessary
    target_sr = 4000
    if sr > target_sr:
        y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
        sr = target_sr

    # Calculate window and hop lengths based on sample rate and provided parameters
    window_length = int(window_size * sr)
    hop_length = int(hop_size * sr)

    # Initialize lists to store feature values
    zcr_values = []
    energy_values = []
    spectral_centroid_values = []
    spectral_spread_values = []
    pitch_values = []
    spectral_entropy_values = []
    mfcc_values = []
    formant_values = []

    # Calculate spectral entropy and handle potential errors
    def spectral_entropy(spectrogram, normalize=True):
        psd_norm = np.divide(spectrogram, spectrogram.sum(axis=0, keepdims=True))
        log_psd_norm = -np.log2(psd_norm, where=(psd_norm > 0))
        entropy = np.sum(psd_norm * log_psd_norm, axis=0)
        if normalize:
            entropy /= np.log2(psd_norm.shape[0])
        return entropy 
    

    def formants(y, sr, n_formants=5, order=16):
        epsilon = 1e-8  # Add a small constant value to avoid zero-valued samples
        y = y + epsilon
        lpc = librosa.lpc(y, order=order)
        
        if not np.isfinite(lpc).all():  # Check if lpc contains any inf or nan values
            return [], []  # Return empty lists if inf or nan values are present

        roots = np.roots(lpc)
        roots = roots[np.imag(roots) >= 0]
        angles = np.angle(roots)
        freqs = sorted(sr * angles / (2 * np.pi))
        bandwidths = -0.5 * sr * np.log(np.abs(roots))
        freqs = freqs[:n_formants]
        bandwidths = bandwidths[:n_formants]
        return freqs, bandwidths
    

    # Iterate through the audio signal using a sliding window
    for start in range(0, len(y) - window_length, hop_length):
        # Extract the current window
        end = start + window_length
        y_window = y[start:end]

        # Calculate various features for the current window
        zcr = np.mean(librosa.feature.zero_crossing_rate(y_window))
        energy = np.mean(librosa.feature.rms(y=y_window))
        spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y_window, sr=sr))
        spectral_spread = np.mean(librosa.feature.spectral_bandwidth(y=y_window, sr=sr))

        # Calculate pitch and handle potential errors
         # ! Causing significant errors in pitch and spectral entropy feature extraction. Gives nan values all the time
        try:
            pitch, harmonic_strength, _ = librosa.pyin(y_window, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
            mean_pitch = np.mean(pitch)
        except librosa.util.exceptions.ParameterError:
            pitch = np.nan

        
        
        try:
            spectrogram = np.abs(librosa.stft(y_window, n_fft=2048, hop_length=512))
            entropy = np.mean(spectral_entropy(spectrogram))
            spectral_entropy_values.append(entropy)
        except librosa.util.exceptions.ParameterError:
            spectral_entropy = np.nan

        # Calculate MFCCs for the current window
        mfcc = np.mean(librosa.feature.mfcc(y=y_window, sr=sr, n_mfcc=13), axis=1)

        # Calculate formant values for the current window (Updated)
        formant_frequencies, formant_bandwidths = formants(y_window, sr)
        formant_values.append(formant_frequencies)

        # Append feature values to the corresponding lists
        zcr_values.append(zcr)
        energy_values.append(energy)
        spectral_centroid_values.append(spectral_centroid)
        spectral_spread_values.append(spectral_spread)
        pitch_values.append(pitch)
        spectral_entropy_values.append(spectral_entropy)
        mfcc_values.append(mfcc)

    # Calculate speaking rate by counting the number of audio chunks
    audio = AudioSegment.from_file(audio_file)
    audio_chunks = split_on_silence(audio, min_silence_len=200, silence_thresh=-36)
    speaking_rate = len(audio_chunks) / (end_time - start_time)

    # Calculate jitter and shimmer
    pitch_contour = librosa.yin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7')) 
    jitter = np.mean(np.abs(np.diff(pitch_contour)) / np.mean(pitch_contour)) 
    shimmer = np.mean(np.abs(np.diff(librosa.feature.rms(y=y))) / np.mean(librosa.feature.rms(y=y)))

    
   

    # Combine all extracted features into a single list
    features = [zcr_values, energy_values, spectral_centroid_values, spectral_spread_values, pitch_values, spectral_entropy_values, mfcc_values, speaking_rate, jitter, shimmer, formant_values]
    # Convert the list of features to a NumPy array
    features_array = np.array(features)

    return features_array


In [7]:
"""
# Function to extract acoustic features
def extract_acoustic_features(audio_file, start_time, end_time, window_size=0.1, hop_size=0.05):
    y, sr = librosa.load(audio_file, sr=None, offset=start_time, duration=end_time - start_time)
    
    # Return nothing if length of audio file is 0
    if len(y) == 0:
        return None
    
    # Resample the audio if necessary
    target_sr = 4000
    if sr > target_sr:
        y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
        sr = target_sr

    # Calculate window and hop lengths based on sample rate and provided parameters
    window_length = int(window_size * sr)
    hop_length = int(hop_size * sr)

    # Initialize lists to store feature values
    zcr_values = []
    energy_values = []
    spectral_centroid_values = []
    spectral_spread_values = []
    pitch_values = []
    spectral_entropy_values = []

    # Iterate through the audio signal using a sliding window
    for start in range(0, len(y) - window_length, hop_length):
        end = start + window_length
        # Extract the current window
        y_window = y[start:end]

        # Calculate various features for the current window
        zcr = np.mean(librosa.feature.zero_crossing_rate(y_window))
        energy = np.mean(librosa.feature.rms(y=y_window))
        spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y_window, sr=sr))
        spectral_spread = np.mean(librosa.feature.spectral_bandwidth(y=y_window, sr=sr))

        # Calculate pitch and handle potential errors
        # ! Causing significant errors in pitch and spectral entropy feature extraction. Gives nan values all the time
        try:
            pitch = np.mean(librosa.feature.tonnetz(y=y_window, sr=sr))
        except librosa.util.exceptions.ParameterError:
            pitch = np.nan

        try:
            spectral_entropy = np.mean(librosa.feature.spectral_contrast(y=y_window, sr=sr))
        except librosa.util.exceptions.ParameterError:
            spectral_entropy = np.nan

        # Append feature values to the corresponding lists
        zcr_values.append(zcr)
        energy_values.append(energy)
        spectral_centroid_values.append(spectral_centroid)
        spectral_spread_values.append(spectral_spread)
        pitch_values.append(pitch)
        spectral_entropy_values.append(spectral_entropy)

    # Combine all extracted features into a single list
    features = [zcr_values, energy_values, spectral_centroid_values, spectral_spread_values, pitch_values, spectral_entropy_values]
    features_array = np.array(features)

    return features_array

"""

### Transcribe audio with Whisper

In [5]:
# 2. Transcribe audio using Whisper
model = whisper.load_model("base.en")

def transcribe_audio_whisper(audio_file, start_time, end_time):
    # Load the audio file
    y, sr = librosa.load(audio_file, sr=None, offset=start_time, duration=end_time - start_time)
    
    # Save the segmented audio to a temporary file
    temp_file = yo_mama_path + "temp.wav"
    sf.write(temp_file, y, sr)
    
    # !Implement actual Whisper transcription here
    transcript = model.transcribe(temp_file)
    
    # Remove the temporary file
    os.remove(temp_file)

    return transcript

## Download pretrained models for fine tuning

### Word2Vec

In [10]:
# Load pre-trained Word2Vec model
import gensim.downloader as api

# Show all available models in gensim-data
print(list(api.info()['models'].keys()))

word2vec_model = api.load('word2vec-google-news-300')

#word2vec_model = Word2Vec(dataset)  # train w2v model

#word2vec_model = gensim.downloader.load('word2vec-google-news-300')

# For saving and loading Word2Vec model
word2vec_model_path = os.path.join(yo_mama_path, 'word2vec_model')

# Save the model to a file
word2vec_model.save(yo_mama_path + 'word2vec_model')

# Load the saved model
#saved_w2v = gensim.models.Word2Vec.load('word2vec_model')

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


### GloVe Model

In [5]:
# Load pre-trained GloVe model
glove_file = "glove.6B/glove.6B.100d.txt"
glove_model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

# Save the GloVe model in the word2vec format
word2vec_glove_file = yo_mama_path + "glove_word2vec.txt"
#glove_model.save_word2vec_format(word2vec_glove_file)

### Bert Model

In [6]:
# Load pre-trained BERT model and tokenizer
bert_model = BertModel.from_pretrained("bert-base-uncased")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Cosine similarity for comparing embeddings
cosine_similarity = CosineSimilarity(dim=1)

# Save the model and tokenizer to a directory
#bert_model.save_pretrained('bert_model')
#bert_tokenizer.save_pretrained('bert_tokenizer')

# Load the saved model and tokenizer
#saved_model = BertModel.from_pretrained('bert_model')
#saved_tokenizer = BertTokenizer.from_pretrained('bert_tokenizer')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Function to extract linguistic features

In [7]:
import nltk

"""
nltk.download('popular')

"""

nltk.data.path.append(yo_mama_path)
nltk.download('popular', download_dir=yo_mama_path)

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt', download_dir=yo_mama_path)
nltk.download('stopwords', download_dir=yo_mama_path)
nltk.download('wordnet', download_dir=yo_mama_path)

def preprocess(text):
    # Tokenize
    words = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return words

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Volumes/YoMama/Rizzerator/model/...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Volumes/YoMama/Rizzerator/model/...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Volumes/YoMama/Rizzerator/model/...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Volumes/YoMama/Rizzerator/model/...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Volumes/YoMama/Rizzerator/model/...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Volumes/YoMama/Rizzerator/m

In [11]:
# 3. Extract linguistic features using Word2Vec, GloVe, and BERT 
def extract_linguistic_features(transcript):
    # Word2Vec features
    # Preprocess the transcript to tokenize and clean the text
    words = preprocess(transcript)
    # Initialize an empty list to store Word2Vec features
    word2vec_features = []

    # Iterate over each word in the preprocessed transcript
    for word in words:
        # If the word is present in the Word2Vec model, append its vector representation to the word2vec_features list
        if word in word2vec_model:
            word2vec_features.append(word2vec_model[word])
        # If the word is not present in the Word2Vec model, append a zero vector of the same dimension as the model's vectors
        else:
            word2vec_features.append(np.zeros(word2vec_model.vector_size))

    # Calculate the mean of the Word2Vec features along axis 0 (i.e., column-wise mean)
    word2vec_features = np.mean(word2vec_features, axis=0)

    # GloVe features
    # Calculate the mean GloVe vector for words in the transcript that are present in the GloVe model
    glove_features = np.mean([glove_model[word] for word in transcript.split() if word in glove_model], axis=0)

    # BERT features
    # Ensure no gradients are calculated for this operation (speeds up computation and reduces memory usage)
    with torch.no_grad():
        # Tokenize the transcript using the BERT tokenizer and convert it to a tensor, then add a batch dimension
        input_ids = torch.tensor(bert_tokenizer.encode(transcript)).unsqueeze(0)
        # Pass the input IDs through the BERT model to get the output
        bert_output = bert_model(input_ids)
        # Extract the mean of the last hidden state tensor along dimension 1 (i.e., average the hidden states of all tokens), then remove the batch dimension
        # ! NGL, I have no idea what this does
        # What is happening here? 
        # The BERT model outputs a tuple of 2 tensors: the last hidden state and the pooler output
        # The last hidden state is a tensor of shape (batch_size, sequence_length, hidden_size)
        # The pooler output is a tensor of shape (batch_size, hidden_size)
        # # The mean of the last hidden state tensor along dimension 1 is a tensor of shape (batch_size, hidden_size)
        # # The squeeze() method removes the batch dimension
        bert_features = bert_output.last_hidden_state.mean(dim=1).squeeze()

    # Return the extracted Word2Vec, GloVe, and BERT features
    return word2vec_features, glove_features, bert_features


### Perform acoustic feature extraction

Current problems:
 - Calculating the "mean of an empty slice" in some cases. What's going on there?
 - RuntimeWarning: invalid value encountered in double_scalars
 - Whisper transcript sometimes not lining up correctly with the provided transcript. Should we just use the provided transcript?
 - Sometimes the provided transcript is "[noise]", other times "[noise][noise]", and other times "[noise][noise][noise]". These all need to be filtered out.

In [12]:
# 4. Extract acoustic features and create data
data = []
count = 0

for wav_file_caller, wav_file_agent, json_file in zip(wav_files_caller, wav_files_agent, json_files):
    with open(transcript_path + json_file) as f:
        json_data = json.load(f)

    human_transcripts = [entry["human_transcript"] for entry in json_data if (entry["human_transcript"] != "[noise]" or entry["human_transcript"] != "[noise][noise]" or entry["human_transcript"] != "[noise][noise][noise]")]
    emotion_scores = [entry["emotion"] for entry in json_data if (entry["human_transcript"] != "[noise]" or entry["human_transcript"] != "[noise][noise]" or entry["human_transcript"] != "[noise][noise][noise]")]
    offset_durations = [(entry["offset_ms"], entry["duration_ms"]) for entry in json_data if (entry["human_transcript"] != "[noise]" or entry["human_transcript"] != "[noise][noise]" or entry["human_transcript"] != "[noise][noise][noise]")]
    speaker_roles = [entry["speaker_role"] for entry in json_data if (entry["human_transcript"] != "[noise]" or entry["human_transcript"] != "[noise][noise]" or entry["human_transcript"] != "[noise][noise][noise]")]

    # Extract acoustic features and linguistic features for each human transcript
    for transcript, emotion_score, (offset_ms, duration_ms), speaker_role in zip(human_transcripts, emotion_scores, offset_durations, speaker_roles):
        start_time = offset_ms / 1000
        end_time = (offset_ms + duration_ms) / 1000

        # Change file path according to speaker role
        if speaker_role == "caller":
            audio_file = data_path_caller + wav_file_caller
        else:
            audio_file = data_path_agent + wav_file_agent
        
        acoustic_features = extract_acoustic_features(audio_file, start_time, end_time)

        if acoustic_features is not None:
            # ! The issue with the whisper transcript is that its not as good as the human_transcript
            """
            whisper_transcript = transcribe_audio_whisper(audio_file, start_time, end_time)
            print(whisper_transcript) # Printing just to see what's going on
            whisper_text = whisper_transcript['text'] # Extract the text portion from the dictionary
            """
            whisper_text = transcript
            word2vec_features, glove_features, bert_features = extract_linguistic_features(whisper_text)

            row = (audio_file, speaker_role, transcript, word2vec_features, glove_features, bert_features, *acoustic_features, emotion_score)
            data.append(row)
    count += 1  # Increment the counter
    print(f"Finished processing: {count}")  # Print the current progress

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = ret.dtype.type(ret / rcount)
  psd_norm = np.divide(spectrogram, spectrogram.sum(axis=0, keepdims=True))
  bandwidths = -0.5 * sr * np.log(np.abs(roots))
  shimmer = np.mean(np.abs(np.diff(librosa.feature.rms(y=y))) / np.mean(librosa.feature.rms(y=y)))
  entropy = np.sum(psd_norm * log_psd_norm, axis=0)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Finished processing: 1


LinAlgError                               Traceback (most recent call last)
Cell In[25], line 25
     22 else:
     23     audio_file = data_path_agent + wav_file_agent
---> 25 acoustic_features = extract_acoustic_features(audio_file, start_time, end_time)
     27 if acoustic_features is not None:
     28     # ! The issue with the whisper transcript is that its not as good as the human_transcript
     29     """
     30     whisper_transcript = transcribe_audio_whisper(audio_file, start_time, end_time)
     31     print(whisper_transcript) # Printing just to see what's going on
     32     whisper_text = whisper_transcript['text'] # Extract the text portion from the dictionary
     33     """

Cell In[13], line 91, in extract_acoustic_features(audio_file, start_time, end_time, window_size, hop_size)
     88 mfcc = np.mean(librosa.feature.mfcc(y=y_window, sr=sr, n_mfcc=13), axis=1)
     90 # Calculate formant values for the current window (Updated)
---> 91 formant_frequencies, formant_bandwidths = formants(y_window, sr)
     92 formant_values.append(formant_frequencies)
     94 # Append feature values to the corresponding lists

Cell In[13], line 48, in extract_acoustic_features..formants(y, sr, n_formants, order)
     46 def formants(y, sr, n_formants=5, order=16):
     47     lpc = librosa.lpc(y, order=order)
---> 48     roots = np.roots(lpc)
...
    207 for a in arrays:
    208     if not isfinite(a).all():
--> 209         raise LinAlgError("Array must not contain infs or NaNs")

LinAlgError: Array must not contain infs or NaNs

### Export to CSV

In [14]:
# 5. Export data to a CSV file
column_names = ["Audio", "SpeakerRole", "Transcript", "Word2Vec", "GloVe", "BERT", "ZCR", "Energy", "SpectralCentroid", "SpectralSpread", "Pitch", "SpectralEntropy", "MFCC", "SpeakingRate", "Jitter", "Shimmer", "FormantValues", "EmotionScore"]
df = pd.DataFrame(data, columns=column_names)
df.to_csv(yo_mama_path + "output.csv")

### Save feature arrays as NumPy files

In [None]:
np.save(yo_mama_path + "zcr_values.npy", np.vstack(df["ZCR"].values))
np.save(yo_mama_path + "energy_values.npy", np.vstack(df["Energy"].values))
np.save(yo_mama_path + "spectral_centroid_values.npy", np.vstack(df["SpectralCentroid"].values))
np.save(yo_mama_path + "spectral_spread_values.npy", np.vstack(df["SpectralSpread"].values))
np.save(yo_mama_path + "pitch_values.npy", np.vstack(df["Pitch"].values))
np.save(yo_mama_path + "spectral_entropy_values.npy", np.vstack(df["SpectralEntropy"].values))