### Import libraries

In [86]:
import whisper
import soundfile as sf
import json

import librosa, os
if librosa.__version__ != '0.6.2':
    os.system('pip3 install librosa==0.6.2')
    import librosa

import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore", category=UserWarning)



### Loop through files

In [80]:
# 1. Loop through .wav files and their corresponding .json files
data_path_caller = "data/audio/caller/"
data_path_agent = "data/audio/agent/"
transcript_path = "data/transcript/"

all_wav_files_caller = [f for f in os.listdir(data_path_caller) if f.endswith(".wav")]

# Select 100 random samples
wav_files_caller = random.sample(all_wav_files_caller, 3)

# Use the same filenames for the agent audio files
wav_files_agent = wav_files_caller

json_files = [f.replace(".wav", ".json") for f in wav_files_caller]

### Make function to extract features

In [84]:
# Function to extract acoustic features
def extract_acoustic_features(audio_file, start_time, end_time):
    y, sr = librosa.load(audio_file, sr=None, offset=start_time, duration=end_time - start_time)
    
    target_sr = 4000
    if sr > target_sr:
        y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
        sr = target_sr
    
    zcr = np.mean(librosa.feature.zero_crossing_rate(y))
    energy = np.mean(librosa.feature.rms(y=y))
    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
    spectral_spread = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
    
    try:
        pitch = np.mean(librosa.feature.tonnetz(y=y, sr=sr))
    except librosa.util.exceptions.ParameterError:
        pitch = np.nan
    
    try:
        spectral_entropy = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr))
    except librosa.util.exceptions.ParameterError:
        spectral_entropy = np.nan
    
    return zcr, energy, spectral_centroid, spectral_spread, pitch, spectral_entropy

### Transcribe audio with Whisper

In [66]:
# 2. Transcribe audio using Whisper
import librosa.output
model = whisper.load_model("base.en")

def transcribe_audio_whisper(audio_file, start_time, end_time):
    # Load the audio file
    y, sr = librosa.load(audio_file, sr=None, offset=start_time, duration=end_time - start_time)
    
     # Save the segmented audio to a temporary file
    temp_file = "temp.wav"
    librosa.output.write_wav(temp_file, y, sr)
    
    # Implement actual Whisper transcription here
    transcript = model.transcribe(temp_file)
    
    # Remove the temporary file
    os.remove(temp_file)

    return transcript

### Function to extract linguistic features

In [67]:
# 3. Extract linguistic features using Word2Vec, GloVe, and BERT (replace with actual feature extraction code)
def extract_linguistic_features(transcript):
    # Implement actual linguistic feature extraction here
    return "word2vec_features", "glove_features", "bert_features"

### Perform acoustic feature extraction

In [87]:
# 4. Extract acoustic features and create data
data = []
count = 0  # Initialize the counter

for wav_file_caller, wav_file_agent, json_file in zip(wav_files_caller, wav_files_agent, json_files):
    with open(transcript_path + json_file) as f:
        json_data = json.load(f)

    human_transcripts = [entry["human_transcript"] for entry in json_data if entry["human_transcript"] != "[noise]"]
    emotion_scores = [entry["emotion"] for entry in json_data if entry["human_transcript"] != "[noise]"]
    offset_durations = [(entry["offset_ms"], entry["duration_ms"]) for entry in json_data if entry["human_transcript"] != "[noise]"]

    for transcript, emotion_score, (offset_ms, duration_ms) in zip(human_transcripts, emotion_scores, offset_durations):
        start_time = offset_ms / 1000
        end_time = (offset_ms + duration_ms) / 1000

        acoustic_features_caller = extract_acoustic_features(data_path_caller + wav_file_caller, start_time, end_time)
        acoustic_features_agent = extract_acoustic_features(data_path_agent + wav_file_agent, start_time, end_time)

        whisper_transcript = transcribe_audio_whisper(data_path_caller + wav_file_caller, start_time, end_time)
        word2vec_features, glove_features, bert_features = extract_linguistic_features(whisper_transcript)

        row = (wav_file_caller, wav_file_agent, transcript, whisper_transcript, word2vec_features, glove_features, bert_features, *acoustic_features_caller, *acoustic_features_agent, emotion_score)
        data.append(row)

    count += 1  # Increment the counter
    print(f"Finished processing: {count}")  # Print the current progress

Finished processing: 1


AttributeError: module 'soundfile' has no attribute 'SoundFileRuntimeError'

### Export to CSV

In [62]:
# 5. Export data to a CSV file
column_names = ["CallerAudio", "AgentAudio", "Transcript", "WhisperTranscript", "Word2Vec", "GloVe", "BERT", "CallerZCR", "CallerEnergy", "CallerSpectralCentroid", "CallerSpectralSpread", "CallerPitch", "CallerSpectralEntropy", "AgentZCR", "AgentEnergy", "AgentSpectralCentroid", "AgentSpectralSpread", "AgentPitch", "AgentSpectralEntropy", "EmotionScore"]
df = pd.DataFrame(data, columns=column_names)
df.to_csv("output.csv")