In [2]:
import os
import pandas as pd
import librosa

crema_path = r"..\..\DataSet"
data = []
savingFilePath = os.path.join('..', "Analytics")

sentence_map = {
    'DFA': "Don't forget a jacket.",
    'IEO': "It's eleven o'clock.",
    'IOM': "I'm on my way to the meeting.",
    'ITH': "I think I have a doctor's appointment.",
    'ITS': "I think I have seen this before",
    'IWL': "I would like a new alarm clock.",
    'IWW': "I wonder what this is about.",
    'MTI': "Maybe tomorrow it will be cold.",
    'TAI': "The airplane is almost full.",
    'TIE': "That is exactly what happened.",
    'TSI': "The surface is slick.",
    'WSI': "We'll stop in a couple of minutes."
}

emotion_map = {
    'ANG': 'Angry',
    'DIS': 'Disgust',
    'FEA': 'Fear',
    'HAP': 'Happy',
    'NEU': 'Neutral',
    'SAD': 'Sad'
}

female_id_list = [
    '1002', '1003', '1004', '1006', '1007', '1008', '1009', '1010', '1012', '1013', '1018', 
    '1020', '1021', '1024', '1025', '1028', '1029', '1030', '1037', '1043', '1046', '1047', 
    '1049', '1052', '1053', '1054', '1055', '1056', '1058', '1060', '1061', '1063', '1072', 
    '1073', '1074', '1075', '1076', '1078', '1079', '1082', '1084', '1089', '1091',
]

for file in os.listdir(crema_path):
    if file.endswith(".wav"):
        parts = file.split('_')
        actor_id = parts[0]
        sentence_code = parts[1]
        emotion_code = parts[2]
        intensity = parts[3].replace('.wav', '')
        if intensity in ["X", "XX"]:
            intensity = "Unspecified"

        file_path = os.path.join(crema_path, file)
        audio, sr = librosa.load(file_path, sr=None)
        duration = len(audio) / sr

        sex = 'Female' if actor_id in female_id_list else 'Male'

        data.append({
            'File Name': file,
            'Actor ID': int(actor_id),
            'Sex': sex,
            'Sentence Code': sentence_code,
            'Sentence': sentence_map.get(sentence_code, sentence_code),
            'Emotion': emotion_map[emotion_code],
            'Intensity': intensity,
            'Duration (s)': round(duration, 2)
        })

df = pd.DataFrame(data)
df.to_csv(os.path.join(savingFilePath, "crema_d_dataset.csv"), index=False)

In [3]:
df_voice_duration = df["Duration (s)"]

df_voice_duration.describe()

count    7442.000000
mean        2.543600
std         0.505569
min         1.270000
25%         2.200000
50%         2.500000
75%         2.840000
max         5.000000
Name: Duration (s), dtype: float64

In [4]:
emotion_intensity_counts = df.groupby(['Emotion', 'Intensity']).size().reset_index(name='Count')

emotion_intensity_counts.to_csv(os.path.join(savingFilePath, r"emotion_entensity_count.csv"), index=False)

In [5]:
emotion_per_sentence = df.groupby(['Sentence', 'Emotion']).size().reset_index(name='Count')

emotion_per_sentence.to_csv(os.path.join(savingFilePath, r"sentence_emotion_count.csv"), index=False)


In [6]:
emotion_counts = df.groupby(['Emotion']).size().reset_index(name='Count')

emotion_counts.to_csv(os.path.join(savingFilePath, r"emotions_count.csv"), index=False)

In [7]:
sentence = df.groupby(['Sentence']).size().reset_index(name='Count')

sentence.to_csv(os.path.join(savingFilePath, r"sentences_count.csv"), index=False)