In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import librosa
from IPython.display import Audio

In [18]:
# Load data from the pickle file into a DataFrame
with open('/Users/maryam/code/MaryamS-61/phoneme_classification/raw_data/Crema_wav_8000.pkl', 'rb') as file:
    loaded_df = pickle.load(file)
loaded_df

Unnamed: 0,Filepath,Label,FileNames,Durations,RawAudio
0,/content/drive/MyDrive/SpeechEmotionDetection/...,SAD,1081_ITH_SAD_XX.wav,2.402375,"[-0.0040296167, -0.0063531175, -0.005278146, -..."
1,/content/drive/MyDrive/SpeechEmotionDetection/...,ANG,1081_IEO_ANG_LO.wav,2.369000,"[0.0025463065, 0.0035758591, 0.003084241, 0.00..."
2,/content/drive/MyDrive/SpeechEmotionDetection/...,NEU,1079_TSI_NEU_XX.wav,2.502500,"[0.0019027907, 0.0035785118, 0.0033085535, 0.0..."
3,/content/drive/MyDrive/SpeechEmotionDetection/...,HAP,1080_IEO_HAP_LO.wav,2.168813,"[-0.005338631, -0.008408369, -0.009417038, -0...."
4,/content/drive/MyDrive/SpeechEmotionDetection/...,SAD,1079_IEO_SAD_HI.wav,2.469125,"[-0.002982853, -0.00370352, -0.0032668808, -0...."
...,...,...,...,...,...
7437,/content/drive/MyDrive/SpeechEmotionDetection/...,HAP,1007_IWL_HAP_XX.wav,3.203187,"[0.004853568, 0.0076506957, 0.0071736197, 0.00..."
7438,/content/drive/MyDrive/SpeechEmotionDetection/...,DIS,1008_IEO_DIS_LO.wav,2.635937,"[0.00041006063, 0.0012284823, 0.002319729, 0.0..."
7439,/content/drive/MyDrive/SpeechEmotionDetection/...,SAD,1006_TSI_SAD_XX.wav,3.069750,"[0.0074983486, 0.010029516, 0.007830538, 0.008..."
7440,/content/drive/MyDrive/SpeechEmotionDetection/...,ANG,1006_TAI_ANG_XX.wav,3.203187,"[7.5466116e-05, -8.896645e-05, -0.00049790763,..."


In [52]:
'''
#stupid way to save the dataframe in a csv. 
It takes more space than pickle at the end!

x = []
for i in range(len(loaded_df)):
    feature=loaded_df['ZeroPadded'].iloc[i];
    for j in feature:
        x.append(j)

x = np.array(x)
x = x.reshape((loaded_df.shape[0], loaded_df.ZeroPadded[0].shape[0]))
df = pd.DataFrame(x)
df['Label'] = loaded_df.Label
df.to_csv('Crema_wav_zeropadded.csv', index=False)
'''

# Computing Spectrograms

In [12]:
spectrograms = []
for n in range(loaded_df.shape[0]):
    y = loaded_df.RawAudio[n]
    # Compute the Short-Time Fourier Transform (STFT)
    D = librosa.stft(y)
    
    # Calculate the magnitude (amplitude) of the STFT
    S = np.abs(D)
    
    # Convert the magnitude to dB scale
    db_magnitude = librosa.amplitude_to_db(S, ref=np.max)
    spectrograms.append(db_magnitude)

In [13]:
loaded_df = loaded_df.drop('Filepath', axis=1)
loaded_df = loaded_df.drop('Durations', axis=1)
loaded_df = loaded_df.drop('ZeroPadded', axis=1)
loaded_df = loaded_df.drop('RawAudio', axis=1)

In [14]:
loaded_df['Spectrograms'] = spectrograms

In [15]:
loaded_df

Unnamed: 0,Label,FileNames,Spectrograms
0,SAD,1081_ITH_SAD_XX.wav,"[[-50.85168, -37.53515, -48.915, -62.227776, -..."
1,ANG,1081_IEO_ANG_LO.wav,"[[-47.93593, -80.0, -53.17414, -56.48402, -52...."
2,NEU,1079_TSI_NEU_XX.wav,"[[-38.109177, -48.644703, -53.782932, -46.1585..."
3,HAP,1080_IEO_HAP_LO.wav,"[[-59.201244, -68.94307, -51.2046, -48.89463, ..."
4,SAD,1079_IEO_SAD_HI.wav,"[[-40.663616, -47.30068, -43.389248, -35.18830..."
...,...,...,...
7437,HAP,1007_IWL_HAP_XX.wav,"[[-48.35008, -59.10016, -61.12514, -56.870613,..."
7438,DIS,1008_IEO_DIS_LO.wav,"[[-54.37221, -53.520206, -46.53382, -50.349686..."
7439,SAD,1006_TSI_SAD_XX.wav,"[[-31.927896, -34.087387, -80.0, -60.489388, -..."
7440,ANG,1006_TAI_ANG_XX.wav,"[[-48.0942, -51.274445, -70.236664, -59.672203..."


In [16]:
# Step 1: Find the maximum sequence length
max_shape1_size = loaded_df['Spectrograms'].apply(lambda x: x.shape[1]).max()
max_shape1_size

79

In [17]:
# Step 2: Zero-pad each array to match the maximum size
def zero_pad_to_max(arr, max_size):
    padded_arr = np.zeros((arr.shape[0], max_size))
    padded_arr[:, :arr.shape[1]] = arr
    return padded_arr

In [18]:
loaded_df['ZeroPaddedSpec'] = loaded_df['Spectrograms'].apply(zero_pad_to_max, args=(max_shape1_size,))

In [24]:
loaded_df.ZeroPaddedSpec[1587].shape

(1025, 79)

In [25]:
with open('Crema_Spectrogram_8000.pkl', 'wb') as file:
 pickle.dump(loaded_df, file)

# Computing MFCCs

In [26]:
# Load data from the pickle file into a DataFrame
with open('/Users/maryam/code/MaryamS-61/phoneme_classification/raw_data/Crema_wav_8000.pkl', 'rb') as file:
    loaded_df = pickle.load(file)

In [28]:
mfccs = []
for n in range(loaded_df.shape[0]):
    y = loaded_df.ZeroPadded[n]
    sr = 8000
    #n_mfcc = 13
    
    # Compute MFCCs with the specified number of coefficients
    mfccs.append(librosa.feature.mfcc(y=y, sr=sr))


In [29]:
loaded_df = loaded_df.drop('Filepath', axis=1)
loaded_df = loaded_df.drop('Durations', axis=1)
loaded_df = loaded_df.drop('ZeroPadded', axis=1)
loaded_df = loaded_df.drop('RawAudio', axis=1)

In [30]:
loaded_df['MFCCs']= mfccs

In [31]:
max_shape1_size = loaded_df['MFCCs'].apply(lambda x: x.shape[1]).max()
max_shape1_size

79

In [32]:
loaded_df['ZeroPaddedMFCCs'] = loaded_df['MFCCs'].apply(zero_pad_to_max, args=(max_shape1_size,))

In [33]:
loaded_df

Unnamed: 0,Label,FileNames,MFCCs,ZeroPaddedMFCCs
0,SAD,1081_ITH_SAD_XX.wav,"[[-470.74683, -442.41135, -439.28793, -425.882...","[[-470.746826171875, -442.4113464355469, -439...."
1,ANG,1081_IEO_ANG_LO.wav,"[[-477.07513, -443.85883, -442.00867, -325.783...","[[-477.07513427734375, -443.85882568359375, -4..."
2,NEU,1079_TSI_NEU_XX.wav,"[[-465.94504, -439.0108, -439.37466, -441.0335...","[[-465.9450378417969, -439.01080322265625, -43..."
3,HAP,1080_IEO_HAP_LO.wav,"[[-458.74774, -433.89124, -434.67215, -437.715...","[[-458.74774169921875, -433.8912353515625, -43..."
4,SAD,1079_IEO_SAD_HI.wav,"[[-472.89673, -443.3953, -441.54398, -438.3957...","[[-472.896728515625, -443.3952941894531, -441...."
...,...,...,...,...
7437,HAP,1007_IWL_HAP_XX.wav,"[[-451.7584, -431.30545, -432.76337, -429.9307...","[[-451.7583923339844, -431.3054504394531, -432..."
7438,DIS,1008_IEO_DIS_LO.wav,"[[-496.57794, -457.69095, -452.06445, -453.168...","[[-496.57794189453125, -457.6909484863281, -45..."
7439,SAD,1006_TSI_SAD_XX.wav,"[[-448.04645, -434.92166, -435.18106, -431.146...","[[-448.04644775390625, -434.9216613769531, -43..."
7440,ANG,1006_TAI_ANG_XX.wav,"[[-471.9001, -434.3496, -433.11316, -436.66855...","[[-471.90008544921875, -434.349609375, -433.11..."


In [36]:
with open('Crema_MFCC_8000.pkl', 'wb') as file:
    pickle.dump(loaded_df, file)