# Feature Extraction
In the following cells, we iterate over the **recordings** in the ALC. Note that one _session_ contains various recordings (usually a couple of seconds long)

For each recording, we use the openSMILE tools to extract GeMAPS features. We create a single CSV file. Each line in the file represents one recording from the ALC, containing features and the label (alcoholized/sober)

In [1]:
import opensmile, audiofile, os, json
import pandas as pd
from tqdm import tqdm

In [2]:
ALCpath = '/mount/studenten/arbeitsdaten-studenten1/team-lab-phonetics/2023/data/ALC/'
dataframe_path = '../data/dataframe.csv'
dataset_path = '../data/dataset_large.csv'

In [3]:
df1 = pd.read_csv(dataframe_path)
df1.head(2)

Unnamed: 0,session,utterance,utt,spn,o_utt,item,o_item,alc,sex,age,...,aak,bak,ges,ces,wea,irreg,anncom,specom,type,content
0,ses4038,5444038020_h_00,5444038020,544,5443046000.0,20,20.0,na,M,27,...,0.0,0.0,f5,r1,SUN,1|0|0|0|0|1|0|0|0,,,R,T
1,ses4038,5444038033_h_00,5444038033,544,,33,,na,M,27,...,0.0,0.0,f5,r1,SUN,0|0|0|0|0|0|0|0|0,,,R,A


In [4]:
d = {'cna': 1, 'a': 1, 'na':0}
df1['alc'] = df1['alc'].map(d) # map 'cna' and 'a' labels into one 'a' (it's the same) to 1 and na->0
df1.head(2)

Unnamed: 0,session,utterance,utt,spn,o_utt,item,o_item,alc,sex,age,...,aak,bak,ges,ces,wea,irreg,anncom,specom,type,content
0,ses4038,5444038020_h_00,5444038020,544,5443046000.0,20,20.0,0,M,27,...,0.0,0.0,f5,r1,SUN,1|0|0|0|0|1|0|0|0,,,R,T
1,ses4038,5444038033_h_00,5444038033,544,,33,,0,M,27,...,0.0,0.0,f5,r1,SUN,0|0|0|0|0|0|0|0|0,,,R,A


In [5]:
with open('../data/columns.txt', 'r') as f:
    cols = f.read().split(', ')

In [6]:
df = pd.DataFrame(columns = cols)
# df = pd.read_csv('../data/dataset_large.csv')
df.head()

Unnamed: 0,full_path,session,utterance,utt,alc,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,...,slopeUV0-500_sma3nz_amean,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp


In [7]:
def extract_features(rec):
    # read in an audio file
    signal, sampling_rate = audiofile.read(rec, always_2d=True,)

    # extract functionals
    smile = opensmile.Smile(
        feature_set=opensmile.FeatureSet.eGeMAPSv02,
        feature_level=opensmile.FeatureLevel.Functionals,)
    
    return smile.process_signal(signal, sampling_rate)

In [8]:
df1.shape

(15180, 22)

In [9]:
# wav_exists = []
# errors = []
# for i, row in tqdm(df1.iterrows()):
#     cur_path = ALCpath + row['session'] + f"/{row['utterance']}" + '.wav'
#     if os.path.exists(cur_path):
#         wav_exists.append(cur_path)
#     else:
#         errors.append(cur_path)

In [9]:
errors = []

for i, row in tqdm(df1.iterrows()):
    cur_path = ALCpath + row['session'] + f"/{row['utterance']}" + '.wav'
    if cur_path in set(df['full_path'].tolist()):
        pass
    else:
        if os.path.exists(cur_path):
            new_row = extract_features(cur_path).reset_index(drop=True)
            new_row['full_path'] = cur_path
            new_row[['session','utterance', 'utt', 'alc']] = row[['session','utterance', 'utt', 'alc']]
            new_row = pd.concat([new_row[cols[:5]], new_row[cols[5:]]], axis=1)
            df = pd.concat([df, new_row], ignore_index=True)
        else:
            errors.append(cur_path)
        if i % 100 == 0:
            df.to_csv('../data/dataset_large.csv', index=False)
        

15180it [4:48:17,  1.14s/it]


In [10]:
df.shape

(15180, 93)

In [11]:
df.to_csv('../data/dataset_large.csv', index=False)

In [31]:
df.head(2)

Unnamed: 0,full_path,session,utterance,utt,alc,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,...,slopeUV0-500_sma3nz_amean,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp
0,/mount/studenten/arbeitsdaten-studenten1/team-...,ses4038,5444038020_h_00,5444038020,0,27.536736,0.236611,24.957035,27.387535,30.500528,...,-0.024768,-0.007154,0.121985,4.028777,2.898551,0.126,0.074726,0.207895,0.208821,-28.878849
1,/mount/studenten/arbeitsdaten-studenten1/team-...,ses4038,5444038033_h_00,5444038033,0,25.317753,0.289597,22.173527,25.276594,27.43298,...,-0.021063,-0.005162,0.065592,3.837472,2.739726,0.145833,0.152067,0.18,0.295583,-29.606325


In [49]:
new_row.reset_index(drop=True)[k.columns]

Unnamed: 0_level_0,Unnamed: 1_level_0,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope,...,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp,session,utterance,utt,alc
start,end,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0 days,0 days 00:00:08.940000,26.727541,0.146232,24.222298,27.345308,29.597071,5.374773,279.819427,300.203003,140.060852,134.58313,...,2.364865,0.190952,0.122859,0.202727,0.263166,-28.521378,ses4038,5444038009_h_00,5444038009,0


In [32]:
new_row.reset_index(drop=True)

Unnamed: 0,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope,...,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp,session,utterance,utt,alc
0,26.727541,0.146232,24.222298,27.345308,29.597071,5.374773,279.819427,300.203003,140.060852,134.58313,...,2.364865,0.190952,0.122859,0.202727,0.263166,-28.521378,ses4038,5444038007_h_00,5444038007,0


In [24]:
df1['long_path'][0]

'/mount/studenten/arbeitsdaten-studenten1/team-lab-phonetics/2023/data/ALC/ses4038/5444038020_h_00_annot.json'

In [23]:
'/mount/studenten/arbeitsdaten-studenten1/team-lab-phonetics/2023/data/ALC/ses4038/5444038024_h_00_annot.json'

'/mount/studenten/arbeitsdaten-studenten1/team-lab-phonetics/2023/data/ALC/ses4038/5444038024_h_00_annot.json'