# Feature Extraction
In the following cells, we iterate over the **recordings** in the ALC. Note that one _session_ contains various recordings (usually a couple of seconds long)

For each recording, we use the openSMILE tools to extract GeMAPS features. We create a single CSV file. Each line in the file represents one recording from the ALC, containing features and the label (alcoholized/sober)

In [13]:
import opensmile, audiofile, os, json
import pandas as pd
from tqdm import tqdm

In [14]:
ALCpath = '/mount/studenten/arbeitsdaten-studenten1/team-lab-phonetics/2023/data/ALC/'
dataframe_path = '../data/dataframe.csv'
dataset_path = '../data/dataset_large.csv'

In [3]:
df1 = pd.read_csv(dataframe_path)
df1.head(2)

Unnamed: 0,session,utterance,utt,spn,o_utt,item,o_item,alc,sex,age,...,aak,bak,ges,ces,wea,irreg,anncom,specom,type,content
0,ses4038,5444038020_h_00,5444038020,544,5443046000.0,20,20.0,na,M,27,...,0.0,0.0,f5,r1,SUN,1|0|0|0|0|1|0|0|0,,,R,T
1,ses4038,5444038033_h_00,5444038033,544,,33,,na,M,27,...,0.0,0.0,f5,r1,SUN,0|0|0|0|0|0|0|0|0,,,R,A


In [4]:
d = {'cna': 1, 'a': 1, 'na':0}
df1['alc'] = df1['alc'].map(d) # map 'cna' and 'a' labels into one 'a' (it's the same) to 1 and na->0
df1.head(2)

Unnamed: 0,session,utterance,utt,spn,o_utt,item,o_item,alc,sex,age,...,aak,bak,ges,ces,wea,irreg,anncom,specom,type,content
0,ses4038,5444038020_h_00,5444038020,544,5443046000.0,20,20.0,0,M,27,...,0.0,0.0,f5,r1,SUN,1|0|0|0|0|1|0|0|0,,,R,T
1,ses4038,5444038033_h_00,5444038033,544,,33,,0,M,27,...,0.0,0.0,f5,r1,SUN,0|0|0|0|0|0|0|0|0,,,R,A


In [5]:
with open('../data/columns-Seq.txt', 'r') as f:
    cols = f.read().split(', ')

In [6]:
df = pd.DataFrame(columns = cols)
df.head()

Unnamed: 0,full_path,session,utterance,utt,alc,Loudness_sma3,alphaRatio_sma3,hammarbergIndex_sma3,slope0-500_sma3,slope500-1500_sma3,...,logRelF0-H1-A3_sma3nz,F1frequency_sma3nz,F1bandwidth_sma3nz,F1amplitudeLogRelF0_sma3nz,F2frequency_sma3nz,F2bandwidth_sma3nz,F2amplitudeLogRelF0_sma3nz,F3frequency_sma3nz,F3bandwidth_sma3nz,F3amplitudeLogRelF0_sma3nz


In [7]:
def extract_features(rec):
    # read in an audio file
    signal, sampling_rate = audiofile.read(rec, always_2d=True,)

    # extract functionals
    smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level='lld',)

    smile = opensmile.Smile(
        feature_set=opensmile.FeatureSet.eGeMAPSv02,
        feature_level=opensmile.FeatureLevel.LowLevelDescriptors)
    
    return smile.process_signal(signal, sampling_rate)

In [8]:
df1.shape

(15180, 22)

In [9]:
df1.head(2)

Unnamed: 0,session,utterance,utt,spn,o_utt,item,o_item,alc,sex,age,...,aak,bak,ges,ces,wea,irreg,anncom,specom,type,content
0,ses4038,5444038020_h_00,5444038020,544,5443046000.0,20,20.0,0,M,27,...,0.0,0.0,f5,r1,SUN,1|0|0|0|0|1|0|0|0,,,R,T
1,ses4038,5444038033_h_00,5444038033,544,,33,,0,M,27,...,0.0,0.0,f5,r1,SUN,0|0|0|0|0|0|0|0|0,,,R,A


In [10]:
# errors = []

# for i, row in tqdm(df1.iterrows()):
#     cur_path = ALCpath + row['session'] + f"/{row['utterance']}" + '.wav'
#     if cur_path in set(df['full_path'].tolist()):
#         pass
#     else:
#         if os.path.exists(cur_path):
#             new_rows = extract_features(cur_path).reset_index(drop=True)
#             df_new = pd.DataFrame(columns = new_rows.columns)
#             for col in new_rows.columns:
#                 df_new[col] = [new_rows[col].tolist()]
#             df_new['full_path'] = cur_path
#             df_new['session'] = row['session']
#             df_new['utterance'] = row['utterance']
#             df_new['utt'] = row['utt']
#             df_new['alc'] = row['alc']
# #             new_cols = new_rows.columns
#             df_new = pd.concat([df_new[cols[-5:]], df_new[cols[:-5]]], axis=1)[:200]
#             df = pd.concat([df, df_new], ignore_index=True)
#         else:
#             errors.append(cur_path)
#         if i % 1000 == 0:
#             df.to_csv('../data/dataset_seq.csv', index=False)

In [11]:
# df.to_csv('../data/dataset_seq.csv', index=False)

In [15]:
df = pd.read_csv('../data/dataset_seq.csv')
df.head(1)

Unnamed: 0,full_path,session,utterance,utt,alc,Loudness_sma3,alphaRatio_sma3,hammarbergIndex_sma3,slope0-500_sma3,slope500-1500_sma3,...,logRelF0-H1-A3_sma3nz,F1frequency_sma3nz,F1bandwidth_sma3nz,F1amplitudeLogRelF0_sma3nz,F2frequency_sma3nz,F2bandwidth_sma3nz,F2amplitudeLogRelF0_sma3nz,F3frequency_sma3nz,F3bandwidth_sma3nz,F3amplitudeLogRelF0_sma3nz
0,/mount/studenten/arbeitsdaten-studenten1/team-...,ses4038,5444038020_h_00,5444038020,0,"[0.06648889929056168, 0.08417747169733047, 0.0...","[1.0440891981124878, 2.068268060684204, 2.8674...","[2.5069081783294678, 1.3846784830093384, 0.947...","[0.015179522335529327, 0.016644515097141266, 0...","[-0.010823157615959644, -0.004189341329038143,...",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[500.5526428222656, 546.5957641601562, 657.120...","[923.6268920898438, 938.3113403320312, 988.179...","[-201.0, -201.0, -201.0, -201.0, -201.0, -201....","[1589.6429443359375, 1558.5521240234375, 1772....","[936.32421875, 1548.3433837890625, 1494.097656...","[-201.0, -201.0, -201.0, -201.0, -201.0, -201....","[2268.088623046875, 2182.937744140625, 2449.39...","[854.4310913085938, 805.93896484375, 930.70843...","[-201.0, -201.0, -201.0, -201.0, -201.0, -201...."


In [16]:
with open('../data/special_cases.txt', 'r') as f:
    special_cases_dicts = json.load(f)
special_sessions = []
for sc in special_cases_dicts:
    special_sessions.extend(sc["ses"])
print(special_sessions)

['ses2028', 'ses1024', 'ses1032', 'ses1025', 'ses2012', 'ses2017', 'ses4007', 'ses3016', 'ses5008', 'ses2046', 'ses1055', 'ses4013', 'ses3018']


In [17]:
dataframe_path = '../data/dataframe.csv'
df1 = pd.read_csv(dataframe_path)
df1.head(1)

Unnamed: 0,session,utterance,utt,spn,o_utt,item,o_item,alc,sex,age,...,aak,bak,ges,ces,wea,irreg,anncom,specom,type,content
0,ses4038,5444038020_h_00,5444038020,544,5443046000.0,20,20.0,na,M,27,...,0.0,0.0,f5,r1,SUN,1|0|0|0|0|1|0|0|0,,,R,T


In [18]:
df_merged = pd.merge(df, df1, on='utterance')
df_merged.head(2)

Unnamed: 0,full_path,session_x,utterance,utt_x,alc_x,Loudness_sma3,alphaRatio_sma3,hammarbergIndex_sma3,slope0-500_sma3,slope500-1500_sma3,...,aak,bak,ges,ces,wea,irreg,anncom,specom,type,content
0,/mount/studenten/arbeitsdaten-studenten1/team-...,ses4038,5444038020_h_00,5444038020,0,"[0.06648889929056168, 0.08417747169733047, 0.0...","[1.0440891981124878, 2.068268060684204, 2.8674...","[2.5069081783294678, 1.3846784830093384, 0.947...","[0.015179522335529327, 0.016644515097141266, 0...","[-0.010823157615959644, -0.004189341329038143,...",...,0.0,0.0,f5,r1,SUN,1|0|0|0|0|1|0|0|0,,,R,T
1,/mount/studenten/arbeitsdaten-studenten1/team-...,ses4038,5444038033_h_00,5444038033,0,"[0.04158048331737518, 0.04811205342411995, 0.0...","[-2.5216071605682373, -6.138274669647217, -8.8...","[10.313282012939453, 15.12503719329834, 18.845...","[0.0004236238601151854, -0.006380358245223761,...","[-0.00458249868825078, -0.01265237107872963, -...",...,0.0,0.0,f5,r1,SUN,0|0|0|0|0|0|0|0|0,,,R,A


In [19]:
df_merged.columns = [col.split('_x')[0] if '_x' in col else col for col in df_merged.columns]
df_merged.head(3)

Unnamed: 0,full_path,session,utterance,utt,alc,Loudness_sma3,alphaRatio_sma3,hammarbergIndex_sma3,slope0-500_sma3,slope500-1500_sma3,...,aak,bak,ges,ces,wea,irreg,anncom,specom,type,content
0,/mount/studenten/arbeitsdaten-studenten1/team-...,ses4038,5444038020_h_00,5444038020,0,"[0.06648889929056168, 0.08417747169733047, 0.0...","[1.0440891981124878, 2.068268060684204, 2.8674...","[2.5069081783294678, 1.3846784830093384, 0.947...","[0.015179522335529327, 0.016644515097141266, 0...","[-0.010823157615959644, -0.004189341329038143,...",...,0.0,0.0,f5,r1,SUN,1|0|0|0|0|1|0|0|0,,,R,T
1,/mount/studenten/arbeitsdaten-studenten1/team-...,ses4038,5444038033_h_00,5444038033,0,"[0.04158048331737518, 0.04811205342411995, 0.0...","[-2.5216071605682373, -6.138274669647217, -8.8...","[10.313282012939453, 15.12503719329834, 18.845...","[0.0004236238601151854, -0.006380358245223761,...","[-0.00458249868825078, -0.01265237107872963, -...",...,0.0,0.0,f5,r1,SUN,0|0|0|0|0|0|0|0|0,,,R,A
2,/mount/studenten/arbeitsdaten-studenten1/team-...,ses4038,5444038035_h_00,5444038035,0,"[0.08644378930330276, 0.06861668825149536, 0.0...","[-6.439538478851318, -10.99951457977295, -14.5...","[14.246787071228027, 19.226457595825195, 24.01...","[-0.03010634519159794, -0.031465787440538406, ...","[0.00977783091366291, 0.0007202308624982834, -...",...,0.0,0.0,f5,r1,SUN,9999|0|0|0|0|0|0|0|0,,,L,N


In [20]:
df_merged.loc[df_merged['session'].isin(special_sessions), 'ss'] = 1
df_merged.loc[~df_merged['session'].isin(special_sessions), 'ss'] = 0

In [22]:
special_df = df_merged[df_merged['ss'] == 1]
special_df.shape

(570, 52)

In [23]:
df_merged = df_merged[df_merged['ss'] == 0]

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
train_df, dev_test_df = train_test_split(df_merged,
                                         train_size=0.7,
                                         random_state=10,
                                         shuffle=True,
                                         stratify=df_merged[['alc', 'sex', 'type']]
                                        )

In [27]:
train_df['alc'].value_counts()

alc
0    6553
1    3674
Name: count, dtype: int64

In [33]:
dev_df, test_df = train_test_split(dev_test_df,
                                         train_size=0.7,
                                         random_state=10,
                                         shuffle=True,
                                         stratify=dev_test_df[['alc', 'sex', 'type']]
                                        )

In [34]:
test_df['alc'].value_counts()

alc
0    843
1    472
Name: count, dtype: int64

In [35]:
special_df['alc'].value_counts()

alc
0    360
1    210
Name: count, dtype: int64

In [36]:
test_df = pd.concat([test_df, special_df])

In [37]:
train_df.to_csv('../data/train_set_seq.csv', index=False)
dev_df.to_csv('../data/dev_set_seq.csv', index=False)
test_df.to_csv('../data/test_set_seq.csv', index=False)

In [38]:
test_df.sample(5)

Unnamed: 0,full_path,session,utterance,utt,alc,Loudness_sma3,alphaRatio_sma3,hammarbergIndex_sma3,slope0-500_sma3,slope500-1500_sma3,...,bak,ges,ces,wea,irreg,anncom,specom,type,content,ss
508,/mount/studenten/arbeitsdaten-studenten1/team-...,ses4043,5614043020_h_00,5614043020,0,"[0.07133859395980835, 0.05786380544304848, 0.0...","[1.9069299697875977, -1.607491135597229, -6.19...","[-0.4582394063472748, 5.203735828399658, 12.58...","[0.026657328009605408, 0.009465198032557964, -...","[0.0030577259603887796, 0.0030403481796383858,...",...,0.0,f5,r1,SUN,0|0|0|0|0|0|0|0|0,,,R,T,0.0
11043,/mount/studenten/arbeitsdaten-studenten1/team-...,ses1051,0501051023_h_00,501051023,1,"[0.03849634900689125, 0.036688562482595444, 0....","[-20.199365615844727, -22.5689754486084, -26.2...","[27.322616577148438, 31.297470092773438, 35.94...","[-0.030584625899791718, -0.06504722684621811, ...","[-0.010195462964475155, -0.004339489620178938,...",...,0.00062,f10,r1,SUN,0|0|0|0|0|0|0|0|0,,,R,R,0.0
8045,/mount/studenten/arbeitsdaten-studenten1/team-...,ses1032,0311032011_h_00,311032011,1,"[0.044511016458272934, 0.04239967465400696, 0....","[-10.81092357635498, -10.14420223236084, -8.98...","[20.27562141418457, 21.64961814880371, 21.1663...","[-0.018056973814964294, -0.039109233766794205,...","[0.0010875758016481996, 0.007086829748004675, ...",...,0.00092,f10,r1,SUN,9999|0|0|0|0|0|0|0|0,,no_BAK_measurement,L,N,1.0
4039,/mount/studenten/arbeitsdaten-studenten1/team-...,ses1024,0231024024_h_00,231024024,1,"[0.15153075754642487, 0.1400376260280609, 0.11...","[-27.680112838745117, -26.4633731842041, -28.2...","[34.56613540649414, 33.986446380615234, 35.893...","[0.051540181040763855, 0.05475561320781708, 0....","[-0.009238596074283123, -0.009574376046657562,...",...,0.00039,f10,r1,SUN,0|0|0|1|0|0|0|0|0,,no_BAK_measurement,R,R,1.0
14188,/mount/studenten/arbeitsdaten-studenten1/team-...,ses2087,1032087033_h_00,1032087033,0,"[0.060047101229429245, 0.055871278047561646, 0...","[-6.083073139190674, -6.0999531745910645, -5.1...","[10.534053802490234, 13.045882225036621, 11.91...","[-0.009160788729786873, -0.0073128375224769115...","[-0.018528707325458527, -0.009310659021139145,...",...,0.0,f5,r1,SUN,0|0|0|0|0|0|0|0|0,,,R,A,0.0
