# Setting a baseline

In this notebook, we set a couple of baselines, implementing "traditional" Machine Learning alogrithms to classify a given recording as intoxicated or not.

We experiment with
- PassiveAggressiveClassifier
- LogisticRegression
- RandomForestClassifier
- DecisionTreeClassifier

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import re
import json
from sklearn.model_selection import train_test_split

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('../data/dataset_large.csv')
df.head(2)

Unnamed: 0,full_path,session,utterance,utt,alc,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,...,slopeUV0-500_sma3nz_amean,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp
0,/mount/studenten/arbeitsdaten-studenten1/team-...,ses4038,5444038020_h_00,5444038020,0,27.536736,0.236611,24.957035,27.387535,30.500528,...,-0.024768,-0.007154,0.121985,4.028777,2.898551,0.126,0.074726,0.207895,0.208821,-28.878849
1,/mount/studenten/arbeitsdaten-studenten1/team-...,ses4038,5444038033_h_00,5444038033,0,25.317753,0.289597,22.173527,25.276594,27.43298,...,-0.021063,-0.005162,0.065592,3.837472,2.739726,0.145833,0.152067,0.18,0.295583,-29.606325


In [4]:
df.shape[0]

15180

In [5]:
with open('../data/special_cases.txt', 'r') as f:
    special_cases_dicts = json.load(f)
special_sessions = []
for sc in special_cases_dicts:
    special_sessions.extend(sc["ses"])
print(special_sessions)

['ses2028', 'ses1024', 'ses1032', 'ses1025', 'ses2012', 'ses2017', 'ses4007', 'ses3016', 'ses5008', 'ses2046', 'ses1055', 'ses4013', 'ses3018']


In [6]:
dataframe_path = '../data/dataframe.csv'
df1 = pd.read_csv(dataframe_path)

In [7]:
df1.shape[0]==df.shape[0]

True

In [8]:
df_merged = pd.merge(df, df1, on='utterance')
df_merged.head(2)

Unnamed: 0,full_path,session_x,utterance,utt_x,alc_x,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,...,aak,bak,ges,ces,wea,irreg,anncom,specom,type,content
0,/mount/studenten/arbeitsdaten-studenten1/team-...,ses4038,5444038020_h_00,5444038020,0,27.536736,0.236611,24.957035,27.387535,30.500528,...,0.0,0.0,f5,r1,SUN,1|0|0|0|0|1|0|0|0,,,R,T
1,/mount/studenten/arbeitsdaten-studenten1/team-...,ses4038,5444038033_h_00,5444038033,0,25.317753,0.289597,22.173527,25.276594,27.43298,...,0.0,0.0,f5,r1,SUN,0|0|0|0|0|0|0|0|0,,,R,A


In [9]:
df_merged.shape

(15180, 114)

In [10]:
df_merged.loc[df_merged['session_x'].isin(special_sessions), 'ss'] = 1
df_merged.loc[~df_merged['session_x'].isin(special_sessions), 'ss'] = 0

In [11]:
df_merged[df_merged['ss']==1].shape

(570, 115)

In [12]:
df_merged.columns = [col.split('_x')[0] if '_x' in col else col for col in df_merged.columns]
df_merged.head(3)

Unnamed: 0,full_path,session,utterance,utt,alc,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,...,bak,ges,ces,wea,irreg,anncom,specom,type,content,ss
0,/mount/studenten/arbeitsdaten-studenten1/team-...,ses4038,5444038020_h_00,5444038020,0,27.536736,0.236611,24.957035,27.387535,30.500528,...,0.0,f5,r1,SUN,1|0|0|0|0|1|0|0|0,,,R,T,0.0
1,/mount/studenten/arbeitsdaten-studenten1/team-...,ses4038,5444038033_h_00,5444038033,0,25.317753,0.289597,22.173527,25.276594,27.43298,...,0.0,f5,r1,SUN,0|0|0|0|0|0|0|0|0,,,R,A,0.0
2,/mount/studenten/arbeitsdaten-studenten1/team-...,ses4038,5444038035_h_00,5444038035,0,25.888353,0.14151,23.589926,25.110443,29.70009,...,0.0,f5,r1,SUN,9999|0|0|0|0|0|0|0|0,,,L,N,0.0


In [13]:
for col in df_merged.columns:
    if df_merged[df_merged[col].isnull()].shape[0] > 0:
        print(col, df_merged[df_merged[col].isnull()].shape[0])

o_utt 5204
o_item 5204
anncom 15171
specom 14610


In [14]:
df_merged[col].isnull()

0        False
1        False
2        False
3        False
4        False
         ...  
15175    False
15176    False
15177    False
15178    False
15179    False
Name: ss, Length: 15180, dtype: bool

In [17]:
special_df = df_merged[df_merged['ss'] == 1]
special_df

Unnamed: 0,full_path,session,utterance,utt,alc,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,...,bak,ges,ces,wea,irreg,anncom,specom,type,content,ss
780,/mount/studenten/arbeitsdaten-studenten1/team-...,ses2028,0312028028_h_00,312028028,0,38.016663,0.082399,34.887970,38.229580,40.351944,...,0.00000,f3,r3,SUN,2|0|1|0|0|1|1|0|0,,no_BAK_measurement,R,A,1.0
781,/mount/studenten/arbeitsdaten-studenten1/team-...,ses2028,0312028026_h_00,312028026,0,35.323150,0.202597,32.752200,35.884804,39.580902,...,0.00000,f3,r3,SUN,9999|0|0|0|0|0|0|0|0,,no_BAK_measurement,L,N,1.0
782,/mount/studenten/arbeitsdaten-studenten1/team-...,ses2028,0312028045_h_00,312028045,0,36.367165,0.115730,34.713676,36.968040,38.553364,...,0.00000,f3,r3,SUN,9999|0|0|0|0|0|0|0|0,,no_BAK_measurement,E,C,1.0
783,/mount/studenten/arbeitsdaten-studenten1/team-...,ses2028,0312028060_h_00,312028060,0,36.813965,0.106350,34.158554,36.421770,40.039360,...,0.00000,f3,r3,SUN,9999|0|0|0|0|0|0|0|0,,no_BAK_measurement,L,S,1.0
784,/mount/studenten/arbeitsdaten-studenten1/team-...,ses2028,0312028016_h_00,312028016,0,38.168660,0.099769,36.126114,37.852604,41.255936,...,0.00000,f3,r3,SUN,1|0|0|0|0|1|0|0|0,,no_BAK_measurement,R,T,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15115,/mount/studenten/arbeitsdaten-studenten1/team-...,ses3016,5153016024_h_00,5153016024,1,25.536276,0.101817,23.910210,26.147352,26.912758,...,0.00098,f5,r1,SUN,0|0|0|0|0|0|0|0|0,,strange_velar_plosive,R,R,1.0
15116,/mount/studenten/arbeitsdaten-studenten1/team-...,ses3016,5153016014_h_00,5153016014,1,23.491589,0.281059,21.110440,22.700867,23.511711,...,0.00098,f5,r1,SUN,1|0|1|0|2|0|0|0|0,,strange_velar_plosive,M,Q,1.0
15117,/mount/studenten/arbeitsdaten-studenten1/team-...,ses3016,5153016004_h_00,5153016004,1,24.097082,0.190799,21.855799,23.978815,25.485561,...,0.00098,f5,r1,SUN,0|0|1|0|0|0|0|0|0,,strange_velar_plosive,R,A,1.0
15118,/mount/studenten/arbeitsdaten-studenten1/team-...,ses3016,5153016006_h_00,5153016006,1,23.330366,0.106243,21.816680,23.351744,25.766888,...,0.00098,f5,r1,SUN,9999|0|0|0|0|0|0|0|0,,strange_velar_plosive,L,N,1.0


In [18]:
df_merged = df_merged[df_merged['ss'] == 0]

In [43]:
df_merged['alc'].value_counts()

alc
0    9360
1    5250
Name: count, dtype: int64

In [44]:
train_df, dev_test_df = train_test_split(df_merged,
                                         train_size=0.7,
                                         random_state=10,
                                         shuffle=True,
                                         stratify=df_merged[['alc', 'sex', 'type']]
                                        )

In [45]:
train_df['alc'].value_counts()

alc
0    6553
1    3674
Name: count, dtype: int64

In [46]:
dev_df, test_df = train_test_split(dev_test_df,
                                         train_size=0.7,
                                         random_state=10,
                                         shuffle=True,
                                         stratify=dev_test_df[['alc', 'sex', 'type']]
                                        )

In [47]:
test_df['alc'].value_counts()

alc
0    843
1    472
Name: count, dtype: int64

In [48]:
special_df['alc'].value_counts()

alc
0    360
1    210
Name: count, dtype: int64

In [49]:
test_df = pd.concat([test_df, special_df])

In [50]:
train_df.to_csv('../data/train_set.csv', index=False)
dev_df.to_csv('../data/dev_set.csv', index=False)
test_df.to_csv('../data/test_set.csv', index=False)