In [1]:
import pandas as pd
import os
from tqdm.notebook import tqdm
from pydub import AudioSegment
import librosa
import numpy as np
import pickle
from itertools import chain
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier



In [2]:
# df = pd.read_csv('../input/sep28k/SEP-28k_labels.csv')
df = pd.read_csv('./extract_data/fluencybank_labels.csv')
df.head()

Unnamed: 0,Show,EpId,ClipId,Start,Stop,Unsure,PoorAudioQuality,Prolongation,Block,SoundRep,WordRep,DifficultToUnderstand,Interjection,NoStutteredWords,NaturalPause,Music,NoSpeech
0,FluencyBank,10,0,88960,136960,0,0,0,1,0,0,0,0,2,0,0,0
1,FluencyBank,10,1,1271520,1319520,0,0,0,0,0,0,0,0,3,0,0,0
2,FluencyBank,10,2,1813760,1861760,0,0,1,0,0,0,0,0,2,0,0,0
3,FluencyBank,10,3,1842720,1890720,0,0,1,0,0,0,0,0,2,1,0,0
4,FluencyBank,10,4,1893280,1941280,0,0,0,0,0,3,0,0,0,0,0,0


In [3]:
df.shape

(4144, 17)

In [4]:
df.columns

Index(['Show', 'EpId', 'ClipId', 'Start', 'Stop', 'Unsure', 'PoorAudioQuality',
       'Prolongation', 'Block', 'SoundRep', 'WordRep', 'DifficultToUnderstand',
       'Interjection', 'NoStutteredWords', 'NaturalPause', 'Music',
       'NoSpeech'],
      dtype='object')

In [5]:
df['Prolongation'].value_counts()

Prolongation
0    3208
1     598
2     217
3     121
Name: count, dtype: int64

In [6]:
df['WordRep'].value_counts()

WordRep
0    3204
1     510
3     226
2     204
Name: count, dtype: int64

In [7]:
df['SoundRep'].value_counts()

SoundRep
0    2996
1     599
2     372
3     177
Name: count, dtype: int64

In [8]:
df['Rep'] = df['WordRep'] + df['SoundRep']
df['Rep']

0       0
1       0
2       0
3       0
4       3
       ..
4139    0
4140    1
4141    3
4142    3
4143    0
Name: Rep, Length: 4144, dtype: int64

In [9]:
df['Rep'].value_counts()

Rep
0    2432
1     670
2     490
3     430
4      85
5      32
6       5
Name: count, dtype: int64

In [10]:
df['Name'] = df[df.columns[0:3]].apply(
    lambda x: '_'.join(x.dropna().astype(str)),
    axis=1
)
df.head()

Unnamed: 0,Show,EpId,ClipId,Start,Stop,Unsure,PoorAudioQuality,Prolongation,Block,SoundRep,WordRep,DifficultToUnderstand,Interjection,NoStutteredWords,NaturalPause,Music,NoSpeech,Rep,Name
0,FluencyBank,10,0,88960,136960,0,0,0,1,0,0,0,0,2,0,0,0,0,FluencyBank_10_0
1,FluencyBank,10,1,1271520,1319520,0,0,0,0,0,0,0,0,3,0,0,0,0,FluencyBank_10_1
2,FluencyBank,10,2,1813760,1861760,0,0,1,0,0,0,0,0,2,0,0,0,0,FluencyBank_10_2
3,FluencyBank,10,3,1842720,1890720,0,0,1,0,0,0,0,0,2,1,0,0,0,FluencyBank_10_3
4,FluencyBank,10,4,1893280,1941280,0,0,0,0,0,3,0,0,0,0,0,0,3,FluencyBank_10_4


In [11]:
df.columns

Index(['Show', 'EpId', 'ClipId', 'Start', 'Stop', 'Unsure', 'PoorAudioQuality',
       'Prolongation', 'Block', 'SoundRep', 'WordRep', 'DifficultToUnderstand',
       'Interjection', 'NoStutteredWords', 'NaturalPause', 'Music', 'NoSpeech',
       'Rep', 'Name'],
      dtype='object')

In [12]:
df['Name']

0          FluencyBank_10_0
1          FluencyBank_10_1
2          FluencyBank_10_2
3          FluencyBank_10_3
4          FluencyBank_10_4
               ...         
4139    FluencyBank_985_191
4140    FluencyBank_985_192
4141    FluencyBank_985_193
4142    FluencyBank_985_194
4143    FluencyBank_985_195
Name: Name, Length: 4144, dtype: object

In [13]:
df = df.sort_values(by='Name')
df

Unnamed: 0,Show,EpId,ClipId,Start,Stop,Unsure,PoorAudioQuality,Prolongation,Block,SoundRep,WordRep,DifficultToUnderstand,Interjection,NoStutteredWords,NaturalPause,Music,NoSpeech,Rep,Name
2285,FluencyBank,107,0,0,40480,0,0,0,0,0,0,0,0,3,0,0,0,0,FluencyBank_107_0
2286,FluencyBank,107,1,824160,872160,0,0,0,0,0,0,0,0,3,2,0,0,0,FluencyBank_107_1
2295,FluencyBank,107,10,1528160,1576160,0,0,0,2,1,0,0,0,0,0,0,0,1,FluencyBank_107_10
2296,FluencyBank,107,11,1550720,1598720,0,0,1,2,0,0,0,0,0,0,0,0,0,FluencyBank_107_11
2297,FluencyBank,107,12,1579360,1627360,0,0,1,0,0,0,0,0,2,0,0,0,0,FluencyBank_107_12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2194,FluencyBank,99,95,6969280,7017280,0,0,0,1,1,2,0,1,0,0,0,0,3,FluencyBank_99_95
2195,FluencyBank,99,96,6998880,7046880,0,0,0,2,0,0,0,3,1,0,0,0,0,FluencyBank_99_96
2196,FluencyBank,99,97,498400,546400,0,0,0,0,0,3,0,3,0,0,0,0,3,FluencyBank_99_97
2197,FluencyBank,99,98,525600,573600,0,0,0,0,0,0,0,2,3,0,0,0,0,FluencyBank_99_98


In [14]:
os.stat("./extract_data/clips/stuttering-clips/clips/HeStutters_0_9.wav").st_size
os.stat("./extract_data/clips/stuttering-clips/clips/HeStutters_1_1.wav").st_size

96044

In [15]:
from tqdm.notebook import tqdm

CLIPS_DIR = "./extract_data/clips/stuttering-clips/clips/"
ignore_list = []
for filename in tqdm(os.listdir(CLIPS_DIR)):
    file_path = CLIPS_DIR + filename
    if os.stat(file_path).st_size == 44:
        ignore_list.append(filename)
        filename = filename[:-4]
        df = df[df.Name != filename]

# ignore fluency bank clips as well        
for filename in tqdm(os.listdir(CLIPS_DIR)):
    # if 'FluencyBank' in filename:
    if 'FluencyBank' not in filename:
        filename = filename[:-4]
        ignore_list.append(filename)

  0%|          | 0/32321 [00:00<?, ?it/s]

  0%|          | 0/32321 [00:00<?, ?it/s]

In [16]:
print(len(ignore_list))
df.head()

28590


Unnamed: 0,Show,EpId,ClipId,Start,Stop,Unsure,PoorAudioQuality,Prolongation,Block,SoundRep,WordRep,DifficultToUnderstand,Interjection,NoStutteredWords,NaturalPause,Music,NoSpeech,Rep,Name
2285,FluencyBank,107,0,0,40480,0,0,0,0,0,0,0,0,3,0,0,0,0,FluencyBank_107_0
2286,FluencyBank,107,1,824160,872160,0,0,0,0,0,0,0,0,3,2,0,0,0,FluencyBank_107_1
2295,FluencyBank,107,10,1528160,1576160,0,0,0,2,1,0,0,0,0,0,0,0,1,FluencyBank_107_10
2296,FluencyBank,107,11,1550720,1598720,0,0,1,2,0,0,0,0,0,0,0,0,0,FluencyBank_107_11
2297,FluencyBank,107,12,1579360,1627360,0,0,1,0,0,0,0,0,2,0,0,0,0,FluencyBank_107_12


In [17]:
df.shape

(4051, 19)

In [18]:
features = {}
directory = CLIPS_DIR

for filename in tqdm(os.listdir(CLIPS_DIR)):
    filename = filename[:-4]
    if ignore_list.count(filename) == 0:
        sound_file, sr = librosa.load(CLIPS_DIR + filename + '.wav', sr=16000)
        mfccs = librosa.feature.mfcc(y=sound_file, sr=sr, n_mfcc=13)
        features[filename] = mfccs

  0%|          | 0/32321 [00:00<?, ?it/s]



In [19]:
import pickle
WORKING_DIR = './workingdirec'
# pickle.dump(features, open(os.path.join(WORKING_DIR, 'features.pkl'), 'wb'))
with open('features.pickle', 'wb') as handle:
    pickle.dump(features, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
with open('./features.pickle', 'rb') as handle:
    features = pickle.load(handle)
len(features)

4144

In [21]:
features['FluencyBank_985_195']


array([[-4.50995544e+02, -4.12957489e+02, -4.09316711e+02, ...,
        -2.88209137e+02, -3.14227142e+02, -3.43718506e+02],
       [ 9.52003784e+01,  9.47118454e+01,  9.44824524e+01, ...,
         1.05262833e+02,  1.06273315e+02,  1.10916092e+02],
       [ 2.08783951e+01,  2.51269932e+01,  2.86390991e+01, ...,
        -4.59372177e+01, -4.44208107e+01, -3.49026451e+01],
       ...,
       [-7.02341557e+00, -1.16951466e+01, -1.16473293e+01, ...,
        -1.46959820e+01, -1.87271881e+01, -1.65342026e+01],
       [-6.07410669e-02, -7.30059910e+00, -6.47276735e+00, ...,
         7.60739517e+00,  3.69215584e+00,  2.18128347e+00],
       [-1.03952427e+01, -7.93300343e+00, -7.75530863e+00, ...,
        -1.86722298e+01, -1.83898964e+01, -1.16343937e+01]], dtype=float32)

In [22]:
to_be_removed = []
for f in features:
    mfccs = list(chain.from_iterable(features[f]))
    features[f] = mfccs
    if len(features[f]) != 1222:
        to_be_removed.append(f)
        df = df[df.Name != f]

for f in to_be_removed:
    features.pop(f)

In [23]:
len(features['FluencyBank_985_195'])

1222

In [24]:
df_rep = pd.DataFrame.from_dict(features)
df_rep = df_rep.transpose()
df_rep = df_rep.reset_index()
df_rep = df_rep.sort_values(by='index')
df_rep

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,1212,1213,1214,1215,1216,1217,1218,1219,1220,1221
0,FluencyBank_010_0,-347.740692,-317.819641,-320.320618,-332.317169,-343.356598,-344.344421,-354.105988,-366.142670,-372.481995,...,1.723361,1.272797,-5.230965,-14.970407,-21.465916,-18.692142,-21.989422,-28.344925,-29.615908,-14.021580
1,FluencyBank_010_1,-438.671997,-419.725128,-417.393494,-410.429810,-408.278900,-409.253845,-408.491180,-408.270386,-411.358856,...,-8.160137,-9.380219,-7.065189,-0.610028,-4.598778,-9.972732,-12.226573,-8.808855,-7.627140,-5.797943
2,FluencyBank_010_10,-161.647873,-123.057701,-139.113174,-158.738495,-191.542191,-225.345139,-251.198715,-286.218719,-315.354279,...,-21.150717,-10.462812,-4.572368,-3.187361,-10.327148,-14.540721,-14.137384,-15.553209,-20.055593,-17.260246
3,FluencyBank_010_11,-427.028564,-397.329529,-394.007721,-395.439880,-397.534607,-395.810181,-401.564575,-346.869751,-223.398605,...,-7.202801,-8.293388,-10.565519,-14.671823,-12.170221,-4.913915,0.071892,-3.458159,-7.450909,-3.462155
4,FluencyBank_010_12,-293.398438,-282.992462,-221.619446,-157.533371,-125.753029,-112.959358,-92.637047,-67.506477,-80.915756,...,-7.929729,-7.808949,-7.652052,7.907707,9.569012,6.291955,-0.370439,-23.622349,-23.704752,-22.841469
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,FluencyBank_985_95,-443.876617,-401.284729,-387.183258,-378.139984,-374.589996,-372.226715,-372.407257,-381.320831,-396.568726,...,2.072358,-0.333659,-0.964053,3.226261,-1.507389,-7.928948,-2.500397,-7.246117,-13.018373,-11.126150
3949,FluencyBank_985_96,-248.297775,-236.968079,-256.489441,-179.804199,-138.778915,-140.303513,-169.274170,-187.856918,-204.508316,...,-13.776359,-17.644371,-16.403625,-8.473763,-7.692134,-9.149691,-11.734518,-14.262653,-16.372873,-16.159088
3950,FluencyBank_985_97,-451.358978,-411.656891,-409.293213,-412.606598,-408.201843,-409.166840,-412.488556,-409.688263,-407.622314,...,-15.045193,-22.120049,-20.163528,-16.951302,-18.500633,-15.504421,-14.256115,-10.287129,-3.862890,-5.044336
3951,FluencyBank_985_98,-403.948334,-322.281982,-267.254852,-257.917694,-266.802399,-297.248413,-343.174957,-376.987091,-390.500305,...,-8.378985,-7.709820,-11.258406,-12.396635,-6.766040,-7.524735,-10.935984,-14.497244,-11.677958,-8.171494


In [25]:
df_rep['Rep'] = df['Rep']
df_rep

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,1213,1214,1215,1216,1217,1218,1219,1220,1221,Rep
0,FluencyBank_010_0,-347.740692,-317.819641,-320.320618,-332.317169,-343.356598,-344.344421,-354.105988,-366.142670,-372.481995,...,1.272797,-5.230965,-14.970407,-21.465916,-18.692142,-21.989422,-28.344925,-29.615908,-14.021580,0.0
1,FluencyBank_010_1,-438.671997,-419.725128,-417.393494,-410.429810,-408.278900,-409.253845,-408.491180,-408.270386,-411.358856,...,-9.380219,-7.065189,-0.610028,-4.598778,-9.972732,-12.226573,-8.808855,-7.627140,-5.797943,0.0
2,FluencyBank_010_10,-161.647873,-123.057701,-139.113174,-158.738495,-191.542191,-225.345139,-251.198715,-286.218719,-315.354279,...,-10.462812,-4.572368,-3.187361,-10.327148,-14.540721,-14.137384,-15.553209,-20.055593,-17.260246,0.0
3,FluencyBank_010_11,-427.028564,-397.329529,-394.007721,-395.439880,-397.534607,-395.810181,-401.564575,-346.869751,-223.398605,...,-8.293388,-10.565519,-14.671823,-12.170221,-4.913915,0.071892,-3.458159,-7.450909,-3.462155,0.0
4,FluencyBank_010_12,-293.398438,-282.992462,-221.619446,-157.533371,-125.753029,-112.959358,-92.637047,-67.506477,-80.915756,...,-7.808949,-7.652052,7.907707,9.569012,6.291955,-0.370439,-23.622349,-23.704752,-22.841469,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,FluencyBank_985_95,-443.876617,-401.284729,-387.183258,-378.139984,-374.589996,-372.226715,-372.407257,-381.320831,-396.568726,...,-0.333659,-0.964053,3.226261,-1.507389,-7.928948,-2.500397,-7.246117,-13.018373,-11.126150,1.0
3949,FluencyBank_985_96,-248.297775,-236.968079,-256.489441,-179.804199,-138.778915,-140.303513,-169.274170,-187.856918,-204.508316,...,-17.644371,-16.403625,-8.473763,-7.692134,-9.149691,-11.734518,-14.262653,-16.372873,-16.159088,0.0
3950,FluencyBank_985_97,-451.358978,-411.656891,-409.293213,-412.606598,-408.201843,-409.166840,-412.488556,-409.688263,-407.622314,...,-22.120049,-20.163528,-16.951302,-18.500633,-15.504421,-14.256115,-10.287129,-3.862890,-5.044336,1.0
3951,FluencyBank_985_98,-403.948334,-322.281982,-267.254852,-257.917694,-266.802399,-297.248413,-343.174957,-376.987091,-390.500305,...,-7.709820,-11.258406,-12.396635,-6.766040,-7.524735,-10.935984,-14.497244,-11.677958,-8.171494,0.0


In [26]:
df_rep.shape

(3953, 1224)

In [27]:
df_rep['Rep'].value_counts()

Rep
0.0    2250
1.0     616
2.0     457
3.0     404
4.0      84
5.0      29
6.0       5
Name: count, dtype: int64

In [28]:
df_rep = df_rep[df_rep['Rep'].notna()]
df_rep

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,1213,1214,1215,1216,1217,1218,1219,1220,1221,Rep
0,FluencyBank_010_0,-347.740692,-317.819641,-320.320618,-332.317169,-343.356598,-344.344421,-354.105988,-366.142670,-372.481995,...,1.272797,-5.230965,-14.970407,-21.465916,-18.692142,-21.989422,-28.344925,-29.615908,-14.021580,0.0
1,FluencyBank_010_1,-438.671997,-419.725128,-417.393494,-410.429810,-408.278900,-409.253845,-408.491180,-408.270386,-411.358856,...,-9.380219,-7.065189,-0.610028,-4.598778,-9.972732,-12.226573,-8.808855,-7.627140,-5.797943,0.0
2,FluencyBank_010_10,-161.647873,-123.057701,-139.113174,-158.738495,-191.542191,-225.345139,-251.198715,-286.218719,-315.354279,...,-10.462812,-4.572368,-3.187361,-10.327148,-14.540721,-14.137384,-15.553209,-20.055593,-17.260246,0.0
3,FluencyBank_010_11,-427.028564,-397.329529,-394.007721,-395.439880,-397.534607,-395.810181,-401.564575,-346.869751,-223.398605,...,-8.293388,-10.565519,-14.671823,-12.170221,-4.913915,0.071892,-3.458159,-7.450909,-3.462155,0.0
4,FluencyBank_010_12,-293.398438,-282.992462,-221.619446,-157.533371,-125.753029,-112.959358,-92.637047,-67.506477,-80.915756,...,-7.808949,-7.652052,7.907707,9.569012,6.291955,-0.370439,-23.622349,-23.704752,-22.841469,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,FluencyBank_985_95,-443.876617,-401.284729,-387.183258,-378.139984,-374.589996,-372.226715,-372.407257,-381.320831,-396.568726,...,-0.333659,-0.964053,3.226261,-1.507389,-7.928948,-2.500397,-7.246117,-13.018373,-11.126150,1.0
3949,FluencyBank_985_96,-248.297775,-236.968079,-256.489441,-179.804199,-138.778915,-140.303513,-169.274170,-187.856918,-204.508316,...,-17.644371,-16.403625,-8.473763,-7.692134,-9.149691,-11.734518,-14.262653,-16.372873,-16.159088,0.0
3950,FluencyBank_985_97,-451.358978,-411.656891,-409.293213,-412.606598,-408.201843,-409.166840,-412.488556,-409.688263,-407.622314,...,-22.120049,-20.163528,-16.951302,-18.500633,-15.504421,-14.256115,-10.287129,-3.862890,-5.044336,1.0
3951,FluencyBank_985_98,-403.948334,-322.281982,-267.254852,-257.917694,-266.802399,-297.248413,-343.174957,-376.987091,-390.500305,...,-7.709820,-11.258406,-12.396635,-6.766040,-7.524735,-10.935984,-14.497244,-11.677958,-8.171494,0.0


In [29]:
X_rep = df_rep.drop(['index', 'Rep'], axis=1)
X_rep

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1212,1213,1214,1215,1216,1217,1218,1219,1220,1221
0,-347.740692,-317.819641,-320.320618,-332.317169,-343.356598,-344.344421,-354.105988,-366.142670,-372.481995,-374.783325,...,1.723361,1.272797,-5.230965,-14.970407,-21.465916,-18.692142,-21.989422,-28.344925,-29.615908,-14.021580
1,-438.671997,-419.725128,-417.393494,-410.429810,-408.278900,-409.253845,-408.491180,-408.270386,-411.358856,-412.211578,...,-8.160137,-9.380219,-7.065189,-0.610028,-4.598778,-9.972732,-12.226573,-8.808855,-7.627140,-5.797943
2,-161.647873,-123.057701,-139.113174,-158.738495,-191.542191,-225.345139,-251.198715,-286.218719,-315.354279,-334.004761,...,-21.150717,-10.462812,-4.572368,-3.187361,-10.327148,-14.540721,-14.137384,-15.553209,-20.055593,-17.260246
3,-427.028564,-397.329529,-394.007721,-395.439880,-397.534607,-395.810181,-401.564575,-346.869751,-223.398605,-167.180069,...,-7.202801,-8.293388,-10.565519,-14.671823,-12.170221,-4.913915,0.071892,-3.458159,-7.450909,-3.462155
4,-293.398438,-282.992462,-221.619446,-157.533371,-125.753029,-112.959358,-92.637047,-67.506477,-80.915756,-117.480995,...,-7.929729,-7.808949,-7.652052,7.907707,9.569012,6.291955,-0.370439,-23.622349,-23.704752,-22.841469
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,-443.876617,-401.284729,-387.183258,-378.139984,-374.589996,-372.226715,-372.407257,-381.320831,-396.568726,-409.386047,...,2.072358,-0.333659,-0.964053,3.226261,-1.507389,-7.928948,-2.500397,-7.246117,-13.018373,-11.126150
3949,-248.297775,-236.968079,-256.489441,-179.804199,-138.778915,-140.303513,-169.274170,-187.856918,-204.508316,-223.737442,...,-13.776359,-17.644371,-16.403625,-8.473763,-7.692134,-9.149691,-11.734518,-14.262653,-16.372873,-16.159088
3950,-451.358978,-411.656891,-409.293213,-412.606598,-408.201843,-409.166840,-412.488556,-409.688263,-407.622314,-408.622650,...,-15.045193,-22.120049,-20.163528,-16.951302,-18.500633,-15.504421,-14.256115,-10.287129,-3.862890,-5.044336
3951,-403.948334,-322.281982,-267.254852,-257.917694,-266.802399,-297.248413,-343.174957,-376.987091,-390.500305,-302.544739,...,-8.378985,-7.709820,-11.258406,-12.396635,-6.766040,-7.524735,-10.935984,-14.497244,-11.677958,-8.171494


In [30]:
y_rep = df_rep['Rep']
y_rep

0       0.0
1       0.0
2       0.0
3       0.0
4       3.0
       ... 
3948    1.0
3949    0.0
3950    1.0
3951    0.0
3952    3.0
Name: Rep, Length: 3845, dtype: float64

In [31]:
X_rep_train, X_rep_test, y_rep_train, y_rep_test =  train_test_split(X_rep, y_rep, 
                                                                     test_size=0.3, 
                                                                     random_state=42)

In [32]:
clf = DecisionTreeClassifier(criterion='entropy',random_state=5)
clf.fit(X_rep_train,y_rep_train)
clf.score(X_rep_test,y_rep_test) * 100


41.94107452339688

In [33]:
model = SVC(kernel="linear")
model = model.fit(X_rep_train, y_rep_train)
model.score(X_rep_test, y_rep_test) * 100

42.7209705372617

In [34]:
df_word = pd.DataFrame.from_dict(features)
df_word = df_word.transpose()
df_word = df_word.reset_index()
df_word = df_word.sort_values(by='index')
df_word

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,1212,1213,1214,1215,1216,1217,1218,1219,1220,1221
0,FluencyBank_010_0,-347.740692,-317.819641,-320.320618,-332.317169,-343.356598,-344.344421,-354.105988,-366.142670,-372.481995,...,1.723361,1.272797,-5.230965,-14.970407,-21.465916,-18.692142,-21.989422,-28.344925,-29.615908,-14.021580
1,FluencyBank_010_1,-438.671997,-419.725128,-417.393494,-410.429810,-408.278900,-409.253845,-408.491180,-408.270386,-411.358856,...,-8.160137,-9.380219,-7.065189,-0.610028,-4.598778,-9.972732,-12.226573,-8.808855,-7.627140,-5.797943
2,FluencyBank_010_10,-161.647873,-123.057701,-139.113174,-158.738495,-191.542191,-225.345139,-251.198715,-286.218719,-315.354279,...,-21.150717,-10.462812,-4.572368,-3.187361,-10.327148,-14.540721,-14.137384,-15.553209,-20.055593,-17.260246
3,FluencyBank_010_11,-427.028564,-397.329529,-394.007721,-395.439880,-397.534607,-395.810181,-401.564575,-346.869751,-223.398605,...,-7.202801,-8.293388,-10.565519,-14.671823,-12.170221,-4.913915,0.071892,-3.458159,-7.450909,-3.462155
4,FluencyBank_010_12,-293.398438,-282.992462,-221.619446,-157.533371,-125.753029,-112.959358,-92.637047,-67.506477,-80.915756,...,-7.929729,-7.808949,-7.652052,7.907707,9.569012,6.291955,-0.370439,-23.622349,-23.704752,-22.841469
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,FluencyBank_985_95,-443.876617,-401.284729,-387.183258,-378.139984,-374.589996,-372.226715,-372.407257,-381.320831,-396.568726,...,2.072358,-0.333659,-0.964053,3.226261,-1.507389,-7.928948,-2.500397,-7.246117,-13.018373,-11.126150
3949,FluencyBank_985_96,-248.297775,-236.968079,-256.489441,-179.804199,-138.778915,-140.303513,-169.274170,-187.856918,-204.508316,...,-13.776359,-17.644371,-16.403625,-8.473763,-7.692134,-9.149691,-11.734518,-14.262653,-16.372873,-16.159088
3950,FluencyBank_985_97,-451.358978,-411.656891,-409.293213,-412.606598,-408.201843,-409.166840,-412.488556,-409.688263,-407.622314,...,-15.045193,-22.120049,-20.163528,-16.951302,-18.500633,-15.504421,-14.256115,-10.287129,-3.862890,-5.044336
3951,FluencyBank_985_98,-403.948334,-322.281982,-267.254852,-257.917694,-266.802399,-297.248413,-343.174957,-376.987091,-390.500305,...,-8.378985,-7.709820,-11.258406,-12.396635,-6.766040,-7.524735,-10.935984,-14.497244,-11.677958,-8.171494


In [35]:
df_word['WordRep'] = df['WordRep']
df_word

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,1213,1214,1215,1216,1217,1218,1219,1220,1221,WordRep
0,FluencyBank_010_0,-347.740692,-317.819641,-320.320618,-332.317169,-343.356598,-344.344421,-354.105988,-366.142670,-372.481995,...,1.272797,-5.230965,-14.970407,-21.465916,-18.692142,-21.989422,-28.344925,-29.615908,-14.021580,0.0
1,FluencyBank_010_1,-438.671997,-419.725128,-417.393494,-410.429810,-408.278900,-409.253845,-408.491180,-408.270386,-411.358856,...,-9.380219,-7.065189,-0.610028,-4.598778,-9.972732,-12.226573,-8.808855,-7.627140,-5.797943,0.0
2,FluencyBank_010_10,-161.647873,-123.057701,-139.113174,-158.738495,-191.542191,-225.345139,-251.198715,-286.218719,-315.354279,...,-10.462812,-4.572368,-3.187361,-10.327148,-14.540721,-14.137384,-15.553209,-20.055593,-17.260246,0.0
3,FluencyBank_010_11,-427.028564,-397.329529,-394.007721,-395.439880,-397.534607,-395.810181,-401.564575,-346.869751,-223.398605,...,-8.293388,-10.565519,-14.671823,-12.170221,-4.913915,0.071892,-3.458159,-7.450909,-3.462155,0.0
4,FluencyBank_010_12,-293.398438,-282.992462,-221.619446,-157.533371,-125.753029,-112.959358,-92.637047,-67.506477,-80.915756,...,-7.808949,-7.652052,7.907707,9.569012,6.291955,-0.370439,-23.622349,-23.704752,-22.841469,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,FluencyBank_985_95,-443.876617,-401.284729,-387.183258,-378.139984,-374.589996,-372.226715,-372.407257,-381.320831,-396.568726,...,-0.333659,-0.964053,3.226261,-1.507389,-7.928948,-2.500397,-7.246117,-13.018373,-11.126150,0.0
3949,FluencyBank_985_96,-248.297775,-236.968079,-256.489441,-179.804199,-138.778915,-140.303513,-169.274170,-187.856918,-204.508316,...,-17.644371,-16.403625,-8.473763,-7.692134,-9.149691,-11.734518,-14.262653,-16.372873,-16.159088,0.0
3950,FluencyBank_985_97,-451.358978,-411.656891,-409.293213,-412.606598,-408.201843,-409.166840,-412.488556,-409.688263,-407.622314,...,-22.120049,-20.163528,-16.951302,-18.500633,-15.504421,-14.256115,-10.287129,-3.862890,-5.044336,1.0
3951,FluencyBank_985_98,-403.948334,-322.281982,-267.254852,-257.917694,-266.802399,-297.248413,-343.174957,-376.987091,-390.500305,...,-7.709820,-11.258406,-12.396635,-6.766040,-7.524735,-10.935984,-14.497244,-11.677958,-8.171494,0.0


In [36]:
df_word['WordRep'].value_counts()

WordRep
0.0    2961
1.0     482
3.0     213
2.0     189
Name: count, dtype: int64

In [37]:
df_word = df_word[df_word['WordRep'].notna()]
df_word

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,1213,1214,1215,1216,1217,1218,1219,1220,1221,WordRep
0,FluencyBank_010_0,-347.740692,-317.819641,-320.320618,-332.317169,-343.356598,-344.344421,-354.105988,-366.142670,-372.481995,...,1.272797,-5.230965,-14.970407,-21.465916,-18.692142,-21.989422,-28.344925,-29.615908,-14.021580,0.0
1,FluencyBank_010_1,-438.671997,-419.725128,-417.393494,-410.429810,-408.278900,-409.253845,-408.491180,-408.270386,-411.358856,...,-9.380219,-7.065189,-0.610028,-4.598778,-9.972732,-12.226573,-8.808855,-7.627140,-5.797943,0.0
2,FluencyBank_010_10,-161.647873,-123.057701,-139.113174,-158.738495,-191.542191,-225.345139,-251.198715,-286.218719,-315.354279,...,-10.462812,-4.572368,-3.187361,-10.327148,-14.540721,-14.137384,-15.553209,-20.055593,-17.260246,0.0
3,FluencyBank_010_11,-427.028564,-397.329529,-394.007721,-395.439880,-397.534607,-395.810181,-401.564575,-346.869751,-223.398605,...,-8.293388,-10.565519,-14.671823,-12.170221,-4.913915,0.071892,-3.458159,-7.450909,-3.462155,0.0
4,FluencyBank_010_12,-293.398438,-282.992462,-221.619446,-157.533371,-125.753029,-112.959358,-92.637047,-67.506477,-80.915756,...,-7.808949,-7.652052,7.907707,9.569012,6.291955,-0.370439,-23.622349,-23.704752,-22.841469,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,FluencyBank_985_95,-443.876617,-401.284729,-387.183258,-378.139984,-374.589996,-372.226715,-372.407257,-381.320831,-396.568726,...,-0.333659,-0.964053,3.226261,-1.507389,-7.928948,-2.500397,-7.246117,-13.018373,-11.126150,0.0
3949,FluencyBank_985_96,-248.297775,-236.968079,-256.489441,-179.804199,-138.778915,-140.303513,-169.274170,-187.856918,-204.508316,...,-17.644371,-16.403625,-8.473763,-7.692134,-9.149691,-11.734518,-14.262653,-16.372873,-16.159088,0.0
3950,FluencyBank_985_97,-451.358978,-411.656891,-409.293213,-412.606598,-408.201843,-409.166840,-412.488556,-409.688263,-407.622314,...,-22.120049,-20.163528,-16.951302,-18.500633,-15.504421,-14.256115,-10.287129,-3.862890,-5.044336,1.0
3951,FluencyBank_985_98,-403.948334,-322.281982,-267.254852,-257.917694,-266.802399,-297.248413,-343.174957,-376.987091,-390.500305,...,-7.709820,-11.258406,-12.396635,-6.766040,-7.524735,-10.935984,-14.497244,-11.677958,-8.171494,0.0


In [38]:
X_word = df_word.drop(['index', 'WordRep'], axis=1)
X_word

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1212,1213,1214,1215,1216,1217,1218,1219,1220,1221
0,-347.740692,-317.819641,-320.320618,-332.317169,-343.356598,-344.344421,-354.105988,-366.142670,-372.481995,-374.783325,...,1.723361,1.272797,-5.230965,-14.970407,-21.465916,-18.692142,-21.989422,-28.344925,-29.615908,-14.021580
1,-438.671997,-419.725128,-417.393494,-410.429810,-408.278900,-409.253845,-408.491180,-408.270386,-411.358856,-412.211578,...,-8.160137,-9.380219,-7.065189,-0.610028,-4.598778,-9.972732,-12.226573,-8.808855,-7.627140,-5.797943
2,-161.647873,-123.057701,-139.113174,-158.738495,-191.542191,-225.345139,-251.198715,-286.218719,-315.354279,-334.004761,...,-21.150717,-10.462812,-4.572368,-3.187361,-10.327148,-14.540721,-14.137384,-15.553209,-20.055593,-17.260246
3,-427.028564,-397.329529,-394.007721,-395.439880,-397.534607,-395.810181,-401.564575,-346.869751,-223.398605,-167.180069,...,-7.202801,-8.293388,-10.565519,-14.671823,-12.170221,-4.913915,0.071892,-3.458159,-7.450909,-3.462155
4,-293.398438,-282.992462,-221.619446,-157.533371,-125.753029,-112.959358,-92.637047,-67.506477,-80.915756,-117.480995,...,-7.929729,-7.808949,-7.652052,7.907707,9.569012,6.291955,-0.370439,-23.622349,-23.704752,-22.841469
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,-443.876617,-401.284729,-387.183258,-378.139984,-374.589996,-372.226715,-372.407257,-381.320831,-396.568726,-409.386047,...,2.072358,-0.333659,-0.964053,3.226261,-1.507389,-7.928948,-2.500397,-7.246117,-13.018373,-11.126150
3949,-248.297775,-236.968079,-256.489441,-179.804199,-138.778915,-140.303513,-169.274170,-187.856918,-204.508316,-223.737442,...,-13.776359,-17.644371,-16.403625,-8.473763,-7.692134,-9.149691,-11.734518,-14.262653,-16.372873,-16.159088
3950,-451.358978,-411.656891,-409.293213,-412.606598,-408.201843,-409.166840,-412.488556,-409.688263,-407.622314,-408.622650,...,-15.045193,-22.120049,-20.163528,-16.951302,-18.500633,-15.504421,-14.256115,-10.287129,-3.862890,-5.044336
3951,-403.948334,-322.281982,-267.254852,-257.917694,-266.802399,-297.248413,-343.174957,-376.987091,-390.500305,-302.544739,...,-8.378985,-7.709820,-11.258406,-12.396635,-6.766040,-7.524735,-10.935984,-14.497244,-11.677958,-8.171494


In [39]:
y_word = df_word['WordRep']
y_word

0       0.0
1       0.0
2       0.0
3       0.0
4       3.0
       ... 
3948    0.0
3949    0.0
3950    1.0
3951    0.0
3952    3.0
Name: WordRep, Length: 3845, dtype: float64

In [40]:
X_word_train, X_word_test, y_word_train, y_word_test =  train_test_split(X_word, y_word, 
                                                                     test_size=0.1, 
                                                                     random_state=42)

In [41]:
clf = DecisionTreeClassifier(criterion='entropy',random_state=5)
clf.fit(X_word_train,y_word_train)
clf.score(X_word_test,y_word_test) * 100


61.81818181818181

In [42]:
model = SVC(kernel="linear")
model = model.fit(X_word_train, y_word_train)
model.score(X_word_test, y_word_test) * 100

58.44155844155844

In [43]:
df_sound = pd.DataFrame.from_dict(features)
df_sound = df_sound.transpose()
df_sound = df_sound.reset_index()
df_sound = df_sound.sort_values(by='index')
df_sound['SoundRep'] = df['SoundRep']
df_sound

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,1213,1214,1215,1216,1217,1218,1219,1220,1221,SoundRep
0,FluencyBank_010_0,-347.740692,-317.819641,-320.320618,-332.317169,-343.356598,-344.344421,-354.105988,-366.142670,-372.481995,...,1.272797,-5.230965,-14.970407,-21.465916,-18.692142,-21.989422,-28.344925,-29.615908,-14.021580,0.0
1,FluencyBank_010_1,-438.671997,-419.725128,-417.393494,-410.429810,-408.278900,-409.253845,-408.491180,-408.270386,-411.358856,...,-9.380219,-7.065189,-0.610028,-4.598778,-9.972732,-12.226573,-8.808855,-7.627140,-5.797943,0.0
2,FluencyBank_010_10,-161.647873,-123.057701,-139.113174,-158.738495,-191.542191,-225.345139,-251.198715,-286.218719,-315.354279,...,-10.462812,-4.572368,-3.187361,-10.327148,-14.540721,-14.137384,-15.553209,-20.055593,-17.260246,0.0
3,FluencyBank_010_11,-427.028564,-397.329529,-394.007721,-395.439880,-397.534607,-395.810181,-401.564575,-346.869751,-223.398605,...,-8.293388,-10.565519,-14.671823,-12.170221,-4.913915,0.071892,-3.458159,-7.450909,-3.462155,0.0
4,FluencyBank_010_12,-293.398438,-282.992462,-221.619446,-157.533371,-125.753029,-112.959358,-92.637047,-67.506477,-80.915756,...,-7.808949,-7.652052,7.907707,9.569012,6.291955,-0.370439,-23.622349,-23.704752,-22.841469,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,FluencyBank_985_95,-443.876617,-401.284729,-387.183258,-378.139984,-374.589996,-372.226715,-372.407257,-381.320831,-396.568726,...,-0.333659,-0.964053,3.226261,-1.507389,-7.928948,-2.500397,-7.246117,-13.018373,-11.126150,1.0
3949,FluencyBank_985_96,-248.297775,-236.968079,-256.489441,-179.804199,-138.778915,-140.303513,-169.274170,-187.856918,-204.508316,...,-17.644371,-16.403625,-8.473763,-7.692134,-9.149691,-11.734518,-14.262653,-16.372873,-16.159088,0.0
3950,FluencyBank_985_97,-451.358978,-411.656891,-409.293213,-412.606598,-408.201843,-409.166840,-412.488556,-409.688263,-407.622314,...,-22.120049,-20.163528,-16.951302,-18.500633,-15.504421,-14.256115,-10.287129,-3.862890,-5.044336,0.0
3951,FluencyBank_985_98,-403.948334,-322.281982,-267.254852,-257.917694,-266.802399,-297.248413,-343.174957,-376.987091,-390.500305,...,-7.709820,-11.258406,-12.396635,-6.766040,-7.524735,-10.935984,-14.497244,-11.677958,-8.171494,0.0


In [44]:
df_sound = df_sound[df_sound['SoundRep'].notna()]
df_sound

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,1213,1214,1215,1216,1217,1218,1219,1220,1221,SoundRep
0,FluencyBank_010_0,-347.740692,-317.819641,-320.320618,-332.317169,-343.356598,-344.344421,-354.105988,-366.142670,-372.481995,...,1.272797,-5.230965,-14.970407,-21.465916,-18.692142,-21.989422,-28.344925,-29.615908,-14.021580,0.0
1,FluencyBank_010_1,-438.671997,-419.725128,-417.393494,-410.429810,-408.278900,-409.253845,-408.491180,-408.270386,-411.358856,...,-9.380219,-7.065189,-0.610028,-4.598778,-9.972732,-12.226573,-8.808855,-7.627140,-5.797943,0.0
2,FluencyBank_010_10,-161.647873,-123.057701,-139.113174,-158.738495,-191.542191,-225.345139,-251.198715,-286.218719,-315.354279,...,-10.462812,-4.572368,-3.187361,-10.327148,-14.540721,-14.137384,-15.553209,-20.055593,-17.260246,0.0
3,FluencyBank_010_11,-427.028564,-397.329529,-394.007721,-395.439880,-397.534607,-395.810181,-401.564575,-346.869751,-223.398605,...,-8.293388,-10.565519,-14.671823,-12.170221,-4.913915,0.071892,-3.458159,-7.450909,-3.462155,0.0
4,FluencyBank_010_12,-293.398438,-282.992462,-221.619446,-157.533371,-125.753029,-112.959358,-92.637047,-67.506477,-80.915756,...,-7.808949,-7.652052,7.907707,9.569012,6.291955,-0.370439,-23.622349,-23.704752,-22.841469,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,FluencyBank_985_95,-443.876617,-401.284729,-387.183258,-378.139984,-374.589996,-372.226715,-372.407257,-381.320831,-396.568726,...,-0.333659,-0.964053,3.226261,-1.507389,-7.928948,-2.500397,-7.246117,-13.018373,-11.126150,1.0
3949,FluencyBank_985_96,-248.297775,-236.968079,-256.489441,-179.804199,-138.778915,-140.303513,-169.274170,-187.856918,-204.508316,...,-17.644371,-16.403625,-8.473763,-7.692134,-9.149691,-11.734518,-14.262653,-16.372873,-16.159088,0.0
3950,FluencyBank_985_97,-451.358978,-411.656891,-409.293213,-412.606598,-408.201843,-409.166840,-412.488556,-409.688263,-407.622314,...,-22.120049,-20.163528,-16.951302,-18.500633,-15.504421,-14.256115,-10.287129,-3.862890,-5.044336,0.0
3951,FluencyBank_985_98,-403.948334,-322.281982,-267.254852,-257.917694,-266.802399,-297.248413,-343.174957,-376.987091,-390.500305,...,-7.709820,-11.258406,-12.396635,-6.766040,-7.524735,-10.935984,-14.497244,-11.677958,-8.171494,0.0


In [45]:
df_sound['SoundRep'].value_counts()

SoundRep
0.0    2784
1.0     543
2.0     343
3.0     175
Name: count, dtype: int64

In [46]:
df_sound.loc[df["SoundRep"] >= 1, "SoundRep"] = 1

In [47]:
df_sound['SoundRep'].value_counts()

SoundRep
0.0    2784
1.0    1061
Name: count, dtype: int64

In [48]:
X_sound = df_sound.drop(['index', 'SoundRep'], axis=1)
X_sound

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1212,1213,1214,1215,1216,1217,1218,1219,1220,1221
0,-347.740692,-317.819641,-320.320618,-332.317169,-343.356598,-344.344421,-354.105988,-366.142670,-372.481995,-374.783325,...,1.723361,1.272797,-5.230965,-14.970407,-21.465916,-18.692142,-21.989422,-28.344925,-29.615908,-14.021580
1,-438.671997,-419.725128,-417.393494,-410.429810,-408.278900,-409.253845,-408.491180,-408.270386,-411.358856,-412.211578,...,-8.160137,-9.380219,-7.065189,-0.610028,-4.598778,-9.972732,-12.226573,-8.808855,-7.627140,-5.797943
2,-161.647873,-123.057701,-139.113174,-158.738495,-191.542191,-225.345139,-251.198715,-286.218719,-315.354279,-334.004761,...,-21.150717,-10.462812,-4.572368,-3.187361,-10.327148,-14.540721,-14.137384,-15.553209,-20.055593,-17.260246
3,-427.028564,-397.329529,-394.007721,-395.439880,-397.534607,-395.810181,-401.564575,-346.869751,-223.398605,-167.180069,...,-7.202801,-8.293388,-10.565519,-14.671823,-12.170221,-4.913915,0.071892,-3.458159,-7.450909,-3.462155
4,-293.398438,-282.992462,-221.619446,-157.533371,-125.753029,-112.959358,-92.637047,-67.506477,-80.915756,-117.480995,...,-7.929729,-7.808949,-7.652052,7.907707,9.569012,6.291955,-0.370439,-23.622349,-23.704752,-22.841469
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,-443.876617,-401.284729,-387.183258,-378.139984,-374.589996,-372.226715,-372.407257,-381.320831,-396.568726,-409.386047,...,2.072358,-0.333659,-0.964053,3.226261,-1.507389,-7.928948,-2.500397,-7.246117,-13.018373,-11.126150
3949,-248.297775,-236.968079,-256.489441,-179.804199,-138.778915,-140.303513,-169.274170,-187.856918,-204.508316,-223.737442,...,-13.776359,-17.644371,-16.403625,-8.473763,-7.692134,-9.149691,-11.734518,-14.262653,-16.372873,-16.159088
3950,-451.358978,-411.656891,-409.293213,-412.606598,-408.201843,-409.166840,-412.488556,-409.688263,-407.622314,-408.622650,...,-15.045193,-22.120049,-20.163528,-16.951302,-18.500633,-15.504421,-14.256115,-10.287129,-3.862890,-5.044336
3951,-403.948334,-322.281982,-267.254852,-257.917694,-266.802399,-297.248413,-343.174957,-376.987091,-390.500305,-302.544739,...,-8.378985,-7.709820,-11.258406,-12.396635,-6.766040,-7.524735,-10.935984,-14.497244,-11.677958,-8.171494


In [49]:
y_sound = df_sound['SoundRep']
y_sound

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
3948    1.0
3949    0.0
3950    0.0
3951    0.0
3952    0.0
Name: SoundRep, Length: 3845, dtype: float64

In [50]:
X_sound_train, X_sound_test, y_sound_train, y_sound_test =  train_test_split(X_sound, y_sound, 
                                                                     test_size=0.2, 
                                                                     random_state=42)

In [51]:
clf = DecisionTreeClassifier(criterion='entropy',random_state=5)
clf.fit(X_sound_train,y_sound_train)
clf.score(X_sound_test,y_sound_test) * 100


61.89856957087127

In [None]:
model = SVC(kernel="linear")
model = model.fit(X_sound_train, y_sound_train)
model.score(X_sound_test, y_sound_test) * 100

In [None]:
nn_model = MLPClassifier(random_state=1, max_iter=300).fit(X_sound_train, y_sound_train)
nn_model.score(X_sound_test, y_sound_test) * 100

In [None]:
y_pred = np.array(nn_model.predict(X_sound_test))
print(np.count_nonzero(y_pred == 0))
print(np.count_nonzero(y_pred == 1))
print(np.count_nonzero(y_pred == 2))
print(np.count_nonzero(y_pred == 3))

y_actual = np.array(y_sound_test)
print(np.count_nonzero(y_actual == 0))
print(np.count_nonzero(y_actual == 1))
print(np.count_nonzero(y_actual == 2))
print(np.count_nonzero(y_actual == 3))

In [None]:
df_pro = pd.DataFrame.from_dict(features)
df_pro = df_pro.transpose()
df_pro

In [None]:
df_pro = df_pro.reset_index()
df_pro = df_pro.sort_values(by='index')
df_pro

In [None]:
df_pro['Prolongation'] = df['Prolongation']
df_pro

In [None]:
df_pro = df_pro[df_pro['Prolongation'].notna()]
df_pro.info()

In [None]:
df_pro['Prolongation'].value_counts()

In [None]:
X_pro = df_pro.drop(['index', 'Prolongation'], axis=1)
X_pro

In [None]:
y_pro = df_pro['Prolongation']
y_pro

In [None]:
y_pro.value_counts()

In [None]:
X_pro_train, X_pro_test, y_pro_train, y_pro_test =  train_test_split(X_pro, y_pro, 
                                                                     test_size=0.3, 
                                                                     random_state=42)

In [None]:
clf = DecisionTreeClassifier(criterion='entropy',random_state=5)
clf.fit(X_pro_train,y_pro_train)
clf.score(X_pro_test,y_pro_test)*100

In [None]:
from keras.layers import Activation, Dense, Dropout, Conv2D, Flatten, MaxPooling2D, Reshape, LSTM, Embedding, SpatialDropout1D
from keras.layers.recurrent import GRU
from keras.models import Sequential
from keras.callbacks import EarlyStopping

model = Sequential()
model.add(Embedding(1222, 1, input_length=X_pro_train.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(4, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 3
batch_size = 64

history = model.fit(X_pro_train, y_pro_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

In [None]:
model = SVC(kernel="linear")
model = model.fit(X_pro_train, y_pro_train)

In [None]:
y_pred = np.array(model.predict(X_pro_test))
print(np.count_nonzero(y_pred == 0))
print(np.count_nonzero(y_pred == 1))
print(np.count_nonzero(y_pred == 2))
print(np.count_nonzero(y_pred == 3))

y_actual = np.array(y_pro_test)
print(np.count_nonzero(y_actual == 0))
print(np.count_nonzero(y_actual == 1))
print(np.count_nonzero(y_actual == 2))
print(np.count_nonzero(y_actual == 3))

In [None]:
dfs = pd.DataFrame.from_dict(features)
dfs = dfs.transpose()
dfs = dfs.reset_index()
dfs = dfs.sort_values(by='index')
dfs

In [None]:
dfs['Stutter'] = df['Prolongation'] + df['WordRep'] + df['SoundRep']

In [None]:
dfs

In [None]:
dfs = dfs[dfs['Stutter'].notna()]
dfs.describe()

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = MinMaxScaler()
dfs.iloc[:, 1:1223] = scaler.fit_transform(dfs.iloc[:, 1:1223])
dfs

In [None]:
X = dfs.drop(['index', 'Stutter'], axis=1)
X

In [None]:
y = dfs['Stutter']
y

In [None]:
y.value_counts()

In [None]:
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = SVC(kernel="linear")
model = model.fit(X_train, y_train)
model.score(X_test, y_test) * 100