In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,confusion_matrix,classification_report

# question

### State with reasons whether the following sentences are true or false?

1- The sentence "SVMs are not suitable for large datasets" is false. SVMs can handle large datasets efficiently due to their ability to work with subsets of training data called support vectors.

2- The sentence "SVMs perform poorly in imbalanced datasets" is true. SVMs aim to find a decision boundary that maximizes the margin between classes. In imbalanced datasets where one class has significantly fewer samples than the other, SVMs may misclassify the minority class due to the emphasis on maximizing the overall margin.

3- The sentence "SVMs perform poorly when there is just too much noise in the data" is true. SVMs are sensitive to noisy data and outliers. As noise interferes with the separation of classes, SVMs may struggle to find an optimal decision boundary, leading to decreased performance.

# lsvt-voice-rehabilitation

In [3]:
xls = pd.ExcelFile('LSVT_voice_rehabilitation.xlsx')
data = pd.read_excel(xls, 'Data')
response = pd.read_excel(xls, 'Binary response')
subject = pd.read_excel(xls, 'Subject demographics')

In [4]:
data

Unnamed: 0,Jitter->F0_abs_dif,Jitter->F0_dif_percent,Jitter->F0_PQ5_classical_Schoentgen,Jitter->F0_PQ5_classical_Baken,Jitter->F0_PQ5_generalised_Schoentgen,Jitter->F0_abs0th_perturb,Jitter->F0_CV,Jitter->F0_TKEO_mean,Jitter->F0_TKEO_std,Jitter->F0_TKEO_prc5,...,det_TKEO_std4_1_coef,det_TKEO_std4_2_coef,det_TKEO_std4_3_coef,det_TKEO_std4_4_coef,det_TKEO_std4_5_coef,det_TKEO_std4_6_coef,det_TKEO_std4_7_coef,det_TKEO_std4_8_coef,det_TKEO_std4_9_coef,det_TKEO_std4_10_coef
0,0.088112,0.041697,0.000480,-3.723304e-06,0.000422,2.458381,6.332164e-07,47.021079,1366.430390,-7.103323,...,2.527583,7.088978,19.753255,54.335046,145.528630,375.097397,921.296579,2137.079844,4697.131077,9931.208257
1,0.161798,0.057364,0.000677,5.466365e-06,0.000206,2.592066,7.228518e-07,93.557936,2582.922776,-23.284761,...,2.841881,7.977363,22.203504,60.993338,163.560972,421.010306,1036.092589,2404.072562,5284.082128,11165.095662
2,0.554508,0.642913,0.007576,-7.443871e-07,0.006488,12.691326,6.946246e-04,52.988422,466.682635,-45.308680,...,1.806103,5.078616,14.135923,38.641654,103.466808,264.654626,649.657090,1507.384591,3315.804236,6974.600636
3,0.031089,0.027108,0.000314,-2.214722e-07,0.000216,0.754288,1.868647e-07,13.982754,417.217249,-1.207741,...,1.999637,5.610448,15.626164,42.943275,115.014975,296.320795,728.284936,1689.586636,3713.818933,7851.139360
4,0.076177,0.039071,0.000302,2.732106e-05,0.001102,1.270034,4.918186e-05,56.373996,1608.317410,-3.491990,...,2.453087,6.902199,19.117609,52.715873,141.113865,363.511021,893.246151,2071.625622,4554.204815,9623.566242
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,0.116214,0.070546,0.000837,2.765074e-06,0.000333,1.890006,9.844257e-07,34.083311,896.672947,-10.446397,...,2.322612,6.527285,18.210931,50.085484,134.123291,345.396264,850.942761,1973.383824,4336.099395,9158.984652
122,0.700258,0.334397,0.003959,8.297261e-06,0.001516,4.557797,1.581899e-05,104.648435,1583.166169,-97.281717,...,2.545596,7.160142,20.081389,55.178858,147.584708,379.897760,935.982559,2166.960428,4769.956102,10067.750435
123,0.072635,0.050743,0.000597,-5.277518e-06,0.000434,6.984651,4.993260e-07,21.859427,625.288493,-4.116001,...,2.189909,6.141876,17.095325,46.893015,125.687344,323.728298,795.715774,1845.609006,4056.256338,8583.121863
124,0.111362,0.054237,0.000646,-1.546671e-06,0.000277,1.935398,4.398078e-07,47.870508,1367.843467,-9.373059,...,2.539158,7.125849,19.849347,54.508453,146.094711,376.377835,926.435019,2147.499571,4717.270683,9966.759379


In [5]:
response

Unnamed: 0,"Binary class 1=acceptable, 2=unacceptable"
0,1
1,2
2,2
3,1
4,2
...,...
121,2
122,2
123,1
124,2


In [6]:
subject

Unnamed: 0,Subject_index,Age,"Gender, 0->Male, 1->Female"
0,1,68,1
1,1,68,1
2,1,68,1
3,2,68,0
4,2,68,0
...,...,...,...
121,15,63,0
122,15,63,0
123,16,69,0
124,16,69,0


In [7]:
df = pd.concat([data,subject,response],axis=1)

In [8]:
df

Unnamed: 0,Jitter->F0_abs_dif,Jitter->F0_dif_percent,Jitter->F0_PQ5_classical_Schoentgen,Jitter->F0_PQ5_classical_Baken,Jitter->F0_PQ5_generalised_Schoentgen,Jitter->F0_abs0th_perturb,Jitter->F0_CV,Jitter->F0_TKEO_mean,Jitter->F0_TKEO_std,Jitter->F0_TKEO_prc5,...,det_TKEO_std4_5_coef,det_TKEO_std4_6_coef,det_TKEO_std4_7_coef,det_TKEO_std4_8_coef,det_TKEO_std4_9_coef,det_TKEO_std4_10_coef,Subject_index,Age,"Gender, 0->Male, 1->Female","Binary class 1=acceptable, 2=unacceptable"
0,0.088112,0.041697,0.000480,-3.723304e-06,0.000422,2.458381,6.332164e-07,47.021079,1366.430390,-7.103323,...,145.528630,375.097397,921.296579,2137.079844,4697.131077,9931.208257,1,68,1,1
1,0.161798,0.057364,0.000677,5.466365e-06,0.000206,2.592066,7.228518e-07,93.557936,2582.922776,-23.284761,...,163.560972,421.010306,1036.092589,2404.072562,5284.082128,11165.095662,1,68,1,2
2,0.554508,0.642913,0.007576,-7.443871e-07,0.006488,12.691326,6.946246e-04,52.988422,466.682635,-45.308680,...,103.466808,264.654626,649.657090,1507.384591,3315.804236,6974.600636,1,68,1,2
3,0.031089,0.027108,0.000314,-2.214722e-07,0.000216,0.754288,1.868647e-07,13.982754,417.217249,-1.207741,...,115.014975,296.320795,728.284936,1689.586636,3713.818933,7851.139360,2,68,0,1
4,0.076177,0.039071,0.000302,2.732106e-05,0.001102,1.270034,4.918186e-05,56.373996,1608.317410,-3.491990,...,141.113865,363.511021,893.246151,2071.625622,4554.204815,9623.566242,2,68,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,0.116214,0.070546,0.000837,2.765074e-06,0.000333,1.890006,9.844257e-07,34.083311,896.672947,-10.446397,...,134.123291,345.396264,850.942761,1973.383824,4336.099395,9158.984652,15,63,0,2
122,0.700258,0.334397,0.003959,8.297261e-06,0.001516,4.557797,1.581899e-05,104.648435,1583.166169,-97.281717,...,147.584708,379.897760,935.982559,2166.960428,4769.956102,10067.750435,15,63,0,2
123,0.072635,0.050743,0.000597,-5.277518e-06,0.000434,6.984651,4.993260e-07,21.859427,625.288493,-4.116001,...,125.687344,323.728298,795.715774,1845.609006,4056.256338,8583.121863,16,69,0,1
124,0.111362,0.054237,0.000646,-1.546671e-06,0.000277,1.935398,4.398078e-07,47.870508,1367.843467,-9.373059,...,146.094711,376.377835,926.435019,2147.499571,4717.270683,9966.759379,16,69,0,2


In [9]:
df.describe()

Unnamed: 0,Jitter->F0_abs_dif,Jitter->F0_dif_percent,Jitter->F0_PQ5_classical_Schoentgen,Jitter->F0_PQ5_classical_Baken,Jitter->F0_PQ5_generalised_Schoentgen,Jitter->F0_abs0th_perturb,Jitter->F0_CV,Jitter->F0_TKEO_mean,Jitter->F0_TKEO_std,Jitter->F0_TKEO_prc5,...,det_TKEO_std4_5_coef,det_TKEO_std4_6_coef,det_TKEO_std4_7_coef,det_TKEO_std4_8_coef,det_TKEO_std4_9_coef,det_TKEO_std4_10_coef,Subject_index,Age,"Gender, 0->Male, 1->Female","Binary class 1=acceptable, 2=unacceptable"
count,126.0,126.0,126.0,126.0,126.0,126.0,126.0,126.0,126.0,126.0,...,126.0,126.0,126.0,126.0,126.0,126.0,126.0,126.0,126.0,126.0
mean,0.225683,0.119056,0.001385,-6.951801e-07,0.001061,6.937604,0.0001660453,82.190214,1796.30223,-39.41124,...,143.774055,370.528329,912.494576,2116.358203,4649.660947,9813.389954,8.357143,61.928571,0.428571,1.666667
std,0.345839,0.180097,0.002124,1.573954e-05,0.001836,24.275232,0.0008712998,106.894449,1631.244809,98.188655,...,20.468616,52.836331,129.246368,296.40001,653.497584,1365.084335,4.927416,6.252268,0.496847,0.473286
min,0.020443,0.01383,0.000159,-5.832686e-05,6e-05,0.548917,7.679846e-08,13.319091,269.922782,-624.951034,...,103.466808,264.654626,649.65709,1507.384591,3216.625092,6700.842161,1.0,51.0,0.0,1.0
25%,0.057713,0.028676,0.00032,-5.121154e-06,0.000248,1.15823,4.132032e-07,29.760987,807.453611,-23.075601,...,128.924436,331.900606,818.09773,1888.120153,4148.651943,8760.159506,4.0,58.0,0.0,1.0
50%,0.103926,0.048597,0.000521,-2.05057e-06,0.000389,2.012467,9.491368e-07,52.594076,1356.767272,-8.83389,...,141.675948,366.706743,902.870861,2096.309958,4597.088099,9741.107531,7.5,63.5,0.0,2.0
75%,0.192672,0.126339,0.001487,1.938132e-06,0.001029,3.251788,9.886929e-06,91.694761,2380.748968,-4.115575,...,155.552562,400.947223,988.90931,2292.902649,5037.68306,10633.458586,13.0,67.0,1.0,2.0
max,2.069748,0.933265,0.010925,9.082689e-05,0.010443,218.686963,0.007890252,906.320483,12299.065627,-0.603839,...,198.333548,510.827346,1257.01411,2916.248441,6406.074011,13537.402516,16.0,69.0,1.0,2.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126 entries, 0 to 125
Columns: 314 entries, Jitter->F0_abs_dif to Binary class 1=acceptable, 2=unacceptable
dtypes: float64(309), int64(5)
memory usage: 309.2 KB


## Model

In [11]:
x_train,x_test,y_train,y_test = train_test_split(pd.concat([data,subject],axis=1),response,random_state=42)

In [12]:
x_train

Unnamed: 0,Jitter->F0_abs_dif,Jitter->F0_dif_percent,Jitter->F0_PQ5_classical_Schoentgen,Jitter->F0_PQ5_classical_Baken,Jitter->F0_PQ5_generalised_Schoentgen,Jitter->F0_abs0th_perturb,Jitter->F0_CV,Jitter->F0_TKEO_mean,Jitter->F0_TKEO_std,Jitter->F0_TKEO_prc5,...,det_TKEO_std4_4_coef,det_TKEO_std4_5_coef,det_TKEO_std4_6_coef,det_TKEO_std4_7_coef,det_TKEO_std4_8_coef,det_TKEO_std4_9_coef,det_TKEO_std4_10_coef,Subject_index,Age,"Gender, 0->Male, 1->Female"
47,0.054323,0.017857,0.000203,-3.843455e-06,0.000240,4.612988,1.781817e-07,91.697524,2753.299161,-7.130365,...,61.931458,165.908079,427.390937,1051.039031,2438.050611,5359.311281,11310.212172,2,68,0
109,0.071532,0.035777,0.000382,-1.487704e-05,0.000611,2.124373,2.974657e-06,38.980439,1086.227101,-3.896339,...,52.308884,140.229582,361.486146,885.130393,2050.902403,4514.808902,9549.912566,11,62,1
123,0.072635,0.050743,0.000597,-5.277518e-06,0.000434,6.984651,4.993260e-07,21.859427,625.288493,-4.116001,...,46.893015,125.687344,323.728298,795.715774,1845.609006,4056.256338,8583.121863,16,69,0
15,0.037592,0.025793,0.000306,7.725535e-07,0.000108,0.968243,1.300052e-07,21.916628,664.710687,-1.765231,...,47.249466,126.589594,326.072560,801.486390,1859.154995,4088.830256,8638.195110,6,58,0
96,0.075121,0.042746,0.000507,8.374407e-07,0.000227,1.776438,3.018278e-07,34.245446,1001.879061,-4.836478,...,51.346563,137.602442,354.293261,872.609610,2023.981089,4449.398057,9393.778399,5,62,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0.053964,0.024475,0.000285,-5.018743e-06,0.000132,2.015948,2.107112e-07,52.199730,1556.504881,-4.279332,...,55.986395,150.026007,386.456106,951.385565,2207.212963,4854.184043,10249.319805,8,65,1
14,0.545266,0.215351,0.002537,-5.156799e-06,0.001999,16.726819,1.956102e-05,73.152923,640.669570,-121.354402,...,38.394547,106.478649,279.072641,725.208967,1723.983681,4049.117874,8615.614385,5,62,1
92,0.151367,0.095865,0.001087,-2.453946e-05,0.001299,1.198491,5.333326e-06,29.120045,661.924062,-16.034130,...,47.720302,128.033687,330.022568,806.834995,1871.996734,4119.765350,8718.331650,3,67,0
51,0.078691,0.021119,0.000248,-3.114055e-06,0.000095,1.806555,1.099312e-07,143.972136,4342.907854,-9.725394,...,67.004863,179.541255,462.526986,1138.003463,2638.819661,5800.546445,12256.176332,4,64,1


In [13]:
y_train

Unnamed: 0,"Binary class 1=acceptable, 2=unacceptable"
47,2
109,2
123,1
15,1
96,1
...,...
106,2
14,2
92,2
51,1


### svm_linear

In [14]:
svm_linear = Pipeline([('scaler', StandardScaler()), 
                       ('lin', SVC(C=1,kernel='linear',random_state=42))
                      ])
svm_linear.fit(x_train, y_train)

In [15]:
cross_val_score(svm_linear, x_train, y_train, cv=5, scoring="f1")

array([0.61538462, 0.76923077, 0.66666667, 0.70588235, 0.6       ])

In [16]:
y_pred_l = svm_linear.predict(x_test)

In [17]:
print(f1_score(y_test, y_pred_l))
print(precision_score(y_test, y_pred_l))
print(recall_score(y_test, y_pred_l))

0.72
0.5625
1.0


In [18]:
print(classification_report(y_test, y_pred_l))

              precision    recall  f1-score   support

           1       0.56      1.00      0.72         9
           2       1.00      0.70      0.82        23

    accuracy                           0.78        32
   macro avg       0.78      0.85      0.77        32
weighted avg       0.88      0.78      0.79        32



### svm_polynomial

In [19]:
svm_poly = Pipeline([('scaler', StandardScaler()), 
                   ('poly', SVC(kernel='poly', random_state=42, degree=3, coef0=1))
                      ])

svm_poly.fit(x_train, y_train)

In [20]:
y_pred_p = svm_poly.predict(x_test)
print(f1_score(y_test, y_pred_p))
print(precision_score(y_test, y_pred_p))
print(recall_score(y_test, y_pred_p))

0.761904761904762
0.6666666666666666
0.8888888888888888


In [21]:
print(classification_report(y_test, y_pred_p))

              precision    recall  f1-score   support

           1       0.67      0.89      0.76         9
           2       0.95      0.83      0.88        23

    accuracy                           0.84        32
   macro avg       0.81      0.86      0.82        32
weighted avg       0.87      0.84      0.85        32



### svm_rbf

In [22]:
svm_rbf = Pipeline([('scaler', StandardScaler()), 
                    ('poly', SVC(kernel='rbf', random_state=42, gamma='auto'))
                      ])
svm_rbf.fit(x_train, y_train)

In [23]:
y_pred_r = svm_rbf.predict(x_test)
print(f1_score(y_test, y_pred_r))
print(precision_score(y_test, y_pred_r))
print(recall_score(y_test, y_pred_r))

0.8750000000000001
1.0
0.7777777777777778


In [24]:
print(classification_report(y_test, y_pred_r))

              precision    recall  f1-score   support

           1       1.00      0.78      0.88         9
           2       0.92      1.00      0.96        23

    accuracy                           0.94        32
   macro avg       0.96      0.89      0.92        32
weighted avg       0.94      0.94      0.93        32



### svm_sigmoid

In [25]:
svm_sig = Pipeline([('scaler', StandardScaler()), 
                    ('poly', SVC(kernel='sigmoid', random_state=42, gamma='auto', coef0=0.1))
                      ])
svm_sig.fit(x_train, y_train)

In [26]:
y_pred_s = svm_sig.predict(x_test)
print(f1_score(y_test, y_pred_s))
print(precision_score(y_test, y_pred_s))
print(recall_score(y_test, y_pred_s))

0.8421052631578948
0.8
0.8888888888888888


In [27]:
print(classification_report(y_test, y_pred_s))

              precision    recall  f1-score   support

           1       0.80      0.89      0.84         9
           2       0.95      0.91      0.93        23

    accuracy                           0.91        32
   macro avg       0.88      0.90      0.89        32
weighted avg       0.91      0.91      0.91        32



## Fine Tune

In [28]:
best_p_pipeline = Pipeline([
                    ('scalar',StandardScaler()),
                    ('svc',SVC())
                ])

In [29]:
parameters = {'svc__C': [0.1,1,4],
          'svc__coef0':[0,1,2],
          'svc__degree':[2,3,4],
          'svc__gamma':['auto','scale',0.01],
          'svc__kernel':['poly','rbf','sigmoid']}
grids = GridSearchCV(best_p_pipeline, parameters, cv=3, scoring='f1_macro')

In [30]:
grids.fit(x_train,y_train)

In [32]:
new_pred = grids.predict(x_test)

In [33]:
print(f1_score(y_test, new_pred))
print(precision_score(y_test, new_pred))
print(recall_score(y_test, new_pred))

0.72
0.5625
1.0


In [34]:
print(classification_report(y_test, new_pred))

              precision    recall  f1-score   support

           1       0.56      1.00      0.72         9
           2       1.00      0.70      0.82        23

    accuracy                           0.78        32
   macro avg       0.78      0.85      0.77        32
weighted avg       0.88      0.78      0.79        32



In [35]:
grids.best_params_

{'svc__C': 4,
 'svc__coef0': 0,
 'svc__degree': 2,
 'svc__gamma': 'auto',
 'svc__kernel': 'sigmoid'}

In [25]:
poly_svm = GridSearchCV(poly_FT, param_grid=poly_hyper, cv=3, scoring='f1_micro')
poly_svm.fit(x_train, y_train)

In [26]:
poly_p = poly_svm.predict(x_test)

In [30]:
sig_FT = Pipeline([('scaler', StandardScaler()), 
                   ('poly', SVC(kernel='sigmoid', random_state=42))
                      ])
sig_hyper = {'poly__gamma': range(2,10,1) , 'poly__coef0': range(2,10,1)}

In [31]:
sig_svm = GridSearchCV(sig_FT, param_grid=sig_hyper, cv=3, scoring='f1_micro')
sig_svm.fit(x_train, y_train)

In [32]:
sig_p = sig_svm.predict(x_test)

in this set of data, after fine tuning, sigmoid has better function and f1_score