In [1]:
import pandas as pd
import numpy as np

In [2]:
df_data = pd.read_excel('LSVT_voice_rehabilitation.xlsx', sheet_name='Data')
df_response = pd.read_excel('LSVT_voice_rehabilitation.xlsx', sheet_name='Binary response')
df_subject = pd.read_excel('LSVT_voice_rehabilitation.xlsx', sheet_name='Subject demographics')

In [3]:
df = pd.concat([df_data, df_response, df_subject], axis=1)

In [4]:
df.shape

(126, 314)

In [5]:
###

# the no of features are more than the rows so we need to reduce the features
# both feature extraction and feature reduction will be applied

### feature extraction - PCA

In [100]:
X = df.drop('Binary class 1=acceptable, 2=unacceptable', axis=1)
y = df['Binary class 1=acceptable, 2=unacceptable'].copy()

In [132]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [8]:
from sklearn.decomposition import PCA

pca = PCA()
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [9]:
print('X_train_pca shape : ',X_train_pca.shape)
print('X_train shape : ',X_train.shape)
print('pca.components : ',pca.components_.shape)

X_train_pca shape :  (94, 94)
X_train shape :  (94, 313)
pca.components :  (94, 313)


In [10]:
###

# as the above shapes show pca reduced no of features from 313 to 94 (defaults is min of rank and samples)
# still seems so many features and keep reducing


In [11]:
# lets find features with 95% explained variance


cumsum = 0.99
pca_features=0
for i in range(0,X_train_pca.shape[1]):
    if pca.explained_variance_ratio_.cumsum()[i] < cumsum:
        pca_features = i
print('no of features explain', cumsum,'% of pca components is : ',pca_features)


no of features explain 0.99 % of pca components is :  0


In [44]:
# the class notebook:

# ### selection principle components, create projection matrix
# cumsum_variance = np.cumsum(pca.explained_variance_ratio_)
# d = np.argmax(cumsum_variance > .95) + 1
# d

# n_component = 1
# projection_matrix = pca.components_[:n_component].T
# projection_matrix.shape

(313, 1)

In [45]:
X_train_transformed = X_train  @ projection_matrix
X_train_transformed.shape

(94, 1)

In [12]:
###

# as shown above just the first pca feature explains 99% of the variance so pca[0] will be considered


In [13]:
first_principal_component = pca.components_[0]

X_train_1stpca = np.dot(X_train, first_principal_component).reshape(-1,1)
X_train_1stpca.shape

(94, 1)

In [14]:
first_principal_component = pca.components_[0]

X_test_1stpca = np.dot(X_test, first_principal_component).reshape(-1,1)
X_test_1stpca.shape

(32, 1)

In [15]:
###

# now given the transfored X_train and X_test we will make the models
# for a better comparison, we apply same models which made for the data without any feature reduction


#### SVC linear

In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

In [23]:
svc_linear = SVC(kernel = 'linear')
model_svc_linear = make_pipeline(StandardScaler(), svc_linear)
model_svc_linear.fit(X_train_1stpca, y_train)
pred_1stpca = model_svc_linear.predict(X_test_1stpca)

In [21]:
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

In [24]:
print('accuracy for linear kernel : ', accuracy_score(y_test, pred_1stpca))
print('f_1 score for linear kernel : ', f1_score(y_test, pred_1stpca))
print(confusion_matrix(y_test, pred_1stpca))

accuracy for linear kernel :  0.65625
f_1 score for linear kernel :  0.0
[[ 0 11]
 [ 0 21]]


In [30]:
###

# the original data shows better accuracy score and f1 score

#### SVC polynomial

In [28]:
# degree:2

svc_poly_d2 = SVC(kernel='poly', degree=2)
model_svc_poly_d2 = make_pipeline(StandardScaler(), svc_poly_d2)
model_svc_poly_d2.fit(X_train_1stpca, y_train)
pred_1stpca = model_svc_poly_d2.predict(X_test_1stpca)

In [29]:
print('accuracy for linear kernel : ', accuracy_score(y_test, pred_1stpca))
print('f_1 score for linear kernel : ', f1_score(y_test, pred_1stpca))
print(confusion_matrix(y_test, pred_1stpca))

accuracy for linear kernel :  0.65625
f_1 score for linear kernel :  0.0
[[ 0 11]
 [ 0 21]]


### finding the best model using GridSearch

In [31]:
# as long as the data size has been reduced significantly, we expect faster fine tune

#### SVC -- PCA

In [40]:
from sklearn.model_selection import GridSearchCV

hyper_params = {
    'svc__kernel': ['linear', 'rbf', 'poly'],
    'svc__degree': [2,3],
    'svc__gamma': ['scale', 0.1, 1, 10],
    'svc__C': [0.1, 1, 10],
}
svc_model = make_pipeline(StandardScaler(), SVC())
svc_tuned = GridSearchCV(svc_model, param_grid=hyper_params, cv=3, scoring='accuracy')
svc_tuned.fit(X_train_1stpca, y_train)
pred_svc_tuned = svc_tuned.predict(X_test_1stpca)
print('accuracy : ',accuracy_score(y_test, pred_svc_tuned))
print('f_1 score : ', f1_score(y_test, pred_svc_tuned))
print('best params: ', svc_tuned.best_params_)

accuracy :  0.65625
f_1 score :  0.0
best params:  {'svc__C': 0.1, 'svc__degree': 2, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}


#### random forest -- PCA

In [61]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

rf_model = make_pipeline(StandardScaler(), RandomForestClassifier())

hyper_params = {
    'randomforestclassifier__n_estimators': [50,100,200],
    'randomforestclassifier__criterion': ['gini', 'entropy'],
    'randomforestclassifier__max_leaf_nodes': [4,6,8]
}

rf_tuned = GridSearchCV(rf_model, param_grid=hyper_params, cv=3, scoring='accuracy')
rf_tuned.fit(X_train_1stpca, y_train)
pred_rf = rf_tuned.predict(X_test_1stpca)
print('accuracy : ',accuracy_score(y_test, pred_rf))
print('f_1 score : ', f1_score(y_test, pred_rf))
print('best params: ', rf_tuned.best_params_)

accuracy :  0.6875
f_1 score :  0.37499999999999994
best params:  {'randomforestclassifier__criterion': 'gini', 'randomforestclassifier__max_leaf_nodes': 4, 'randomforestclassifier__n_estimators': 50}


In [63]:
###

# random forest showed higher accuracy and f1 score compared to SVC


### feature extraction - LDA

#### SVC -- LDA

In [65]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [72]:
lda = LinearDiscriminantAnalysis(n_components=1)
lda_preprocess = make_pipeline(lda, StandardScaler())
lda_svc_model = make_pipeline(lda_preprocess, SVC())
hyper_params = {
    'svc__kernel': ['linear', 'rbf', 'poly'],
    'svc__degree': [2,3],
    'svc__gamma': ['scale', 0.1, 1, 10],
    'svc__C': [0.1, 1, 10],
}
lda_tuned = GridSearchCV(lda_svc_model, param_grid=hyper_params, scoring='accuracy', cv=3)
lda_tuned.fit(X_train, y_train)

In [73]:
pred_lda_sva = lda_tuned.predict(X_test)
print('accuracy : ',accuracy_score(y_test, pred_lda_sva))
print('f_1 score : ', f1_score(y_test, pred_lda_sva))
print('best params: ', lda_tuned.best_params_)

accuracy :  0.75
f_1 score :  0.6363636363636364
best params:  {'svc__C': 1, 'svc__degree': 3, 'svc__gamma': 'scale', 'svc__kernel': 'poly'}


In [74]:
###

# LDA provided higher accuracy and f1 score than PCA with SVC
# SVC with original data and PCA, had its best accuracy for linear kernel
# unlike original data and PCA, LDA had irs best accuracy in degree=3 of polynomial kernel

#### randomforest -- LDA

In [76]:
lda = LinearDiscriminantAnalysis(n_components=1)
lda_preprocess = make_pipeline(lda, StandardScaler())
lda_rf_model = make_pipeline(lda_preprocess, RandomForestClassifier())

hyper_params = {
    'randomforestclassifier__n_estimators': [50,100,200],
    'randomforestclassifier__criterion': ['gini', 'entropy'],
    'randomforestclassifier__max_leaf_nodes': [4,6,8]
}

lda_rf_tuned = GridSearchCV(rf_model, param_grid=hyper_params, cv=3, scoring='accuracy')
lda_rf_tuned.fit(X_train, y_train)
pred_lda_rf = lda_rf_tuned.predict(X_test)
print('accuracy : ',accuracy_score(y_test, pred_lda_rf))
print('f_1 score : ', f1_score(y_test, pred_lda_rf))
print('best params: ', lda_rf_tuned.best_params_)

accuracy :  0.8125
f_1 score :  0.7000000000000001
best params:  {'randomforestclassifier__criterion': 'gini', 'randomforestclassifier__max_leaf_nodes': 6, 'randomforestclassifier__n_estimators': 200}


In [106]:
####

# higher accurcay score and f1 score with LDA randomforest than all PCA models!


### feature selection - SelectKBest

In [107]:
from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif

In [122]:
print(len(X_train.select_dtypes(include=['number'])))
print(len(X_train))

94
94


In [117]:
for col in X_train.select_dtypes(include=['number']):
    if X_train.select_dtypes(include=['number'])[col].nunique() < 12:
        print(col)

Data_length
Age
Gender, 0->Male, 1->Female


In [123]:
### 

# all columns are numeric and according to their unique values could be considered continues
# it could be acceptable to consider age as categorical but since its ordinal properties we keep it as continuous
# gender is obviouslly categorical and it is the only categorical featrure


In [134]:
X_train.columns

Index(['Jitter->F0_abs_dif', 'Jitter->F0_dif_percent',
       'Jitter->F0_PQ5_classical_Schoentgen', 'Jitter->F0_PQ5_classical_Baken',
       'Jitter->F0_PQ5_generalised_Schoentgen', 'Jitter->F0_abs0th_perturb',
       'Jitter->F0_CV', 'Jitter->F0_TKEO_mean', 'Jitter->F0_TKEO_std',
       'Jitter->F0_TKEO_prc5',
       ...
       'det_TKEO_std4_4_coef', 'det_TKEO_std4_5_coef', 'det_TKEO_std4_6_coef',
       'det_TKEO_std4_7_coef', 'det_TKEO_std4_8_coef', 'det_TKEO_std4_9_coef',
       'det_TKEO_std4_10_coef', 'Subject_index', 'Age',
       'Gender, 0->Male, 1->Female'],
      dtype='object', length=313)

In [151]:
from sklearn.compose import ColumnTransformer

# as long as categorical and numerical features have different methods in seleckbest, I seperated the
# categorical and numerical features

# there are two pipelines one for categorical and one for numerical with different selectkbest methods
# finally, pipelines will be applied to entire data by columntransfer

# k=2 for numerical features
# k=1 for categorical feature (there is just one categorical feature)

numeric_columns = pd.DataFrame(X_train).drop('Gender, 0->Male, 1->Female', axis=1).columns
cat_columns = pd.DataFrame(X_train['Gender, 0->Male, 1->Female']).columns

skb_num = make_pipeline(StandardScaler(), SelectKBest(f_classif, k=1))
skb_cat = make_pipeline(SelectKBest(chi2, k=1))

preprocessing = ColumnTransformer([
    ('num', skb_num, numeric_columns),
    ('cat', skb_num, cat_columns)
])

X_selected = preprocessing.fit_transform(X_train, y_train)
preprocessing.get_feature_names_out()

array(['num__MFCC_2nd coef', 'cat__Gender, 0->Male, 1->Female'],
      dtype=object)

In [152]:
X_selected.shape

(94, 2)

#### SVC

In [155]:
skb_svc = make_pipeline(preprocessing, SVC())
skb_svc.fit(X_train, y_train)
predict_skb_svc = skb_svc.predict(X_test)
print('accuracy : ',accuracy_score(y_test, predict_skb_svc))
print('f_1 score : ', f1_score(y_test, predict_skb_svc))
print('best params: ', preprocessing.get_feature_names_out())

accuracy :  0.8125
f_1 score :  0.7272727272727273
best params:  ['num__MFCC_2nd coef' 'cat__Gender, 0->Male, 1->Female']


In [None]:
# this time even greater accuracy and f1 score than all PCA and LDA models!