In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest, f_classif

In [7]:
import warnings
warnings.filterwarnings('ignore') # Suppress all warnings

In [8]:
data = pd.read_csv('data/voice.csv', sep=',')
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('label', axis=1), data['label'], test_size=0.2, random_state=42)


### Model Selection

In [9]:
models = [
    ('Logistic Regression', LogisticRegression()),
    ('Support Vector Machine', SVC()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('XGBoost', XGBClassifier()),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Naive Bayes', GaussianNB()),
    ('Neural Network', MLPClassifier(max_iter=1000))
]

metric = 'accuracy'

In [10]:
# Evaluate each model using cross-validation
results = []
names = []

for name, model in models:
    cv_results = cross_val_score(model, X_train, y_train, cv=5, scoring=metric)
    results.append(cv_results)
    names.append(name)
    print(f"{name}: {cv_results.mean():.4f} ({cv_results.std():.4f})")

# Select the best model based on cross-validation results
best_model_index = np.argmax([result.mean() for result in results])
best_model_name, best_model = models[best_model_index]

print(f"\nBest Model: {best_model_name}, with an accuracy of {results[best_model_index].mean():.4f}")

Logistic Regression: 0.8737 (0.0261)
Support Vector Machine: 0.7194 (0.0145)
Decision Tree: 0.9645 (0.0085)
Random Forest: 0.9791 (0.0057)
Gradient Boosting: 0.9759 (0.0042)
XGBoost: 0.9799 (0.0055)
K-Nearest Neighbors: 0.7861 (0.0143)
Naive Bayes: 0.9313 (0.0063)
Neural Network: 0.9716 (0.0046)

Best Model: XGBoost, with an accuracy of 0.9799


### Removing not relevant features

In [11]:
# print the importance of features using XGBoost
model = XGBClassifier()
model.fit(X_train, y_train)
feature_importances = model.feature_importances_
sorted_feature_importances = sorted(zip(feature_importances, X_train.columns), reverse=True)

print("\nFeature Importances:")
for importance, feature in sorted_feature_importances:
    print(f"{feature}: {importance:.4f}")


Feature Importances:
meanfun: 0.7186
IQR: 0.0750
sfm: 0.0343
minfun: 0.0336
maxdom: 0.0240
sd: 0.0231
sp.ent: 0.0159
meanfreq: 0.0142
skew: 0.0098
modindx: 0.0095
Q75: 0.0090
meandom: 0.0085
maxfun: 0.0084
mode: 0.0083
mindom: 0.0076


### Remove the least important features (<0.01)

In [12]:
# remove the least important features (<0.01)
selected_features = [feature for importance, feature in sorted_feature_importances if importance >= 0.01]
X_train = X_train[selected_features]
X_test = X_test[selected_features]

### Run the model again with the selected features

In [13]:
# Evaluate each model using cross-validation
results = []
names = []

for name, model in models:
    cv_results = cross_val_score(model, X_train, y_train, cv=5, scoring=metric)
    results.append(cv_results)
    names.append(name)
    print(f"{name}: {cv_results.mean():.4f} ({cv_results.std():.4f})")

# Select the best model based on cross-validation results
best_model_index = np.argmax([result.mean() for result in results])
best_model_name, best_model = models[best_model_index]

print(f"\nBest Model: {best_model_name}, with an accuracy of {results[best_model_index].mean():.4f}")

Logistic Regression: 0.8927 (0.0131)
Support Vector Machine: 0.6693 (0.0150)
Decision Tree: 0.9692 (0.0084)
Random Forest: 0.9799 (0.0047)
Gradient Boosting: 0.9767 (0.0049)
XGBoost: 0.9795 (0.0069)
K-Nearest Neighbors: 0.8816 (0.0138)
Naive Bayes: 0.9373 (0.0051)
Neural Network: 0.9736 (0.0041)

Best Model: Random Forest, with an accuracy of 0.9799


### HyperParameter Tuning

In [14]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameters for each model
param_grids = {
    'Logistic Regression': {'C': [0.01, 0.1, 1, 10, 100]},
    'Support Vector Machine': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    'Decision Tree': {'max_depth': [10, 20, 30], 'min_samples_split': [2, 5, 10]},
    'Random Forest': {'n_estimators': [10, 50, 100], 'max_features': ['auto', 'sqrt', 'log2']},
    'Gradient Boosting': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]},
    'XGBoost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]},
    'K-Nearest Neighbors': {'n_neighbors': [3, 5, 7, 9]},
    'Naive Bayes': {},
    'Neural Network': {'hidden_layer_sizes': [(50,), (100,), (50, 50)], 'alpha': [0.0001, 0.001, 0.01]}
}

# Evaluate each model using Grid Search
results = []
names = []

for name, model in models:
    param_grid = param_grids[name]
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring=metric)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    cv_results = cross_val_score(best_model, X_train, y_train, cv=5, scoring=metric)
    results.append(cv_results)
    names.append(name)
    print(f"{name}: {cv_results.mean():.4f} ({cv_results.std():.4f})")

# Select the best model based on cross-validation results
best_model_index = np.argmax([result.mean() for result in results])
best_model_name, best_model = models[best_model_index]

print(f"\nBest Model: {best_model_name}, with an accuracy of {results[best_model_index].mean():.4f}")

Logistic Regression: 0.9665 (0.0096)
Support Vector Machine: 0.9672 (0.0106)
Decision Tree: 0.9669 (0.0063)
Random Forest: 0.9787 (0.0064)
Gradient Boosting: 0.9791 (0.0042)
XGBoost: 0.9807 (0.0072)
K-Nearest Neighbors: 0.8931 (0.0129)
Naive Bayes: 0.9373 (0.0051)
Neural Network: 0.9696 (0.0090)

Best Model: XGBoost, with an accuracy of 0.9807
