In [27]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, log_loss

In [2]:
df = pd.read_csv('voice-classification.csv', low_memory=False)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 21 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   meanfreq  3168 non-null   float64
 1   sd        3168 non-null   float64
 2   median    3168 non-null   float64
 3   Q25       3168 non-null   float64
 4   Q75       3168 non-null   float64
 5   IQR       3168 non-null   float64
 6   skew      3168 non-null   float64
 7   kurt      3168 non-null   float64
 8   sp.ent    3168 non-null   float64
 9   sfm       3168 non-null   float64
 10  mode      3168 non-null   float64
 11  centroid  3168 non-null   float64
 12  meanfun   3168 non-null   float64
 13  minfun    3168 non-null   float64
 14  maxfun    3168 non-null   float64
 15  meandom   3168 non-null   float64
 16  mindom    3168 non-null   float64
 17  maxdom    3168 non-null   float64
 18  dfrange   3168 non-null   float64
 19  modindx   3168 non-null   float64
 20  label     3168 non-null   obje

In [4]:
df.columns

Index(['meanfreq', 'sd', 'median', 'Q25', 'Q75', 'IQR', 'skew', 'kurt',
       'sp.ent', 'sfm', 'mode', 'centroid', 'meanfun', 'minfun', 'maxfun',
       'meandom', 'mindom', 'maxdom', 'dfrange', 'modindx', 'label'],
      dtype='object')

In [5]:
df.head().transpose()

Unnamed: 0,0,1,2,3,4
meanfreq,0.059781,0.0660087,0.0773155,0.151228,0.13512
sd,0.0642413,0.06731,0.0838294,0.0721106,0.0791461
median,0.0320269,0.0402287,0.0367185,0.158011,0.124656
Q25,0.0150715,0.0194139,0.00870106,0.0965817,0.0787202
Q75,0.0901934,0.0926662,0.131908,0.207955,0.206045
IQR,0.075122,0.0732523,0.123207,0.111374,0.127325
skew,12.8635,22.4233,30.7572,1.23283,1.10117
kurt,274.403,634.614,1024.93,4.1773,4.33371
sp.ent,0.893369,0.892193,0.846389,0.963322,0.971955
sfm,0.491918,0.513724,0.478905,0.727232,0.783568


In [8]:
target = df.label.copy()
df.drop('label', axis=1, inplace=True)

In [10]:
trainX, testX, trainY, testY = train_test_split(df,
                                                target,
                                                random_state=0,
                                                test_size=0.3)

In [11]:
model_Decision = DecisionTreeClassifier()
model_RandomForest = RandomForestClassifier()
model_SVC = SVC()
model_bayes = GaussianNB()
model_Gradient = GradientBoostingClassifier()

models = [model_Decision,
          model_RandomForest,
          model_SVC,
          model_bayes,
          model_Gradient]

kfold_10 = KFold(n_splits=5)

n_jobs = -1

In [29]:
class Classifiers:
    models          = []
    probs_pred      = []
    preds           = []
    accuracy_scores = []
    #log_losses      = []
    
    #def get_log_losses(self):
    #    return self.log_losses
    
    def get_accuracy_scores(self):
        return self.accuracy_scores
    
    def calculate_scores(self, testY):
        y_true = testY
        for x in range(len(self.models)):
            y_pred = self.preds[x]
            
            accScore = accuracy_score(y_true, y_pred)
            
            #logLoss = log_loss(testY, self.probs_pred[x])
            #logLoss = log_loss(testY, self.preds[x])
            
            self.accuracy_scores.append(accScore)
            #self.log_losses.append(logLoss)
            print(f'{self.models[x]} : {round(accScore*100,2)} % ')
    
    def predict(self, testX):
        for model in self.models:
            tmp_pred = model.predict(testX)
            #tmp_prob = model.predict_proba(testX)
            self.preds.append(tmp_pred)
            #self.probs_pred.append(tmp_prob)
    
    def fit(self, trainX, trainY):
        for model in self.models:
            model.fit(trainX, trainY)
        
    def __init__(self):
        model_Decision = DecisionTreeClassifier()
        model_RandomForest = RandomForestClassifier()
        model_SVC = SVC()
        model_bayes = GaussianNB()
        model_Gradient = GradientBoostingClassifier()

        self.models = [model_Decision,
                  model_RandomForest,
                  model_SVC,
                  model_bayes,
                  model_Gradient
                 ]

In [30]:
models_class = Classifiers()
models_class.fit(trainX, trainY)
models_class.predict(testX)
models_class.calculate_scores(testY)

DecisionTreeClassifier() : 95.58 % 
RandomForestClassifier() : 98.63 % 
SVC() : 68.77 % 
GaussianNB() : 89.27 % 
GradientBoostingClassifier() : 98.11 % 


>## RandomForestClassifier

In [32]:
RandomForest = RandomForestClassifier(n_jobs = -1)
kfold_10 = KFold(n_splits=5)
prm_grd = {
    'n_estimators' : [1, 5, 10, 25, 50] + list(range(100,501,100)),
    'max_depth'    : list(range(1,17))
}
gsCV = GridSearchCV(estimator=RandomForest,
                    param_grid=prm_grd,
                    cv=kfold_10,
       )
gsCV.fit(trainX, trainY)



GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(n_jobs=-1),
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16],
                         'n_estimators': [1, 5, 10, 25, 50, 100, 200, 300, 400,
                                          500]})

In [34]:
gsCV.best_score_

0.9797032924572427

In [36]:
RandomForest = RandomForestClassifier(max_depth=9, n_estimators=300, n_jobs=-1)
RandomForest.fit(trainX, trainY)
accuracy_score(testY,RandomForest.predict(testX))

0.9852786540483701