In [1]:
import pandas as pd
import numpy as np

In [2]:
data=pd.read_csv("voice-classification.csv")

In [3]:
data.head()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
0,0.059781,0.064241,0.032027,0.015071,0.090193,0.075122,12.863462,274.402906,0.893369,0.491918,...,0.059781,0.084279,0.015702,0.275862,0.007812,0.007812,0.007812,0.0,0.0,male
1,0.066009,0.06731,0.040229,0.019414,0.092666,0.073252,22.423285,634.613855,0.892193,0.513724,...,0.066009,0.107937,0.015826,0.25,0.009014,0.007812,0.054688,0.046875,0.052632,male
2,0.077316,0.083829,0.036718,0.008701,0.131908,0.123207,30.757155,1024.927705,0.846389,0.478905,...,0.077316,0.098706,0.015656,0.271186,0.00799,0.007812,0.015625,0.007812,0.046512,male
3,0.151228,0.072111,0.158011,0.096582,0.207955,0.111374,1.232831,4.177296,0.963322,0.727232,...,0.151228,0.088965,0.017798,0.25,0.201497,0.007812,0.5625,0.554688,0.247119,male
4,0.13512,0.079146,0.124656,0.07872,0.206045,0.127325,1.101174,4.333713,0.971955,0.783568,...,0.13512,0.106398,0.016931,0.266667,0.712812,0.007812,5.484375,5.476562,0.208274,male


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 21 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   meanfreq  3168 non-null   float64
 1   sd        3168 non-null   float64
 2   median    3168 non-null   float64
 3   Q25       3168 non-null   float64
 4   Q75       3168 non-null   float64
 5   IQR       3168 non-null   float64
 6   skew      3168 non-null   float64
 7   kurt      3168 non-null   float64
 8   sp.ent    3168 non-null   float64
 9   sfm       3168 non-null   float64
 10  mode      3168 non-null   float64
 11  centroid  3168 non-null   float64
 12  meanfun   3168 non-null   float64
 13  minfun    3168 non-null   float64
 14  maxfun    3168 non-null   float64
 15  meandom   3168 non-null   float64
 16  mindom    3168 non-null   float64
 17  maxdom    3168 non-null   float64
 18  dfrange   3168 non-null   float64
 19  modindx   3168 non-null   float64
 20  label     3168 non-null   obje

In [5]:
data.isna().sum()

meanfreq    0
sd          0
median      0
Q25         0
Q75         0
IQR         0
skew        0
kurt        0
sp.ent      0
sfm         0
mode        0
centroid    0
meanfun     0
minfun      0
maxfun      0
meandom     0
mindom      0
maxdom      0
dfrange     0
modindx     0
label       0
dtype: int64

In [8]:
features=data.drop(['label'],axis=1)

In [11]:
target=data[['label']]

In [12]:
features.shape,target.shape

((3168, 20), (3168, 1))

In [13]:
features.dtypes

meanfreq    float64
sd          float64
median      float64
Q25         float64
Q75         float64
IQR         float64
skew        float64
kurt        float64
sp.ent      float64
sfm         float64
mode        float64
centroid    float64
meanfun     float64
minfun      float64
maxfun      float64
meandom     float64
mindom      float64
maxdom      float64
dfrange     float64
modindx     float64
dtype: object

In [None]:
# here no categorical data in features,so no need to encode into numerical(label encoder, ordinal encoder)

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [15]:
X_train , X_test, y_train, y_test = train_test_split(features, target, random_state = 10)

In [16]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2376, 20)
(792, 20)
(2376, 1)
(792, 1)


# decision tree classifier

In [17]:
my_DT_model = DecisionTreeClassifier()

# Using GridSearchCV to find best params

In [20]:
from sklearn.model_selection import GridSearchCV


In [19]:
params={'criterion':['gini', 'entropy'], 'max_depth':[1,2,3,10], 'splitter' :['best', 'random']}

In [21]:
grid_search = GridSearchCV(my_DT_model, params, cv = 3, n_jobs = -1)

In [22]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 10],
                         'splitter': ['best', 'random']})

In [23]:
grid_search.best_params_

{'criterion': 'gini', 'max_depth': 3, 'splitter': 'best'}

# now we apply the best params to DT model

In [24]:
my_DT_model = DecisionTreeClassifier(criterion='gini', random_state=2, max_depth = 3, splitter = 'best')

In [25]:
my_DT_model.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3, random_state=2)

In [26]:
my_preds = my_DT_model.predict(X_test)

In [27]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [28]:
accuracy_score(y_test, my_preds)

0.9671717171717171

In [29]:
print(confusion_matrix(y_test, my_preds, ))

[[369  15]
 [ 11 397]]


In [30]:
print(classification_report(y_test, my_preds))

              precision    recall  f1-score   support

      female       0.97      0.96      0.97       384
        male       0.96      0.97      0.97       408

    accuracy                           0.97       792
   macro avg       0.97      0.97      0.97       792
weighted avg       0.97      0.97      0.97       792



# random forest

In [31]:
my_rf_classifier = RandomForestClassifier()

In [32]:
my_rf_classifier.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestClassifier()

In [33]:
my_predictions = my_rf_classifier.predict(X_test)
print(accuracy_score(y_test, my_predictions))

0.9797979797979798


# voting classifier

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

In [35]:
rf_clf = RandomForestClassifier()
log_clf = LogisticRegression(max_iter=10000)
svm_clf = SVC()

In [36]:
voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rf_clf), ('svc', svm_clf)])

In [37]:
voting_clf.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


VotingClassifier(estimators=[('lr', LogisticRegression(max_iter=10000)),
                             ('rf', RandomForestClassifier()), ('svc', SVC())])

In [38]:
from sklearn.metrics import accuracy_score

In [39]:
for clf in (log_clf, rf_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

  y = column_or_1d(y, warn=True)


LogisticRegression 0.9305555555555556


  


RandomForestClassifier 0.9835858585858586


  y = column_or_1d(y, warn=True)


SVC 0.6792929292929293


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


VotingClassifier 0.9381313131313131


# now we see accuracy score of voting_clf directly

In [40]:
voting_clf_preds=voting_clf.predict(X_test)

In [41]:
accuracy_score(voting_clf_preds,y_test)

0.9381313131313131