In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'voice-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2557638%2F4344911%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240622%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240622T072505Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D8be8a11aedf3d53b690cc42935ee819fb435cac095043ade2d4d2d51e993fcd06e4765021a245ba6dbfea813a51938826c4bf4f254b6d30393f7496187b9ae47f8399b3c776edd0d420123d31b5c38bc132cb2613560519da072e0de3dea1ff5f6bbd73f375c0f555b939016209223817aa8a7bc4a754bccda8601ce0b9407476c66735db485093a1e62b6c1c61b6b9d82cf4a3f991d2c7606180993cc4d3d1cdd8152e3833f32e8777fd1429d2e03056af263dd28c2ef1385344257a5ed0ea1ff403f947a303964e35f3aad1066ed4354f105e9d3b29828e47cc6315bb0b91a542484f785dcbe8e954d3acbfa62a354efb0dfa196f05644de6e8276de6f46d3'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Loading the dataset 'Voice'

In [None]:
df = pd.read_csv('../input/voice-dataset/voice.csv')
df.head()

In [None]:
df.shape

Our dataset has 3168 rows and 21 columns

In [None]:
df.isna().sum()

We don't have any null values to take care of.

# Pie Chart

In [None]:
df['label'].value_counts()

In [None]:
val= [1584,1584]
label = ['male','female']
plt.figure(figsize=(6,8))
plt.pie(val,labels=label)
plt.legend()
plt.show()

# Exploratory Data Analysis

In [None]:
corr = df.corr()
sns.set(font_scale=1.5)
plt.figure(figsize=(20,20))
sns.heatmap(corr,annot = True,cmap='coolwarm')
plt.show()

We will do EDA on similar columns with correlation higher than 0.8 and remove the columns which will not fetch us important patterns.

In [None]:
corr = df.corr()
corr = corr[corr>0.85]
plt.figure(figsize=(15,15))
sns.heatmap(corr,annot = True,cmap='coolwarm')
plt.show()

In [None]:
df = df.drop(['meanfreq','centroid',],axis=1)
#we removed meanfreq and centroid as it had the maximum correlated columns

In [None]:
sns.boxplot(x=df.maxdom,y=df.label)
plt.show()
sns.boxplot(x=df.dfrange,y=df.label)
plt.show()

In [None]:
df = df.drop(['dfrange','maxdom'],axis=1)

In [None]:
sns.boxplot(x=df['skew'],y=df.label)
plt.show()
sns.boxplot(x=df['kurt'],y=df.label)
plt.show()

In [None]:
df = df.drop(['kurt','skew'],axis=1)

We removed both kurt and skew as it had too many outliers. It is bad for the training.

In [None]:
sns.boxplot(x=df['sd'],y=df.label)
plt.show()
sns.boxplot(x=df['IQR'],y=df.label)
plt.show()

In [None]:
df = df.drop(['IQR'],axis=1)

In [None]:
sns.boxplot(x=df['sfm'],y=df.label)
plt.show()
sns.boxplot(x=df['sp.ent'],y=df.label)
plt.show()

In [None]:
df = df.drop('sp.ent',axis=1)
#We will remove sp.ent as it has more outliers

In [None]:
sns.pairplot(df,kind = 'boxplot',hue='label')
plt.show()

In [None]:
df = df.drop(['maxfun','modindx','minfun'],axis=1)
#these columns have too many outliers

In [None]:
corr = df.corr()
corr = corr[corr>0.85]
plt.figure(figsize=(5,5))
sns.heatmap(corr,annot = True,cmap='coolwarm')
plt.show()

# Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.iloc[:,:-1]
y = df.label

In [None]:
X_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 1)

# Metric Functions

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
def eval(y_pred,ytest):
    print("Confusion matrix:\n")
    cm = confusion_matrix(y_pred,ytest)
    sns.heatmap(cm,annot = True,xticklabels=["Female","Male"],yticklabels=["Female","Male"])
    plt.show()
    print("Classification Report\n",classification_report(y_pred,ytest))

In [None]:
def score(model):
    print("Training score: ",model.score(X_train,y_train))
    print("Test score: ",model.score(x_test,y_test))

# 1) DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
DTmodel = DecisionTreeClassifier(min_samples_split = 5,max_depth = 10,random_state = 0)

In [None]:
DTmodel.fit(X_train,y_train)

In [None]:
ypred1 = DTmodel.predict(x_test)
ypred1[:5]

In [None]:
score(DTmodel)

In [None]:
eval(ypred1,y_test)

**Conclusion**: Slightly overfit model.

# 2) LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
LRmodel = LogisticRegression(n_jobs=3,max_iter=1000,class_weight=0.001,random_state=0)

In [None]:
LRmodel.fit(X_train,y_train)

In [None]:
ypred2 = LRmodel.predict(x_test)

In [None]:
score(LRmodel)

In [None]:
eval(ypred2,y_test)

**Conclusion**: Slightly inaccurate and underfit model with overall less training and test score.

# 3) Support Vector Machine

In [None]:
from sklearn.svm import SVC

In [None]:
SVMmodel = SVC(kernel = 'rbf', C=2.0,random_state=0,degree = 3)

In [None]:
SVMmodel.fit(X_train,y_train)

In [None]:
ypred3 = SVMmodel.predict(x_test)

In [None]:
score(SVMmodel)

In [None]:
eval(ypred3,y_test)

**Conclusion**: Properly fit as both training and testing scores are approximately same.

# 4) K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
Kmodel = KNeighborsClassifier(n_neighbors = 4,metric ='minkowski',p=1,n_jobs=5,algorithm='ball_tree')

In [None]:
Kmodel.fit(X_train,y_train)

In [None]:
ypred4 = Kmodel.predict(x_test)

In [None]:
score(Kmodel)

In [None]:
eval(ypred4,y_test)

**Conclusion** : Good training and testing accuracy

# 5) Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
RFmodel = RandomForestClassifier(n_estimators = 1000,max_depth = 11,n_jobs=5,criterion='gini',warm_start=True,min_samples_split=4,oob_score=True)

In [None]:
RFmodel.fit(X_train,y_train)

In [None]:
ypred5 = RFmodel.predict(x_test)

In [None]:
score(RFmodel)

In [None]:
eval(ypred5,y_test)

In [None]:
data = {'Dtree' : [0.99,0.96], 'LogReg' : [0.89,0.88], 'SVM':[0.96,0.95], 'Knn' : [0.97,0.96], 'RF' : [1.0 , 0.99]}

In [None]:
import pandas as pd
df = pd.DataFrame(data, index=['Accuracy on Train data', 'Accuracy on Test data'])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.lineplot(data=df)
plt.show()

# Conclusion
**Random Forest classifier** was proved to be the best classifier with only 13-15 missclassification in testing set.

> *If you have any query regarding any section of the code please do comment on the notebook! Thank you for viewing.*

# Thank You