# Noise Detector : Audio based using RandomForestClassifier

### 1. Imports

In [1]:
import sys
sys.path.append("..")
from scripts import file

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

### 2. Load and Train Data

In [3]:
df = pd.read_csv("../data/features.csv")

In [4]:
df

Unnamed: 0,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,label
0,-414.114319,82.087929,12.61022,32.129623,2.22103,11.476946,-1.837692,7.881042,-3.493526,5.470525,1.474488,3.019343,0.665442,1.0
1,-463.897888,78.660675,9.834409,35.102051,22.241463,24.459047,11.769912,15.829986,11.701596,13.975851,10.279265,10.457386,0.165733,1.0
2,-315.297119,93.139267,13.642873,33.382542,4.83163,12.353395,-8.381453,8.370358,-5.774446,15.57698,6.351563,10.063846,4.227615,1.0
3,-117.961357,170.17775,-61.46682,23.826248,-14.787617,16.183578,-4.95971,26.672409,-1.894688,0.065805,-7.571851,4.338713,-2.277861,1.0
4,-142.604919,141.129791,-98.17852,63.160038,-24.862801,29.915831,-37.757191,17.45882,-13.29934,16.262056,-14.990223,-1.429464,-5.765497,1.0
5,-177.625183,172.315079,-25.954384,35.637497,-0.178278,12.985711,-13.363649,9.972052,-3.060303,-7.207282,6.185294,5.774026,-3.053701,1.0
6,-283.136658,-13.925992,-64.456879,3.016428,-30.128635,-17.023546,-1.737343,-9.878433,-4.386919,5.712756,0.435114,-6.029335,-3.740821,1.0
7,-313.369873,67.950638,-38.628506,28.658379,-20.030783,-16.104122,-14.199511,-1.925687,1.748709,-15.824556,10.573709,-0.669855,-4.330415,1.0
8,-235.880112,17.692507,9.530772,27.011477,-5.056016,8.729738,-18.020399,-5.723011,-4.658276,-7.452484,-4.110888,6.895874,-10.276404,1.0
9,-543.653503,77.117821,-8.429994,44.824429,39.604782,-37.96529,-29.561563,23.505594,-6.381229,-25.797789,10.2344,9.897546,-9.006483,1.0


In [5]:
X = df.drop('label', axis=1)
y = df['label']
labels = ('silence', 'noise')

In [6]:
scaler = StandardScaler()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [8]:
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

### 3. Model Development

In [9]:
model = RandomForestClassifier()

In [10]:
model.fit(scaled_X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### 4. Prediction and Evaluation

In [11]:
y_pred = model.predict(scaled_X_test)

In [12]:
print("Accuracy = ", accuracy_score(y_test, y_pred))

Accuracy =  0.6


In [13]:
print("Classification Report : \n\n", classification_report(y_test, y_pred))

Classification Report : 

               precision    recall  f1-score   support

         0.0       0.50      1.00      0.67         2
         1.0       1.00      0.33      0.50         3

    accuracy                           0.60         5
   macro avg       0.75      0.67      0.58         5
weighted avg       0.80      0.60      0.57         5



In [14]:
print(df['label'].value_counts())

label
1.0    11
0.0    11
Name: count, dtype: int64


### 5. Trial with test input

In [19]:
import librosa 
import numpy as np
def extract_feature(file_path) :
    y, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfccs_mean = np.mean(mfccs.T, axis=0)
    return mfccs_mean.reshape(1,-1)

In [20]:
file_path = "../data/converted/sample-3.wav"

In [21]:
feature = extract_feature(file_path)

In [22]:
prediction = model.predict(feature)

In [24]:
print("Prediction : ", labels[int(prediction[0])])

Prediction :  noise


### 6. Saving model and scaler as pkl file

In [25]:
file.save_file("model.pkl", model)
file.save_file("scaler.pkl", scaler)