In [26]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile
import pandas as pd
import os


In [None]:
# 'C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out'
sample_rate, data = wavfile.read('C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\NORMAL\\CAR01_NORMAL2.wav')
print(sample_rate)
print(data.shape)

16000
(1920768,)


In [28]:
# Scale to [-1,1] and convert to float32
data = data.astype(np.float32) / np.max(np.abs(data))
print(data.dtype)
print(data.shape)

float32
(1920768,)


In [29]:
# Windowing: 20 ms windows, 50% overlap
def create_windows(data, sample_rate):
    window_size = max(1, int(0.02 * sample_rate))  # samples per 20 ms
    hop_size = max(1, window_size // 2)

    # Make overlapping windows efficiently
    from numpy.lib.stride_tricks import sliding_window_view
    if len(data) < window_size:
        # Pad with zeros to create at least one window
        pad_width = window_size - len(data)
        data = np.pad(data, (0, pad_width), mode='constant')
    windows = sliding_window_view(data, window_shape=window_size)[::hop_size]  # shape: (num_windows, window_size)
    # Apply Hann window to reduce spectral leakage
    hann = np.hanning(window_size)
    win = windows * hann
    return win

# Feature extraction
def extract_features(signal):
    # Root Mean Square (RMS) energy
    rms = np.sqrt(np.mean(signal**2))
    # Zero Crossing Rate (ZCR)
    zero_crossings = np.where(np.diff(np.sign(signal)))[0]
    zcr = len(zero_crossings) / len(signal)
    # Mean and Standard Deviation of the signal
    mean = np.mean(signal)
    std = np.std(signal)
    # skewness
    skew = np.mean((signal - mean)**3) / (std**3) if std else 0
    # kurtosis
    kurt = np.mean((signal - mean)**4) / (std**4) if std else 0
    # Energy
    energy = np.sum(signal**2)
    # FFT mean
    fft = np.fft.rfft(signal)
    fft_mean = np.mean(np.abs(fft))
    # FFT energy
    fft_energy = np.sum(np.abs(fft)**2)
    # FFT min and max
    fft_min = np.min(np.abs(fft))
    fft_max = np.max(np.abs(fft))
    # FFT min max diff
    fft_diff = fft_max - fft_min
    # FFT magnitude
    mag = np.abs(fft)
    # FFT median
    fft_median = np.median(mag)
    # FFT median absolute deviation
    fft_mad = np.mean(np.abs(mag - fft_median))

    return rms, zcr, mean, std, skew, kurt, energy, fft_mean, fft_energy, fft_min, fft_max, fft_diff, fft_median, fft_mad

# Extract features from each window
windows = create_windows(data, sample_rate)
features = []
for win in windows:
    features.append(extract_features(win))
    
# Convert features to DataFrame    
df_car01 = pd.DataFrame(features, columns=["RMS Energy", "Zero Crossing Rate", "Mean", "Std Dev", "Skewness", "Kurtosis", "Energy",
                                     "FFT Mean", "FFT Energy", "FFT Min", "FFT Max", "FFT Min-Max Diff", "FFT Median", "FFT MAD"])
df_car01.head()


Unnamed: 0,RMS Energy,Zero Crossing Rate,Mean,Std Dev,Skewness,Kurtosis,Energy,FFT Mean,FFT Energy,FFT Min,FFT Max,FFT Min-Max Diff,FFT Median,FFT MAD
0,0.043529,0.425,0.004692,0.043276,0.539507,4.194774,0.606333,0.597833,99.609104,0.022447,2.750033,2.727587,0.455713,0.351919
1,0.036345,0.296875,0.002297,0.036272,-0.200042,4.031821,0.422698,0.475647,68.486716,0.057297,3.289081,3.231784,0.344774,0.26248
2,0.036902,0.328125,-0.003286,0.036756,-0.610448,5.387104,0.43577,0.509343,70.332126,0.006842,2.701417,2.694575,0.401037,0.256448
3,0.036571,0.38125,-6.5e-05,0.036571,0.096018,6.033674,0.42799,0.506454,68.614241,0.020676,2.343038,2.322362,0.391963,0.282975
4,0.046112,0.496875,0.004711,0.045871,-0.210194,5.134912,0.680431,0.657688,110.263147,0.045893,2.842715,2.796822,0.514889,0.354964


In [30]:
df_car01["label"] = "NORMAL"
print(df_car01.head())

   RMS Energy  Zero Crossing Rate      Mean   Std Dev  Skewness  Kurtosis  \
0    0.043529            0.425000  0.004692  0.043276  0.539507  4.194774   
1    0.036345            0.296875  0.002297  0.036272 -0.200042  4.031821   
2    0.036902            0.328125 -0.003286  0.036756 -0.610448  5.387104   
3    0.036571            0.381250 -0.000065  0.036571  0.096018  6.033674   
4    0.046112            0.496875  0.004711  0.045871 -0.210194  5.134912   

     Energy  FFT Mean  FFT Energy   FFT Min   FFT Max  FFT Min-Max Diff  \
0  0.606333  0.597833   99.609104  0.022447  2.750033          2.727587   
1  0.422698  0.475647   68.486716  0.057297  3.289081          3.231784   
2  0.435770  0.509343   70.332126  0.006842  2.701417          2.694575   
3  0.427990  0.506454   68.614241  0.020676  2.343038          2.322362   
4  0.680431  0.657688  110.263147  0.045893  2.842715          2.796822   

   FFT Median   FFT MAD   label  
0    0.455713  0.351919  NORMAL  
1    0.344774  0.2

In [31]:
df_car01["car_id"] = "CAR01"
df_car01.head()

Unnamed: 0,RMS Energy,Zero Crossing Rate,Mean,Std Dev,Skewness,Kurtosis,Energy,FFT Mean,FFT Energy,FFT Min,FFT Max,FFT Min-Max Diff,FFT Median,FFT MAD,label,car_id
0,0.043529,0.425,0.004692,0.043276,0.539507,4.194774,0.606333,0.597833,99.609104,0.022447,2.750033,2.727587,0.455713,0.351919,NORMAL,CAR01
1,0.036345,0.296875,0.002297,0.036272,-0.200042,4.031821,0.422698,0.475647,68.486716,0.057297,3.289081,3.231784,0.344774,0.26248,NORMAL,CAR01
2,0.036902,0.328125,-0.003286,0.036756,-0.610448,5.387104,0.43577,0.509343,70.332126,0.006842,2.701417,2.694575,0.401037,0.256448,NORMAL,CAR01
3,0.036571,0.38125,-6.5e-05,0.036571,0.096018,6.033674,0.42799,0.506454,68.614241,0.020676,2.343038,2.322362,0.391963,0.282975,NORMAL,CAR01
4,0.046112,0.496875,0.004711,0.045871,-0.210194,5.134912,0.680431,0.657688,110.263147,0.045893,2.842715,2.796822,0.514889,0.354964,NORMAL,CAR01


In [32]:
def process_wav(file_path, label, car_id):
    """
    Load a .wav file, extract features for each 20 ms window,
    and return a labeled DataFrame.
    """
    # --- Load WAV ---
    sr, data = wavfile.read(file_path)
    data = np.asarray(data)

    # --- Convert to float32 and scale to [-1, 1] ---
    data = data.astype(np.float32)
    max_val = np.max(np.abs(data))
    if max_val > 0:
        data /= max_val

    # --- Create windows ---
    win = create_windows(data, sr)

    # --- Extract features per window ---
    features = np.array([extract_features(w) for w in win])

    # --- Make DataFrame ---
    columns = [
        "RMS Energy", "Zero Crossing Rate", "Mean", "Std Dev", "Skewness", "Kurtosis",
        "Energy", "FFT Mean", "FFT Energy", "FFT Min", "FFT Max",
        "FFT Min-Max Diff", "FFT Median", "FFT MAD"
    ]
    df = pd.DataFrame(features, columns=columns)
    df["label"] = label
    df["car_id"] = car_id
    df["file"] = os.path.basename(file_path)

    return df

In [None]:
import glob
import os

# Find all WAVs for car01 that contain "normal" in their names
normal_files = glob.glob(r"C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\NORMAL\\*car01*normal*.wav")

dfs = []
for f in normal_files:
    print(f"Processing {f}...")
    df_temp = process_wav(f, label="NORMAL", car_id="CAR01")
    dfs.append(df_temp)

# Combine all NORMAL recordings for car01
df_car01_normal = pd.concat(dfs, ignore_index=True)
df_car01_normal.head()

Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\NORMAL\CAR01_NORMAL.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\NORMAL\CAR01_NORMAL2.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\NORMAL\CAR01_NORMAL3.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\NORMAL\CAR01_NORMAL4.wav...


Unnamed: 0,RMS Energy,Zero Crossing Rate,Mean,Std Dev,Skewness,Kurtosis,Energy,FFT Mean,FFT Energy,FFT Min,FFT Max,FFT Min-Max Diff,FFT Median,FFT MAD,label,car_id,file
0,0.590236,0.0125,-0.447317,0.38508,0.00705,1.813349,111.481097,3.117672,28081.904072,0.534739,143.141302,142.606563,0.846992,2.449278,NORMAL,CAR01,CAR01_NORMAL.wav
1,0.407952,0.00625,-0.307309,0.268302,-0.420757,1.711234,53.255957,1.114782,13356.217733,2.8e-05,98.33885,98.338822,0.0041,1.11235,NORMAL,CAR01,CAR01_NORMAL.wav
2,0.068964,0.00625,-0.05289,0.044256,-0.266504,1.530318,1.521915,0.18756,386.732643,5.3e-05,16.924906,16.924853,0.001572,0.186601,NORMAL,CAR01,CAR01_NORMAL.wav
3,0.008001,0.00625,-0.006048,0.005237,-0.312779,1.550727,0.020483,0.023188,5.150439,6.7e-05,1.935504,1.935437,0.001544,0.022253,NORMAL,CAR01,CAR01_NORMAL.wav
4,0.000638,0.04375,-0.000394,0.000501,-0.520898,1.746676,0.00013,0.002954,0.02879,3.3e-05,0.126163,0.12613,0.001125,0.00228,NORMAL,CAR01,CAR01_NORMAL.wav


In [None]:
# Find all WAVs for car02 that contain "normal" in their names
normal_files = glob.glob(r"C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\NORMAL\\*car02*normal*.wav")

dfs = []
for f in normal_files:
    print(f"Processing {f}...")
    df_temp = process_wav(f, label="NORMAL", car_id="CAR02")
    dfs.append(df_temp)

# Combine all NORMAL recordings for car02
df_car02_normal = pd.concat(dfs, ignore_index=True)
df_car02_normal.head()

Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\NORMAL\CAR02_NORMAL.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\NORMAL\CAR02_NORMAL2.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\NORMAL\CAR02_NORMAL3.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\NORMAL\CAR02_NORMAL4.wav...


Unnamed: 0,RMS Energy,Zero Crossing Rate,Mean,Std Dev,Skewness,Kurtosis,Energy,FFT Mean,FFT Energy,FFT Min,FFT Max,FFT Min-Max Diff,FFT Median,FFT MAD,label,car_id,file
0,0.07134,0.596875,-0.003776,0.07124,0.486736,5.557927,1.628613,1.009764,269.0488,0.026939,4.073558,4.046619,0.793402,0.589594,NORMAL,CAR02,CAR02_NORMAL.wav
1,0.059422,0.634375,-0.008917,0.058749,-0.423169,6.708136,1.129919,0.847585,196.795833,0.127128,4.886341,4.759212,0.673838,0.470621,NORMAL,CAR02,CAR02_NORMAL.wav
2,0.039376,0.571875,0.003006,0.039261,0.24845,5.486059,0.496146,0.580309,81.41466,0.039382,2.172662,2.13328,0.431319,0.314837,NORMAL,CAR02,CAR02_NORMAL.wav
3,0.026224,0.39375,0.004451,0.025844,0.309521,4.517991,0.220064,0.373087,36.235704,0.021751,1.861572,1.83982,0.300121,0.182511,NORMAL,CAR02,CAR02_NORMAL.wav
4,0.026348,0.35,0.000355,0.026346,-0.312434,5.293238,0.222156,0.332212,35.559574,0.024198,2.478034,2.453836,0.253357,0.173475,NORMAL,CAR02,CAR02_NORMAL.wav


In [35]:
# Find all WAVs for car03 that contain "normal" in their names
normal_files = glob.glob(r"C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\NORMAL\\*car03*normal*.wav")

dfs = []
for f in normal_files:
    print(f"Processing {f}...")
    df_temp = process_wav(f, label="NORMAL", car_id="CAR03")
    dfs.append(df_temp)

# Combine all NORMAL recordings for car03
df_car03_normal = pd.concat(dfs, ignore_index=True)
df_car03_normal.head()

Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\NORMAL\CAR03_NORMAL.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\NORMAL\CAR03_NORMAL2.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\NORMAL\CAR03_NORMAL3.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\NORMAL\CAR03_NORMAL4.wav...


Unnamed: 0,RMS Energy,Zero Crossing Rate,Mean,Std Dev,Skewness,Kurtosis,Energy,FFT Mean,FFT Energy,FFT Min,FFT Max,FFT Min-Max Diff,FFT Median,FFT MAD,label,car_id,file
0,0.577691,0.0125,-0.438131,0.376522,0.203558,2.451601,106.792688,3.895431,26915.519205,0.884874,140.20189,139.317016,1.353922,2.831341,NORMAL,CAR03,CAR03_NORMAL.wav
1,0.594992,0.0125,-0.466082,0.369841,-0.116235,1.45063,113.28484,1.601182,29247.896471,0.023199,149.146383,149.123184,0.036073,1.572689,NORMAL,CAR03,CAR03_NORMAL.wav
2,0.220609,0.00625,-0.173854,0.135805,-0.275515,1.55653,15.573855,0.77603,4039.347899,0.056074,55.633251,55.577177,0.103239,0.697505,NORMAL,CAR03,CAR03_NORMAL.wav
3,0.527421,0.0125,-0.338327,0.404608,-0.415521,2.079372,89.015377,3.741307,20103.204163,0.467479,108.264688,107.79721,1.355259,2.825106,NORMAL,CAR03,CAR03_NORMAL.wav
4,0.582857,0.01875,-0.438812,0.383623,0.25334,2.604575,108.711308,3.982209,27253.367914,0.807028,140.419928,139.6129,1.419347,2.883307,NORMAL,CAR03,CAR03_NORMAL.wav


In [36]:
# Find all WAVs for car04 that contain "normal" in their names
normal_files = glob.glob(r"C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\NORMAL\\*car04*normal*.wav")

dfs = []
for f in normal_files:
    print(f"Processing {f}...")
    df_temp = process_wav(f, label="NORMAL", car_id="CAR04")
    dfs.append(df_temp)

# Combine all NORMAL recordings for car04
df_car04_normal = pd.concat(dfs, ignore_index=True)
df_car04_normal.head()

Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\NORMAL\CAR04_NORMAL.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\NORMAL\CAR04_NORMAL2.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\NORMAL\CAR04_NORMAL3.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\NORMAL\CAR04_NORMAL4.wav...


Unnamed: 0,RMS Energy,Zero Crossing Rate,Mean,Std Dev,Skewness,Kurtosis,Energy,FFT Mean,FFT Energy,FFT Min,FFT Max,FFT Min-Max Diff,FFT Median,FFT MAD,label,car_id,file
0,0.099016,0.665625,-0.001417,0.099005,-0.009824,4.82835,3.137305,1.230692,502.26467,0.040754,8.400898,8.360145,0.773585,0.78261,NORMAL,CAR04,CAR04_NORMAL.wav
1,0.05763,0.503125,0.013035,0.056136,0.676319,5.838632,1.062783,0.768106,180.193494,0.026516,4.171311,4.144794,0.506283,0.469346,NORMAL,CAR04,CAR04_NORMAL.wav
2,0.056013,0.434375,-0.009543,0.055194,-1.229192,5.975926,1.00398,0.645848,166.015191,0.014344,6.124445,6.110101,0.394641,0.403516,NORMAL,CAR04,CAR04_NORMAL.wav
3,0.039329,0.3375,-0.005082,0.038999,-1.244912,7.197346,0.494963,0.51655,80.578637,0.009727,2.40234,2.392613,0.380493,0.30897,NORMAL,CAR04,CAR04_NORMAL.wav
4,0.043956,0.278125,0.001433,0.043933,0.684347,5.548546,0.618291,0.559724,99.121525,0.047925,3.87751,3.829584,0.404689,0.328691,NORMAL,CAR04,CAR04_NORMAL.wav


In [39]:
df_normal = pd.concat([df_car01_normal, df_car02_normal, df_car03_normal, df_car04_normal], ignore_index=True)
print(df_normal.head())
print(df_normal["label"].value_counts())
print(df_normal["car_id"].value_counts())

   RMS Energy  Zero Crossing Rate      Mean   Std Dev  Skewness  Kurtosis  \
0    0.590236             0.01250 -0.447317  0.385080  0.007050  1.813349   
1    0.407952             0.00625 -0.307309  0.268302 -0.420757  1.711234   
2    0.068964             0.00625 -0.052890  0.044256 -0.266504  1.530318   
3    0.008001             0.00625 -0.006048  0.005237 -0.312779  1.550727   
4    0.000638             0.04375 -0.000394  0.000501 -0.520898  1.746676   

       Energy  FFT Mean    FFT Energy   FFT Min     FFT Max  FFT Min-Max Diff  \
0  111.481097  3.117672  28081.904072  0.534739  143.141302        142.606563   
1   53.255957  1.114782  13356.217733  0.000028   98.338850         98.338822   
2    1.521915  0.187560    386.732643  0.000053   16.924906         16.924853   
3    0.020483  0.023188      5.150439  0.000067    1.935504          1.935437   
4    0.000130  0.002954      0.028790  0.000033    0.126163          0.126130   

   FFT Median   FFT MAD   label car_id            

In [40]:
# Find all WAVs for car01 that contain "faulty" in their names
faulty_files = glob.glob(r"C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\FAULTY\\*car01*faulty*.wav")

dfs = []
for f in faulty_files:
    print(f"Processing {f}...")
    df_temp = process_wav(f, label="FAULTY", car_id="CAR01")
    dfs.append(df_temp)

# Combine all FAULTY recordings for car01
df_car01_faulty = pd.concat(dfs, ignore_index=True)

Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\FAULTY\CAR01_FAULTY.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\FAULTY\CAR01_FAULTY2.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\FAULTY\CAR01_FAULTY3.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\FAULTY\CAR01_FAULTY4.wav...


In [41]:
# Find all WAVs for car02 that contain "faulty" in their names
faulty_files = glob.glob(r"C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\FAULTY\\*car02*faulty*.wav")

dfs = []
for f in faulty_files:
    print(f"Processing {f}...")
    df_temp = process_wav(f, label="FAULTY", car_id="CAR02")
    dfs.append(df_temp)

# Combine all FAULTY recordings for car02
df_car02_faulty = pd.concat(dfs, ignore_index=True)

Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\FAULTY\CAR02_FAULTY.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\FAULTY\CAR02_FAULTY2.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\FAULTY\CAR02_FAULTY3.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\FAULTY\CAR02_FAULTY4.wav...


In [42]:
# Find all WAVs for car03 that contain "faulty" in their names
faulty_files = glob.glob(r"C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\FAULTY\\*car03*faulty*.wav")

dfs = []
for f in faulty_files:
    print(f"Processing {f}...")
    df_temp = process_wav(f, label="FAULTY", car_id="CAR03")
    dfs.append(df_temp)

# Combine all FAULTY recordings for car03
df_car03_faulty = pd.concat(dfs, ignore_index=True)

Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\FAULTY\CAR03_FAULTY.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\FAULTY\CAR03_FAULTY2.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\FAULTY\CAR03_FAULTY3.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\FAULTY\CAR03_FAULTY4.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\FAULTY\CAR03_FAULTY5.wav...


In [43]:
# Find all WAVs for car04 that contain "faulty" in their names
faulty_files = glob.glob(r"C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\FAULTY\\*car04*faulty*.wav")

dfs = []
for f in faulty_files:
    print(f"Processing {f}...")
    df_temp = process_wav(f, label="FAULTY", car_id="CAR04")
    dfs.append(df_temp)

# Combine all FAULTY recordings for car04
df_car04_faulty = pd.concat(dfs, ignore_index=True)

Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\FAULTY\CAR04_FAULTY.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\FAULTY\CAR04_FAULTY2.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\FAULTY\CAR04_FAULTY3.wav...
Processing C:\\Users\\nusse\\Desktop\\EH5TinyML\\soundovertcp\\Server\\out\\FAULTY\CAR04_FAULTY4.wav...


In [44]:
df_faulty = pd.concat([df_car01_faulty, df_car02_faulty, df_car03_faulty, df_car04_faulty], ignore_index=True)
print(df_faulty.head())
print(df_faulty["label"].value_counts())
print(df_faulty["car_id"].value_counts())

   RMS Energy  Zero Crossing Rate      Mean   Std Dev  Skewness  Kurtosis  \
0    0.052342            0.250000  0.004459  0.052152 -0.003804  4.810695   
1    0.040002            0.209375 -0.000204  0.040002 -0.209156  4.884339   
2    0.041209            0.237500 -0.000240  0.041208  0.277924  6.338346   
3    0.037616            0.265625  0.000401  0.037614  0.055549  5.736441   
4    0.034497            0.368750  0.001210  0.034475 -0.020088  5.220019   

     Energy  FFT Mean  FFT Energy   FFT Min   FFT Max  FFT Min-Max Diff  \
0  0.876715  0.629304  142.922475  0.024039  4.567020          4.542981   
1  0.512060  0.475284   82.145389  0.035812  4.648804          4.612991   
2  0.543415  0.397534   87.219234  0.018808  4.709432          4.690624   
3  0.452780  0.421870   72.501334  0.016782  2.966286          2.949504   
4  0.380804  0.471702   61.067224  0.011676  2.917248          2.905572   

   FFT Median   FFT MAD   label car_id              file  
0    0.430344  0.353548  FA

In [45]:
# Save dataframes to CSV
df_normal.to_csv('car_normal_data.csv', index=False)
df_faulty.to_csv('car_faulty_data.csv', index=False)

In [46]:
# Align columns (keep intersection of feature cols)
common = df_normal.columns.intersection(df_faulty.columns)
df_normal = df_normal[common]
df_faulty = df_faulty[common]

df_all = pd.concat([df_normal, df_faulty], ignore_index=True)
print(df_all["label"].value_counts())

label
FAULTY    202856
NORMAL    191322
Name: count, dtype: int64


In [None]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

meta_cols = [c for c in ["label","car_id","file"] if c in df_all.columns]
feature_cols = [c for c in df_all.columns if c not in meta_cols]

X = df_all[feature_cols].values
y = df_all["label"].values
groups = df_all["car_id"].values if "car_id" in df_all.columns else df_all["file"].values

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups))

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("rf", RandomForestClassifier(
        n_estimators=200, max_depth=12, class_weight="balanced", random_state=42
    ))
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
# plot confusion matrix
import seaborn as sns
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()
# Feature importances (from the RF step)
importances = pipe.named_steps["rf"].feature_importances_
feat_ranking = sorted(zip(feature_cols, importances), key=lambda x: x[1], reverse=True)
for name, score in feat_ranking[:15]:
    print(f"{name:20s}  {score:.4f}")

              precision    recall  f1-score   support

      FAULTY       0.65      0.55      0.60     47998
      NORMAL       0.61      0.70      0.65     48011

    accuracy                           0.63     96009
   macro avg       0.63      0.63      0.62     96009
weighted avg       0.63      0.63      0.62     96009

[[26602 21396]
 [14557 33454]]
Zero Crossing Rate    0.1567
Mean                  0.1412
FFT Median            0.0994
Std Dev               0.0862
FFT Mean              0.0813
RMS Energy            0.0702
Energy                0.0650
FFT Max               0.0608
FFT MAD               0.0580
FFT Min-Max Diff      0.0532
FFT Energy            0.0493
Kurtosis              0.0327
Skewness              0.0326
FFT Min               0.0135


[[26602 21396]
 [14557 33454]]
