In [1]:
import os
import numpy as np
import pandas as pd
from scipy.io import loadmat


In [3]:
def extract_features(signal):
    features = {}
    features['mean'] = np.mean(signal)
    features['std'] = np.std(signal)
    features['rms'] = np.sqrt(np.mean(signal**2))
    features['peak_to_peak'] = np.max(signal) - np.min(signal)
    features['energy'] = np.sum(signal**2)
    return features

In [5]:
def load_drive_end_signal(mat_file_path):
    data = loadmat(mat_file_path)
    
    # Find the Drive End signal key automatically
    de_key = [key for key in data.keys() if key.endswith('_DE_time')][0]
    
    signal = data[de_key].flatten()
    return signal


In [7]:
normal_dir = "Dataset/Normal"
faulty_dir = "Dataset/Faulty"

In [9]:
dataset = []

# Process NORMAL files
for filename in os.listdir(normal_dir):
    if filename.endswith(".mat"):
        file_path = os.path.join(normal_dir, filename)
        
        signal = load_drive_end_signal(file_path)
        features = extract_features(signal)
        features['label'] = 0  # Normal
        
        dataset.append(features)

# Process FAULTY files
for filename in os.listdir(faulty_dir):
    if filename.endswith(".mat"):
        file_path = os.path.join(faulty_dir, filename)
        
        signal = load_drive_end_signal(file_path)
        features = extract_features(signal)
        features['label'] = 1  # Faulty
        
        dataset.append(features)


In [11]:
df = pd.DataFrame(dataset)
df.head()

Unnamed: 0,mean,std,rms,peak_to_peak,energy,label
0,0.012558,0.072687,0.073764,0.597892,1327.296658,0
1,0.012564,0.065152,0.066352,0.663397,2130.42823,0
2,0.012564,0.065152,0.066352,0.663397,2130.42823,0
3,0.012459,0.064695,0.065884,0.590173,2108.022789,0
4,0.013444,0.291216,0.291526,3.118917,10306.001322,1


In [13]:
df['label'].value_counts()

label
1    16
0     4
Name: count, dtype: int64

In [15]:
X = df.drop(columns=['label'])
y = df['label']


In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

In [19]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [21]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(class_weight='balanced', random_state=42)
model.fit(X_train_scaled, y_train)

In [23]:
class_weight='balanced'

In [25]:
y_pred = model.predict(X_test_scaled)

In [27]:
from sklearn.metrics import classification_report, confusion_matrix

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       1.00      0.75      0.86         4

    accuracy                           0.80         5
   macro avg       0.75      0.88      0.76         5
weighted avg       0.90      0.80      0.82         5

Confusion Matrix:
[[1 0]
 [1 3]]


In [29]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=4,
    random_state=42,
    class_weight='balanced'
)

rf_model.fit(X_train, y_train)


In [33]:
y_pred_rf = rf_model.predict(X_test)


In [35]:
from sklearn.metrics import classification_report, confusion_matrix

print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         4

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5

Random Forest Confusion Matrix:
[[1 0]
 [0 4]]


In [37]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

cv_scores = cross_val_score(
    rf_model,
    X,
    y,
    cv=cv,
    scoring='f1'
)

cv_scores, cv_scores.mean()

(array([1., 1., 1.]), 1.0)

In [38]:
import pandas as pd

importance = pd.Series(
    rf_model.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

importance


peak_to_peak    0.272727
energy          0.272727
std             0.232323
rms             0.202020
mean            0.020202
dtype: float64

In [41]:
def get_severity_label(filename):
    name = filename.lower()
    if 'normal' in name:
        return 0
    elif 'ir007' in name:
        return 1
    elif 'ir014' in name or 'ir021' in name:
        return 2
    elif 'ir028' in name:
        return 3
    else:
        return None


In [43]:
normal_dir = "Dataset/Normal"
faulty_dir = "Dataset/Faulty"


In [45]:
dataset = []

# NORMAL
for filename in os.listdir(normal_dir):
    if filename.endswith(".mat"):
        file_path = os.path.join(normal_dir, filename)
        signal = load_drive_end_signal(file_path)
        features = extract_features(signal)
        features['label'] = 0
        dataset.append(features)

# FAULTY (severity-aware)
for filename in os.listdir(faulty_dir):
    if filename.endswith(".mat"):
        severity = get_severity_label(filename)
        if severity is None:
            continue
        file_path = os.path.join(faulty_dir, filename)
        signal = load_drive_end_signal(file_path)
        features = extract_features(signal)
        features['label'] = severity
        dataset.append(features)

df = pd.DataFrame(dataset)
df['label'].value_counts()


label
2    8
0    4
1    4
3    4
Name: count, dtype: int64

In [47]:
df.groupby('label')[['rms','energy','peak_to_peak','std']].mean()

Unnamed: 0_level_0,rms,energy,peak_to_peak,std
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.068088,1924.043977,0.628715,0.066921
1,0.299384,10954.066028,3.093699,0.299274
2,0.326501,15806.273338,5.429425,0.326082
3,0.835103,84570.384381,8.240946,0.835064


In [49]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

X = df.drop(columns=['label'])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

rf_severity = RandomForestClassifier(
    n_estimators=300,
    max_depth=6,
    random_state=42,
    class_weight='balanced'
)

rf_severity.fit(X_train, y_train)
y_pred = rf_severity.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1
           2       1.00      1.00      1.00         2
           3       1.00      1.00      1.00         1

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5

[[1 0 0 0]
 [0 1 0 0]
 [0 0 2 0]
 [0 0 0 1]]


In [51]:
importance = pd.Series(
    rf_severity.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

importance

peak_to_peak    0.324475
energy          0.233360
rms             0.200632
std             0.189845
mean            0.051688
dtype: float64

In [53]:
def split_into_windows(signal, window_size=2048):
    windows = []
    for start in range(0, len(signal) - window_size, window_size):
        windows.append(signal[start:start + window_size])
    return windows


In [55]:
dataset = []

for filename in os.listdir(normal_dir):
    if filename.endswith(".mat"):
        file_path = os.path.join(normal_dir, filename)
        signal = load_drive_end_signal(file_path)
        
        windows = split_into_windows(signal)
        
        for w in windows:
            features = extract_features(w)
            features['label'] = 0
            dataset.append(features)


In [57]:
for filename in os.listdir(faulty_dir):
    if filename.endswith(".mat"):
        severity = get_severity_label(filename)
        if severity is None:
            continue
        
        file_path = os.path.join(faulty_dir, filename)
        signal = load_drive_end_signal(file_path)
        
        windows = split_into_windows(signal)
        
        for w in windows:
            features = extract_features(w)
            features['label'] = severity
            dataset.append(features)


In [59]:
df = pd.DataFrame(dataset)
df['label'].value_counts()

label
0    828
2    472
1    237
3    235
Name: count, dtype: int64

In [61]:
df.groupby('label')[['rms','energy','peak_to_peak','std']].mean()


Unnamed: 0_level_0,rms,energy,peak_to_peak,std
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.067247,9.286216,0.419512,0.066029
1,0.299372,183.753173,2.663609,0.299262
2,0.326087,265.329293,4.096518,0.32564
3,0.834628,1427.705438,6.127888,0.834588


In [63]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

X = df.drop(columns=['label'])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

rf_windowed = RandomForestClassifier(
    n_estimators=300,
    max_depth=6,
    random_state=42,
    class_weight='balanced'
)

rf_windowed.fit(X_train, y_train)
y_pred = rf_windowed.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00       207
           1       1.00      1.00      1.00        59
           2       1.00      1.00      1.00       118
           3       1.00      1.00      1.00        59

    accuracy                           1.00       443
   macro avg       1.00      1.00      1.00       443
weighted avg       1.00      1.00      1.00       443

[[207   0   0   0]
 [  0  59   0   0]
 [  0   0 118   0]
 [  0   0   0  59]]


In [64]:
importance = pd.Series(
    rf_windowed.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

importance


energy          0.262123
std             0.232991
rms             0.230160
peak_to_peak    0.223897
mean            0.050829
dtype: float64