Isolating the Transient to Speed up Training Time and hopefully Accuracy Too

In [1]:
import os
import pandas as pd
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import matplotlib.pyplot as plt
from scipy.signal import hilbert
from scipy.stats import skew, kurtosis
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sktime.transformations.panel.rocket import MiniRocketMultivariateVariable
from sklearn.linear_model import RidgeClassifierCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import DataLoader, TensorDataset
from module import my_improved_bayesian_change_point_detection, load_data, preprocess_bluetooth_signals
from rf_classifier import RFClassifier

In [2]:
# Set the root directory
root_directory = os.path.join(os.path.join(os.getcwd(), 'Bluetooth Datasets'), 'Dataset 5 Gsps')

In [3]:
# Load the data
data = load_data(root_directory)

New Section: Preprocess data by removing spur signals, normalizing, applying hilbert transform, and isolating transient

In [4]:
preprocessed_data = preprocess_bluetooth_signals(data, signal_column='signal', dataset='A')

Preprocessing 2548 signals from dataset A...
Processing signal 1/2548...
Processing signal 1001/2548...
Processing signal 2001/2548...
Preprocessing complete!


In [5]:
start_time = time.perf_counter()

In [6]:
preprocessed_data['transient'] = None
for idx, row in preprocessed_data.iterrows():
    analytic_signal = row['analytic_signal']
    start_idx, end_idx, _, _ = my_improved_bayesian_change_point_detection(analytic_signal, window_size=600, overlap=0.65, start_threshold=10, end_threshold=2)
    preprocessed_data.at[idx, 'transient'] = analytic_signal[start_idx:end_idx]

In [7]:
def extract_transient_features(transient, fs):
    """Extract 9 HOS features + duration from a transient signal"""
    # Convert pandas Series to numpy array
    analytic = transient
    
    # Compute analytic signal
    # analytic = hilbert(signal)
    
    # Instantaneous characteristics
    amplitude = np.abs(analytic)
    amplitude_centered = amplitude - np.mean(amplitude)

    phase = np.unwrap(np.angle(analytic))
    time = 1 / fs
    mu_f = np.mean(np.diff(phase)/(2 * np.pi))  # Mean frequency
    phase_nonlinear = phase - 2 * np.pi * mu_f * time
    phase_centered = phase_nonlinear - np.mean(phase_nonlinear)

    frequency = np.diff(phase)/(2*np.pi)  # Handle length mismatch
    frequency_centered = frequency - np.mean(frequency)
    
    # Calculate HOS features
    features = {
        'amp_var': np.var(amplitude_centered),
        'amp_skew': skew(amplitude_centered, bias=False),
        'amp_kurt': kurtosis(amplitude_centered, fisher=True, bias=False),
        'phase_var': np.var(phase_centered),
        'phase_skew': skew(phase_centered, bias=False),
        'phase_kurt': kurtosis(phase_centered, fisher=True, bias=False),
        'freq_var': np.var(frequency_centered) if len(frequency) > 0 else 0,
        'freq_skew': skew(frequency_centered, bias=False) if len(frequency) > 0 else 0,
        'freq_kurt': kurtosis(frequency_centered, fisher=True, bias=False) if len(frequency) > 0 else 0,
        'duration': len(analytic)
    }
    
    return features

In [8]:
# Normalize and center features
def normalize_features(df):
    """Z-score normalization for all feature columns"""
    normalized_df = df.copy()
    for col in normalized_df.columns:
        if col != 'duration':
            mean = normalized_df[col].mean()
            std = normalized_df[col].std()
            normalized_df[col] = (normalized_df[col] - mean)/std
    return normalized_df

In [9]:
# Extract features for all transients
# feature_df = preprocessed_data['transient'].apply(extract_transient_features).apply(pd.Series)
feature_df = preprocessed_data['transient'].apply(
    lambda x: extract_transient_features(x, fs=5e9)
).apply(pd.Series)

In [10]:
# Apply normalization (excluding duration initially)
normalized_features = normalize_features(feature_df)

In [11]:
# Normalize duration separately
duration_mean = feature_df['duration'].mean()
duration_std = feature_df['duration'].std()
normalized_features['duration'] = (feature_df['duration'] - duration_mean)/duration_std

In [12]:
clipped_features = normalized_features.copy()

# Clip features between -3 and 3 (z-score units)
for col in clipped_features.columns:
    clipped_features[col] = np.clip(clipped_features[col], -3.5, 3.5)

# Add normalized features back to original dataframe
# complete_df = pd.concat([preprocessed_data, normalized_features], axis=1)
complete_df = pd.concat([preprocessed_data, clipped_features], axis=1)

In [13]:
device = torch.device('cuda')
torch.manual_seed(0)

<torch._C.Generator at 0x7c16e8ea4610>

In [14]:
def prepare_data(df, test_size=0.2, random_state=42):
    # Encode labels
    le = LabelEncoder()
    df['label_encoded'] = le.fit_transform(df['label'])
    
    # Extract features and labels
    feature_cols = [col for col in df.columns if col not in ['label', 'label_encoded']]
    X = df[feature_cols].values
    y = df['label_encoded'].values
    
    # Stratified split (120 train, 30 test per class)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=test_size, 
        stratify=y,
        random_state=random_state
    )

    # Normalize features
    # scaler = StandardScaler()
    # X_train = scaler.fit_transform(X_train)
    # X_test = scaler.transform(X_test)
    
    return X_train, X_test, y_train, y_test, le.classes_

In [15]:
def train_model(model, X_train, y_train, X_test, y_test, epochs=2500, lr=0.0001):
    train_dataset = TensorDataset(
        torch.tensor(X_train, dtype=torch.float32),
        torch.tensor(y_train, dtype=torch.long)
    )
    
    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    
    # Initialize model, loss, optimizer
    criterion = nn.CrossEntropyLoss()
    # optimizer = optim.Adam(model.parameters(), lr=lr)
    optimizer = optim.AdamW(model.parameters(), lr=lr ,weight_decay=0.001)
    scheduler = StepLR(optimizer, step_size=500, gamma=0.9)
    
    # Training loop
    for _ in range(epochs):
        model.train()
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        scheduler.step()
    return model

In [16]:
def evaluate_model(model, X_test, y_test):
    test_dataset = TensorDataset(
        torch.tensor(X_test, dtype=torch.float32),
        torch.tensor(y_test, dtype=torch.long)
    )
    test_loader = DataLoader(test_dataset, batch_size=32)
    
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    final_acc = 100 * correct / total
    print(f'Final Test Accuracy: {final_acc:.1f}%')
    return final_acc

In [17]:
nn_df = complete_df[['label', 'amp_var',
       'amp_skew', 'amp_kurt', 'phase_var', 'phase_skew', 'phase_kurt',
       'freq_var', 'freq_skew', 'freq_kurt', 'duration']]

# Prepare data
X_train, X_test, y_train, y_test, classes = prepare_data(nn_df, random_state=42)

# Initialize model
model = RFClassifier(input_size=X_train.shape[1], hidden_size=128, num_classes=len(classes), dropout_rate=0.4) # .to(device)
print(model)

RFClassifier(
  (layers): Sequential(
    (0): Linear(in_features=10, out_features=128, bias=True)
    (1): Tanh()
    (2): Dropout(p=0.4, inplace=False)
    (3): Linear(in_features=128, out_features=128, bias=True)
    (4): Tanh()
    (5): Dropout(p=0.4, inplace=False)
    (6): Linear(in_features=128, out_features=128, bias=True)
    (7): Tanh()
    (8): Dropout(p=0.4, inplace=False)
    (9): Linear(in_features=128, out_features=17, bias=True)
  )
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label_encoded'] = le.fit_transform(df['label'])


In [None]:
trained_model = train_model(model, X_train, y_train, X_test, y_test, epochs=2500, lr=1e-3)
end_time = time.perf_counter()
final_acc = evaluate_model(trained_model, X_test, y_test)

Final Test Accuracy: 78.4%


In [19]:
print(f'Execution time: {end_time - start_time:.6f} seconds')

Execution time: 339.333641 seconds
