Isolating the Transient to Speed up Training Time and hopefully Accuracy Too

In [21]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from scipy.signal import hilbert
from scipy.stats import skew, kurtosis
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sktime.transformations.panel.rocket import MiniRocketMultivariateVariable
from sklearn.linear_model import RidgeClassifierCV
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
from module import my_improved_bayesian_change_point_detection, load_data, preprocess_bluetooth_signals
from rf_classifier import RFClassifier

In [22]:
# Set the root directory
root_directory = os.path.join(os.path.join(os.getcwd(), 'Bluetooth Datasets'), 'Dataset 250 Msps')

In [23]:
# Load the data
data = load_data(root_directory)

In [24]:
print(data.head())

                                              signal  \
0  0       0.0031
1       0.0023
2       0.0023
3...   
1  0       0.0021
1       0.0031
2       0.0031
3...   
2  0       0.0027
1       0.0023
2       0.0025
3...   
3  0       0.0021
1       0.0023
2       0.0027
3...   
4  0       0.0029
1       0.0029
2       0.0033
3...   

                                  label  \
0  Iphone\4s\013004004984503_oguz_guler   
1  Iphone\4s\013004004984503_oguz_guler   
2  Iphone\4s\013004004984503_oguz_guler   
3  Iphone\4s\013004004984503_oguz_guler   
4  Iphone\4s\013004004984503_oguz_guler   

                                            filename  
0  013004004984503_oguz_guler_ro_iphone_4s_record...  
1  013004004984503_oguz_guler_ro_iphone_4s_record...  
2  013004004984503_oguz_guler_ro_iphone_4s_record...  
3  013004004984503_oguz_guler_ro_iphone_4s_record...  
4  013004004984503_oguz_guler_ro_iphone_4s_record...  


In [25]:
print(data['label'].value_counts())

label
Iphone\4s\013004004984503_oguz_guler               150
Iphone\4s\013051002641007_koray_oktem              150
Iphone\5\013409009258565_gamze_uyuk                150
Iphone\5\013737001703349_makbule_guclu             150
Iphone\5s\352053069423260_mustafa_gungor           150
Iphone\5s\359261061140526_melisa_topcu             150
Iphone\6\354427066558690_mustafa_guclu             150
Iphone\6\355401074766578_berat_serefoglu           150
Iphone\6S\353308076325778_tugce_ozkan              150
Iphone\6S\355396082974273_melisa_oktem             150
Iphone\6S\355694077593599_deniz_yilmaz             150
Iphone\7\356563081643675_cuneyt_buyukkilic         150
Iphone\7\359206078179249_ismet_buyukkilic          150
Iphone\7plus\355373083202269_akin_yavuz            150
Iphone\7plus\359179071432156_melisa_topcu          150
LG\G4\352334072148270_mertcan_yurtseven            150
LG\G4\352334073607175_mert_kilic                   150
LG\V20\3521162080037221_ozan_erbasan               150
LG\V

New Section: Preprocess data by removing spur signals, normalizing, applying hilbert transform, and isolating transient

In [26]:
preprocessed_data = preprocess_bluetooth_signals(data, signal_column='signal', dataset='D')

Preprocessing 4950 signals from dataset D...
Processing signal 1/4950...
Processing signal 1001/4950...
Processing signal 2001/4950...
Processing signal 3001/4950...
Processing signal 4001/4950...
Preprocessing complete!


In [27]:
preprocessed_data['transient'] = None
for idx, row in preprocessed_data.iterrows():
    analytic_signal = row['analytic_signal']
    start_idx, end_idx, _, _ = my_improved_bayesian_change_point_detection(analytic_signal, window_size=200, overlap=0.65, threshold=2, offset=0.75)
    if isinstance(analytic_signal, pd.Series):
        preprocessed_data.at[idx, 'transient'] = analytic_signal.iloc[start_idx:end_idx]
    else:
        preprocessed_data.at[idx, 'transient'] = analytic_signal[start_idx:end_idx]

In [28]:
for idx, row in preprocessed_data.iterrows():
    print(f'Length of original signal: {len(row['signal'])}')
    print(f'Type of original signal: {type(row['signal'])}')
    print(f'Length of filtered signal: {len(row['filtered_signal'])}')
    print(f'Type of filtered signal: {type(row['filtered_signal'])}')
    print(f'Length of normalized signal: {len(row['normalized_signal'])}')
    print(f'Type of normalized signal: {type(row['normalized_signal'])}')
    print(f'Length of analytic signal: {len(row['analytic_signal'])}')
    print(f'Type of analytic signal: {type(row['analytic_signal'])}')
    print(f'Length of I data: {len(row['I_data'])}')
    print(f'Type of I data: {type(row['I_data'])}')
    print(f'Length of Q data: {len(row['Q_data'])}')
    print(f'Type of Q data: {type(row['Q_data'])}')
    print(f'Length of transient: {len(row['transient'])}')
    print(f'Type of transient: {type(row['transient'])}')
    break

Length of original signal: 2151
Type of original signal: <class 'pandas.core.series.Series'>
Length of filtered signal: 2151
Type of filtered signal: <class 'numpy.ndarray'>
Length of normalized signal: 2151
Type of normalized signal: <class 'numpy.ndarray'>
Length of analytic signal: 2151
Type of analytic signal: <class 'numpy.ndarray'>
Length of I data: 2151
Type of I data: <class 'numpy.ndarray'>
Length of Q data: 2151
Type of Q data: <class 'numpy.ndarray'>
Length of transient: 280
Type of transient: <class 'numpy.ndarray'>


In [29]:
def extract_transient_features(transient):
    """Extract 9 HOS features + duration from a transient signal"""
    # Convert pandas Series to numpy array
    analytic = transient
    
    # Compute analytic signal
    # analytic = hilbert(signal)
    
    # Instantaneous characteristics
    amplitude = np.abs(analytic)
    phase = np.unwrap(np.angle(analytic))
    frequency = np.diff(phase)/(2*np.pi)  # Handle length mismatch
    
    # Calculate HOS features
    features = {
        'amp_var': np.var(amplitude),
        'amp_skew': skew(amplitude),
        'amp_kurt': kurtosis(amplitude),
        'phase_var': np.var(phase),
        'phase_skew': skew(phase),
        'phase_kurt': kurtosis(phase),
        'freq_var': np.var(frequency) if len(frequency) > 0 else 0,
        'freq_skew': skew(frequency) if len(frequency) > 0 else 0,
        'freq_kurt': kurtosis(frequency) if len(frequency) > 0 else 0,
        'duration': len(analytic)
    }
    
    return features

In [30]:
# Normalize and center features
def normalize_features(df):
    """Z-score normalization for all feature columns"""
    normalized_df = df.copy()
    for col in normalized_df.columns:
        if col != 'duration':
            mean = normalized_df[col].mean()
            std = normalized_df[col].std()
            normalized_df[col] = (normalized_df[col] - mean)/std
    return normalized_df

In [31]:
# Extract features for all transients
feature_df = preprocessed_data['transient'].apply(extract_transient_features).apply(pd.Series)

In [32]:
# Apply normalization (excluding duration initially)
normalized_features = normalize_features(feature_df)

In [33]:
# Normalize duration separately
duration_mean = feature_df['duration'].mean()
duration_std = feature_df['duration'].std()
normalized_features['duration'] = (feature_df['duration'] - duration_mean)/duration_std

# Add normalized features back to original dataframe
complete_df = pd.concat([preprocessed_data, normalized_features], axis=1)

In [34]:
preprocessed_data.head()

Unnamed: 0,signal,label,filename,filtered_signal,normalized_signal,analytic_signal,I_data,Q_data,transient
0,0 0.0031 1 0.0023 2 0.0023 3...,Iphone\4s\013004004984503_oguz_guler,013004004984503_oguz_guler_ro_iphone_4s_record...,"[0.0031, 0.0023, 0.0023, 0.0025, 0.0025, 0.002...","[0.26050420168067223, 0.19327731092436973, 0.1...","[(0.26050420168067223-0.35292475555603525j), (...","[0.26050420168067223, 0.19327731092436937, 0.1...","[-0.35292475555603525, 0.01825843451618641, -0...","[(0.24369747899159616+0.05629721951523308j), (..."
1,0 0.0021 1 0.0031 2 0.0031 3...,Iphone\4s\013004004984503_oguz_guler,013004004984503_oguz_guler_ro_iphone_4s_record...,"[0.0021, 0.0031, 0.0031, 0.0025, 0.0031, 0.002...","[0.17073170731707316, 0.25203252032520324, 0.2...","[(0.17073170731707296-0.005863582089700243j), ...","[0.17073170731707296, 0.2520325203252026, 0.25...","[-0.005863582089700243, 0.028305609054164897, ...","[(0.2520325203252032-0.06719198199521643j), (0..."
2,0 0.0027 1 0.0023 2 0.0025 3...,Iphone\4s\013004004984503_oguz_guler,013004004984503_oguz_guler_ro_iphone_4s_record...,"[0.0027, 0.0023, 0.0025, 0.0031, 0.0029, 0.003...","[0.24770642201834864, 0.2110091743119266, 0.22...","[(0.24770642201834828+0.34843513718714153j), (...","[0.24770642201834828, 0.2110091743119263, 0.22...","[0.34843513718714153, -0.021830316157262053, 0...","[(0.22935779816513804-0.01825142148219965j), (..."
3,0 0.0021 1 0.0023 2 0.0027 3...,Iphone\4s\013004004984503_oguz_guler,013004004984503_oguz_guler_ro_iphone_4s_record...,"[0.0021, 0.0023, 0.0027, 0.0029, 0.0025, 0.002...","[0.18584070796460178, 0.20353982300884957, 0.2...","[(0.18584070796460148+0.3423477836066878j), (0...","[0.18584070796460148, 0.20353982300884926, 0.2...","[0.3423477836066878, 0.005589855421496792, 0.0...","[(0.2743362831858411-0.024867227271992184j), (..."
4,0 0.0029 1 0.0029 2 0.0033 3...,Iphone\4s\013004004984503_oguz_guler,013004004984503_oguz_guler_ro_iphone_4s_record...,"[0.0029, 0.0029, 0.0033, 0.0031, 0.0025, 0.002...","[0.2436974789915966, 0.2436974789915966, 0.277...","[(0.24369747899159616+0.15210962403547493j), (...","[0.24369747899159616, 0.24369747899159663, 0.2...","[0.15210962403547493, 0.026726885947882698, 0....","[(0.15966386554621798+0.005647992675238671j), ..."


In [35]:
# Verify feature columns
print(complete_df[['amp_var', 'phase_skew', 'freq_kurt', 'duration']].head())
print(complete_df.columns)

    amp_var  phase_skew  freq_kurt  duration
0 -0.683438   -0.314074  -0.431197 -0.637020
1 -0.614454   -0.025522  -0.407892 -0.637020
2 -1.253123    2.430415  -0.832875  0.097123
3 -0.553153    0.445900  -0.523540 -0.637020
4 -0.978028    0.273011  -0.358390 -0.637020
Index(['signal', 'label', 'filename', 'filtered_signal', 'normalized_signal',
       'analytic_signal', 'I_data', 'Q_data', 'transient', 'amp_var',
       'amp_skew', 'amp_kurt', 'phase_var', 'phase_skew', 'phase_kurt',
       'freq_var', 'freq_skew', 'freq_kurt', 'duration'],
      dtype='object')


In [36]:
device = torch.device('cpu')

In [37]:
def prepare_data(df, test_size=0.2, random_state=42):
    # Encode labels
    le = LabelEncoder()
    df['label_encoded'] = le.fit_transform(df['label'])
    
    # Extract features and labels
    feature_cols = [col for col in df.columns if col not in ['label', 'label_encoded']]
    X = df[feature_cols].values
    y = df['label_encoded'].values
    
    # Stratified split (120 train, 30 test per class)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=test_size, 
        stratify=y,
        random_state=random_state
    )
    
    return X_train, X_test, y_train, y_test, le.classes_

In [None]:
def train_model(model, X_train, y_train, X_test, y_test, epochs=2500, lr=0.01):
    # Convert to tensors
    train_dataset = TensorDataset(
        torch.tensor(X_train, dtype=torch.float32, device=device),
        torch.tensor(y_train, dtype=torch.long, device=device)
    )
    
    test_dataset = TensorDataset(
        torch.tensor(X_test, dtype=torch.float32, device=device),
        torch.tensor(y_test, dtype=torch.long, device=device)
    )
    
    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=4)
    
    # Initialize model, loss, optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.1)
    
    # Training loop
    for epoch in range(epochs):
        model.train()
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
            optimizer.step()
        
        # Validate every 100 epochs
        if (epoch+1) % 100 == 0:
            model.eval()
            correct = 0
            total = 0
            with torch.no_grad():
                for inputs, labels in test_loader:
                    inputs, labels = inputs.to(device), labels.to(device)
                    outputs = model(inputs)
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()
            
            print(f'Epoch {epoch+1}/{epochs}, Test Acc: {100*correct/total:.2f}%')
    
    return model

In [42]:
def evaluate_model(model, X_test, y_test):
    test_dataset = TensorDataset(
        torch.tensor(X_test, dtype=torch.float32, device=device),
        torch.tensor(y_test, dtype=torch.long, device=device)
    )
    test_loader = DataLoader(test_dataset, batch_size=32)
    
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    final_acc = 100 * correct / total
    print(f'Final Test Accuracy: {final_acc:.2f}%')
    return final_acc

In [43]:
nn_df = complete_df[['label', 'amp_var',
       'amp_skew', 'amp_kurt', 'phase_var', 'phase_skew', 'phase_kurt',
       'freq_var', 'freq_skew', 'freq_kurt', 'duration']]

# Prepare data
X_train, X_test, y_train, y_test, classes = prepare_data(nn_df)

# Initialize model
model = RFClassifier(input_size=X_train.shape[1], num_classes=len(classes)).to(device)

with torch.device(device):
       trained_model = train_model(model, X_train, y_train, X_test, y_test)
       evaluate_model(trained_model, X_test, y_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label_encoded'] = le.fit_transform(df['label'])


Epoch 100/2500, Test Acc: 36.57%
Epoch 200/2500, Test Acc: 44.14%
Epoch 300/2500, Test Acc: 49.80%
Epoch 400/2500, Test Acc: 53.33%
Epoch 500/2500, Test Acc: 54.44%
Epoch 600/2500, Test Acc: 56.06%
Epoch 700/2500, Test Acc: 57.27%
Epoch 800/2500, Test Acc: 58.69%
Epoch 900/2500, Test Acc: 59.49%
Epoch 1000/2500, Test Acc: 60.91%
Epoch 1100/2500, Test Acc: 61.72%
Epoch 1200/2500, Test Acc: 62.22%
Epoch 1300/2500, Test Acc: 63.13%
Epoch 1400/2500, Test Acc: 63.54%
Epoch 1500/2500, Test Acc: 64.04%
Epoch 1600/2500, Test Acc: 64.04%
Epoch 1700/2500, Test Acc: 64.04%
Epoch 1800/2500, Test Acc: 64.24%
Epoch 1900/2500, Test Acc: 64.75%
Epoch 2000/2500, Test Acc: 64.95%
Epoch 2100/2500, Test Acc: 64.85%
Epoch 2200/2500, Test Acc: 65.35%
Epoch 2300/2500, Test Acc: 65.35%
Epoch 2400/2500, Test Acc: 65.66%
Epoch 2500/2500, Test Acc: 66.26%
Final Test Accuracy: 66.26%


In [None]:
# Split the data into training and test sets, ensuring 30 examples per label for the test set
train_data = pd.DataFrame()
test_data = pd.DataFrame()

In [None]:
for label, group in data.groupby('label'):
    train, test = train_test_split(group, test_size=30, random_state=42)
    train_data = pd.concat([train_data, train])
    test_data = pd.concat([test_data, test])

In [None]:
transient_lengths = []
for idx, row in data.iterrows():
    transient_lengths.append(len(row['transient']))
print(np.unique(transient_lengths, return_counts=True))

In [None]:
# Separate the features (signal data) and labels for training and test sets
# X_train = train_data['signal']
X_train = train_data['transient']
y_train = train_data['label']
# X_test = test_data['signal']
X_test = test_data['transient']
y_test = test_data['label']

In [None]:
print(y_train.value_counts())

In [None]:
print(y_test.value_counts())

In [None]:
# Ensure X_train is a DataFrame with one column, where each cell is a Series
X_train_df = pd.DataFrame({'signal': X_train})
X_test_df = pd.DataFrame({'signal': X_test})

In [None]:
# Setup pipeline
bluetooth_pipeline = make_pipeline(
    MiniRocketMultivariateVariable(
        pad_value_short_series=-10.0, 
        random_state=42,
        reference_length='max',
        max_dilations_per_kernel=32,
        n_jobs=-1
    ),
    RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
)

In [None]:
# Fit the pipeline
bluetooth_pipeline.fit(X_train_df, y_train)

In [None]:
# Evaluate the pipeline
accuracy = bluetooth_pipeline.score(X_test_df, y_test)
print(f"Test Accuracy: {accuracy:.3f}")

In [None]:
# Predict labels for the test set
y_pred = bluetooth_pipeline.predict(X_test_df)

# Create a DataFrame to compare true and predicted labels
results = pd.DataFrame({
    'True Label': y_test,
    'Predicted Label': y_pred
})

# Identify incorrect predictions
incorrect_predictions = results[results['True Label'] != results['Predicted Label']]

# Print incorrect predictions
print("Incorrect Predictions:")
print(incorrect_predictions)