IMPORT DEPENDENCIES AND LIBS

In [27]:
import os
import random
import itertools
import numpy as np
import pandas as pd
import knock_evaluator

from knock_evaluator import mimic_knock_detection, find_knock_event_ended, calculate_ZImpactReturn, calculate_Very_High_Impact, calculate_DeployFlag, reshape_sequence, calculate_Has_It_Knocked

import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import make_scorer
from bayes_opt import BayesianOptimization
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events


In [2]:
def process_folder(folder_path, label):
    """
    Process all files in the given folder and compile data into a DataFrame.

    Parameters:
    - folder_path: Path to the folder containing the files.
    - label: The label to assign to all data from this folder (e.g., 1 for valid, 0 for invalid).
    
    Returns:
    - A DataFrame containing all processed data from the files.
    """
    all_data = []  # List to hold data from all files

    # Iterate over all files in the folder
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        # Ensure we're only processing .csv files
        if os.path.isfile(file_path) and file_path.endswith('.csv'):
            # Read the file
            data = pd.read_csv(file_path)
            
            # Filter rows for xBuffer, yBuffer, zBuffer and reset index
            x_data = data[data['Expression'] == 'xBuffer'].reset_index(drop=True)
            y_data = data[data['Expression'] == 'yBuffer'].reset_index(drop=True)
            z_data = data[data['Expression'] == 'zBuffer'].reset_index(drop=True)
            
            # Ensure data is aligned
            min_length = min(len(x_data), len(y_data), len(z_data))
            structured_df = pd.DataFrame({
                'x': x_data['Value'].head(min_length),
                'y': y_data['Value'].head(min_length),
                'z': z_data['Value'].head(min_length),
                'label': label
            })
            
            # Append to the list
            all_data.append(structured_df)
    
    # Combine all data into a single DataFrame
    combined_data = pd.concat(all_data, ignore_index=True)
    
    return combined_data

In [3]:

# Load the valid knock data
valid_data_path = 'data/valid/valid1.csv'
valid_data = pd.read_csv(valid_data_path)

# Display the first few rows of the dataset to understand its structure
valid_data.head()

Unnamed: 0,Level,Expression,Value,Location,Refresh,Access
0,0,extractionBuffer,,0x20005EDC,Off,private
1,1,[0],,0x20005EDC,Off,public
2,2,xBuffer,9.0,0x20005EDC,Off,public
3,2,yBuffer,116.0,0x20005EDE,Off,public
4,2,zBuffer,54.0,0x20005EE0,Off,public


In [4]:

# Extract only the rows that contain acceleration values and reset index to avoid grouping issues
x_data = valid_data[valid_data['Expression'] == 'xBuffer'].reset_index(drop=True)
y_data = valid_data[valid_data['Expression'] == 'yBuffer'].reset_index(drop=True)
z_data = valid_data[valid_data['Expression'] == 'zBuffer'].reset_index(drop=True)

# Ensure we only take as many rows as the shortest among x, y, z to keep data aligned
min_length = min(len(x_data), len(y_data), len(z_data))

# Reconstruct the DataFrame using the aligned data
structured_df_aligned = pd.DataFrame({
    'x': x_data['Value'].head(min_length),
    'y': y_data['Value'].head(min_length),
    'z': z_data['Value'].head(min_length),
    'label': 1  # Label for valid knock
})

structured_df_aligned.head()

Unnamed: 0,x,y,z,label
0,9.0,116.0,54.0,1
1,2.0,-20.0,99.0,1
2,100.0,111.0,-22.0,1
3,33.0,76.0,34.0,1
4,15.0,-25.0,-24.0,1


In [5]:
def structure_sequences(df, label):
    """
    Structures the DataFrame such that each 90-row sequence (representing 30 time points of x, y, z data)
    is treated as a single observation.

    Parameters:
    - df: DataFrame containing the sequences.
    - label: The label for these sequences (1 for valid, 0 for invalid).
    
    Returns:
    - A list of tuples, where each tuple is (sequence, label), and
      each sequence is a (90, ) shape array if flattening or a (30, 3) array if keeping x, y, z separate.
    """
    sequences = []
    num_sequences = len(df) // 90  # Assuming each sequence is exactly 90 rows
    
    for i in range(num_sequences):
        start_idx = i * 90
        sequence = df.iloc[start_idx:start_idx + 90][['x', 'y', 'z']].values.flatten()  # Flattened sequence
        # Alternatively, keep as a (30, 3) array for models that can handle sequence data
        # sequence = df.iloc[start_idx:start_idx + 90][['x', 'y', 'z']].values.reshape((30, 3))
        sequences.append((sequence, label))
    
    return sequences

Load all valid / invalid data into data frames, Label & Combine them to Sequences. Afterwards Combine them to global Sequence structure and Shuffle.

In [6]:
folder_path_valid = 'data/valid'  
folder_path_invalid = 'data/invalid'  
valid_data = process_folder(folder_path_valid, label=1)
invalid_data = process_folder(folder_path_invalid, label=0)

# Assuming valid_data and invalid_data are already loaded and structured with one file per sequence
valid_sequences = structure_sequences(valid_data, 1)
invalid_sequences = structure_sequences(invalid_data, 0)

all_sequences = valid_sequences + invalid_sequences
random.shuffle(all_sequences)

In [7]:
# Separate features and labels for model training
X = np.array([seq[0] for seq in all_sequences])
y = np.array([seq[1] for seq in all_sequences])

Train - Test Split of Data

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
sequence_df = reshape_sequence(all_sequences[0][0])

In [10]:
prebuffer_size = 8
knock_event_length_avg = 45
knock_event_ended = prebuffer_size + knock_event_length_avg
knock_algo_AngleXLatched = 8
knock_algo_Vel_Zlatched = 9000
knock_algo_Vel_Zlatched_Counter = 20
knock_algo_HFA_Zlatched = 3200
knock_algo_HFA_Zlatched_Counter = 15
ZImpactReturn = False
Very_High_Impact = False
DeployFlag = False

Has_It_Knocked = False

very_high_counter = 5
Acc_Latched_Counter = 25
Acc_Latched_Threshold = 8500
Vel_Latched_Counter = 30
Vel_Latched_Threshold = 9000


In [11]:
# Calculate the velocities as cumulative sum (integration) of each buffer
sequence_df['xBuffer_quasi_velocity'] = np.cumsum(sequence_df['xBuffer'])
sequence_df['yBuffer_quasi_velocity'] = np.cumsum(sequence_df['yBuffer'])
sequence_df['zBuffer_quasi_velocity'] = np.cumsum(sequence_df['zBuffer'])

# Calculate the energies of each buffer
sequence_df['xBuffer_quasi_energy'] = (sequence_df['xBuffer_quasi_velocity'])**2
sequence_df['yBuffer_quasi_energy'] = (sequence_df['yBuffer_quasi_velocity'])**2
sequence_df['zBuffer_quasi_energy'] = (sequence_df['zBuffer_quasi_velocity'])**2

sequence_df['xBuffer_quasi_work'] = np.cumsum(np.abs(sequence_df['xBuffer_quasi_velocity']))
sequence_df['yBuffer_quasi_work'] = np.cumsum(np.abs(sequence_df['yBuffer_quasi_velocity']))
sequence_df['zBuffer_quasi_work'] = np.cumsum(np.abs(sequence_df['zBuffer_quasi_velocity']))

knock_event_ended_dynamic = find_knock_event_ended(sequence_df['zBuffer'], knock_algo_HFA_Zlatched, knock_algo_HFA_Zlatched_Counter, prebuffer_size)

# Nulling values after the knock_event_ended parameter by setting them to 0
# Update based on the dynamically determined knock_event_ended value
if knock_event_ended_dynamic is not None and knock_event_ended_dynamic != 0:
    knock_event_ended = knock_event_ended_dynamic
    sequence_df.loc[knock_event_ended_dynamic+1:, 'xBuffer_quasi_velocity'] = 0
    sequence_df.loc[knock_event_ended_dynamic+1:, 'yBuffer_quasi_velocity'] = 0
    sequence_df.loc[knock_event_ended_dynamic+1:, 'zBuffer_quasi_velocity'] = 0
    sequence_df.loc[knock_event_ended_dynamic+1:, 'xBuffer_quasi_energy'] = 0
    sequence_df.loc[knock_event_ended_dynamic+1:, 'yBuffer_quasi_energy'] = 0
    sequence_df.loc[knock_event_ended_dynamic+1:, 'zBuffer_quasi_energy'] = 0
    sequence_df.loc[knock_event_ended_dynamic+1:, 'xBuffer_quasi_work'] = 0
    sequence_df.loc[knock_event_ended_dynamic+1:, 'yBuffer_quasi_work'] = 0
    sequence_df.loc[knock_event_ended_dynamic+1:, 'zBuffer_quasi_work'] = 0

else:
    sequence_df.loc[knock_event_ended+1:, 'xBuffer_quasi_velocity'] = 0
    sequence_df.loc[knock_event_ended+1:, 'yBuffer_quasi_velocity'] = 0
    sequence_df.loc[knock_event_ended+1:, 'zBuffer_quasi_velocity'] = 0
    sequence_df.loc[knock_event_ended+1:, 'xBuffer_quasi_energy'] = 0
    sequence_df.loc[knock_event_ended+1:, 'yBuffer_quasi_energy'] = 0
    sequence_df.loc[knock_event_ended+1:, 'zBuffer_quasi_energy'] = 0
    sequence_df.loc[knock_event_ended+1:, 'xBuffer_quasi_work'] = 0
    sequence_df.loc[knock_event_ended+1:, 'yBuffer_quasi_work'] = 0
    sequence_df.loc[knock_event_ended+1:, 'zBuffer_quasi_work'] = 0


#Calculate several keay values
maxZ_Acc = np.abs(sequence_df['zBuffer']).max()
maxX_Vel = np.abs(sequence_df['xBuffer_quasi_velocity']).max()
maxY_Vel = np.abs(sequence_df['yBuffer_quasi_velocity']).max()
maxZ_Vel = np.abs(sequence_df['zBuffer_quasi_velocity']).max()
minZ_Vel = sequence_df['zBuffer_quasi_velocity'].min()
divXZ = maxZ_Vel/maxX_Vel
divYZ = maxZ_Vel/maxY_Vel

ZImpactReturn = calculate_ZImpactReturn(maxZ_Vel, maxX_Vel, maxY_Vel, minZ_Vel, knock_algo_AngleXLatched)
Very_High_Impact = calculate_Very_High_Impact(sequence_df['zBuffer'], sequence_df['zBuffer_quasi_velocity'], very_high_counter)
DeployFlag = calculate_DeployFlag(sequence_df['zBuffer'], sequence_df['zBuffer_quasi_velocity'], prebuffer_size, knock_event_ended, Acc_Latched_Counter, Vel_Latched_Counter, Acc_Latched_Threshold, Vel_Latched_Threshold)
Has_It_Knocked = calculate_Has_It_Knocked(DeployFlag[0], Very_High_Impact, ZImpactReturn)


In [12]:
print(Has_It_Knocked)

False


In [13]:
# Example parameters - replace with actual parameter values as needed
params = {
    'KLOPFALGO_HfaLatchedCounterThr_u8': 25,
    'KLOPFALGO_VelLatchedCounterThr_u8': 30,
    'KLOPFALGO_VelVeryHighCounterThr_u8': 5,
    'KLOPFALGO_HfaThrZLatched_s16': 8500, 
    'KLOPFALGO_VelThrZLatched_s32': 9000,
    'KLOPFALGO_WinkelThrXLatched_s16': 8
    # Add other parameters as needed
}
# Test with the first sequence as an example
sequence, label = all_sequences[0]
result = mimic_knock_detection(sequence, params)

print(f"Sequence Label: {label}, Detection Result: {result}")

Sequence Label: 0, Detection Result: False


In [29]:
predictions = [mimic_knock_detection(seq, params) for seq, _ in all_sequences]
true_labels = [label for _, label in all_sequences]

f1 = f1_score(true_labels, predictions)
print(f"Default F1 Score: {f1}")

Default F1 Score: 0.7867298578199052


Perform K-Fold Cross-Validation

In [15]:
k = 5  # Number of folds
kf = KFold(n_splits=k, shuffle=True, random_state=42)

In [16]:
f1_scores = []

for train_index, test_index in kf.split(all_sequences):  # Use all_sequences directly
    # Splitting data into training and test sets for this fold using list comprehension
    train = [all_sequences[i] for i in train_index]
    test = [all_sequences[i] for i in test_index]
    
    # Generate predictions for each sequence in the test set
    predictions = [mimic_knock_detection(sequence, params) for sequence, _ in test]
    
    # Extract true labels for the test set
    true_labels = [label for _, label in test]
    
    # Calculate F1 score and append to list
    f1 = f1_score(true_labels, predictions)
    f1_scores.append(f1)

# Calculate average F1 score across all folds
average_f1 = np.mean(f1_scores)
print(f"Average F1 Score across {k} folds: {average_f1}")

Average F1 Score across 5 folds: 0.8475101418497646


PARAMETER OPTIMIZATION - STRATEGY: GRID SEARCH

In [17]:
#Define parameter grid
parameter_grid = {
    'KLOPFALGO_HfaLatchedCounterThr_u8': [10, 15, 20, 25],
    'KLOPFALGO_VelLatchedCounterThr_u8': [10, 20, 30, 40],
    'KLOPFALGO_VelVeryHighCounterThr_u8': [3,5,8],
    'KLOPFALGO_HfaThrZLatched_s16': [6500, 7500, 8500, 9500], 
    'KLOPFALGO_VelThrZLatched_s32': [6000, 7000, 8000, 9000, 10000],
    'KLOPFALGO_WinkelThrXLatched_s16': [4,6,8,10]
    # Add other parameters as needed
}
# Generate all combinations of parameters
param_combinations = list(itertools.product(*(parameter_grid[param_name] for param_name in parameter_grid)))

# Placeholder for best score and corresponding parameters
best_score = 0
best_params = {}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for combination in param_combinations:
    params = dict(zip(parameter_grid.keys(), combination))
    
    # List to hold F1 scores for each fold
    f1_scores = []

    for train_index, test_index in kf.split(all_sequences):
        train, test = [all_sequences[i] for i in train_index], [all_sequences[i] for i in test_index]
        predictions = [mimic_knock_detection(sequence, params) for sequence, _ in test]
        true_labels = [label for _, label in test]
        f1 = f1_score(true_labels, predictions)
        f1_scores.append(f1)
    
    # Calculate average F1 score for this parameter combination
    avg_f1 = np.mean(f1_scores)
    
    if avg_f1 > best_score:
        best_score = avg_f1
        best_params = params

print("Best F1 Score:", best_score)
print("Best Parameters:", best_params)

Best F1 Score: 0.8857109891543178
Best Parameters: {'KLOPFALGO_HfaLatchedCounterThr_u8': 15, 'KLOPFALGO_VelLatchedCounterThr_u8': 10, 'KLOPFALGO_VelVeryHighCounterThr_u8': 3, 'KLOPFALGO_HfaThrZLatched_s16': 7500, 'KLOPFALGO_VelThrZLatched_s32': 8000, 'KLOPFALGO_WinkelThrXLatched_s16': 6}


PARAMETER OPTIMIZATION - STRATEGY: BAYESIAN OPTIMIZATION

In [28]:
def knock_detection_cv(KLOPFALGO_HfaLatchedCounterThr_u8, KLOPFALGO_VelLatchedCounterThr_u8, KLOPFALGO_VelVeryHighCounterThr_u8, KLOPFALGO_HfaThrZLatched_s16, KLOPFALGO_VelThrZLatched_s32, KLOPFALGO_WinkelThrXLatched_s16):
    # Convert continuous parameters to their appropriate format if necessary
    params = {
        'KLOPFALGO_HfaLatchedCounterThr_u8': int(KLOPFALGO_HfaLatchedCounterThr_u8),
        'KLOPFALGO_VelLatchedCounterThr_u8': int(KLOPFALGO_VelLatchedCounterThr_u8),
        'KLOPFALGO_VelVeryHighCounterThr_u8': int(KLOPFALGO_VelVeryHighCounterThr_u8),
        'KLOPFALGO_HfaThrZLatched_s16': int(KLOPFALGO_HfaThrZLatched_s16),
        'KLOPFALGO_VelThrZLatched_s32': int(KLOPFALGO_VelThrZLatched_s32),
        'KLOPFALGO_WinkelThrXLatched_s16': int(KLOPFALGO_WinkelThrXLatched_s16)
    }
    
    f1_scores = []
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    for train_index, test_index in kf.split(all_sequences):
        train, test = [all_sequences[i] for i in train_index], [all_sequences[i] for i in test_index]
        predictions = [mimic_knock_detection(sequence, params) for sequence, _ in test]
        true_labels = [label for _, label in test]
        f1 = f1_score(true_labels, predictions)
        f1_scores.append(f1)

    # Return the average F1 score across folds
    return np.mean(f1_scores)


# Define the parameter bounds
pbounds = {
    'KLOPFALGO_HfaLatchedCounterThr_u8': (10, 25),
    'KLOPFALGO_VelLatchedCounterThr_u8': (10, 40),
    'KLOPFALGO_VelVeryHighCounterThr_u8': (3, 8),
    'KLOPFALGO_HfaThrZLatched_s16': (6000, 10000),
    'KLOPFALGO_VelThrZLatched_s32': (6000, 10000),
    'KLOPFALGO_WinkelThrXLatched_s16': (4, 12)

}

# Instantiate BayesianOptimization object
optimizer = BayesianOptimization(
    f=knock_detection_cv,
    pbounds=pbounds,
    random_state=1,
)

# Set up the logger
logger = JSONLogger(path="./bayes_opt_logs.json")
optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

# Use the expected improvement acquisition function to handle exploration-exploitation trade-off
optimizer.maximize(init_points=10, n_iter=40)

# Output the best parameters and the corresponding score
best_score = optimizer.max['target']
best_params = optimizer.max['params']
print("Best F1 Score:", optimizer.max['target'])
print("Best Parameters:", optimizer.max['params'])

# Write the summary to a text file
with open("optimization_summary.txt", "w") as file:
    file.write(f"Best F1 Score: {best_score}\n")
    file.write("Best Parameters:\n")
    for param, value in best_params.items():
        file.write(f"{param}: {value}\n")

Best F1 Score: 0.8814685649118935
Best Parameters: {'KLOPFALGO_HfaLatchedCounterThr_u8': 14.206659880966079, 'KLOPFALGO_HfaThrZLatched_s16': 9157.117313805953, 'KLOPFALGO_VelLatchedCounterThr_u8': 13.096780197329261, 'KLOPFALGO_VelThrZLatched_s32': 7791.574104703621, 'KLOPFALGO_VelVeryHighCounterThr_u8': 7.542977515465478, 'KLOPFALGO_WinkelThrXLatched_s16': 6.348913186989436}
