In [1]:
from sktime.transformations.panel.rocket import MiniRocketMultivariate
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle
import numba
from sklearn.preprocessing import MinMaxScaler
import numpy as np



In [2]:
# File paths for sensor data and target labels
df = 'D:/Python/Hydraulic Rig Dataset/Data/sensor_data_df.pkl'
target_df ='D:/Python/Hydraulic Rig Dataset/Data/profile_df.pkl'

# Load dataframes and drop any rows with missing values
target_data = pd.read_pickle(target_df)
data = pd.read_pickle(df)
data.dropna(inplace=True)
target_data.dropna(inplace=True)


# Inspect first few rows and target columns
print(data.head(5))
print(target_data.columns)


                              requirements_1 requirements_2  \
0                               ÿþa n y i o              @    
2                     a r g o n 2 - c f f i              @    
4   a r g o n 2 - c f f i - b i n d i n g s              @    
6                         a s t t o k e n s              @    
8                         a s y n c - l r u              @    

                                      requirements_3  
0   f i l e : / / / C : / b / a b s _ 8 4 7 u o b...  
2   f i l e : / / / o p t / c o n d a / c o n d a...  
4   f i l e : / / / C : / c i / a r g o n 2 - c f...  
6   f i l e : / / / o p t / c o n d a / c o n d a...  
8   f i l e : / / / C : / b / a b s _ e 0 h j k v...  
Index(['Target'], dtype='object')


In [4]:
# Combine sensor measurements with target labels
df_combined =pd.concat([data, target_data], axis=1)
print(df_combined.head(5))

     CE_1    CE_2    CE_3    CE_4    CE_5    CE_6    CE_7    CE_8    CE_9  \
0  47.202  47.273  47.250  47.332  47.213  47.372  47.273  47.438  46.691   
1  29.208  28.822  28.805  28.922  28.591  28.643  28.216  27.812  27.514   
2  23.554  23.521  23.527  23.008  23.042  23.052  22.658  22.952  22.908   
3  21.540  21.419  21.565  20.857  21.052  21.039  20.926  20.912  20.989   
4  20.460  20.298  20.350  19.867  19.997  19.972  19.924  19.813  19.691   

    CE_10  ...  VS1_52  VS1_53  VS1_54  VS1_55  VS1_56  VS1_57  VS1_58  \
0  46.599  ...   0.552   0.545   0.553   0.553   0.539   0.544   0.545   
1  27.481  ...   0.547   0.548   0.544   0.536   0.542   0.540   0.533   
2  22.359  ...   0.544   0.543   0.554   0.544   0.544   0.545   0.544   
3  20.882  ...   0.538   0.553   0.543   0.553   0.555   0.544   0.543   
4  19.634  ...   0.546   0.544   0.552   0.539   0.540   0.549   0.542   

   VS1_59  VS1_60  Target  
0   0.535   0.543       1  
1   0.531   0.534       1  
2   0.53

In [5]:
# Split features and target
X = df_combined.drop(columns=['Target'])
y = df_combined['Target']


# Train-test split with stratification to preserve class ratios
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
print(X.dtypes)
print(y.dtypes)

CE_1      float64
CE_2      float64
CE_3      float64
CE_4      float64
CE_5      float64
           ...   
VS1_56    float64
VS1_57    float64
VS1_58    float64
VS1_59    float64
VS1_60    float64
Length: 43680, dtype: object
int64


In [None]:
# Identify unique sensor prefixes and define time points
sensor_names = sorted(list(set([col.split('_')[0] for col in X.columns])))
n_channels = len(sensor_names)
n_time_points = 60


# Function to reshape DataFrame to 3D array expected by MiniRocketMultivariate
def reshape_for_minirocket(df, sensor_names, n_time_points_dict):
    """
    Converts wide-format sensor DataFrame into shape (n_samples, n_channels, max_time_points).
    Args:
        df: pandas DataFrame of sensor readings (wide format).
        sensor_names: list of sensor prefixes (e.g. ['PS1', 'TS2', ...]).
        n_time_points_dict: dict mapping sensor name to expected number of time points.
    Returns:
        reshaped_array: numpy array of shape (n_samples, n_channels, max_time_points).
    """
    n_samples = df.shape[0]
    n_channels = len(sensor_names)
    max_time_points = max(n_time_points_dict.values())
     # Initialize empty array
    reshaped_array = np.empty((n_samples, n_channels, max_time_points))
    
    for i, sensor in enumerate(sensor_names):
           # Filter columns belonging to this sensor
        sensor_cols = [col for col in df.columns if col.startswith(f'{sensor}_')]
        print(f"\n--- Sensor: {sensor} ---")
        print(f"Number of columns starting with '{sensor}_': {len(sensor_cols)}")
        print(f"First 10 matching columns: {sensor_cols[:10]}")
        print(f"Last 10 matching columns: {sensor_cols[-10:]}")

         # Sort columns by time index (assumes "Sensor_0", "Sensor_1", ... naming)
        sensor_cols_sorted = sorted(sensor_cols, key=lambda x: int(x.split('_')[1]))
        print(f"First 10 sorted columns: {sensor_cols_sorted[:10]}")
        print(f"Last 10 sorted columns: {sensor_cols_sorted[-10:]}")
        print(f"Shape of selected data: {df[sensor_cols_sorted].values.shape}")


        # Extract matrix of shape (n_samples, num_tp) and assign into reshaped array
        num_sensor_time_points = len(sensor_cols_sorted)
        if num_sensor_time_points > 0:
            reshaped_array[:, i, :min(num_sensor_time_points, max_time_points)] = df[sensor_cols_sorted].values[:, :max_time_points]
        else:
            # If missing sensor data, fill with NaNs
            reshaped_array[:, i, :] = np.nan

    return reshaped_array

sensor_names = ['PS1', 'PS2', 'PS3', 'PS4', 'PS5', 'PS6', 'EPS1', 'FS1', 'FS2', 'TS1', 'TS2', 'TS3', 'TS4', 'VS1', 'CE', 'CP', 'SE']
n_time_points_dict = {
    'PS1': 60,
    'PS2': 60,
    'PS3': 60,
    'PS4': 60,
    'PS5': 60,
    'PS6': 60,
    'EPS1': 60,
    'FS1': 60,
    'FS2': 60,
    'TS1': 60,
    'TS2': 60,
    'TS3': 60,
    'TS4': 60,
    'VS1': 60,
    'CE': 60,
    'CP': 60,
    'SE': 60,
}


# Reshape the training and testing data
X_train_reshaped = reshape_for_minirocket(X_train, sensor_names, n_time_points_dict)
X_test_reshaped = reshape_for_minirocket(X_test, sensor_names, n_time_points_dict)



print("Shape of X_train_reshaped:", X_train_reshaped.shape)
print("Shape of X_test_reshaped:", X_test_reshaped.shape)



--- Sensor: PS1 ---
Number of columns starting with 'PS1_': 6000
First 10 matching columns: ['PS1_1', 'PS1_2', 'PS1_3', 'PS1_4', 'PS1_5', 'PS1_6', 'PS1_7', 'PS1_8', 'PS1_9', 'PS1_10']
Last 10 matching columns: ['PS1_5991', 'PS1_5992', 'PS1_5993', 'PS1_5994', 'PS1_5995', 'PS1_5996', 'PS1_5997', 'PS1_5998', 'PS1_5999', 'PS1_6000']
First 10 sorted columns: ['PS1_1', 'PS1_2', 'PS1_3', 'PS1_4', 'PS1_5', 'PS1_6', 'PS1_7', 'PS1_8', 'PS1_9', 'PS1_10']
Last 10 sorted columns: ['PS1_5991', 'PS1_5992', 'PS1_5993', 'PS1_5994', 'PS1_5995', 'PS1_5996', 'PS1_5997', 'PS1_5998', 'PS1_5999', 'PS1_6000']
Shape of selected data: (1764, 6000)

--- Sensor: PS2 ---
Number of columns starting with 'PS2_': 6000
First 10 matching columns: ['PS2_1', 'PS2_2', 'PS2_3', 'PS2_4', 'PS2_5', 'PS2_6', 'PS2_7', 'PS2_8', 'PS2_9', 'PS2_10']
Last 10 matching columns: ['PS2_5991', 'PS2_5992', 'PS2_5993', 'PS2_5994', 'PS2_5995', 'PS2_5996', 'PS2_5997', 'PS2_5998', 'PS2_5999', 'PS2_6000']
First 10 sorted columns: ['PS2_1', 'P

In [34]:
# Apply MiniROCKET (multivariate version)
rocket = MiniRocketMultivariate(num_kernels=10000)
rocket.fit(X_train_reshaped)
X_train_transformed = rocket.transform(X_train_reshaped)
X_test_transformed = rocket.transform(X_test_reshaped)

In [None]:
# Scale the Rocket features to [0, 1] range for downstream models
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_transformed)
X_test_scaled = scaler.transform(X_test_transformed)

In [None]:
# Save the processed datasets and labels to disk for later use
X_train_pickled = 'D:/Python/Hydraulic Rig Dataset/Data/X_train_pickled.pkl'
X_test_pickled = 'D:/Python/Hydraulic Rig Dataset/Data/X_test_pickled.pkl'
y_train_pickled = 'D:/Python/Hydraulic Rig Dataset/Data/y_train_pickled.pkl'
y_test_pickled = 'D:/Python/Hydraulic Rig Dataset/Data/y_test_pickled.pkl'

X_train_transformed.to_pickle(X_train_pickled)  # save train features
X_test_transformed.to_pickle(X_test_pickled)     # save test features
y_train.to_pickle(y_train_pickled)               # save train labels
y_test.to_pickle(y_test_pickled)     # save test labels
