In [None]:
! pip install skl2onnx
! pip install kagglehub
! pip install onnxmltools
! pip install onnxruntime
! pip install tensorflow
!pip install onnx-tf

**GET DATASET FROM KAGGLE**

In [16]:
import kagglehub
path = kagglehub.dataset_download("kmknation/mobifall-dataset-v20")

**PROCESS RAW DATA**

In [17]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import kagglehub
from scipy.stats import skew, kurtosis  # For skewness and kurtosis

# Updated FFT Function (no longer needed for dominant frequency)
def compute_fft(signal, sampling_rate):
    """Compute the FFT and return all frequencies (positive and negative) and magnitudes."""
    N = len(signal)
    freqs = np.fft.fftfreq(N, d=1 / sampling_rate)
    magnitudes = np.abs(np.fft.fft(signal)) / N
    return np.fft.fftshift(freqs), np.fft.fftshift(magnitudes)

def summarize_file(base_address, file_name, window_size_in_secs=4, stride_fraction=0.1):
    # Initialize arrays
    x, y, z, time_stamp = [], [], [], []

    # Read data
    with open(os.path.join(base_address, file_name), 'r') as file:
        for _ in range(16):  # Skip header
            next(file)
        for line in file:
            values = line.strip().split(',')
            if len(values) >= 4:
                time_stamp.append(float(values[0].strip()))
                x.append(float(values[1].strip()))
                y.append(float(values[2].strip()))
                z.append(float(values[3].strip()))

    # Convert times to nanoseconds for calculations
    window_size_ns = window_size_in_secs * 1e9
    stride_ns = int(stride_fraction * window_size_ns)

    # Sliding window
    results = []
    start_idx = 0
    while start_idx < len(time_stamp):
        start_time = time_stamp[start_idx]
        end_time = start_time + window_size_ns
        end_idx = start_idx
        while end_idx < len(time_stamp) and time_stamp[end_idx] <= end_time:
            end_idx += 1

        # Extract window
        window_x = x[start_idx:end_idx]
        window_y = y[start_idx:end_idx]
        window_z = z[start_idx:end_idx]

        if window_x:
            # Standard deviation
            std_x, std_y, std_z = np.std(window_x), np.std(window_y), np.std(window_z)

            # Signal magnitude area (SMA)
            sma = np.sum(np.abs(window_x) + np.abs(window_y) + np.abs(window_z)) / len(window_x)

            # Root-mean-square (RMS)
            rms_x = np.sqrt(np.mean(np.square(window_x)))
            rms_y = np.sqrt(np.mean(np.square(window_y)))
            rms_z = np.sqrt(np.mean(np.square(window_z)))

            # Skewness
            skew_x = skew(window_x)
            skew_y = skew(window_y)
            skew_z = skew(window_z)

            # Kurtosis
            kurt_x = kurtosis(window_x)
            kurt_y = kurtosis(window_y)
            kurt_z = kurtosis(window_z)

            # Append all features to results (excluding dominant frequency)
            results.append([
                std_x, std_y, std_z, sma,
                rms_x, rms_y, rms_z, skew_x, skew_y, skew_z, kurt_x, kurt_y, kurt_z
            ])

        next_start_time = start_time + stride_ns
        while start_idx < len(time_stamp) and time_stamp[start_idx] < next_start_time:
            start_idx += 1

    return results

def process_dataset(dataset_path):
    data = []
    columns = [
        "activity_name", "activity_no", "window_no",
        "acc_std_x", "acc_std_y", "acc_std_z", "acc_sma",
        "acc_rms_x", "acc_rms_y", "acc_rms_z", "acc_skew_x", "acc_skew_y", "acc_skew_z", "acc_kurt_x", "acc_kurt_y", "acc_kurt_z",
        "gyro_std_x", "gyro_std_y", "gyro_std_z", "gyro_sma",
        "gyro_rms_x", "gyro_rms_y", "gyro_rms_z", "gyro_skew_x", "gyro_skew_y", "gyro_skew_z", "gyro_kurt_x", "gyro_kurt_y", "gyro_kurt_z"
    ]

    total_files = sum(1 for _, _, files in os.walk(dataset_path) for file in files if file.endswith(".txt") and 'acc' in file)
    print("Total files:", total_files)

    processed_files = 0

    for root, _, files in os.walk(dataset_path):
        for file in files:
            if file.endswith(".txt") and 'acc' in file:
                parsed_filename = file.split('_')
                activity_name = parsed_filename[0]
                activity_no = parsed_filename[-2] + '.' + parsed_filename[-1].split('.')[0]

                # Build associated file names
                gyro_filename = f"{parsed_filename[0]}_gyro_{parsed_filename[2]}_{parsed_filename[3]}"

                # Check if the associated files exist
                gyro_path = os.path.join(root, gyro_filename)

                acc_results = summarize_file(root, file)
                gyro_results = summarize_file(root, gyro_filename) if os.path.exists(gyro_path) else []

                # Determine the maximum number of windows
                max_windows = max(len(acc_results), len(gyro_results))

                # Pad smaller result sets with NaN values
                def pad_results(results, max_length):
                    padded_results = results + [[np.nan] * len(results[0]) for _ in range(max_length - len(results))]
                    return padded_results

                if acc_results:
                    acc_results = pad_results(acc_results, max_windows)
                else:
                    acc_results = [[np.nan] * 13 for _ in range(max_windows)]  # 13 features now

                if gyro_results:
                    gyro_results = pad_results(gyro_results, max_windows)
                else:
                    gyro_results = [[np.nan] * 13 for _ in range(max_windows)]  # 13 features now

                # Combine results into rows
                for window_no in range(max_windows):
                    row = [activity_name, activity_no, window_no] + acc_results[window_no] + gyro_results[window_no]
                    data.append(row)

                processed_files += 1
                sys.stdout.write(f"\rProcessed {processed_files}/{total_files}: {file}")
                sys.stdout.flush()

    # Create a DataFrame with the summarized data
    df = pd.DataFrame(data, columns=columns)
    return df

# Use KaggleHub to download the dataset
dataset_path = kagglehub.dataset_download("kmknation/mobifall-dataset-v20")

# Verify the path exists
path = Path(dataset_path)
if path.exists():
    print(f"The path '{path}' exists.")
else:
    print(f"The path '{path}' does not exist.")

# Process the dataset and save the summary
df = process_dataset(dataset_path)
df.to_csv("MobiFall_summary_without_dominant_frequency.csv", index=False)

The path '/root/.cache/kagglehub/datasets/kmknation/mobifall-dataset-v20/versions/1' exists.
Total files: 630
Processed 310/630: FOL_acc_11_2.txt

  skew_x = skew(window_x)
  kurt_x = kurtosis(window_x)


Processed 497/630: FKL_acc_4_1.txt

  skew_z = skew(window_z)
  kurt_z = kurtosis(window_z)


Processed 630/630: FOL_acc_6_3.txt

**ENCODING**

In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Assuming pre_df is your DataFrame
def preprocess(pre_df):
    # Step 1: Label encode the activity_name
    label_encoder = LabelEncoder()
    pre_df['activity_name'] = label_encoder.fit_transform(pre_df['activity_name'])

    # Step 2: Drop activity_no and window_no
    pre_df = pre_df.drop(columns=['activity_no', 'window_no'])

    # Step 3: Check and handle NaN or infinite values
    pre_df = pre_df.replace([np.inf, -np.inf], np.nan)  # Replace infinite values with NaN
    pre_df.dropna(inplace=True)  # Drop rows with NaN

    # Step 4: Ensure all features are numeric
    pre_df = pre_df.apply(pd.to_numeric, errors='coerce')
    pre_df.dropna(inplace=True)  # Drop any rows that couldn't be converted

    # Print the mapping of activity names to their encoded values
    activity_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
    print("Activity Name to Encoded Value Mapping:")
    for activity, encoded_value in activity_mapping.items():
        print(f"{activity}: {encoded_value}")

    return pre_df, label_encoder

# Example usage
# Assuming df is your original DataFrame
preprocessed_df, label_encoder = preprocess(df)

# Check the results
print(preprocessed_df.head())

Activity Name to Encoded Value Mapping:
BSC: 0
CSI: 1
CSO: 2
FKL: 3
FOL: 4
JOG: 5
JUM: 6
SCH: 7
SDL: 8
STD: 9
STN: 10
STU: 11
WAL: 12
   activity_name  acc_std_x  acc_std_y  acc_std_z    acc_sma  acc_rms_x  \
0              3   4.499141   2.987564   5.127036  16.315239   5.069125   
1              3   4.247136   2.789493   4.843027  16.310487   4.961023   
2              3   4.021176   2.618092   4.579686  16.306017   4.873326   
3              3   3.819695   2.472054   4.315499  16.319557   4.805378   
4              3   3.113925   2.306072   3.145883  16.545363   4.811135   

   acc_rms_y  acc_rms_z  acc_skew_x  acc_skew_y  ...  gyro_sma  gyro_rms_x  \
0   5.935516   7.167293    0.893955    1.346694  ...  1.299361    0.933942   
1   5.763888   7.231539    1.086705    1.528225  ...  1.240739    0.912441   
2   5.619333   7.285372    1.269841    1.706692  ...  1.226710    0.912378   
3   5.498597   7.337092    1.450987    1.880339  ...  1.188325    0.909819   
4   5.187403   7.644553  

**STANDARDIZE**

In [22]:
import pandas as pd
import numpy as np

def standardize_dataframe(df):
  # Initialize dictionary to store mean and std of each column
  stats = {}

  # Iterate through each column in the DataFrame
  for column in df.columns[1:]:
          if df[column].dtype in [np.float64, np.int64]:  # Only standardize numerical columns
              mean = df[column].mean()
              std = df[column].std()

              # Store the mean and std
              stats[column] = {'mean': mean, 'std': std}

              # Apply Z-score standardization
              df[column] = (df[column] - mean) / std

  return df, stats

preprocessed_df,stats=standardize_dataframe(preprocessed_df)


**SAVE MEAN/STD**

In [23]:
# Convert the stats dictionary to a DataFrame
stats_df = pd.DataFrame.from_dict(stats, orient='index')

# Reset index to have a proper column for the keys
stats_df.reset_index(inplace=True)
stats_df.rename(columns={'index': 'Feature'}, inplace=True)

# Save the DataFrame to a CSV file
csv_file_path = 'stats.csv'
stats_df.to_csv(csv_file_path, index=False)

print(f"Statistics saved to {csv_file_path}")

Statistics saved to stats.csv


**TRAIN MODEL**

In [24]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Step 1: Separate features (X) and target (y)
X = preprocessed_df.drop(columns=['activity_name'])
y = preprocessed_df['activity_name']

# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 4: Initialize and train the LightGBM model
lgb_model = lgb.LGBMClassifier(n_estimators=100, random_state=42)
lgb_model.fit(X_train, y_train)

# Step 5: Make predictions and evaluate the model
y_pred = lgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

model_file_path = 'lightgbm_model.txt'
lgb_model.booster_.save_model(model_file_path)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012715 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6630
[LightGBM] [Info] Number of data points in the train set: 23627, number of used features: 26
[LightGBM] [Info] Start training from score -2.802620
[LightGBM] [Info] Start training from score -3.607116
[LightGBM] [Info] Start training from score -3.613376
[LightGBM] [Info] Start training from score -2.796359
[LightGBM] [Info] Start training from score -2.810326
[LightGBM] [Info] Start training from score -2.693012
[LightGBM] [Info] Start training from score -2.693637
[LightGBM] [Info] Start training from score -3.605557
[LightGBM] [Info] Start training from score -2.808218
[LightGBM] [Info] Start training from score -1.489790
[LightGBM] [Info] Start training from score -3.092864
[LightGBM] [Info] Start training from score -3.101295
[LightGBM] [Info] Start training from score -1.489602
Accuracy:

<lightgbm.basic.Booster at 0x783d8aa1a110>

**CONVERT TO ONNX**

In [25]:
import onnxmltools
import numpy as np
from sklearn.model_selection import train_test_split
from onnx import TensorProto
from onnx import helper
from onnx import numpy_helper
from skl2onnx.common.data_types import StringTensorType, FloatTensorType  # Correct import

initial_types = [
    ('X', FloatTensorType([None, X_train.shape[1]]))  # Features: acc_std_x, acc_std_y, ..., ori_df_z
]

# Convert the trained LightGBM model to ONNX
onnx_model = onnxmltools.convert_lightgbm(lgb_model, initial_types=initial_types)

# Save the model as a .onnx file
onnxmltools.utils.save_model(onnx_model, 'lightgbm_model.onnx')




In [None]:
X.iloc[0:2,1:]

Unnamed: 0,acc_std_y,acc_std_z,acc_sma,acc_df_x,acc_df_y,acc_df_z,gyro_std_x,gyro_std_y,gyro_std_z,gyro_sma,gyro_df_x,gyro_df_y,gyro_df_z,ori_std_x,ori_std_y,ori_std_z,ori_sma,ori_df_x,ori_df_y,ori_df_z
0,0.487331,2.976688,0.86234,0.211859,0.057242,0.368497,0.614769,0.545008,0.050875,-0.006331,0.024636,2.038729,-0.153325,4.391725,2.019205,0.111307,-0.485411,0.009877,0.053748,0.154739
1,0.427042,2.94707,0.892204,0.211859,0.057242,0.226294,0.494084,0.469881,-0.079226,-0.07086,0.178101,-1.307381,-0.153325,4.462546,2.062994,0.17716,-0.410278,0.009877,0.053748,0.154739


**TEST ONNX MODEL**

In [26]:
import onnxruntime as ort
import numpy as np


# Load the ONNX model
onnx_model_path = 'lightgbm_model.onnx'
session = ort.InferenceSession(onnx_model_path)

# Prepare a sample input (21 features, replace this with your actual data)
# Example of a single sample input with 21 features
input_data = np.array([X.iloc[0].values], dtype=np.float32)
# Input name should match the one defined in the model ("X")
input_name = session.get_inputs()[0].name  # 'X'

# Run inference
output = session.run(None, {input_name: input_data})

# Print output: 'label' should be the predicted class
print("Predicted label:", output[0],y[0])

# If you need probabilities (if they exist)
# print("Predicted probabilities:", output[1] if len(output) > 1 else "No probabilities output")


Predicted label: [3] 3
