# unified multi-modal recognition model

In [None]:
# Import relevant libraries

import numpy as np
import pandas as pd
import librosa
import os
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Load the data
audio_folder = "Audio/"
imu_folder = "Gesture/"

X = []
y = []

for label in range(1, 6):
    # Load the audio data and extract features
    folder_path = f"{audio_folder}/{label}"
    for file_name in os.listdir(folder_path):
        # to ignore any DS_Store file 
        if not file_name.startswith('.'): 
            file_path = f"{folder_path}/{file_name}"
            # Load audio clip
            signal, sr = librosa.load(file_path, duration=3.0)  # Set duration to 3 seconds
            # Extract MFCC features
            mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc).T.flatten().tolist()
            # Flatten the list
            mfcc_flattened=mfcc.

    # Load the IMU data and compute statistical features
    imu_file = f"{imu_folder}/{i}/{j}.csv"
    imu_data = pd.read_csv(imu_file)
    imu_mean = imu_data.mean().values
    imu_std = imu_data.std().values
    imu_var = imu_data.var().values

    # Concatenate the features
    features = np.concatenate([mfcc, imu_mean, imu_std, imu_var])

    # Add the features and labels to the dataset
    X.append(features)
    y.append(i)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the MLP classifier
clf = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, alpha=0.0001,
                    solver='adam', verbose=10,  random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = clf.predict(X_test)

# Evaluate the performance of the classifier using accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)