In [1]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import h5py

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Loading the data back from the HDF5 file
with h5py.File('D:\Hanze_Master\Thesis\Data\Capture24_seq\X_30.h5', 'r') as f:
    X = f['X_30'][:]
    print(X.shape)  # Check the shape of the loaded data

(1856443, 3, 150)


In [3]:
Y = np.load(r'D:\Hanze_Master\Thesis\Data\Capture24_seq\Y_WillettsSpecific2018.npy')
Y.shape

(1856443,)

In [4]:
unique_labels, counts = np.unique(Y, return_counts=True)
print("Labels: ", unique_labels)
print("Counts: ", counts)
print("Amount of Labels is: ", len(unique_labels))

Labels:  ['bicycling' 'household-chores' 'manual-work' 'mixed-activity' 'sitting'
 'sleep' 'sports' 'standing' 'vehicle' 'walking']
Counts:  [ 18079 130622  20286  78060 664344 680313  10597  61249  69860 123033]
Amount of Labels is:  10


In [5]:
# Assume `labels` is a list of string labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(Y)  # Converts labels to integers

# You can get the mapping like this:
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print("Label Mapping:", label_mapping)

Label Mapping: {'bicycling': 0, 'household-chores': 1, 'manual-work': 2, 'mixed-activity': 3, 'sitting': 4, 'sleep': 5, 'sports': 6, 'standing': 7, 'vehicle': 8, 'walking': 9}


In [6]:
# Split into training and testing datasets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, encoded_labels, test_size=0.2, random_state=42, stratify=encoded_labels
)

# Verify the shapes
print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Testing labels shape:", y_test.shape)

Training data shape: (1485154, 3, 150)
Testing data shape: (371289, 3, 150)
Training labels shape: (1485154,)
Testing labels shape: (371289,)


In [13]:
len(X_train)

1485154

In [17]:
def reshape_and_create_df(X, y):
    # Reshape the array to (100, 450)
    reshaped_array = X.reshape(len(X), -1)

    # Create column names
    column_names = [f"feature_{i}_{j}" for i in range(3) for j in range(150)]

    # Convert to DataFrame
    df = pd.DataFrame(reshaped_array, columns=column_names)

    # Add the labels as the last column
    df['label'] = y

    # Display the DataFrame
    return df

In [19]:
df_train = reshape_and_create_df(X_train, y_train)
df_train

Unnamed: 0,feature_0_0,feature_0_1,feature_0_2,feature_0_3,feature_0_4,feature_0_5,feature_0_6,feature_0_7,feature_0_8,feature_0_9,...,feature_2_141,feature_2_142,feature_2_143,feature_2_144,feature_2_145,feature_2_146,feature_2_147,feature_2_148,feature_2_149,label
0,-0.101955,-0.101134,-0.106929,-0.107826,-0.094714,-0.099950,-0.097728,-0.101423,-0.103208,-0.099627,...,0.729929,0.732640,0.729418,0.731951,0.736047,0.733098,0.726919,0.730659,0.732669,5
1,-0.097537,-0.093784,-0.093862,-0.097478,-0.099378,-0.094756,-0.101868,-0.096065,-0.097842,-0.099613,...,-0.984755,-0.970524,-0.977068,-0.970952,-0.970347,-0.977036,-0.968033,-0.969061,-0.970864,4
2,-0.602585,-0.605491,-0.594191,-0.592931,-0.589778,-0.592316,-0.589992,-0.592310,-0.589784,-0.592923,...,-0.775346,-0.777163,-0.775460,-0.777114,-0.775455,-0.777188,-0.775243,-0.777939,-0.780173,4
3,-0.610814,-0.608250,-0.567706,-0.611779,-0.587914,-0.602664,-0.599889,-0.576318,-0.620388,-0.618238,...,0.749233,0.748322,0.749506,0.752662,0.753073,0.752410,0.753070,0.752678,0.750734,4
4,-0.908482,-0.919230,-0.881712,-0.877782,-0.879902,-0.893474,-0.879103,-0.878429,-0.884018,-0.892818,...,0.338309,0.347747,0.387590,0.394885,0.375805,0.402441,0.408714,0.394953,0.389488,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1485149,-0.510123,-0.758677,-0.748559,-0.798723,-0.840677,-1.080650,-1.151172,-0.913615,-0.887855,-1.199405,...,-0.825203,-0.787970,-0.721692,-0.715470,-0.700270,-0.701607,-0.744504,-0.674500,-0.763609,9
1485150,-0.069925,-0.077132,-0.068348,-0.066858,-0.073496,-0.059595,-0.072674,-0.070512,-0.071120,-0.065322,...,0.779542,0.779633,0.779543,0.779631,0.779545,0.779630,0.779547,0.779628,0.779548,5
1485151,0.134422,0.862039,0.477696,0.535497,0.487873,0.501593,0.309370,0.311790,0.171015,0.198196,...,0.733663,1.417508,0.827283,0.999216,0.689220,1.194405,1.030180,0.124267,1.620494,0
1485152,-0.548709,-0.463786,-0.488637,-0.467967,-0.499827,-0.481178,-0.484668,-0.487197,-0.455158,-0.455893,...,0.688617,0.705170,0.705894,0.709620,0.703610,0.714751,0.700283,0.705244,0.701792,4


In [20]:
df_test = reshape_and_create_df(X_test, y_test)
df_test

Unnamed: 0,feature_0_0,feature_0_1,feature_0_2,feature_0_3,feature_0_4,feature_0_5,feature_0_6,feature_0_7,feature_0_8,feature_0_9,...,feature_2_141,feature_2_142,feature_2_143,feature_2_144,feature_2_145,feature_2_146,feature_2_147,feature_2_148,feature_2_149,label
0,-0.997880,-0.994092,-0.986562,-1.010824,-1.011868,-1.013387,-1.016559,-0.992806,-0.970249,-0.994051,...,-0.102893,-0.099884,-0.097786,-0.092979,-0.082135,-0.089010,-0.081001,-0.093575,-0.079810,4
1,1.392653,1.285826,-0.081144,0.574093,-0.372502,0.429589,2.033084,2.228379,1.996397,0.477339,...,0.937322,0.862782,1.053815,0.859562,0.618887,0.436643,0.163161,0.551026,1.014369,7
2,-0.362486,-0.364864,-0.357958,-0.349600,-0.362299,-0.365154,-0.355841,-0.351912,-0.349591,-0.355791,...,0.966563,0.967621,0.963175,0.964706,0.963735,0.964478,0.963839,0.964443,0.963817,5
3,0.849804,0.854854,0.850621,0.856554,0.851584,0.850629,0.850848,0.850246,0.852683,0.846476,...,0.516933,0.516683,0.516989,0.516618,0.517069,0.516512,0.517222,0.516262,0.517731,5
4,-0.420504,-0.184499,-0.301809,-0.265236,-0.363322,-0.369326,-0.317164,-0.292365,-0.456576,-0.498503,...,0.556762,0.570732,0.679725,0.587154,0.581445,0.475484,0.574472,0.484504,0.654837,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371284,0.079359,0.079052,0.079396,0.079021,0.079423,0.078998,0.079443,0.078980,0.079459,0.078966,...,-0.906181,-0.906008,-0.906191,-0.905990,-0.906221,-0.905941,-0.906298,-0.905833,-0.903199,5
371285,-0.550618,-0.545959,-0.545927,-0.545948,-0.545936,-0.545940,-0.545943,-0.545934,-0.545949,-0.545929,...,0.591437,0.591341,0.591444,0.591335,0.591450,0.591328,0.591457,0.591321,0.591464,5
371286,-0.669343,-0.292168,-0.468979,-0.577058,-0.481980,-0.379605,-0.400036,-0.506725,-0.467517,-0.515238,...,-0.735122,-0.737931,-0.734089,-0.738671,-0.743963,-0.752325,-0.729518,-0.753022,-0.714502,4
371287,-0.995012,-0.961877,-0.963440,-0.963077,-0.950118,-0.979284,-0.957066,-0.950535,-0.972500,-0.940765,...,-0.037228,-0.032837,-0.024643,-0.037774,-0.038646,-0.042250,-0.039746,-0.046373,-0.021400,4


In [22]:
# Save the DataFrame to a CSV file
csv_filename = 'X_30_train.csv.gz'

# Save the DataFrame to a compressed CSV file
df_train.to_csv(csv_filename, index=False, compression='gzip')

print(f"DataFrame saved to {csv_filename}")

DataFrame saved to X_30_train.csv.gz


In [23]:
# Save the DataFrame to a CSV file
csv_filename = 'X_30_test.csv.gz'

# Save the DataFrame to a compressed CSV file
df_test.to_csv(csv_filename, index=False, compression='gzip')

print(f"DataFrame saved to {csv_filename}")

DataFrame saved to X_30_test.csv.gz
