In [20]:
import pandas as pd
from glob import glob

In [21]:
# --------------------------------------------------------------
# Read single CSV file
# --------------------------------------------------------------

single_file_acc = pd.read_csv("../../data/raw/MetaMotion/A-bench-heavy2-rpe8_MetaWear_2019-01-11T16.10.08.270_C42732BE255C_Accelerometer_12.500Hz_1.4.4.csv")
single_file_gyr = pd.read_csv("../../data/raw/MetaMotion/A-bench-heavy2-rpe8_MetaWear_2019-01-11T16.10.08.270_C42732BE255C_Gyroscope_25.000Hz_1.4.4.csv")

print(single_file_acc.head())
# print(single_file_gyr)

      epoch (ms)             time (01:00)  elapsed (s)  x-axis (g)  \
0  1547219408431  2019-01-11T16:10:08.431         0.00       0.010   
1  1547219408511  2019-01-11T16:10:08.511         0.08       0.000   
2  1547219408591  2019-01-11T16:10:08.591         0.16       0.001   
3  1547219408671  2019-01-11T16:10:08.671         0.24      -0.012   
4  1547219408751  2019-01-11T16:10:08.751         0.32      -0.013   

   y-axis (g)  z-axis (g)  
0       0.964      -0.087  
1       0.961      -0.069  
2       0.974      -0.087  
3       0.971      -0.084  
4       0.954      -0.094  


In [22]:
# --------------------------------------------------------------
# List all data in data/raw/MetaMotion
# --------------------------------------------------------------

files = glob("../../data/raw/MetaMotion/*.csv")

print(files[0])

../../data/raw/MetaMotion\A-bench-heavy2-rpe8_MetaWear_2019-01-11T16.10.08.270_C42732BE255C_Accelerometer_12.500Hz_1.4.4.csv


In [23]:
# --------------------------------------------------------------
# Extract features from filename
# --------------------------------------------------------------

data_path = "../../data/raw/MetaMotion\\"

f = files[2]
participant = f.split("-")[0].replace(data_path, "")
label = f.split("-")[1]
category = f.split("-")[2].rstrip("123").rstrip("_MetaWear_2019")

print(f"Participant: {participant}")
print(f"Label: {label}")
print(f"Category: {category}")

df = pd.read_csv(f)
df["participant"] = participant
df["label"] = label
df["category"] = category

# print(df)

Participant: A
Label: bench
Category: heavy


In [24]:
# --------------------------------------------------------------
# Read all files
# --------------------------------------------------------------

acc_df = pd.DataFrame()
gyr_df = pd.DataFrame()

acc_set = 1
gyr_set = 1

for file in files:
  # Which contains: epoch, time...
  df = pd.read_csv(file)

  participant = file.split("-")[0].replace(data_path, "")
  label = file.split("-")[1]
  category = file.split("-")[2].rstrip("123").rstrip("_MetaWear_2019")

  # We add to epoch, time... these also participant, label, category
  df["participant"] = participant
  df["label"] = label
  df["category"] = category

  if "Accelerometer" in file:
    df["set"] = acc_set
    acc_set += 1
    acc_df = pd.concat([acc_df, df])

  if "Gyroscope" in file:
    df["set"] = gyr_set
    gyr_set += 1
    gyr_df = pd.concat([gyr_df, df])

print(len(acc_df))
print(len(gyr_df))
print(acc_df.head())
# print(acc_df[acc_df["set"] == 1])

23578
47218
      epoch (ms)             time (01:00)  elapsed (s)  x-axis (g)  \
0  1547219408431  2019-01-11T16:10:08.431         0.00       0.010   
1  1547219408511  2019-01-11T16:10:08.511         0.08       0.000   
2  1547219408591  2019-01-11T16:10:08.591         0.16       0.001   
3  1547219408671  2019-01-11T16:10:08.671         0.24      -0.012   
4  1547219408751  2019-01-11T16:10:08.751         0.32      -0.013   

   y-axis (g)  z-axis (g) participant  label category  set  
0       0.964      -0.087           A  bench    heavy    1  
1       0.961      -0.069           A  bench    heavy    1  
2       0.974      -0.087           A  bench    heavy    1  
3       0.971      -0.084           A  bench    heavy    1  
4       0.954      -0.094           A  bench    heavy    1  


In [25]:
# --------------------------------------------------------------
# Working with datetimes
# --------------------------------------------------------------

acc_df.index = pd.to_datetime(acc_df["epoch (ms)"], unit = "ms")
gyr_df.index = pd.to_datetime(gyr_df["epoch (ms)"], unit = "ms")

del acc_df["epoch (ms)"]
del acc_df["time (01:00)"]
del acc_df["elapsed (s)"]

del gyr_df["epoch (ms)"]
del gyr_df["time (01:00)"]
del gyr_df["elapsed (s)"]

print(acc_df.head())

# acc_df = acc_df.sort_index(ascending=True)
# gyr_df = gyr_df.sort_index(ascending=True)

print(acc_df.head())


                         x-axis (g)  y-axis (g)  z-axis (g) participant  \
epoch (ms)                                                                
2019-01-11 15:10:08.431       0.010       0.964      -0.087           A   
2019-01-11 15:10:08.511       0.000       0.961      -0.069           A   
2019-01-11 15:10:08.591       0.001       0.974      -0.087           A   
2019-01-11 15:10:08.671      -0.012       0.971      -0.084           A   
2019-01-11 15:10:08.751      -0.013       0.954      -0.094           A   

                         label category  set  
epoch (ms)                                    
2019-01-11 15:10:08.431  bench    heavy    1  
2019-01-11 15:10:08.511  bench    heavy    1  
2019-01-11 15:10:08.591  bench    heavy    1  
2019-01-11 15:10:08.671  bench    heavy    1  
2019-01-11 15:10:08.751  bench    heavy    1  
                         x-axis (g)  y-axis (g)  z-axis (g) participant  \
epoch (ms)                                                            

In [26]:
# --------------------------------------------------------------
# Turn into function
# --------------------------------------------------------------

files = glob("../../data/raw/MetaMotion/*.csv")

def read_data_from_files(files):

	data_path = "../../data/raw/MetaMotion\\"

	acc_df = pd.DataFrame()
	gyr_df = pd.DataFrame()

	acc_set = 1
	gyr_set = 1

	for file in files:
		# Which contains: epoch, time...
		df = pd.read_csv(file)

		participant = file.split("-")[0].replace(data_path, "")
		label = file.split("-")[1]
		category = file.split("-")[2].rstrip("123").rstrip("_MetaWear_2019")

		# We add to epoch, time... these also participant, label, category
		df["participant"] = participant
		df["label"] = label
		df["category"] = category

		if "Accelerometer" in file:
			df["set"] = acc_set
			acc_set += 1
			acc_df = pd.concat([acc_df, df])

		if "Gyroscope" in file:
			df["set"] = gyr_set
			gyr_set += 1
			gyr_df = pd.concat([gyr_df, df])

	acc_df.index = pd.to_datetime(acc_df["epoch (ms)"], unit = "ms")
	gyr_df.index = pd.to_datetime(gyr_df["epoch (ms)"], unit = "ms")

	del acc_df["epoch (ms)"]
	del acc_df["time (01:00)"]
	del acc_df["elapsed (s)"]

	del gyr_df["epoch (ms)"]
	del gyr_df["time (01:00)"]
	del gyr_df["elapsed (s)"]

	print(acc_df.head())

	# acc_df = acc_df.sort_index(ascending=True)
	# gyr_df = gyr_df.sort_index(ascending=True)

	return acc_df, gyr_df


acc_df, gyr_df = read_data_from_files(files)

print(acc_df.head())
print(gyr_df.head())

                         x-axis (g)  y-axis (g)  z-axis (g) participant  \
epoch (ms)                                                                
2019-01-11 15:10:08.431       0.010       0.964      -0.087           A   
2019-01-11 15:10:08.511       0.000       0.961      -0.069           A   
2019-01-11 15:10:08.591       0.001       0.974      -0.087           A   
2019-01-11 15:10:08.671      -0.012       0.971      -0.084           A   
2019-01-11 15:10:08.751      -0.013       0.954      -0.094           A   

                         label category  set  
epoch (ms)                                    
2019-01-11 15:10:08.431  bench    heavy    1  
2019-01-11 15:10:08.511  bench    heavy    1  
2019-01-11 15:10:08.591  bench    heavy    1  
2019-01-11 15:10:08.671  bench    heavy    1  
2019-01-11 15:10:08.751  bench    heavy    1  
                         x-axis (g)  y-axis (g)  z-axis (g) participant  \
epoch (ms)                                                            

In [27]:
# --------------------------------------------------------------
# Merging datasets
# --------------------------------------------------------------

data_merged = pd.concat([acc_df.iloc[:, :3], gyr_df], axis = 1)

data_merged.columns = [
	"acc_x",
	"acc_y",
	"acc_z",
	"gyr_x",
	"gyr_y",
	"gyr_z",
	"participant",
	"label",
	"category",
	"set",
]

print(len(data_merged))
print(data_merged.head())

69677
                         acc_x  acc_y  acc_z   gyr_x  gyr_y  gyr_z  \
epoch (ms)                                                           
2019-01-11 15:08:04.950    NaN    NaN    NaN -10.671 -1.524  5.976   
2019-01-11 15:08:04.990    NaN    NaN    NaN  -8.720 -2.073  3.171   
2019-01-11 15:08:05.030    NaN    NaN    NaN   0.488 -3.537 -4.146   
2019-01-11 15:08:05.070    NaN    NaN    NaN   0.244 -5.854  3.537   
2019-01-11 15:08:05.110    NaN    NaN    NaN  -0.915  0.061 -2.805   

                        participant  label category   set  
epoch (ms)                                                 
2019-01-11 15:08:04.950           B  bench    heavy  30.0  
2019-01-11 15:08:04.990           B  bench    heavy  30.0  
2019-01-11 15:08:05.030           B  bench    heavy  30.0  
2019-01-11 15:08:05.070           B  bench    heavy  30.0  
2019-01-11 15:08:05.110           B  bench    heavy  30.0  


In [28]:
# --------------------------------------------------------------
# Resample data (frequency conversion)
# --------------------------------------------------------------

sampling = {
	"acc_x": "mean",
	"acc_y": "mean",
	"acc_z": "mean",
	"gyr_x": "mean",
	"gyr_y": "mean",
	"gyr_z": "mean",
	"participant": "last",
	"label": "last",
	"category": "last",
	"set": "last",
}

data_merged[:1000].resample(rule = "200ms").apply(sampling)

days = [g for n, g in data_merged.groupby(pd.Grouper(freq = "D"))]
data_resampled = pd.concat([df.resample(rule = "200ms").apply(sampling).dropna() for df in days])

data_resampled["set"] = data_resampled["set"].astype(int)
data_resampled.info()

print(data_resampled.head())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9009 entries, 2019-01-11 15:08:05.200000 to 2019-01-20 17:33:27.800000
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   acc_x        9009 non-null   float64
 1   acc_y        9009 non-null   float64
 2   acc_z        9009 non-null   float64
 3   gyr_x        9009 non-null   float64
 4   gyr_y        9009 non-null   float64
 5   gyr_z        9009 non-null   float64
 6   participant  9009 non-null   object 
 7   label        9009 non-null   object 
 8   category     9009 non-null   object 
 9   set          9009 non-null   int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 774.2+ KB
                            acc_x     acc_y     acc_z   gyr_x   gyr_y   gyr_z  \
epoch (ms)                                                                      
2019-01-11 15:08:05.200  0.013500  0.977000 -0.071000 -1.8904  2.4392  0.9388   
2019-01-11 15:08:05.400 -0.00150

In [29]:
# --------------------------------------------------------------
# Export dataset
# --------------------------------------------------------------

data_resampled.to_pickle("../../data/interim/01_data_processed.pkl")