In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hharraw/Phones_accelerometer.csv
/kaggle/input/hharraw/Phones_gyroscope.csv


In [2]:
import pandas as pd
import torch
from tqdm import tqdm
import os
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
scaler = StandardScaler()

In [3]:
data_dir = '/kaggle/input/hharraw'
HHAR_phones_acc = pd.read_csv(f'{data_dir}/Phones_accelerometer.csv')
HHAR_phones_gyro = pd.read_csv(f'{data_dir}/Phones_gyroscope.csv')

In [4]:
def split_Model_data(acc_data, gyro_data):
    models = ['nexus4', 's3', 's3mini', 'samsungold']
    model_dats = {}
    for model in models:
        model_dats[model] = []
        model_dats[model].append(acc_data[acc_data['Model']==model])
        model_dats[model].append(gyro_data[gyro_data['Model']==model])
    return model_dats



In [5]:
def sensor_pairing(user_acc_data, user_gyro_data, label):
    '''columns:
        Creation_Time   x   y   z   User,   labels
    '''
    # 重置索引
    user_acc_data = user_acc_data.reset_index(drop=True)
    user_gyro_data = user_gyro_data.reset_index(drop=True)

    new_pd = pd.DataFrame(columns=['Time', 'acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z', 'labels'])

    # 转化为秒级时间戳
    timeScale = 100000000.
    user_acc_data['Creation_Time'] /= timeScale
    user_gyro_data['Creation_Time'] /= timeScale
    # print(user_acc_data.head())
    # print(user_gyro_data.head())

    len_acc = user_acc_data.shape[0]
    len_gyro = user_gyro_data.shape[0]
    # print(len_acc, len_gyro)
    acc_idx = 0
    gyro_idx = 0
    with tqdm(total=len_acc, position=0, desc="Pairing user's label data") as pbar:
        while acc_idx < len_acc and gyro_idx < len_gyro:
            acc_curTime = user_acc_data.iloc[acc_idx]['Creation_Time']
            # acc_curTime = acc_sample['Creation_Time']
            gyro_curTime = user_gyro_data.iloc[gyro_idx]['Creation_Time']
            # gyro_curTime = gyro_sample['Creation_Time']
            # print(acc_curTime, gyro_curTime, abs(acc_curTime - gyro_curTime))
            if abs(acc_curTime - gyro_curTime) < 0.1:
                # print('gen_new_row')
                new_row = {}
                new_row['Time'] = 0.5 * (acc_curTime + gyro_curTime)
                new_row['labels'] = label
                new_row['acc_x'] = user_acc_data.iloc[acc_idx]['x']
                new_row['acc_y'] = user_acc_data.iloc[acc_idx]['y']
                new_row['acc_z'] = user_acc_data.iloc[acc_idx]['z']
                new_row['gyro_x'] = user_gyro_data.iloc[gyro_idx]['x']
                new_row['gyro_y'] = user_gyro_data.iloc[gyro_idx]['y']
                new_row['gyro_z'] = user_gyro_data.iloc[gyro_idx]['z']
                
                new_pd = new_pd.append(new_row, ignore_index=True)

                acc_idx += 1
                gyro_idx += 1
                pbar.update(1)
            else:
                if acc_curTime - gyro_curTime >= 0.1:  # 10hz
                    gyro_idx += 1
                elif gyro_curTime - acc_curTime >= 0.1:
                    acc_idx += 1
                    pbar.update(1)
    print(f'user_acc_{label}.shape: {user_acc_data.shape}')
    print(f'user_gyro_{label}.shape: {user_gyro_data.shape}')
    print(f'new_pd.shape: {new_pd.shape}')
    # print(new_pd.head(), "\n")
    return new_pd

In [6]:
def sliding_window(time_series, width, step, order='F'):
    w = np.hstack([time_series[i:1 + i - width or None:step] for i in range(0, width)])
    result = w.reshape((int(len(w) / width), width), order='F')
    if order == 'F':
        return result
    else:
        return np.ascontiguousarray(result)

def calc_normalization(data):
    num_instances, num_time_steps, num_features = data.shape
    data = np.reshape(data, (num_instances, -1))
    scaler.fit(data)
#     mean, std = (np.array([np.mean(x) for x in X_train], dtype=np.float32), np.array([np.std(x) for x in X_train], dtype=np.float32))
    return scaler
    
def apply_normalization(data, scaler):
#     scaler = StandardScaler()
    num_instances, num_time_steps, num_features = data.shape
    data = np.reshape(data, (num_instances, -1))
    norm_data = scaler.transform(data)
#     debug_here()
#     data = (data - mean) / (std + 1e-5)
    norm_data[np.isnan(norm_data)] = 0
    norm_data = np.reshape(norm_data, (num_instances, num_time_steps, num_features))
    return norm_data

def HHAR_data_grnerator(model_dats, model_name):
    seq_length = 128  # 数据长度
    shifting_step = 128  # overlap=0
    channel_nums = 6  # 各个模态的通道数，每个采样结果的数据量
    # acc data cleaning
    acc_dat = model_dats[0]
    acc_dat.dropna()
    acc_dat['gt'] = acc_dat['gt'].astype('category')
    acc_dat['labels']=acc_dat['gt'].cat.codes
    acc_dat_refined = acc_dat.drop(columns=['Index', 'Arrival_Time', 'Model', 'Device' , 'gt'])

    # gyro data cleaning
    gyro_dat = model_dats[1]
    gyro_dat.dropna()
    gyro_dat['gt'] = gyro_dat['gt'].astype('category')
    gyro_dat['labels']=gyro_dat['gt'].cat.codes
    gyro_dat_refined = gyro_dat.drop(columns=['Index', 'Arrival_Time', 'Model', 'Device' , 'gt'])

    # split user data
    user_index = 0
    for user_name, user_acc_data in tqdm(acc_dat_refined.groupby('User'), desc="Processing User"):
        # print(f"user: {user_name}")
        data, labels = [], []
        user_gyro_data = gyro_dat_refined[gyro_dat_refined['User']==user_name]

        # 用户数据按照label进行处理
        for label_name, user_acc_label_data in tqdm(user_acc_data.groupby('labels'), desc="Processing User label"):
            # user sensor pairing
            if label_name == -1:
                continue
            user_gyro_label_data = user_gyro_data[user_gyro_data['labels']==label_name]
            paired_data = sensor_pairing(user_acc_label_data, user_gyro_label_data, label=label_name)  # time, x,y,z,x,y,z, label
            sliced_data = np.empty((int(paired_data.shape[0]/seq_length), seq_length, channel_nums))  # (N, 128, 6)

            # 按找通道数据进行分割，添加到sliced_data
            channl_idx = 0
            for channel in paired_data[['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z']]:
                channel_data = paired_data[channel]
                sliced_data[:,:,channl_idx] = sliding_window(channel_data.values, seq_length, shifting_step, 'T')
                channl_idx += 1
            
            # append label data 
            data.append(sliced_data)
            # gen labels
            class_labels = np.empty(sliced_data.shape[0])
            class_labels.fill(label_name)
            labels.append(class_labels.astype(int))
            
        # data and labels for each users 
        array_user_data= np.concatenate(data, axis=0)
        array_user_labels= np.concatenate(labels, axis=0)
        
        # Stratified train, validation, test split of the data 
        X_train, X_test, y_train, y_test = train_test_split(array_user_data, array_user_labels,  stratify=array_user_labels,  test_size=0.3,random_state=1)
        # print(X_train.shape)
        # print(y_train.shape)

        # Data normalization 
        # Calculate mean and standard deviation based on train
        scaler = calc_normalization(X_train)
        
        # Apply normalization 
        X_train = apply_normalization(X_train,scaler)
#         X_val = apply_normalization(X_val,scaler)
        X_test = apply_normalization(X_test,scaler)
        
        # prepare samples
        train_data = {'samples':X_train, 'labels':y_train}
#         val_data   = {'samples':X_val, 'labels':y_val}
        test_data  = {'samples':X_test, 'labels':y_test}
        
        # save
        os.makedirs(f'/kaggle/working/HHAR_data/{model_name}', exist_ok=True)
        torch.save(train_data, f'/kaggle/working/HHAR_data/{model_name}/train_{user_index}.pt')
#         torch.save(val_data,  f'HHAR_user_data/val_{user_name}.pt')
        torch.save(test_data, f'/kaggle/working/HHAR_data/{model_name}/test_{user_index}.pt')

        user_index+=1
    return


In [None]:
model_dats = split_Model_data(HHAR_phones_acc, HHAR_phones_gyro)
HHAR_phones_acc = None
HHAR_phones_gyro = None
models = list(model_dats.keys())

model_name = models[0]
model_acc_gyro_dats = model_dats[model_name]
# 节省内存
model_dats = None

HHAR_data_grnerator(model_dats=model_acc_gyro_dats, model_name=model_name)