# スマートホームにおけるセンサデータを使った日常行動認識実験

In [1]:
import os
import pandas as pd
import numpy as np
from progressbar import ProgressBar

### データの読み込み

データは
http://ailab.wsu.edu/casas/datasets/
からダウンロード

今回は，このリストの17番 (http://ailab.wsu.edu/casas/datasets/aruba.zip)

を落として， "./wsu_dataset/aruba"に展開した中身を配置しております

In [2]:
col_names = [ 'date','time', 'sensor_id', 'val', 'activity_name', 'state']

aruba_df = pd.read_csv('./wsu_dataset/aruba/data', sep="\s+", names=col_names)

aruba_df.head()

Unnamed: 0,date,time,sensor_id,val,activity_name,state
0,2010-11-04,00:03:50.209589,M003,ON,Sleeping,begin
1,2010-11-04,00:03:57.399391,M003,OFF,,
2,2010-11-04,00:15:08.984841,T002,21.5,,
3,2010-11-04,00:30:19.185547,T003,21,,
4,2010-11-04,00:30:19.385336,T004,21,,


In [3]:
aruba_df.sensor_id.unique()

array(['M003', 'T002', 'T003', 'T004', 'T005', 'T001', 'M002', 'M007',
       'M005', 'M004', 'M006', 'M008', 'M020', 'M010', 'M011', 'M012',
       'M013', 'M014', 'M009', 'M018', 'M019', 'M015', 'M016', 'M017',
       'M021', 'M022', 'M023', 'M001', 'M024', 'D002', 'M031', 'D004',
       'M030', 'M029', 'M028', 'D001', 'M026', 'M027', 'M025', 'c',
       'LEAVEHOME', 'ENTERHOME'], dtype=object)

In [4]:
aruba_df.activity_name.unique()

array(['Sleeping', nan, 'Bed_to_Toilet', 'Meal_Preparation', 'Relax',
       'Housekeeping', 'Eating', 'Wash_Dishes', 'Leave_Home', 'Enter_Home',
       'Work', 'Resperate'], dtype=object)

In [5]:
# acitivity_name_list = sorted(aruba_df.activity_name.unique().tolist())
activity_name_list = sorted(aruba_df.activity_name.value_counts().index.tolist())
print (activity_name_list)

['Bed_to_Toilet', 'Eating', 'Enter_Home', 'Housekeeping', 'Leave_Home', 'Meal_Preparation', 'Relax', 'Resperate', 'Sleeping', 'Wash_Dishes', 'Work']


In [6]:
%%time
## センサーのオン，オフの情報から各時間帯ごとにラベルを作成．

labels = pd.Series(np.array([''] * aruba_df.shape[0]))

for activity_id, activity_name in enumerate(activity_name_list):
    target_idxes = aruba_df[aruba_df.activity_name == activity_name].index

    target_start_idxes = target_idxes[::2]
    target_end_idxes = target_idxes[1::2]

    assert len(target_start_idxes) == len(target_end_idxes)
    
    for start_idx, end_idx in zip(target_start_idxes, target_end_idxes):
        
        labels[start_idx:end_idx+1] += activity_name + '-'

aruba_df['label'] = labels

CPU times: user 4.09 s, sys: 55 ms, total: 4.14 s
Wall time: 4.13 s


In [7]:
activity_count_series = aruba_df.label.value_counts()

activity_count_series = activity_count_series.sort_index()

target_sensor_id_list = aruba_df.sensor_id.unique()
target_sensor_id_list = [x  for x in target_sensor_id_list if ("M0" in x or "D0" in x)]

aruba_df = aruba_df[aruba_df.sensor_id.isin(target_sensor_id_list)]

aruba_df.label.value_counts().sort_index()

                                871275
Bed_to_Toilet-                    1330
Eating-                          16037
Eating-Meal_Preparation-          2019
Eating-Relax-                      178
Eating-Wash_Dishes-                  4
Enter_Home-                       2018
Enter_Home-Housekeeping-             4
Enter_Home-Meal_Preparation-         4
Housekeeping-                    10579
Housekeeping-Leave_Home-             6
Leave_Home-                       1916
Leave_Home-Meal_Preparation-        10
Meal_Preparation-               283115
Meal_Preparation-Relax-           8185
Relax-                          346222
Resperate-                         542
Sleeping-                        32755
Wash_Dishes-                     10460
Work-                            16321
Name: label, dtype: int64

In [8]:
label2id = {label:i for i, label in enumerate(aruba_df.label.unique().tolist())}

all_sensor_id_list = aruba_df.sensor_id.unique()

ok_values = ["ON", "OFF", "OPEN", "CLOSE"] # "O5cc"のようなノイズラベルを除く

aruba_df = aruba_df[aruba_df.val.isin(ok_values)]

aruba_df.val.value_counts()

OFF      798005
ON       797981
CLOSE      3419
OPEN       3415
Name: val, dtype: int64

In [9]:
aruba_df = aruba_df.applymap(lambda x: 1 if (x=='ON' or x=='OPEN') else x)
aruba_df = aruba_df.applymap(lambda x: 0 if (x=='OFF' or x=='CLOSE') else x)

In [10]:
aruba_df.head()

Unnamed: 0,date,time,sensor_id,val,activity_name,state,label
0,2010-11-04,00:03:50.209589,M003,1,Sleeping,begin,Sleeping-
1,2010-11-04,00:03:57.399391,M003,0,,,Sleeping-
15,2010-11-04,02:32:33.351906,M003,1,,,Sleeping-
16,2010-11-04,02:32:38.895958,M003,0,,,Sleeping-
20,2010-11-04,03:42:21.82365,M003,1,,,Sleeping-


In [11]:
# dateカラムとtimeカラムを統合

aruba_df['datetime'] = pd.to_datetime(aruba_df['date'] + " " +  aruba_df['time'])

del aruba_df['date']
del aruba_df['time']

In [12]:
# 訓練データとテストデータの作成．
## 訓練 6週間，　テスト 1週間
import datetime as dt

time_span = dt.timedelta(weeks=6)

first_date = aruba_df.datetime[0]
end_date = first_date = first_date + time_span

train_aruba_df = aruba_df[aruba_df.datetime < end_date]
test_aruba_df = aruba_df[(aruba_df.datetime >=end_date) & (aruba_df.datetime <= end_date+dt.timedelta(weeks=1))]
print (train_aruba_df.shape, test_aruba_df.shape)

(268697, 6) (54456, 6)


In [13]:
def extract_last_state_sensor_feature(df, window_size=10):
    nrow, ncol = df.shape
    
    res_df = pd.DataFrame(index=all_sensor_id_list)
    labels = []
    
    p = ProgressBar(max_value=nrow - window_size) 
    for i in range(nrow - window_size):
        target_df = df.iloc[i:i+window_size, :]
        changed_sensor_id_list = target_df.sensor_id.unique()
        sensor2val_dic = {}
        for one_sensor_id in changed_sensor_id_list:
            last_state = target_df[target_df.sensor_id == one_sensor_id].val.tolist()[-1]
            sensor2val_dic[one_sensor_id] = last_state
        labels.append(target_df.label.values[-1])

        res_df[i] = pd.Series(sensor2val_dic)
        
        p.update(i+1)
    res_df = res_df.T

    res_df = res_df.fillna(0)
    
    
    res_df['label'] = labels
    res_df.label = res_df.label.apply(lambda x: label2id[x])
    return res_df

In [14]:
def extract_last_state_sensor_feature(df, window_size=10):
    nrow, ncol = df.shape
    
    res_df = pd.DataFrame(index=all_sensor_id_list)
    labels = []
    
    p = ProgressBar(max_value=nrow - window_size) 
    for i in range(nrow - window_size):
        target_df = df.iloc[i:i+window_size, :]
        changed_sensor_id_list = target_df.sensor_id.unique()
        sensor2val_dic = {}
        for one_sensor_id in changed_sensor_id_list:
            last_state = target_df[target_df.sensor_id == one_sensor_id].val.tolist()[-1]
            sensor2val_dic[one_sensor_id] = last_state
        labels.append(target_df.label.values[-1])

        res_df[i] = pd.Series(sensor2val_dic)
        
        p.update(i+1)
        
    res_df = res_df.T
    res_df = res_df.fillna(0)
    
    res_df['label'] = labels
    res_df.label = res_df.label.apply(lambda x: label2id[x])
    return res_df

In [15]:
def extract_baseline_feature(df, window_size=10):
    nrow, ncol = df.shape
    
    start_features = ['S' + str(i) for i in range(24)]
    end_features = ['E' + str(i) for i in range(24)]
    res_df = pd.DataFrame(index=all_sensor_id_list.tolist() + start_features + end_features +  ['duration'])

    labels = []
    p = ProgressBar(max_value=nrow - window_size) 
    for i in range(nrow - window_size):
        target_df = df.iloc[i:i+window_size, :] 
        label_series = target_df.groupby('sensor_id').count().label
        start = target_df.iloc[0,:].datetime
        end =  target_df.iloc[-1,:].datetime

        def make_time_series(time_hour, prefix):
            vec = np.zeros(24)
            vec[time_hour] = 1
            ser = pd.Series(vec)
            ser.index = [prefix + str(x) for x in ser.index]
            return ser

        start_series = make_time_series(start.time().hour, 'S')
        end_series = make_time_series(end.time().hour, 'E')

        one_series = pd.concat([label_series, start_series, end_series])
        one_series['duration'] = (end - start).total_seconds() / 60

        labels.append(target_df.label.values[-1])
        res_df[i] = one_series
        p.update(i+1)
        
    res_df = res_df.T
    res_df = res_df.fillna(0)
    
    res_df['label'] = labels
    res_df.label = res_df.label.apply(lambda x: label2id[x])
    return res_df

In [None]:
label2id_df = pd.DataFrame(label2id, index=["id"]).T
label2id_df.index.name = 'label'
label2id_df.to_csv("label2id.csv")

In [None]:
%%time
train_fet1_df = extract_baseline_feature(train_aruba_df)

train_fet1_df.to_csv('train_fet1_df.csv')

 95% (255888 of 268687) |################# | Elapsed Time: 0:51:28 ETA: 0:04:18  0% (26 of 268687) |                      | Elapsed Time: 0:00:00 ETA: 0:17:24

In [None]:
%%time
test_fet1_df = extract_baseline_feature(test_aruba_df)

test_fet1_df.to_csv('test_fet1_df.csv')