## Acquisizione e cleaning dei dati

In [5]:
import pandas as pd
import matplotlib as plt
import numpy as np

import os
import re

In [6]:
column_names = pd.MultiIndex.from_product([['T','RA','LA','RL','LL'],\
                                      ['xacc','yacc','zacc','xgyro','ygyro','zgyro','xmag','ymag','zmag']],
                                     names=['Units', 'Sensors'])
index = pd.MultiIndex.from_product([[1,2,3,4,5], list(range(1,26))],\
                                  names=['Seconds','Sampling instant'])

In [7]:
a, p, s = re.findall('/data/a(\d+)/p(\d+)/s(\d+).txt', '../data/a01/p1/s01.txt')[0]
p

'1'

In [10]:
def data_description(filename: str):
    """Ritorna, in base ai dati letti, un dataframe contenente i valori
    ritornati dal metodo describe() per ogni colonna"""
    
    a, p, s = re.findall('data/a(\d+)/p(\d+)/s(\d+).txt', filename)[0]
    
    data = pd.read_table(filename, sep=',', names=column_names)
    descr = data.describe().unstack()  # 1 x 360 row
    descr['a'] = int(a)
    descr['p'] = int(p)
    descr['s'] = int(s)
    return descr


descr = data_description('data/a08/p3/s45.txt')
descr

T   xacc  count    125.000000
          mean       9.359757
          std        0.293035
          min        7.965700
          25%        9.236300
                      ...    
LL  zmag  75%        0.255730
          max        0.262540
a                    8.000000
p                    3.000000
s                   45.000000
Length: 363, dtype: float64

In [12]:
#
# Leggiamo tutti i file che formano il dataset e li concateniamo in un unico dataframe
#

data_segments = []

for a in os.listdir('data/'):
    if (a != '.DS_Store'):
        for p in os.listdir('data/' + a + "/"):
            if (p != '.DS_Store'):
                for s in os.listdir('data/' + a + "/" + p + "/"):
                    descr = data_description(f'data/{a}/{p}/{s}')
                    data_segments.append(descr)

In [13]:
len(data_segments)

9120

In [14]:
main_dataframe = pd.concat(data_segments, axis=1).transpose()
main_dataframe

Unnamed: 0_level_0,T,T,T,T,T,T,T,T,T,T,...,LL,LL,LL,LL,LL,LL,LL,a,p,s
Unnamed: 0_level_1,xacc,xacc,xacc,xacc,xacc,xacc,xacc,xacc,yacc,yacc,...,zmag,zmag,zmag,zmag,zmag,zmag,zmag,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Unnamed: 0_level_2,count,mean,std,min,25%,50%,75%,max,count,mean,...,mean,std,min,25%,50%,75%,max,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,125.0,9.375416,1.340390,7.2316,8.40890,9.0251,10.0480,12.995,125.0,-2.220890,...,-0.321324,0.039035,-0.398260,-0.352810,-0.317460,-0.284520,-0.25302,5.0,4.0,6.0
1,125.0,9.305754,1.284732,7.4804,8.41810,9.0577,9.7066,13.283,125.0,-2.195375,...,-0.414925,0.041983,-0.491020,-0.445550,-0.416820,-0.374450,-0.34023,5.0,4.0,12.0
2,125.0,9.304878,1.106255,7.7173,8.41870,9.1712,9.7528,12.333,125.0,-2.062480,...,-0.390564,0.038960,-0.465550,-0.418900,-0.390840,-0.356970,-0.31562,5.0,4.0,13.0
3,125.0,9.362854,1.362537,7.2450,8.42340,8.9843,9.9633,13.152,125.0,-2.198233,...,-0.261211,0.038384,-0.357470,-0.281530,-0.256180,-0.244070,-0.19572,5.0,4.0,7.0
4,125.0,9.174813,1.237477,7.0816,8.28350,8.9143,9.8522,12.738,125.0,-2.587231,...,-0.208207,0.030420,-0.262260,-0.236630,-0.206820,-0.178340,-0.15940,5.0,4.0,39.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9115,125.0,9.640625,11.188618,-6.6005,0.15543,7.6783,15.9650,49.761,125.0,1.064580,...,0.117772,0.074734,-0.007036,0.057861,0.118400,0.173690,0.34812,12.0,1.0,34.0
9116,125.0,9.592879,11.048747,-4.7068,0.75087,7.6151,14.3700,42.925,125.0,0.922502,...,0.099326,0.073658,-0.016816,0.034445,0.099499,0.150310,0.29743,12.0,1.0,22.0
9117,125.0,8.994511,10.764011,-4.7699,-0.23418,7.1181,14.2460,44.027,125.0,0.349831,...,0.082468,0.060077,-0.043316,0.037624,0.083106,0.123430,0.24107,12.0,1.0,36.0
9118,125.0,9.802873,11.534305,-5.2692,1.42080,8.3953,15.1580,42.165,125.0,0.711324,...,0.057885,0.050787,-0.018097,0.018445,0.046401,0.083571,0.19256,12.0,1.0,37.0


In [15]:
column_names = main_dataframe.columns.to_list()
new_index = column_names[-3:] + column_names[:-3]
new_index

[('a', '', ''),
 ('p', '', ''),
 ('s', '', ''),
 ('T', 'xacc', 'count'),
 ('T', 'xacc', 'mean'),
 ('T', 'xacc', 'std'),
 ('T', 'xacc', 'min'),
 ('T', 'xacc', '25%'),
 ('T', 'xacc', '50%'),
 ('T', 'xacc', '75%'),
 ('T', 'xacc', 'max'),
 ('T', 'yacc', 'count'),
 ('T', 'yacc', 'mean'),
 ('T', 'yacc', 'std'),
 ('T', 'yacc', 'min'),
 ('T', 'yacc', '25%'),
 ('T', 'yacc', '50%'),
 ('T', 'yacc', '75%'),
 ('T', 'yacc', 'max'),
 ('T', 'zacc', 'count'),
 ('T', 'zacc', 'mean'),
 ('T', 'zacc', 'std'),
 ('T', 'zacc', 'min'),
 ('T', 'zacc', '25%'),
 ('T', 'zacc', '50%'),
 ('T', 'zacc', '75%'),
 ('T', 'zacc', 'max'),
 ('T', 'xgyro', 'count'),
 ('T', 'xgyro', 'mean'),
 ('T', 'xgyro', 'std'),
 ('T', 'xgyro', 'min'),
 ('T', 'xgyro', '25%'),
 ('T', 'xgyro', '50%'),
 ('T', 'xgyro', '75%'),
 ('T', 'xgyro', 'max'),
 ('T', 'ygyro', 'count'),
 ('T', 'ygyro', 'mean'),
 ('T', 'ygyro', 'std'),
 ('T', 'ygyro', 'min'),
 ('T', 'ygyro', '25%'),
 ('T', 'ygyro', '50%'),
 ('T', 'ygyro', '75%'),
 ('T', 'ygyro', 'max'),
 

In [16]:
main_dataframe = main_dataframe[new_index]
main_dataframe

Unnamed: 0_level_0,a,p,s,T,T,T,T,T,T,T,...,LL,LL,LL,LL,LL,LL,LL,LL,LL,LL
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,xacc,xacc,xacc,xacc,xacc,xacc,xacc,...,ymag,ymag,zmag,zmag,zmag,zmag,zmag,zmag,zmag,zmag
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,count,mean,std,min,25%,50%,75%,...,75%,max,count,mean,std,min,25%,50%,75%,max
0,5.0,4.0,6.0,125.0,9.375416,1.340390,7.2316,8.40890,9.0251,10.0480,...,0.51501,0.59019,125.0,-0.321324,0.039035,-0.398260,-0.352810,-0.317460,-0.284520,-0.25302
1,5.0,4.0,12.0,125.0,9.305754,1.284732,7.4804,8.41810,9.0577,9.7066,...,0.44353,0.55931,125.0,-0.414925,0.041983,-0.491020,-0.445550,-0.416820,-0.374450,-0.34023
2,5.0,4.0,13.0,125.0,9.304878,1.106255,7.7173,8.41870,9.1712,9.7528,...,0.43510,0.53405,125.0,-0.390564,0.038960,-0.465550,-0.418900,-0.390840,-0.356970,-0.31562
3,5.0,4.0,7.0,125.0,9.362854,1.362537,7.2450,8.42340,8.9843,9.9633,...,0.45443,0.54418,125.0,-0.261211,0.038384,-0.357470,-0.281530,-0.256180,-0.244070,-0.19572
4,5.0,4.0,39.0,125.0,9.174813,1.237477,7.0816,8.28350,8.9143,9.8522,...,0.42768,0.52017,125.0,-0.208207,0.030420,-0.262260,-0.236630,-0.206820,-0.178340,-0.15940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9115,12.0,1.0,34.0,125.0,9.640625,11.188618,-6.6005,0.15543,7.6783,15.9650,...,0.59453,0.66941,125.0,0.117772,0.074734,-0.007036,0.057861,0.118400,0.173690,0.34812
9116,12.0,1.0,22.0,125.0,9.592879,11.048747,-4.7068,0.75087,7.6151,14.3700,...,0.64064,0.70769,125.0,0.099326,0.073658,-0.016816,0.034445,0.099499,0.150310,0.29743
9117,12.0,1.0,36.0,125.0,8.994511,10.764011,-4.7699,-0.23418,7.1181,14.2460,...,0.58671,0.63964,125.0,0.082468,0.060077,-0.043316,0.037624,0.083106,0.123430,0.24107
9118,12.0,1.0,37.0,125.0,9.802873,11.534305,-5.2692,1.42080,8.3953,15.1580,...,0.58876,0.62212,125.0,0.057885,0.050787,-0.018097,0.018445,0.046401,0.083571,0.19256


In [17]:
#
# Creiamo diversi file CSV, i primi contenenti informazioni sull'attività, la persona, e il segmento di
# riferimento per ogni riga, l'ultimo contenente i dati puliti
#

main_dataframe.to_csv('main_dataframe_multiindex.csv')

In [18]:
main_dataframe.columns = main_dataframe.columns.to_list()

In [19]:
main_dataframe

Unnamed: 0,"(a, , )","(p, , )","(s, , )","(T, xacc, count)","(T, xacc, mean)","(T, xacc, std)","(T, xacc, min)","(T, xacc, 25%)","(T, xacc, 50%)","(T, xacc, 75%)",...,"(LL, ymag, 75%)","(LL, ymag, max)","(LL, zmag, count)","(LL, zmag, mean)","(LL, zmag, std)","(LL, zmag, min)","(LL, zmag, 25%)","(LL, zmag, 50%)","(LL, zmag, 75%)","(LL, zmag, max)"
0,5.0,4.0,6.0,125.0,9.375416,1.340390,7.2316,8.40890,9.0251,10.0480,...,0.51501,0.59019,125.0,-0.321324,0.039035,-0.398260,-0.352810,-0.317460,-0.284520,-0.25302
1,5.0,4.0,12.0,125.0,9.305754,1.284732,7.4804,8.41810,9.0577,9.7066,...,0.44353,0.55931,125.0,-0.414925,0.041983,-0.491020,-0.445550,-0.416820,-0.374450,-0.34023
2,5.0,4.0,13.0,125.0,9.304878,1.106255,7.7173,8.41870,9.1712,9.7528,...,0.43510,0.53405,125.0,-0.390564,0.038960,-0.465550,-0.418900,-0.390840,-0.356970,-0.31562
3,5.0,4.0,7.0,125.0,9.362854,1.362537,7.2450,8.42340,8.9843,9.9633,...,0.45443,0.54418,125.0,-0.261211,0.038384,-0.357470,-0.281530,-0.256180,-0.244070,-0.19572
4,5.0,4.0,39.0,125.0,9.174813,1.237477,7.0816,8.28350,8.9143,9.8522,...,0.42768,0.52017,125.0,-0.208207,0.030420,-0.262260,-0.236630,-0.206820,-0.178340,-0.15940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9115,12.0,1.0,34.0,125.0,9.640625,11.188618,-6.6005,0.15543,7.6783,15.9650,...,0.59453,0.66941,125.0,0.117772,0.074734,-0.007036,0.057861,0.118400,0.173690,0.34812
9116,12.0,1.0,22.0,125.0,9.592879,11.048747,-4.7068,0.75087,7.6151,14.3700,...,0.64064,0.70769,125.0,0.099326,0.073658,-0.016816,0.034445,0.099499,0.150310,0.29743
9117,12.0,1.0,36.0,125.0,8.994511,10.764011,-4.7699,-0.23418,7.1181,14.2460,...,0.58671,0.63964,125.0,0.082468,0.060077,-0.043316,0.037624,0.083106,0.123430,0.24107
9118,12.0,1.0,37.0,125.0,9.802873,11.534305,-5.2692,1.42080,8.3953,15.1580,...,0.58876,0.62212,125.0,0.057885,0.050787,-0.018097,0.018445,0.046401,0.083571,0.19256


In [20]:
main_dataframe.to_csv('main_dataframe.csv')

In [21]:
main_dataframe = main_dataframe.drop(columns=[( 'a', '', ''), ( 'p', '', ''),\
                                              ( 's', '', '')])

In [22]:
main_dataframe = main_dataframe.drop(columns=[( 'T', 'xacc', 'count'), ( 'T', 'yacc', 'count'),\
                                              ( 'T', 'zacc', 'count'), ( 'T', 'xgyro', 'count'),\
                                              ( 'T', 'ygyro', 'count'), ( 'T', 'zgyro', 'count'),\
                                              ( 'T', 'xmag', 'count'), ( 'T', 'ymag', 'count'),\
                                              ( 'T', 'zmag', 'count'), ( 'RA', 'xacc', 'count'),\
                                              ( 'RA', 'yacc', 'count'), ( 'RA', 'zacc', 'count'),\
                                              ( 'RA', 'xgyro', 'count'), ( 'RA', 'ygyro', 'count'),\
                                              ( 'RA', 'zgyro', 'count'), ( 'RA', 'xmag', 'count'),\
                                              ( 'RA', 'ymag', 'count'), ( 'RA', 'zmag', 'count'),\
                                              ( 'LA', 'xacc', 'count'), ( 'LA', 'yacc', 'count'),\
                                              ( 'LA', 'zacc', 'count'), ( 'LA', 'xgyro', 'count'),\
                                              ( 'LA', 'ygyro', 'count'), ( 'LA', 'zgyro', 'count'),\
                                              ( 'LA', 'xmag', 'count'), ( 'LA', 'ymag', 'count'),\
                                              ( 'LA', 'zmag', 'count'), ( 'RL', 'xacc', 'count'),\
                                              ( 'RL', 'yacc', 'count'), ( 'RL', 'zacc', 'count'),\
                                              ( 'RL', 'xgyro', 'count'), ( 'RL', 'ygyro', 'count'),\
                                              ( 'RL', 'zgyro', 'count'), ( 'RL', 'xmag', 'count'),\
                                              ( 'RL', 'ymag', 'count'), ( 'RL', 'zmag', 'count'),\
                                              ( 'LL', 'xacc', 'count'), ( 'LL', 'yacc', 'count'),\
                                              ( 'LL', 'zacc', 'count'), ( 'LL', 'xgyro', 'count'),\
                                              ( 'LL', 'ygyro', 'count'), ( 'LL', 'zgyro', 'count'),\
                                              ( 'LL', 'xmag', 'count'), ( 'LL', 'ymag', 'count'),\
                                              ( 'LL', 'zmag', 'count')])

In [23]:
main_dataframe

Unnamed: 0,"(T, xacc, mean)","(T, xacc, std)","(T, xacc, min)","(T, xacc, 25%)","(T, xacc, 50%)","(T, xacc, 75%)","(T, xacc, max)","(T, yacc, mean)","(T, yacc, std)","(T, yacc, min)",...,"(LL, ymag, 50%)","(LL, ymag, 75%)","(LL, ymag, max)","(LL, zmag, mean)","(LL, zmag, std)","(LL, zmag, min)","(LL, zmag, 25%)","(LL, zmag, 50%)","(LL, zmag, 75%)","(LL, zmag, max)"
0,9.375416,1.340390,7.2316,8.40890,9.0251,10.0480,12.995,-2.220890,0.841599,-3.7353,...,0.41325,0.51501,0.59019,-0.321324,0.039035,-0.398260,-0.352810,-0.317460,-0.284520,-0.25302
1,9.305754,1.284732,7.4804,8.41810,9.0577,9.7066,13.283,-2.195375,0.841058,-3.8709,...,0.28580,0.44353,0.55931,-0.414925,0.041983,-0.491020,-0.445550,-0.416820,-0.374450,-0.34023
2,9.304878,1.106255,7.7173,8.41870,9.1712,9.7528,12.333,-2.062480,0.886773,-3.9881,...,0.28769,0.43510,0.53405,-0.390564,0.038960,-0.465550,-0.418900,-0.390840,-0.356970,-0.31562
3,9.362854,1.362537,7.2450,8.42340,8.9843,9.9633,13.152,-2.198233,0.794485,-3.8168,...,0.38984,0.45443,0.54418,-0.261211,0.038384,-0.357470,-0.281530,-0.256180,-0.244070,-0.19572
4,9.174813,1.237477,7.0816,8.28350,8.9143,9.8522,12.738,-2.587231,0.875247,-4.5112,...,0.32838,0.42768,0.52017,-0.208207,0.030420,-0.262260,-0.236630,-0.206820,-0.178340,-0.15940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9115,9.640625,11.188618,-6.6005,0.15543,7.6783,15.9650,49.761,1.064580,4.467311,-13.0280,...,0.51855,0.59453,0.66941,0.117772,0.074734,-0.007036,0.057861,0.118400,0.173690,0.34812
9116,9.592879,11.048747,-4.7068,0.75087,7.6151,14.3700,42.925,0.922502,4.221065,-13.3980,...,0.55002,0.64064,0.70769,0.099326,0.073658,-0.016816,0.034445,0.099499,0.150310,0.29743
9117,8.994511,10.764011,-4.7699,-0.23418,7.1181,14.2460,44.027,0.349831,3.843780,-16.2680,...,0.50997,0.58671,0.63964,0.082468,0.060077,-0.043316,0.037624,0.083106,0.123430,0.24107
9118,9.802873,11.534305,-5.2692,1.42080,8.3953,15.1580,42.165,0.711324,4.112416,-19.5470,...,0.50547,0.58876,0.62212,0.057885,0.050787,-0.018097,0.018445,0.046401,0.083571,0.19256


In [24]:
main_dataframe.to_csv('clean_dataframe.csv')