### Imports

In [2]:
import sys
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from pandas_profiling import ProfileReport



### Configuração de plotagem de gráficos

In [3]:
sns.set_theme(style="whitegrid")

params = {'legend.fontsize': 'x-large',
          'figure.figsize': (15, 5),
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
plt.rcParams.update(params)

# Análise exploratória dos dados

In [4]:
user1 = pd.read_csv('../input/user1.features_labels.csv')
user1.head()

Unnamed: 0,timestamp,raw_acc:magnitude_stats:mean,raw_acc:magnitude_stats:std,raw_acc:magnitude_stats:moment3,raw_acc:magnitude_stats:moment4,raw_acc:magnitude_stats:percentile25,raw_acc:magnitude_stats:percentile50,raw_acc:magnitude_stats:percentile75,raw_acc:magnitude_stats:value_entropy,raw_acc:magnitude_stats:time_entropy,...,label:STAIRS_-_GOING_DOWN,label:ELEVATOR,label:OR_standing,label:AT_SCHOOL,label:PHONE_IN_HAND,label:PHONE_IN_BAG,label:PHONE_ON_TABLE,label:WITH_CO-WORKERS,label:WITH_FRIENDS,label_source
0,1464129912,1.011438,0.012573,0.023013,0.04124,1.006184,1.010868,1.016028,0.935099,6.684536,...,0.0,,0.0,0.0,,,,1.0,0.0,2
1,1464129950,1.011233,0.009356,-0.005622,0.016687,1.006338,1.010926,1.016657,1.732968,6.684569,...,0.0,,0.0,0.0,,,,1.0,0.0,2
2,1464130031,1.013422,0.018068,-0.008593,0.039286,1.004077,1.012983,1.021926,1.464639,6.684453,...,0.0,,0.0,0.0,,,,1.0,0.0,2
3,1464130109,1.014891,0.0164,0.021383,0.038825,1.005934,1.01467,1.023,1.440043,6.684483,...,0.0,,0.0,0.0,,,,1.0,0.0,2
4,1464130130,1.017487,0.022632,-0.012891,0.037226,1.00604,1.017587,1.028168,1.937362,6.684364,...,0.0,,0.0,0.0,,,,1.0,0.0,2


In [5]:
user1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2685 entries, 0 to 2684
Columns: 278 entries, timestamp to label_source
dtypes: float64(274), int64(4)
memory usage: 5.7 MB


In [6]:
list(user1.columns)

['timestamp',
 'raw_acc:magnitude_stats:mean',
 'raw_acc:magnitude_stats:std',
 'raw_acc:magnitude_stats:moment3',
 'raw_acc:magnitude_stats:moment4',
 'raw_acc:magnitude_stats:percentile25',
 'raw_acc:magnitude_stats:percentile50',
 'raw_acc:magnitude_stats:percentile75',
 'raw_acc:magnitude_stats:value_entropy',
 'raw_acc:magnitude_stats:time_entropy',
 'raw_acc:magnitude_spectrum:log_energy_band0',
 'raw_acc:magnitude_spectrum:log_energy_band1',
 'raw_acc:magnitude_spectrum:log_energy_band2',
 'raw_acc:magnitude_spectrum:log_energy_band3',
 'raw_acc:magnitude_spectrum:log_energy_band4',
 'raw_acc:magnitude_spectrum:spectral_entropy',
 'raw_acc:magnitude_autocorrelation:period',
 'raw_acc:magnitude_autocorrelation:normalized_ac',
 'raw_acc:3d:mean_x',
 'raw_acc:3d:mean_y',
 'raw_acc:3d:mean_z',
 'raw_acc:3d:std_x',
 'raw_acc:3d:std_y',
 'raw_acc:3d:std_z',
 'raw_acc:3d:ro_xy',
 'raw_acc:3d:ro_xz',
 'raw_acc:3d:ro_yz',
 'proc_gyro:magnitude_stats:mean',
 'proc_gyro:magnitude_stats:std

### Vamos analisar os atributos de saída (contextos)

In [7]:
labels = user1.loc[:, user1.columns.str.startswith('label:')].columns.values
labels

array(['label:LYING_DOWN', 'label:SITTING', 'label:FIX_walking',
       'label:FIX_running', 'label:BICYCLING', 'label:SLEEPING',
       'label:LAB_WORK', 'label:IN_CLASS', 'label:IN_A_MEETING',
       'label:LOC_main_workplace', 'label:OR_indoors', 'label:OR_outside',
       'label:IN_A_CAR', 'label:ON_A_BUS', 'label:DRIVE_-_I_M_THE_DRIVER',
       'label:DRIVE_-_I_M_A_PASSENGER', 'label:LOC_home',
       'label:FIX_restaurant', 'label:PHONE_IN_POCKET',
       'label:OR_exercise', 'label:COOKING', 'label:SHOPPING',
       'label:STROLLING', 'label:DRINKING__ALCOHOL_',
       'label:BATHING_-_SHOWER', 'label:CLEANING', 'label:DOING_LAUNDRY',
       'label:WASHING_DISHES', 'label:WATCHING_TV',
       'label:SURFING_THE_INTERNET', 'label:AT_A_PARTY', 'label:AT_A_BAR',
       'label:LOC_beach', 'label:SINGING', 'label:TALKING',
       'label:COMPUTER_WORK', 'label:EATING', 'label:TOILET',
       'label:GROOMING', 'label:DRESSING', 'label:AT_THE_GYM',
       'label:STAIRS_-_GOING_UP', 'lab

In [8]:
user1.loc[:, labels].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2685 entries, 0 to 2684
Data columns (total 51 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   label:LYING_DOWN               2681 non-null   float64
 1   label:SITTING                  2681 non-null   float64
 2   label:FIX_walking              2681 non-null   float64
 3   label:FIX_running              2681 non-null   float64
 4   label:BICYCLING                0 non-null      float64
 5   label:SLEEPING                 2681 non-null   float64
 6   label:LAB_WORK                 0 non-null      float64
 7   label:IN_CLASS                 0 non-null      float64
 8   label:IN_A_MEETING             2681 non-null   float64
 9   label:LOC_main_workplace       2685 non-null   int64  
 10  label:OR_indoors               1581 non-null   float64
 11  label:OR_outside               1581 non-null   float64
 12  label:IN_A_CAR                 2681 non-null   f

Segundo a descrição do dataset:

**1: o rótulo é relevante para a amostra**  
**0: o rótulo não é relevante para a amostra**  
**nulo: informação faltante**

Vamos copiar o dataset.

In [9]:
#user1_eda = user1.loc[:, labels].dropna(axis=1, how='all')
user1_eda = user1
labels = user1_eda.loc[:, user1_eda.columns.str.startswith('label:')].columns.values
user1_eda.loc[:, labels].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2685 entries, 0 to 2684
Data columns (total 51 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   label:LYING_DOWN               2681 non-null   float64
 1   label:SITTING                  2681 non-null   float64
 2   label:FIX_walking              2681 non-null   float64
 3   label:FIX_running              2681 non-null   float64
 4   label:BICYCLING                0 non-null      float64
 5   label:SLEEPING                 2681 non-null   float64
 6   label:LAB_WORK                 0 non-null      float64
 7   label:IN_CLASS                 0 non-null      float64
 8   label:IN_A_MEETING             2681 non-null   float64
 9   label:LOC_main_workplace       2685 non-null   int64  
 10  label:OR_indoors               1581 non-null   float64
 11  label:OR_outside               1581 non-null   float64
 12  label:IN_A_CAR                 2681 non-null   f

Vários labels possuem 4 linhas nulas. Suspeitamos que sejam as mesmas linhas em todas essas colunas. Vamos examinar.

In [10]:
user1_eda[user1_eda.loc[:,'label:LYING_DOWN':'label:FIX_running'].isnull().any(axis=1)]
#user1_eda[user1_eda.loc[:, user1_eda.columns.str.startswith('label:')].isnull().any(axis=1)]
#user1_eda.columns.str.startswith('label:')

Unnamed: 0,timestamp,raw_acc:magnitude_stats:mean,raw_acc:magnitude_stats:std,raw_acc:magnitude_stats:moment3,raw_acc:magnitude_stats:moment4,raw_acc:magnitude_stats:percentile25,raw_acc:magnitude_stats:percentile50,raw_acc:magnitude_stats:percentile75,raw_acc:magnitude_stats:value_entropy,raw_acc:magnitude_stats:time_entropy,...,label:STAIRS_-_GOING_DOWN,label:ELEVATOR,label:OR_standing,label:AT_SCHOOL,label:PHONE_IN_HAND,label:PHONE_IN_BAG,label:PHONE_ON_TABLE,label:WITH_CO-WORKERS,label:WITH_FRIENDS,label_source
771,1464331339,1.009321,0.011919,-0.009118,0.019204,1.002906,1.00944,1.015879,2.048544,6.684542,...,,,,,,,,,,-1
893,1464381738,1.010567,0.00439,0.005478,0.009831,1.008065,1.010495,1.012967,1.526642,6.684602,...,,,,,,,,,,-1
900,1464384138,1.041899,0.288034,0.313004,0.478115,0.944796,1.004593,1.079433,2.068323,6.648032,...,,,,,,,,,,-1
2004,1464677950,1.007211,0.001778,-0.000707,0.002424,1.005946,1.007274,1.008397,2.276544,6.68461,...,,,,,,,,,,-1


Vamos eliminar essas 4 linhas do dataset.

In [11]:
indexes = user1_eda.index[user1_eda.loc[:,'label:LYING_DOWN':'label:FIX_running'].isnull().any(axis=1)]
user1_eda.drop(indexes, inplace=True)
user1_eda.loc[:, labels].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2681 entries, 0 to 2684
Data columns (total 51 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   label:LYING_DOWN               2681 non-null   float64
 1   label:SITTING                  2681 non-null   float64
 2   label:FIX_walking              2681 non-null   float64
 3   label:FIX_running              2681 non-null   float64
 4   label:BICYCLING                0 non-null      float64
 5   label:SLEEPING                 2681 non-null   float64
 6   label:LAB_WORK                 0 non-null      float64
 7   label:IN_CLASS                 0 non-null      float64
 8   label:IN_A_MEETING             2681 non-null   float64
 9   label:LOC_main_workplace       2681 non-null   int64  
 10  label:OR_indoors               1581 non-null   float64
 11  label:OR_outside               1581 non-null   float64
 12  label:IN_A_CAR                 2681 non-null   f

### Vamos verificar os labels mais comuns

In [12]:
plt.figure()
plt.rcParams.update({'font.size': 9})
plt.rcParams['figure.figsize']=(500, 200)

labels_df = user1_eda.loc[:,labels]
labels_df[labels_df == 1].count().sort_values(ascending=False).plot.bar()

<AxesSubplot:>

In [13]:
labels_df[labels_df == 1].count().sort_values(ascending=False)

label:PHONE_ON_TABLE             2076
label:SITTING                    1543
label:OR_indoors                 1455
label:LOC_home                   1437
label:LYING_DOWN                  748
label:TALKING                     653
label:SLEEPING                    633
label:LOC_main_workplace          591
label:PHONE_IN_POCKET             394
label:EATING                      310
label:WATCHING_TV                 257
label:SURFING_THE_INTERNET        242
label:OR_standing                 231
label:FIX_walking                 158
label:OR_outside                  126
label:WITH_FRIENDS                109
label:PHONE_IN_HAND               106
label:COMPUTER_WORK                85
label:WITH_CO-WORKERS              82
label:DRESSING                     57
label:COOKING                      47
label:WASHING_DISHES               46
label:ON_A_BUS                     42
label:GROOMING                     36
label:DRIVE_-_I_M_THE_DRIVER       35
label:TOILET                       35
label:AT_SCH

Vamos verificar se há colunas com nenhum valor 1.

In [14]:
labels_df.loc[:, (labels_df == 0).all()]

0
1
2
3
4
...
2680
2681
2682
2683
2684


In [15]:
# forma alternativa de checar a mesma informação

column_values = labels_df[['label:FIX_running']].values.ravel()
unique_values = pd.unique(column_values)
print(unique_values)

[0. 1.]


In [16]:
labels_df['label:FIX_running'].value_counts()

0.0    2680
1.0       1
Name: label:FIX_running, dtype: int64

In [17]:
labels_df.apply(pd.value_counts)

Unnamed: 0,label:LYING_DOWN,label:SITTING,label:FIX_walking,label:FIX_running,label:BICYCLING,label:SLEEPING,label:LAB_WORK,label:IN_CLASS,label:IN_A_MEETING,label:LOC_main_workplace,...,label:STAIRS_-_GOING_UP,label:STAIRS_-_GOING_DOWN,label:ELEVATOR,label:OR_standing,label:AT_SCHOOL,label:PHONE_IN_HAND,label:PHONE_IN_BAG,label:PHONE_ON_TABLE,label:WITH_CO-WORKERS,label:WITH_FRIENDS
0.0,1933,1138,2523,2680,,2048,,,2660,2090,...,2680,2680,,2450,2649,2470,,500,2599,2572
1.0,748,1543,158,1,,633,,,21,591,...,1,1,,231,32,106,,2076,82,109


### Da descrição do dataset, sabemos que os contextos são divididos em dois grupos:
#### Atividade principal (classes mutualmente exclusivas): lying down, sitting, standing in place, standing and moving, walking, running, bicycling
#### Atividade secundária: 109 rótulos adicionais que descrevem um contexto mais específico em diferentes aspectos

##### esportes (por exemplo, jogar basquete, na academia), transporte (por exemplo, dirigir - eu sou o motorista, no ônibus), necessidades básicas (por exemplo, dormir, comer, ir ao banheiro), companhia (por exemplo, com a família, com colegas de trabalho), localização (por exemplo, em casa, no trabalho, fora) etc.

##### Vários rótulos secundários podem ser aplicados a uma amostra.
##### Algumas amostras podem não ter nenhuma atividade principal selecionada, mas têm rótulos secundários (por exemplo, quando o usuário não se lembra se estava sentado ou caminhando, mas lembra que está dentro de casa).
##### Em média (mais de 60 usuários), uma amostra tem mais de 3 rótulos atribuídos a ela.
##### Em média, a distribuição de uso de rótulo de um usuário tem uma entropia de 3,9 bits, o que significa que um usuário típico usou principalmente ~ 15 rótulos durante o período de participação.

### Vamos separar as duas categorias de contextos e criar uma nova coluna com o contexto principal codificado

In [18]:
main_label = ['label:LYING_DOWN', 'label:SITTING', 'label:OR_standing', 'label:FIX_walking', 'label:FIX_running', 'label:BICYCLING']

secondary_label = ['label:SLEEPING',
       'label:LAB_WORK', 'label:IN_CLASS', 'label:IN_A_MEETING',
       'label:LOC_main_workplace', 'label:OR_indoors', 'label:OR_outside',
       'label:IN_A_CAR', 'label:ON_A_BUS', 'label:DRIVE_-_I_M_THE_DRIVER',
       'label:DRIVE_-_I_M_A_PASSENGER', 'label:LOC_home',
       'label:FIX_restaurant', 'label:PHONE_IN_POCKET',
       'label:OR_exercise', 'label:COOKING', 'label:SHOPPING',
       'label:STROLLING', 'label:DRINKING__ALCOHOL_',
       'label:BATHING_-_SHOWER', 'label:CLEANING', 'label:DOING_LAUNDRY',
       'label:WASHING_DISHES', 'label:WATCHING_TV',
       'label:SURFING_THE_INTERNET', 'label:AT_A_PARTY', 'label:AT_A_BAR',
       'label:LOC_beach', 'label:SINGING', 'label:TALKING',
       'label:COMPUTER_WORK', 'label:EATING', 'label:TOILET',
       'label:GROOMING', 'label:DRESSING', 'label:AT_THE_GYM',
       'label:STAIRS_-_GOING_UP', 'label:STAIRS_-_GOING_DOWN',
       'label:ELEVATOR', 'label:AT_SCHOOL',
       'label:PHONE_IN_HAND', 'label:PHONE_IN_BAG',
       'label:PHONE_ON_TABLE', 'label:WITH_CO-WORKERS',
       'label:WITH_FRIENDS']

In [19]:
plt.figure()
plt.rcParams.update({'font.size': 9})

main_labels_df = user1_eda.loc[:,main_label]
main_labels_df[labels_df == 1].count().sort_values(ascending=False).plot.bar()

main_labels_df[labels_df == 1].count().sort_values(ascending=False)

label:SITTING        1543
label:LYING_DOWN      748
label:OR_standing     231
label:FIX_walking     158
label:FIX_running       1
label:BICYCLING         0
dtype: int64

In [20]:
def gen_multi_label (row):
    label = 'label:'
    if row['label:LYING_DOWN'] == 1 :
      label += ':LYING_DOWN'
    if row['label:SITTING'] == 1 :
      label += ':SITTING'
    if row['label:OR_standing'] == 1 :
      label += ':OR_standing'
    if row['label:FIX_walking'] == 1 :
      label += ':FIX_walking'
    if row['label:FIX_running'] == 1 :
      label += ':FIX_running'
    if row['label:BICYCLING'] == 1 :
      label += ':BICYCLING'
    if row['label:SLEEPING'] == 1 :
      label += ':SLEEPING'
    if row['label:LAB_WORK'] == 1 :
      label += ':LAB_WORK'
    if row['label:IN_CLASS'] == 1 :
      label += ':IN_CLASS'
    if row['label:IN_A_MEETING'] == 1 :
      label += ':IN_A_MEETING'
    if row['label:LOC_main_workplace'] == 1 :
      label += ':LOC_main_workplace'
    if row['label:OR_indoors'] == 1 :
      label += ':OR_indoors'
    if row['label:OR_outside'] == 1 :
      label += ':OR_outside'
    if row['label:IN_A_CAR'] == 1 :
      label += ':IN_A_CAR'
    if row['label:ON_A_BUS'] == 1 :
      label += ':ON_A_BUS'
    if row['label:DRIVE_-_I_M_THE_DRIVER'] == 1 :
      label += ':DRIVE_-_I_M_THE_DRIVER'
    if row['label:DRIVE_-_I_M_A_PASSENGER'] == 1 :
      label += ':DRIVE_-_I_M_A_PASSENGER'
    if row['label:LOC_home'] == 1 :
      label += ':LOC_home'
    if row['label:FIX_restaurant'] == 1 :
      label += ':FIX_restaurant'
    if row['label:PHONE_IN_POCKET'] == 1 :
      label += ':PHONE_IN_POCKET'
    if row['label:OR_exercise'] == 1 :
      label += ':OR_exercise'
    if row['label:COOKING'] == 1 :
      label += ':COOKING'
    if row['label:SHOPPING'] == 1 :
      label += ':SHOPPING'
    if row['label:STROLLING'] == 1 :
      label += ':STROLLING'
    if row['label:DRINKING__ALCOHOL_'] == 1 :
      label += ':DRINKING__ALCOHOL_'
    if row['label:BATHING_-_SHOWER'] == 1 :
      label += ':BATHING_-_SHOWER'
    if row['label:CLEANING'] == 1 :
      label += ':CLEANING'
    if row['label:DOING_LAUNDRY'] == 1 :
      label += ':DOING_LAUNDRY'
    if row['label:WASHING_DISHES'] == 1 :
      label += ':WASHING_DISHES'
    if row['label:WATCHING_TV'] == 1 :
      label += ':WATCHING_TV'
    if row['label:SURFING_THE_INTERNET'] == 1 :
      label += ':SURFING_THE_INTERNET'
    if row['label:AT_A_PARTY'] == 1 :
      label += ':AT_A_PARTY'
    if row['label:AT_A_BAR'] == 1 :
      label += ':AT_A_BAR'
    if row['label:LOC_beach'] == 1 :
      label += ':LOC_beach'
    if row['label:SINGING'] == 1 :
      label += ':SINGING'
    if row['label:TALKING'] == 1 :
      label += ':TALKING'
    if row['label:COMPUTER_WORK'] == 1 :
      label += ':COMPUTER_WORK'
    if row['label:EATING'] == 1 :
      label += ':EATING'
    if row['label:TOILET'] == 1 :
      label += ':TOILET'
    if row['label:GROOMING'] == 1 :
      label += ':GROOMING'
    if row['label:DRESSING'] == 1 :
      label += ':DRESSING'
    if row['label:AT_THE_GYM'] == 1 :
      label += ':AT_THE_GYM'
    if row['label:STAIRS_-_GOING_UP'] == 1 :
      label += ':STAIRS_-_GOING_UP'
    if row['label:STAIRS_-_GOING_DOWN'] == 1 :
      label += ':STAIRS_-_GOING_DOWN'
    if row['label:ELEVATOR'] == 1 :
      label += ':ELEVATOR'
    if row['label:AT_SCHOOL'] == 1 :
      label += ':AT_SCHOOL'
    if row['label:PHONE_IN_HAND'] == 1 :
      label += ':PHONE_IN_HAND'
    if row['label:PHONE_IN_BAG'] == 1 :
      label += ':PHONE_IN_BAG'
    if row['label:PHONE_ON_TABLE'] == 1 :
      label += ':PHONE_ON_TABLE'
    if row['label:WITH_CO-WORKERS'] == 1 :
      label += ':WITH_CO-WORKERS'
    if row['label:WITH_FRIENDS'] == 1 :
      label += ':WITH_FRIENDS'

    return label

In [21]:
user1_eda['multi_label'] = user1_eda.apply (lambda row: gen_multi_label(row), axis=1)
user1_eda['multi_label'].value_counts()

label::LYING_DOWN:SLEEPING:OR_indoors:LOC_home:PHONE_ON_TABLE           616
label::SITTING:LOC_main_workplace:PHONE_ON_TABLE                        420
label::SITTING:OR_indoors:LOC_home:WATCHING_TV:EATING:PHONE_ON_TABLE    143
label::SITTING:OR_indoors:LOC_home:PHONE_ON_TABLE                        94
label::FIX_walking:OR_outside:PHONE_IN_POCKET:TALKING                    68
                                                                       ... 
label::FIX_walking:LOC_main_workplace:PHONE_IN_HAND                       1
label::OR_standing:OR_indoors:LOC_home:TALKING:PHONE_ON_TABLE             1
label::SITTING:OR_indoors:LOC_home:TOILET                                 1
label::SITTING:LOC_main_workplace:SURFING_THE_INTERNET                    1
label::OR_standing:OR_outside:PHONE_IN_POCKET:TALKING                     1
Name: multi_label, Length: 141, dtype: int64

In [22]:
user1_eda['multi_label'].unique()

array(['label::SITTING:TALKING:WITH_CO-WORKERS', 'label::SITTING',
       'label::SITTING:LOC_main_workplace:PHONE_ON_TABLE',
       'label::SITTING:LOC_main_workplace:SURFING_THE_INTERNET:PHONE_ON_TABLE',
       'label::FIX_walking:OR_indoors:PHONE_IN_POCKET:TOILET',
       'label::OR_standing:LOC_main_workplace:PHONE_IN_POCKET:TALKING',
       'label::FIX_walking:OR_outside:PHONE_IN_POCKET:STROLLING:TALKING',
       'label::FIX_walking:OR_outside:TALKING:PHONE_IN_HAND',
       'label::SITTING:ON_A_BUS:PHONE_IN_POCKET:TALKING',
       'label::SITTING:OR_indoors:ON_A_BUS:LOC_home:PHONE_IN_POCKET:TALKING',
       'label::OR_standing:OR_indoors:LOC_home:TALKING:PHONE_ON_TABLE',
       'label::OR_standing:OR_indoors:LOC_home:COOKING:WATCHING_TV:TALKING',
       'label::SITTING:OR_indoors:LOC_home:WATCHING_TV:TALKING:EATING:PHONE_ON_TABLE',
       'label::SITTING:OR_indoors:LOC_home:WATCHING_TV:TALKING:PHONE_ON_TABLE',
       'label::OR_standing:OR_indoors:LOC_home:WASHING_DISHES:PHONE_ON_

#### Número de classes *multi-label* geradas

In [23]:
len(user1_eda['multi_label'].unique())

141

In [24]:
#%matplotlib inline
#user1_eda['context'].value_counts().plot.bar()

In [25]:
user1_eda['label_source'].value_counts()

2    1998
1     451
4     185
6      31
5      16
Name: label_source, dtype: int64

In [26]:
user1_eda['label_source'].unique()

array([2, 6, 4, 1, 5])

In [27]:
user1_eda

Unnamed: 0,timestamp,raw_acc:magnitude_stats:mean,raw_acc:magnitude_stats:std,raw_acc:magnitude_stats:moment3,raw_acc:magnitude_stats:moment4,raw_acc:magnitude_stats:percentile25,raw_acc:magnitude_stats:percentile50,raw_acc:magnitude_stats:percentile75,raw_acc:magnitude_stats:value_entropy,raw_acc:magnitude_stats:time_entropy,...,label:ELEVATOR,label:OR_standing,label:AT_SCHOOL,label:PHONE_IN_HAND,label:PHONE_IN_BAG,label:PHONE_ON_TABLE,label:WITH_CO-WORKERS,label:WITH_FRIENDS,label_source,multi_label
0,1464129912,1.011438,0.012573,0.023013,0.041240,1.006184,1.010868,1.016028,0.935099,6.684536,...,,0.0,0.0,,,,1.0,0.0,2,label::SITTING:TALKING:WITH_CO-WORKERS
1,1464129950,1.011233,0.009356,-0.005622,0.016687,1.006338,1.010926,1.016657,1.732968,6.684569,...,,0.0,0.0,,,,1.0,0.0,2,label::SITTING:TALKING:WITH_CO-WORKERS
2,1464130031,1.013422,0.018068,-0.008593,0.039286,1.004077,1.012983,1.021926,1.464639,6.684453,...,,0.0,0.0,,,,1.0,0.0,2,label::SITTING:TALKING:WITH_CO-WORKERS
3,1464130109,1.014891,0.016400,0.021383,0.038825,1.005934,1.014670,1.023000,1.440043,6.684483,...,,0.0,0.0,,,,1.0,0.0,2,label::SITTING:TALKING:WITH_CO-WORKERS
4,1464130130,1.017487,0.022632,-0.012891,0.037226,1.006040,1.017587,1.028168,1.937362,6.684364,...,,0.0,0.0,,,,1.0,0.0,2,label::SITTING:TALKING:WITH_CO-WORKERS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2680,1464886422,1.002114,0.043888,-0.023421,0.060480,0.975662,1.001421,1.028555,2.562570,6.683649,...,,0.0,0.0,0.0,,0.0,0.0,0.0,1,label::SITTING:ON_A_BUS:PHONE_IN_POCKET:TALKING
2681,1464886722,1.015957,0.068057,-0.030021,0.108892,0.980304,1.015067,1.050255,2.055226,6.682352,...,,0.0,0.0,0.0,,0.0,0.0,0.0,1,label::SITTING:ON_A_BUS:PHONE_IN_POCKET:TALKING
2682,1464887023,0.996011,0.039554,-0.034651,0.065742,0.976391,0.998637,1.018739,1.989673,6.683814,...,,0.0,0.0,0.0,,0.0,0.0,0.0,1,label::SITTING:ON_A_BUS:PHONE_IN_POCKET:TALKING
2683,1464887322,1.079283,0.404817,0.330815,0.569329,0.907679,1.001940,1.298900,2.453154,6.612619,...,,0.0,0.0,0.0,,0.0,0.0,0.0,4,label::FIX_walking:OR_outside:PHONE_IN_POCKET:...


### Vamos analisar alguns dados dos sensores

In [28]:
sensors = ['raw_acc:magnitude_stats:mean', 
 'raw_acc:magnitude_stats:std',
 'raw_acc:magnitude_stats:moment3',
 'raw_acc:magnitude_stats:moment4',
 'raw_acc:magnitude_stats:percentile25',
 'raw_acc:magnitude_stats:percentile50',
 'raw_acc:magnitude_stats:percentile75',
 'raw_acc:magnitude_stats:value_entropy',
 'raw_acc:magnitude_stats:time_entropy',
 'raw_acc:magnitude_spectrum:log_energy_band0',
 'raw_acc:magnitude_spectrum:log_energy_band1',
 'raw_acc:magnitude_spectrum:log_energy_band2',
 'raw_acc:magnitude_spectrum:log_energy_band3',
 'raw_acc:magnitude_spectrum:log_energy_band4',
 'raw_acc:magnitude_spectrum:spectral_entropy',
 'raw_acc:magnitude_autocorrelation:period',
 'raw_acc:magnitude_autocorrelation:normalized_ac',
 'raw_acc:3d:mean_x',
 'raw_acc:3d:mean_y',
 'raw_acc:3d:mean_z',
 'raw_acc:3d:std_x',
 'raw_acc:3d:std_y',
 'raw_acc:3d:std_z',
 'raw_acc:3d:ro_xy',
 'raw_acc:3d:ro_xz',
 'raw_acc:3d:ro_yz',
 'proc_gyro:magnitude_stats:mean',
 'proc_gyro:magnitude_stats:std',
 'proc_gyro:magnitude_stats:moment3',
 'proc_gyro:magnitude_stats:moment4',
 'proc_gyro:magnitude_stats:percentile25',
 'proc_gyro:magnitude_stats:percentile50',
 'proc_gyro:magnitude_stats:percentile75',
 'proc_gyro:magnitude_stats:value_entropy',
 'proc_gyro:magnitude_stats:time_entropy',
 'proc_gyro:magnitude_spectrum:log_energy_band0',
 'proc_gyro:magnitude_spectrum:log_energy_band1',
 'proc_gyro:magnitude_spectrum:log_energy_band2',
 'proc_gyro:magnitude_spectrum:log_energy_band3',
 'proc_gyro:magnitude_spectrum:log_energy_band4',
 'proc_gyro:magnitude_spectrum:spectral_entropy',
 'proc_gyro:magnitude_autocorrelation:period',
 'proc_gyro:magnitude_autocorrelation:normalized_ac',
 'proc_gyro:3d:mean_x',
 'proc_gyro:3d:mean_y',
 'proc_gyro:3d:mean_z',
 'proc_gyro:3d:std_x',
 'proc_gyro:3d:std_y',
 'proc_gyro:3d:std_z',
 'proc_gyro:3d:ro_xy',
 'proc_gyro:3d:ro_xz',
 'proc_gyro:3d:ro_yz',
 'raw_magnet:magnitude_stats:mean',
 'raw_magnet:magnitude_stats:std',
 'raw_magnet:magnitude_stats:moment3',
 'raw_magnet:magnitude_stats:moment4',
 'raw_magnet:magnitude_stats:percentile25',
 'raw_magnet:magnitude_stats:percentile50',
 'raw_magnet:magnitude_stats:percentile75',
 'raw_magnet:magnitude_stats:value_entropy',
 'raw_magnet:magnitude_stats:time_entropy',
 'raw_magnet:magnitude_spectrum:log_energy_band0',
 'raw_magnet:magnitude_spectrum:log_energy_band1',
 'raw_magnet:magnitude_spectrum:log_energy_band2',
 'raw_magnet:magnitude_spectrum:log_energy_band3',
 'raw_magnet:magnitude_spectrum:log_energy_band4',
 'raw_magnet:magnitude_spectrum:spectral_entropy',
 'raw_magnet:magnitude_autocorrelation:period',
 'raw_magnet:magnitude_autocorrelation:normalized_ac',
 'raw_magnet:3d:mean_x',
 'raw_magnet:3d:mean_y',
 'raw_magnet:3d:mean_z',
 'raw_magnet:3d:std_x',
 'raw_magnet:3d:std_y',
 'raw_magnet:3d:std_z',
 'raw_magnet:3d:ro_xy',
 'raw_magnet:3d:ro_xz',
 'raw_magnet:3d:ro_yz',
 'raw_magnet:avr_cosine_similarity_lag_range0',
 'raw_magnet:avr_cosine_similarity_lag_range1',
 'raw_magnet:avr_cosine_similarity_lag_range2',
 'raw_magnet:avr_cosine_similarity_lag_range3',
 'raw_magnet:avr_cosine_similarity_lag_range4',
 'watch_acceleration:magnitude_stats:mean',
 'watch_acceleration:magnitude_stats:std',
 'watch_acceleration:magnitude_stats:moment3',
 'watch_acceleration:magnitude_stats:moment4',
 'watch_acceleration:magnitude_stats:percentile25',
 'watch_acceleration:magnitude_stats:percentile50',
 'watch_acceleration:magnitude_stats:percentile75',
 'watch_acceleration:magnitude_stats:value_entropy',
 'watch_acceleration:magnitude_stats:time_entropy',
 'watch_acceleration:magnitude_spectrum:log_energy_band0',
 'watch_acceleration:magnitude_spectrum:log_energy_band1',
 'watch_acceleration:magnitude_spectrum:log_energy_band2',
 'watch_acceleration:magnitude_spectrum:log_energy_band3',
 'watch_acceleration:magnitude_spectrum:log_energy_band4',
 'watch_acceleration:magnitude_spectrum:spectral_entropy',
 'watch_acceleration:magnitude_autocorrelation:period',
 'watch_acceleration:magnitude_autocorrelation:normalized_ac',
 'watch_acceleration:3d:mean_x',
 'watch_acceleration:3d:mean_y',
 'watch_acceleration:3d:mean_z',
 'watch_acceleration:3d:std_x',
 'watch_acceleration:3d:std_y',
 'watch_acceleration:3d:std_z',
 'watch_acceleration:3d:ro_xy',
 'watch_acceleration:3d:ro_xz',
 'watch_acceleration:3d:ro_yz',
 'watch_acceleration:spectrum:x_log_energy_band0',
 'watch_acceleration:spectrum:x_log_energy_band1',
 'watch_acceleration:spectrum:x_log_energy_band2',
 'watch_acceleration:spectrum:x_log_energy_band3',
 'watch_acceleration:spectrum:x_log_energy_band4',
 'watch_acceleration:spectrum:y_log_energy_band0',
 'watch_acceleration:spectrum:y_log_energy_band1',
 'watch_acceleration:spectrum:y_log_energy_band2',
 'watch_acceleration:spectrum:y_log_energy_band3',
 'watch_acceleration:spectrum:y_log_energy_band4',
 'watch_acceleration:spectrum:z_log_energy_band0',
 'watch_acceleration:spectrum:z_log_energy_band1',
 'watch_acceleration:spectrum:z_log_energy_band2',
 'watch_acceleration:spectrum:z_log_energy_band3',
 'watch_acceleration:spectrum:z_log_energy_band4',
 'watch_acceleration:relative_directions:avr_cosine_similarity_lag_range0',
 'watch_acceleration:relative_directions:avr_cosine_similarity_lag_range1',
 'watch_acceleration:relative_directions:avr_cosine_similarity_lag_range2',
 'watch_acceleration:relative_directions:avr_cosine_similarity_lag_range3',
 'watch_acceleration:relative_directions:avr_cosine_similarity_lag_range4',
 'watch_heading:mean_cos',
 'watch_heading:std_cos',
 'watch_heading:mom3_cos',
 'watch_heading:mom4_cos',
 'watch_heading:mean_sin',
 'watch_heading:std_sin',
 'watch_heading:mom3_sin',
 'watch_heading:mom4_sin',
 'watch_heading:entropy_8bins']

sensors_acc = ['raw_acc:magnitude_stats:mean', 
 'raw_acc:3d:mean_x',
 'raw_acc:3d:mean_y',
 'raw_acc:3d:mean_z']
 
sensors_gyro = ['proc_gyro:magnitude_stats:mean',
 'proc_gyro:3d:mean_x',
 'proc_gyro:3d:mean_y',
 'proc_gyro:3d:mean_z']

sensors_magnet = ['raw_magnet:magnitude_stats:mean',
 'raw_magnet:3d:mean_x',
 'raw_magnet:3d:mean_y',
 'raw_magnet:3d:mean_z']

sensors_watch_acc = ['watch_acceleration:magnitude_stats:mean',
 'watch_acceleration:3d:mean_x',
 'watch_acceleration:3d:mean_y',
 'watch_acceleration:3d:mean_z']

sensors_watch_heading = ['watch_heading:mean_cos',
 'watch_heading:mean_sin']

# Projeto de um modelo de classificação

### Vamos iniciar com o projeto de um classificador multiclasses, considerando somente as classes principais, por ser mais simples. Vamos relembrar a distribuição dessas classes.

In [29]:
user1_eda.shape

(2681, 279)

In [30]:
user1_model1_pre = user1_eda
main_labels_df = user1_model1_pre.loc[:,main_label]
main_labels_df[labels_df == 1].count().sort_values(ascending=False)

label:SITTING        1543
label:LYING_DOWN      748
label:OR_standing     231
label:FIX_walking     158
label:FIX_running       1
label:BICYCLING         0
dtype: int64

### Vamos eliminar as colunas de labels que representam menos que duas amostras do dataset, que é o mínimo exigido por um classificador.

In [31]:
user1_model1_pre.drop(user1_model1_pre.loc[user1_model1_pre['label:FIX_running']==1].index, inplace=True)
user1_model1_pre.drop(columns=['label:FIX_running', 'label:BICYCLING'], inplace=True)
user1_model1_pre.shape

(2680, 277)

In [32]:
def mail_label_ID (row):
    if row['label:SITTING'] == 1 :
      return '0'
    if row['label:LYING_DOWN'] == 1 :
      return '1'
    if row['label:OR_standing'] == 1 :
      return '2'
    if row['label:FIX_walking'] == 1 :
      return '3'
        
    return '-1'

In [33]:
user1_model1_pre['main_label'] = user1_model1_pre.apply (lambda row: mail_label_ID(row), axis=1)
user1_model1_pre['main_label'].value_counts()

0    1543
1     748
2     231
3     158
Name: main_label, dtype: int64

## Seleção de atributos

#### Nesse primeiro experimento, vamos selecionar somente os sinais do acelerômetro e do giroscópio do smartphone.

## Vamos agora projetar um classificador multiclasses, considerando somente as classes principais, por ser mais simples. 

### Vamos separar os dados pertinentes.

In [34]:
user1_multiclass_acc_3d = pd.concat([user1_model1_pre.loc[:, user1.columns.str.startswith('raw_acc:3d:')], user1_model1_pre['main_label']], axis=1)

In [35]:
from sklearn.model_selection import train_test_split

X_train_multiclass_acc_3d, X_test__multiclass_acc_3d, y_train_multiclass_acc_3d, y_test_multiclass_acc_3d = train_test_split(user1_multiclass_acc_3d.drop(columns=['main_label']), user1_multiclass_acc_3d['main_label'], stratify=user1_multiclass_acc_3d['main_label'], test_size=0.25, random_state=42)

In [36]:
X_train_multiclass_acc_3d.shape

(2010, 9)

In [37]:
y_train_multiclass_acc_3d.shape

(2010,)

### Criação do conjunto de teste e do conjunto de treino
#### Separamos os dois conjuntos de forma estratificada, pelo atributo *main label*, uma vez que se trata de um dataset desbalanceado.

## Vamos agora projetar um classificador multiclasses, considerando somente as classes principais, por ser mais simples. 

### Vamos separar os dados pertinentes.

In [38]:
user1_multiclass = pd.concat([user1_model1_pre.loc[:, user1.columns.str.startswith('raw_acc')], user1_model1_pre['main_label']], axis=1)

In [39]:
from sklearn.model_selection import train_test_split

X_train_multiclass, X_test__multiclass, y_train_multiclass, y_test_multiclass = train_test_split(user1_multiclass.drop(columns=['main_label']), user1_multiclass['main_label'], stratify=user1_multiclass['main_label'], test_size=0.25, random_state=42)

In [40]:
X_train_multiclass.shape

(2010, 26)

In [41]:
y_train_multiclass.shape

(2010,)

### Vamos criar um pipeline para pré-processamento e treino dos modelos

### MLP

### Separando os dados (x, y, z) do acelerômetro, giroscópio e magnetômetro do smartphone

In [42]:
acc_3d = user1_model1_pre.loc[:, user1.columns.str.startswith('raw_acc:3d:')]
gyro_3d = user1_model1_pre.loc[:, user1.columns.str.startswith('proc_gyro:3d:')]
magnet_3d = user1_model1_pre.loc[:, user1.columns.str.startswith('raw_magnet:3d:')]

user1_multiclass_acc_gyro_magnet = pd.concat([acc_3d, gyro_3d], axis=1)
user1_multiclass_acc_gyro_magnet = pd.concat([user1_multiclass_acc_gyro_magnet, magnet_3d], axis=1)
user1_multiclass_acc_gyro_magnet = pd.concat([user1_multiclass_acc_gyro_magnet, user1_model1_pre['main_label']], axis=1)

user1_multiclass_acc_gyro_magnet.dropna(inplace=True) # removendo dados faltantes

In [43]:
from sklearn.model_selection import train_test_split

X_train_multiclass_acc_gyro_magnet, X_test_multiclass_acc_gyro_magnet, y_train_multiclass_acc_gyro_magnet, y_test_multiclass_acc_gyro_magnet = train_test_split(user1_multiclass_acc_gyro_magnet.drop(columns=['main_label']), user1_multiclass_acc_gyro_magnet['main_label'], stratify=user1_multiclass_acc_gyro_magnet['main_label'], test_size=0.25, random_state=42)

In [44]:
X_train_multiclass_acc_gyro_magnet.shape

(1788, 27)

In [45]:
X_test_multiclass_acc_gyro_magnet.shape

(597, 27)

In [46]:
y_train_multiclass_acc_gyro_magnet.shape

(1788,)

In [47]:
y_test_multiclass_acc_gyro_magnet.shape

(597,)

In [48]:
# vamos salvar os dois conjuntos de dados
X_train_multiclass_acc_gyro_magnet.to_csv('../input/X_train_multiclass_acc_gyro_magnet.csv', index=False)
X_test_multiclass_acc_gyro_magnet.to_csv('../input/X_test_multiclass_acc_gyro_magnet.csv', index=False)

y_train_multiclass_acc_gyro_magnet.to_csv('../input/y_train_multiclass_acc_gyro_magnet.csv', index=False)
y_test_multiclass_acc_gyro_magnet.to_csv('../input/y_test_multiclass_acc_gyro_magnet.csv', index=False)

## Vamos criar um pipeline para pré-processamento e treino do modelo multi-classes, usando os dados (x, y, z) do acelerômetro, giroscópio e magnetômetro do smartphone

#### Vamos usar os seguintes algoritmos de classificação:
**Gaussian Naive Bayes**  
**Logistic regression**  
**Logistic regression CV**  
**Máquina de vetor de suporte**  
**MLP**

## TF

In [49]:
import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical


from sklearn import preprocessing
from sklearn.metrics import classification_report

In [50]:
def avg_multilabel_BA(y_true, y_pred):
    ba_array = []
    for i in range(y_pred.shape[1]):
        report = classification_report(y_true[:, i], (y_pred[:, i] > 0.5), output_dict=True, zero_division=0)
        #print(report)
        sensitivity = report['1.0']['recall'] # tp / (tp + fn)
        specificity = report['0.0']['recall'] #specificity = tn / (tn+fp)
        ba = 0.5*(specificity+sensitivity)
        ba_array.append(ba)
        #ba_array.append(balanced_accuracy_score((Y_pred[:, i] > 0.5), Y_test[:, i]))
    return np.mean(ba_array)

In [55]:

# Set the input shape
#input_shape = X_train_multiclass_acc_gyro_magnet.shape
#print(f'Feature shape: {input_shape}')
labels= ['label:SITTING', 'label:LYING_DOWN','label:OR_standing', 'label:FIX_walking']

raw = pd.read_csv('../input/user1.features_labels.csv')
raw = raw.dropna(subset=labels)

x = raw[raw.columns.drop(raw.filter(regex='label:'))]
y = raw.filter(regex='label:')
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.25, random_state=42)

#X_train = X_train_multiclass_acc_gyro_magnet.dropna()
input_shape = X_train.shape
print(f'Feature shape: {input_shape}')

X_train.to_numpy().reshape(input_shape[0], input_shape[1])

min_max_scaler = preprocessing.MinMaxScaler()
X_train_norm = pd.DataFrame(min_max_scaler.fit_transform(X_train))


#a = tf.strings.to_number(Y_train).numpy().astype(np.int32)
num_classes = 4
#Y_train = to_categorical(,num_classes = num_classes)


# Create the model
model = Sequential()
model.add(Dense(16, input_dim=input_shape[1], activation='relu', activity_regularizer=tf.keras.regularizers.l2(0.01)))
#model.add(Dense(16, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

# Configure the model and start training
ada = tf.keras.optimizers.Adagrad(learning_rate=0.1)
sgd = tf.keras.optimizers.SGD(learning_rate=0.1, decay=1e-2, momentum=0.5)

model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['categorical_accuracy'])
model.fit(X_train_norm, Y_train, epochs=40, batch_size=10, verbose=1, validation_split=0.2)



Feature shape: (2010, 227)


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).

In [None]:
X_test = X_test_multiclass_acc_gyro_magnet

X_test_norm = pd.DataFrame(min_max_scaler.transform(X_test))


b = tf.strings.to_number(y_test_multiclass_acc_gyro_magnet).numpy().astype(np.int32)
Y_test = to_categorical(b,num_classes = num_classes)

# Test the model after training
test_results = model.evaluate(X_test_norm, Y_test, verbose=1)
print(f'Test results - Loss: {test_results[0]} - Accuracy: {test_results[1]}%')

Test results - Loss: 0.3957729637622833 - Accuracy: 0.6649916172027588%


In [None]:
Y_pred = model.predict(X_test_norm)
#Y_pred = np.argmax(Y_pred_prob,axis=1)
#Y_pred

In [None]:
avg_multilabel_BA(Y_test, Y_pred)


0.5489454948623735

In [None]:
import es_utils as utils

In [None]:
mlp = utils.MLPMultilabel(input_shape[1], num_classes)

In [None]:
mlp.train(X_train_norm, Y_train)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [None]:
mlp.evaluate(X_test_norm, Y_test)

Test results - Loss: 0.4075752794742584 - Accuracy: 0.6046901345252991%
Averaged Balanced Accuracy: 0.509194


([0.4075752794742584, 0.6046901345252991], 0.5091941143770412)

## testando com todos os sensores

In [None]:
X = user1_model1_pre[user1_model1_pre.columns.drop(user1_model1_pre.filter(regex='label:'))]
X.drop(columns=['multi_label', 'main_label'], inplace=True)
Y = user1_model1_pre.filter(regex='label:')
Y = Y[['label:SITTING', 'label:LYING_DOWN','label:OR_standing', 'label:FIX_walking']]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [None]:
Y

Unnamed: 0,label:SITTING,label:LYING_DOWN,label:OR_standing,label:FIX_walking
0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0
...,...,...,...,...
2680,1.0,0.0,0.0,0.0
2681,1.0,0.0,0.0,0.0
2682,1.0,0.0,0.0,0.0
2683,0.0,0.0,0.0,1.0


In [None]:
X

Unnamed: 0,timestamp,raw_acc:magnitude_stats:mean,raw_acc:magnitude_stats:std,raw_acc:magnitude_stats:moment3,raw_acc:magnitude_stats:moment4,raw_acc:magnitude_stats:percentile25,raw_acc:magnitude_stats:percentile50,raw_acc:magnitude_stats:percentile75,raw_acc:magnitude_stats:value_entropy,raw_acc:magnitude_stats:time_entropy,...,lf_measurements:temperature_ambient,discrete:time_of_day:between0and6,discrete:time_of_day:between3and9,discrete:time_of_day:between6and12,discrete:time_of_day:between9and15,discrete:time_of_day:between12and18,discrete:time_of_day:between15and21,discrete:time_of_day:between18and24,discrete:time_of_day:between21and3,label_source
0,1464129912,1.011438,0.012573,0.023013,0.041240,1.006184,1.010868,1.016028,0.935099,6.684536,...,,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2
1,1464129950,1.011233,0.009356,-0.005622,0.016687,1.006338,1.010926,1.016657,1.732968,6.684569,...,,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2
2,1464130031,1.013422,0.018068,-0.008593,0.039286,1.004077,1.012983,1.021926,1.464639,6.684453,...,,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2
3,1464130109,1.014891,0.016400,0.021383,0.038825,1.005934,1.014670,1.023000,1.440043,6.684483,...,,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2
4,1464130130,1.017487,0.022632,-0.012891,0.037226,1.006040,1.017587,1.028168,1.937362,6.684364,...,,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2680,1464886422,1.002114,0.043888,-0.023421,0.060480,0.975662,1.001421,1.028555,2.562570,6.683649,...,,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1
2681,1464886722,1.015957,0.068057,-0.030021,0.108892,0.980304,1.015067,1.050255,2.055226,6.682352,...,,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1
2682,1464887023,0.996011,0.039554,-0.034651,0.065742,0.976391,0.998637,1.018739,1.989673,6.683814,...,,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1
2683,1464887322,1.079283,0.404817,0.330815,0.569329,0.907679,1.001940,1.298900,2.453154,6.612619,...,,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,4


In [None]:
dfeeder = utils.DataProcessingExtrasensory(X, Y)

x_train, x_test, y_train, y_test = dfeeder.split_train_test()
mlp2 = utils.MLPMultilabel(input_shape[1], num_classes)
mlp2.train(X_train_norm, Y_train)

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.2, random_state=42)


In [None]:
type(y_train['label:OR_standing'].iloc[0])


numpy.float64

In [None]:
def get_x_y_from_raw(raw):
    x = raw[raw.columns.drop(raw.filter(regex='label:'))]
    #x.drop(columns=['multi_label', 'main_label'], inplace=True)
    y = raw.filter(regex='label:')
    #y = y[['label:SITTING', 'label:LYING_DOWN','label:OR_standing', 'label:FIX_walking']]
    return x, y
    
def select_labels(y, labels : list):
    return y[labels]

In [None]:
x, y = get_x_y_from_raw(user1_model1_pre)
y = select_labels(y, ['label:SITTING', 'label:LYING_DOWN','label:OR_standing', 'label:FIX_walking'])
y

Unnamed: 0,label:SITTING,label:LYING_DOWN,label:OR_standing,label:FIX_walking
0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0
...,...,...,...,...
2680,1.0,0.0,0.0,0.0
2681,1.0,0.0,0.0,0.0
2682,1.0,0.0,0.0,0.0
2683,0.0,0.0,0.0,1.0


In [None]:
user1_eda


Unnamed: 0,timestamp,raw_acc:magnitude_stats:mean,raw_acc:magnitude_stats:std,raw_acc:magnitude_stats:moment3,raw_acc:magnitude_stats:moment4,raw_acc:magnitude_stats:percentile25,raw_acc:magnitude_stats:percentile50,raw_acc:magnitude_stats:percentile75,raw_acc:magnitude_stats:value_entropy,raw_acc:magnitude_stats:time_entropy,...,label:OR_standing,label:AT_SCHOOL,label:PHONE_IN_HAND,label:PHONE_IN_BAG,label:PHONE_ON_TABLE,label:WITH_CO-WORKERS,label:WITH_FRIENDS,label_source,multi_label,main_label
0,1464129912,1.011438,0.012573,0.023013,0.041240,1.006184,1.010868,1.016028,0.935099,6.684536,...,0.0,0.0,,,,1.0,0.0,2,label::SITTING:TALKING:WITH_CO-WORKERS,0
1,1464129950,1.011233,0.009356,-0.005622,0.016687,1.006338,1.010926,1.016657,1.732968,6.684569,...,0.0,0.0,,,,1.0,0.0,2,label::SITTING:TALKING:WITH_CO-WORKERS,0
2,1464130031,1.013422,0.018068,-0.008593,0.039286,1.004077,1.012983,1.021926,1.464639,6.684453,...,0.0,0.0,,,,1.0,0.0,2,label::SITTING:TALKING:WITH_CO-WORKERS,0
3,1464130109,1.014891,0.016400,0.021383,0.038825,1.005934,1.014670,1.023000,1.440043,6.684483,...,0.0,0.0,,,,1.0,0.0,2,label::SITTING:TALKING:WITH_CO-WORKERS,0
4,1464130130,1.017487,0.022632,-0.012891,0.037226,1.006040,1.017587,1.028168,1.937362,6.684364,...,0.0,0.0,,,,1.0,0.0,2,label::SITTING:TALKING:WITH_CO-WORKERS,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2680,1464886422,1.002114,0.043888,-0.023421,0.060480,0.975662,1.001421,1.028555,2.562570,6.683649,...,0.0,0.0,0.0,,0.0,0.0,0.0,1,label::SITTING:ON_A_BUS:PHONE_IN_POCKET:TALKING,0
2681,1464886722,1.015957,0.068057,-0.030021,0.108892,0.980304,1.015067,1.050255,2.055226,6.682352,...,0.0,0.0,0.0,,0.0,0.0,0.0,1,label::SITTING:ON_A_BUS:PHONE_IN_POCKET:TALKING,0
2682,1464887023,0.996011,0.039554,-0.034651,0.065742,0.976391,0.998637,1.018739,1.989673,6.683814,...,0.0,0.0,0.0,,0.0,0.0,0.0,1,label::SITTING:ON_A_BUS:PHONE_IN_POCKET:TALKING,0
2683,1464887322,1.079283,0.404817,0.330815,0.569329,0.907679,1.001940,1.298900,2.453154,6.612619,...,0.0,0.0,0.0,,0.0,0.0,0.0,4,label::FIX_walking:OR_outside:PHONE_IN_POCKET:...,3
