# Process Files

Este notebook é responsável por realizar pré-processamento e limpeza nos dados.

In [1]:
import os
import re
import numpy as np
import pandas as pd
from datetime import datetime

In [2]:
'''Read data about control group'''
def read_control():
    #Path to data
    data_path = os.path.dirname(os.getcwd()) + '/Data/handwrite' + '/control/'

    #List files in data_patabsh
    filenames = os.listdir(data_path)

    #Dataframe with all control's people
    df_control = pd.DataFrame()

    for file in filenames:
        #Read data
        df = pd.read_csv(data_path + file,sep=';',header=None,names=['X','Y','Z','pressure','gripAngle','timestamp','test_id'])

        #Set an id 
        df['id'] = file.split('.')[0]

        #Set that the person does not have parkinson
        df['parkinson'] = False

        #Concat results
        df_control = pd.concat([df_control,df],axis=0)
        
    return df_control
        
'''Read data about parkinson group'''
def read_parkinson():
    #Path to data
    data_path = os.path.dirname(os.getcwd()) + '/Data/handwrite' + '/parkinson/'

    #List files in data_patabsh
    filenames = os.listdir(data_path)

    #Dataframe with all control's people
    df_parkinson = pd.DataFrame()

    for file in filenames:
        #Read data
        df = pd.read_csv(data_path + file,sep=';',header=None,names=['X','Y','Z','pressure','gripAngle','timestamp','test_id'])

        #Set an id 
        df['id'] = file.split('.')[0]

        #Set that the person does not have parkinson
        df['parkinson'] = True

        #Concat results
        df_parkinson = pd.concat([df_parkinson,df],axis=0)
    
    return df_parkinson

'''Process files from dataset handwrite'''
def process_hw():
    #Read two datasets
    df_control = read_control()
    df_parkinson = read_parkinson()

    #Concat two datasets
    df_parkinson = pd.concat([df_parkinson,df_control])

    #Select just the columns that are necessary
    df_parkinson = df_parkinson.loc[:,['id','test_id','X','Y','timestamp','parkinson']]

    #Sort values
    df_parkinson.sort_values(['id','test_id','timestamp'],inplace=True)

    #Save result
    df_parkinson.to_csv(os.path.dirname(os.getcwd()) + '/Data/handwrite/' + 'parkinson_hw.csv',index=False)
    

'''Process each user's file'''
def process_user():
    #Path to user's files
    path = os.path.dirname(os.getcwd()) + '/Data/tappy-keystroke' + '/Archived users/'

    #List files in path
    filename =  os.listdir(path)
    filename.sort()

    #Create dataframe to save information
    df_parkinson = pd.DataFrame(columns=['birthyear','gender','parkinson','tremor','diagnosisYear','sided','UPDRS','impact','levadopa','DA','MAOB','other'])

    #Identifier of user
    id_user = []

    for file in filename:

        #Read file
        with open(path + file) as f:
            user = f.readlines()

        #Extract information
        user = list(map(lambda x: x.split(': ')[1].replace('\n',''),user))
        id_user.append(file.split('_')[1].split('.')[0])

        #Fix dataframe
        user = pd.DataFrame(user).transpose()
        user.columns = df_parkinson.columns

        #Concat result
        df_parkinson = pd.concat([df_parkinson,user],axis=0)

    #ID like index
    df_parkinson['id'] = id_user
    df_parkinson.set_index('id',inplace=True)

    #Save data
    df_parkinson.to_csv(os.path.dirname(os.getcwd()) + '/Data/tappy-keystroke/user_information.csv')
    
'''Process each tappy's file'''
def process_tappy():
    #Path to tappy data's files
    path = os.path.dirname(os.getcwd()) + '/Data/tappy-keystroke' + '/Tappy Data/'

    #List files in path
    filename =  os.listdir(path)
    filename.sort()

    #Create dataframe with result
    df_parkinson = pd.DataFrame(columns=['id','date','timestamp','hand','hold_time','direction','latency','flight'])

    for index,file in enumerate(filename):
        print(index)

        try:
            #Read data
            df_tmp = pd.read_csv(path+file,sep='\t',usecols=[0,1,2,3,4,5,6,7],low_memory=False)    
            df_tmp.columns = df_parkinson.columns

            #Concat result
            df_parkinson = pd.concat([df_parkinson,df_tmp],axis=0)
        except:
            pass

    #ID like index
    df_parkinson.set_index('id',inplace=True)
        
    #Save data
    df_parkinson.to_csv(os.path.dirname(os.getcwd()) + '/Data/tappy-keystroke/tappy_data.csv')
    
'''Perform a clean in dataset tappy'''
def clean_tappy():
    
    #Path to data
    data_path = os.path.dirname(os.getcwd()) + '/Data/tappy-keystroke/'
    
    #Read dataset
    df_tappy = pd.read_csv(data_path + 'tappy_data.csv',usecols=[0,3,4,5,6,7])

    #Filter rows with problem
    df_tappy = df_tappy.loc[df_tappy['hand'].isin(['L','R','S'])]
    df_tappy = df_tappy.loc[df_tappy['direction'].isin(['LL', 'LS', 'SL', 'LR', 'RR', 'RL', 'RS', 'SR', 'SS'])]

    #Create regular expression
    prog =re.compile('^[0-9]+\.[0-9]*$')

    #Fix columns
    df_tappy['latency'] = df_tappy['latency'].map(lambda x: x if type(x) == float else (float(x) if prog.match(x) is not None else np.nan))
    df_tappy['flight'] = df_tappy['flight'].map(lambda x: x if type(x) == float else (float(x) if prog.match(x) is not None else np.nan))

    #Change type
    df_tappy.loc[:,'hold_time'] = df_tappy.loc[:,'hold_time'].astype(float)

    #Exclude negative numbers in hold_time
    df_tappy = df_tappy[df_tappy['hold_time'] > 0]
    
    #Filter data to exclude incorrect information
    df_tappy = df_tappy.loc[(df_tappy['latency']-df_tappy['hold_time']>0),:]
    df_tappy = df_tappy.loc[df_tappy['latency'] < 2000,:]

    #Have one row in dataset that doesn't have id but we infer that is QEYMRM1ZSM because of sequence
    df_tappy['id'] = df_tappy['id'].fillna('QEYMRM1ZSM')

    #Save new data
    df_tappy.to_csv(data_path + 'tappy_data_cleaned.csv',index=False)
    
'''Perform clean in dataset user information'''    
def clean_user():
    
    #Path to data
    data_path = os.path.dirname(os.getcwd()) + '/Data/tappy-keystroke/'
    
    #Read data
    df_user = pd.read_csv(data_path + 'user_information.csv',usecols=[0,3])
    
    #Excludes people who do not have reliable information
    df_user = df_user.loc[~df_user['id'].isin(['LA6KW35OXK','UH6FQWXIZI','VCFUOTMSKT']),:]

    #Save result
    df_user.to_csv(data_path + 'user_information_cleaned.csv',index=False)
    
'''Compute new metrics for col_pivot'''
def metric(df,col_pivot):
    
    #Creation of new metrics
    df_metrics = df.groupby(['id',col_pivot]).apply(lambda x: x.loc[:,['hold_time','latency','flight']].agg([np.mean,np.std]))
    df_metrics.reset_index(level=[1,2],inplace=True)
    df_metrics.rename(columns={'level_2':'metric'},inplace=True)

    #Apply pivot in table to get new columns
    df_metrics = pd.pivot_table(df_metrics,values=['hold_time','latency','flight'],columns=[col_pivot,'metric'],index=['id'])

    #Fix columns
    cols = []
    for col in df_metrics.columns:
        cols.append(col[0] + '_' + col[1] + '_' + col[2])
    df_metrics.columns = cols
    df_metrics.reset_index(inplace=True)
    
    return df_metrics

#Fill na in dataframe with mean
def fill_na(df):
    return df.fillna(df.iloc[:,2:].mean())

'''Perform merge in two datasets'''
def merge_data(col_pivot):
    
    #Path to data
    data_path = os.path.dirname(os.getcwd()) + '/Data/tappy-keystroke/'
    
    #Read data
    df_user = pd.read_csv(data_path+'user_information_cleaned.csv')
    df_tappy = pd.read_csv(data_path+'tappy_data_cleaned.csv')

    #Compute new metrics
    df_metrics = metric(df_tappy,col_pivot)

    #Merge two files
    df_merge = pd.merge(df_user,df_metrics,on='id')
    
    #Fill na values
    df_merge = df_merge.groupby('parkinson').apply(fill_na)

    #Fix somethings
    del df_merge['parkinson']
    df_merge = df_merge.reset_index(level=0).set_index('id')

    #Save result
    df_merge.to_csv(data_path+'parkinson_tappy_' + col_pivot + '.csv')
    
'''Criation of new metrics for dataset handwrite'''
def new_metric_hw():
    #Path to data
    data_path = os.path.dirname(os.getcwd()) + '/Data/'

    #Read data
    df_parkinson = pd.read_csv(data_path + '/handwrite/parkinson_hw.csv')

    #We drop equal positions because we want to know just the changes along time
    df_parkinson = df_parkinson.drop_duplicates(['id','test_id','X','Y'],keep='first')

    #Compute velocity for each id and test
    df_velocity = df_parkinson.groupby(['id','test_id']).apply(lambda x: (np.sqrt(x['X'].diff()**2 + x['Y'].diff()**2) / x['timestamp'].diff()).agg([np.mean,np.std]))

    #Fix dataframe
    df_velocity.reset_index(inplace=True)
    df_velocity.dropna(inplace=True)

    #Insert column that indicates if person has or not parkinson
    map_parkinson = df_parkinson.set_index('id')['parkinson'].to_dict()
    df_velocity['parkinson'] = df_velocity['id'].map(map_parkinson)

    #Save data
    df_velocity.to_csv(data_path + '/handwrite/parkinson_hw_velocity.csv',index=False)

In [3]:
#Process dataset
process_hw()
process_user()
process_tappy()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

As funções acima transformam os dados no formato csv para facilitar a manipulação.

In [4]:
#Clean two files
clean_user()
clean_tappy()

  if (yield from self.run_code(code, result)):


As funções acima realizam as seguintes atividades de limpeza:

- tappy:
    - Arquivo bruto possui linhas concatenadas (provalmente falha do sistema na gravação) causando falha na leitura das colunas. A solução foi remover essas linhas dada a dificuldade de tratá-las.
    - Exclusão de linhas com informações inconsistentes com a fórmula latency-hold_time > 0.
- user_information:
    - Exclusão de colunas: Excluímos informações que não relacionadas ao teclado por nosso objetivo ser utilizar apenas os dados de tecla para predição.
    - Usuários com informações estranhas: Existem pessoas que afirmam não possuir parkinson apesar de colocar um ano como diagnóstico. Como não é possível saber a veracidade disso nós removemos esses usuários.

In [5]:
#Merge files and creation of new metrics
merge_data('hand')
merge_data('direction')
new_metric_hw()

Por alguma razão os dois datasets tappy-keystroke não possuem identificadores iguais, por causa disso nós verificamos quais seriam os valores similares em ambos realizando um merge.

Além disso, na função acima também criamos novas métricas que são resumos (médias) dos atributos numéricos.