# Analysis

Iremos realizar algumas análises no dataset

# 1. Reading dataset 

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import boxcox

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split

In [None]:
#Read data
df_parkinson = pd.read_csv('parkinson_hw.csv')

# 2. New metrics

## 2.1 Criation of new metrics

In [None]:
#We drop equal positions because we want to know just the changes along time
df_parkinson = df_parkinson.drop_duplicates(['id','test_id','X','Y'],keep='first')

#Compute velocity for each id and test
df_velocity = df_parkinson.groupby(['id','test_id']).apply(lambda x: (np.sqrt(x['X'].diff()**2 + x['Y'].diff()**2) / x['timestamp'].diff()).agg([np.mean,np.std]))

#Fix dataframe
df_velocity.reset_index(inplace=True)
df_velocity.dropna(inplace=True)

#Insert column that indicates if person has or not parkinson
map_parkinson = df_parkinson.set_index('id')['parkinson'].to_dict()
df_velocity['parkinson'] = df_velocity['id'].map(map_parkinson)

#Apply boxcox
df_velocity['mean'],_ = boxcox(df_velocity['mean'])
df_velocity['std'],_ = boxcox(df_velocity['std'])

In [None]:
#Boxplot for all tests
plt.figure(figsize=(13,9))
plt.subplot(221)
sns.boxplot(data=df_velocity,x='parkinson',y='mean')

#Boxplot for each test
plt.subplot(222)
sns.boxplot(data=df_velocity[df_velocity['test_id'] == 0],x='parkinson',y='mean')
plt.subplot(223)
sns.boxplot(data=df_velocity[df_velocity['test_id'] == 0],x='parkinson',y='mean')
plt.subplot(224)
sns.boxplot(data=df_velocity[df_velocity['test_id'] == 2],x='parkinson',y='mean')


plt.show()

In [None]:
#Boxplot for all tests
plt.figure(figsize=(13,9))
plt.subplot(221)
sns.boxplot(data=df_velocity,x='parkinson',y='std')

#Boxplot for each test
plt.subplot(222)
sns.boxplot(data=df_velocity[df_velocity['test_id'] == 0],x='parkinson',y='std')
plt.subplot(223)
sns.boxplot(data=df_velocity[df_velocity['test_id'] == 0],x='parkinson',y='std')
plt.subplot(224)
sns.boxplot(data=df_velocity[df_velocity['test_id'] == 2],x='parkinson',y='std')


plt.show()

# 3 Modeling

In [None]:
'''Apply classification in df'''
def classifier(df,test_size=0.4):
    #Classifiers
    dt = DecisionTreeClassifier()
    svm = SVC(gamma='scale',probability=True)
    lr = LogisticRegression(solver='liblinear')
    clfs = {'dt':dt,'svm':svm,'lr':lr}

    #Divide dataset in dependent and independent variables
    X = df.iloc[:,1:]
    Y = df['parkinson']

    #Split data
    X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=test_size)

    #Training
    for k,v in clfs.items():
        clfs[k] = v.fit(X_train,y_train)
    
    return clfs,X_test,y_test

'''Compute some metrics about classification'''
def metrics(clfs,X,Y):
    #Confusion matrix for each classifier
    result = []
    for k in clfs.keys():
        tn, fp, fn, tp  = confusion_matrix(Y,clfs[k].predict(X)).ravel()
        result.append({'classifier':k,'acc':round((tn+tp)/(tn+fp+fn+tp),2),'sens':round(tp/(tp+fn),2),'spec':round(tn/(tn+fp),2),'auc':round(roc_auc_score(Y,clfs[k].predict(X)),2)})
        
    return result

'''Perform an avaliation on classifiers'''
def evaluate_classifiers(df,test_size=0.4):
    #Dataframe with classification result
    df_result = pd.DataFrame()

    for i in range(100):
        #Classification
        clfs,X,Y = classifier(df,test_size)

        #Concat other results
        df_result = pd.concat([df_result,pd.DataFrame(metrics(clfs,X,Y))])

    #Summarizes the metrics
    df_result = df_result.pivot_table(index='classifier',values=['acc','auc','sens','spec'])
    
    return df_result

In [None]:
X = df_velocity.loc[df_velocity['test_id'] == 0,['mean','std']]
Y = df_velocity.loc[df_velocity['test_id'] == 0,'parkinson']