In [None]:
import pandas as pd
import numpy as np
import scipy.io as sio
import os
import re
import matplotlib.pyplot as plt
import source.transform_data as trs
from pathlib import Path
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist
from sklearn.cluster import AgglomerativeClustering 
from sklearn.cluster import KMeans 

In [None]:
path_to_data = Path('Data')

In [None]:
data_wide = pd.read_csv(path_to_data/'data_table_wide.csv', index_col=None)

In [None]:
data_wide.head()

### Number of walking bouts before vs after rehab:

In [None]:
data_wide.value_counts('exp_phase_id')

In [None]:
data_wide.value_counts('exp_phase_descr')

### It looks like most people stopped recording after 8 hours:

In [None]:
plt.scatter(x='time_stamps_hours', y='speed_mean', c= 'morning_afternoon', data=data_wide)
plt.xlabel("hours after beginning of recording")
plt.ylabel("mean speed during walking bout")

#### Is it appropriate to only consider walking bouts within 8 hours and disregard later walking bouts? If so, we could subdivide the 8 hours or recording into two phases of 4 hours each (morning and afternoon).

### Number of walking bouts morning vs afternoon:

In [None]:
data_wide.value_counts('morning_afternoon')

### Correlation matrix:

In [None]:
data_wide.head()
data_wide_for_corr = data_wide.copy()

In [None]:
data_wide_for_corr = data_wide_for_corr.drop('time_stamps', axis=1)
data_wide_for_corr = data_wide_for_corr.drop('morning_afternoon', axis=1)
data_wide_for_corr.head()

In [None]:
data_wide_num = data_wide_for_corr.iloc[:,5:].copy()

corr_matrix = data_wide_num.corr(method='spearman')

plt.figure(figsize=(2, 2))
 
# Plot the matrix
plt.matshow(corr_matrix, cmap="viridis")
 
# Set ticks
plt.xticks(range(len(corr_matrix.columns)), corr_matrix.columns, rotation="vertical")
plt.yticks(range(len(corr_matrix.columns)), corr_matrix.columns)
 
# Add color bar
plt.colorbar()
 
# Add text annotations
for i in range(len(corr_matrix.columns)):
    for j in range(len(corr_matrix.columns)):
        plt.text(j, i, f"{corr_matrix.iloc[i, j]:.2f}", ha="center", va="center", color="w")
plt.show()

In [None]:
corr_matrix

In [None]:
#plt.figure(figsize=(20, 20)) 

plt.scatter(x='time_stamps_hours', y='speed_mean', c= 'WB_time', data=data_wide, label='WB_time')

# Add legend
plt.legend(title='WB_time')

# Optional: Label axes and title
plt.xlabel('Time Stamps (Hours)')
plt.ylabel('Speed Mean')
plt.title('Speed Mean vs Time Stamps')

# Show the plot
plt.show()

### Frequency of walking bouts over time:

In [None]:
data_wide.hist('time_stamps_hours')
plt.xlabel("hours after beginning of recording")
plt.ylabel("frequency of walking bouts")

### Frequency of walking bouts over time before vs after rehab:

In [None]:
data_wide_before_rh = data_wide[data_wide.exp_phase_id == 'T2'].copy()
data_wide_after_rh = data_wide[data_wide.exp_phase_id == 'T3'].copy()

In [None]:
data_wide_before_rh.hist('time_stamps_hours')
plt.xlabel("hours after beginning of recording")
plt.ylabel("frequency of walking bouts")
plt.title('Frequency of walking bouts before rehab:')

In [None]:
data_wide_after_rh.hist('time_stamps_hours')
plt.xlabel("hours after beginning of recording")
plt.ylabel("frequency of walking bouts")
plt.title('Frequency of walking bouts after rehab:')

#### There seems to be a slight decrease in frequency of walking bouts over time during recording.

### More walking bouts before than after rehab: Is this possible?

In [None]:
print(data_wide_before_rh.shape)
print(data_wide_after_rh.shape)

In [None]:
print(len(set(data_wide_before_rh.file_name)))
print(len(set(data_wide_after_rh.file_name)))

In [None]:
data_wide.head()

### K-means clustering to check if I stumble over anything interesting:

In [None]:
numerical_var_names = list(data_wide.iloc[:,8:].columns)

In [None]:
numerical_variables = data_wide[numerical_var_names]
numerical_variables

In [None]:
ngait_cycles = list(data_wide['Ngait_cycles'])
cadence_mean = list(data_wide['cadence_mean'])
speed_mean = list(data_wide['speed_mean'])
speed_std = list(data_wide['speed_std'])
slength_mean = list(data_wide['slength_mean'])

num_data = list(zip(ngait_cycles, cadence_mean, speed_mean, speed_std, slength_mean))

In [None]:
inertias = []

for i in range(1,11):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(numerical_variables)
    inertias.append(kmeans.inertia_)

plt.plot(range(1,11), inertias, marker='o')
plt.title('Elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(num_data)

In [None]:
plt.scatter(data_wide.exp_phase_id, data_wide.WB_time, c=kmeans.labels_)
plt.show() 

In [None]:
data_wide_l = data_wide.copy()
data_wide_l['kmeans_labels'] = kmeans.labels_
data_wide_l_T2 = data_wide_l.loc[data_wide_l.exp_phase_id == 'T2',:]
data_wide_l_T3 = data_wide_l.loc[data_wide_l.exp_phase_id == 'T3',:]

In [None]:
plt.scatter(data_wide_l_T2.speed_mean, data_wide_l_T2.WB_time, c=data_wide_l_T2.kmeans_labels)
plt.show() 

In [None]:
plt.scatter(data_wide_l_T3.speed_mean, data_wide_l_T3.WB_time, c=data_wide_l_T3.kmeans_labels)
plt.show() 

In [None]:
plt.scatter(data_wide_l_T2.time_stamps, data_wide_l_T2.WB_time, c=data_wide_l_T2.kmeans_labels)
plt.show() 

In [None]:
plt.scatter(data_wide_l_T3.time_stamps, data_wide_l_T3.WB_time, c=data_wide_l_T3.kmeans_labels)
plt.show() 

#### All clusters seem to be present both before and after rehab. 

In [None]:
plt.scatter(data_wide.morning_afternoon, data_wide.WB_time, c=kmeans.labels_)
plt.show() 

#### All clusters seem to be present both in the morning and in the afternoon.

In [None]:
plt.scatter(data_wide.time_stamps, data_wide.WB_time, c=kmeans.labels_)
plt.show() 

In [None]:
plt.scatter(data_wide.speed_mean, data_wide.WB_time, c=kmeans.labels_)
plt.show() 