In [None]:
import pandas as pd
import numpy as np
import scipy.io as sio
import os
import re
import matplotlib.pyplot as plt
import source.transform_data as trs
from pathlib import Path
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist
from sklearn.cluster import AgglomerativeClustering 
from sklearn.cluster import KMeans 

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [None]:
path_to_data = Path('Data')

In [None]:
data_wide = pd.read_csv(path_to_data/'data_table_wide.csv', index_col=None)

In [None]:
data_wide.head()

### Number of walking bouts before vs after rehab:

In [None]:
data_wide.value_counts('exp_phase_id')

In [None]:
data_wide.value_counts('exp_phase_descr')

### It looks like most people stopped recording after 8 hours:

In [None]:
ax = plt.subplot()
scatter = ax.scatter(data_wide.time_stamps_hours, data_wide.speed_mean, alpha=0.4)
# produce a legend with a cross-section of sizes from the scatter
handles, labels = scatter.legend_elements()
legend2 = ax.legend(handles, labels, loc="upper right")
ax.set_xlabel("hours after beginning of recording")
ax.set_ylabel("mean speed during walking bout")

#### Is it appropriate to only consider walking bouts within 8 hours and disregard later walking bouts? If so, we could subdivide the 8 hours or recording into two phases of 4 hours each (morning and afternoon).

In [None]:
data_wide

### Number of walking bouts morning vs afternoon:

In [None]:
data_wide.value_counts('morning_afternoon')

### Correlation matrix:

In [None]:
data_wide.head()
data_wide_for_corr = data_wide.copy()

In [None]:
data_wide_for_corr = data_wide_for_corr.drop('time_stamps', axis=1)
data_wide_for_corr = data_wide_for_corr.drop('morning_afternoon', axis=1)
data_wide_for_corr.head()

In [None]:
data_wide_num = data_wide_for_corr.iloc[:,5:].copy()

corr_matrix = data_wide_num.corr(method='spearman')

plt.figure(figsize=(10, 10))

# Create a figure and axes with the desired size
fig, ax = plt.subplots(figsize=(10, 10))  # Adjust the figsize to make squares larger

# Plot the matrix using matshow on the created axes
cax = ax.matshow(corr_matrix, cmap="viridis")

# Set ticks
ax.set_xticks(range(len(corr_matrix.columns)))
ax.set_xticklabels(corr_matrix.columns, rotation="vertical")
ax.set_yticks(range(len(corr_matrix.columns)))
ax.set_yticklabels(corr_matrix.columns)


# Add color bar
fig.colorbar(cax)


# Add text annotations
for i in range(len(corr_matrix.columns)):
    for j in range(len(corr_matrix.columns)):
        ax.text(j, i, f"{corr_matrix.iloc[i, j]:.2f}", ha="center", va="center", color="w")

# Display the plot
plt.show()


### Frequency of walking bouts over time:

In [None]:
data_wide.hist('time_stamps_hours')
plt.xlabel("hours after beginning of recording")
plt.ylabel("frequency of walking bouts")

### Frequency of walking bouts over time before vs after rehab:

In [None]:
data_wide_before_rh = data_wide[data_wide.exp_phase_id == 'T2'].copy()
data_wide_after_rh = data_wide[data_wide.exp_phase_id == 'T3'].copy()

In [None]:
data_wide_before_rh.hist('time_stamps_hours')
plt.xlabel("hours after beginning of recording")
plt.ylabel("frequency of walking bouts")
plt.title('Frequency of walking bouts before rehab:')

In [None]:
data_wide_after_rh.hist('time_stamps_hours')
plt.xlabel("hours after beginning of recording")
plt.ylabel("frequency of walking bouts")
plt.title('Frequency of walking bouts after rehab:')

#### There seems to be a slight decrease in frequency of walking bouts over time during recording.

### More walking bouts before than after rehab: Is this possible?

In [None]:
print(data_wide_before_rh.shape)
print(data_wide_after_rh.shape)

In [None]:
print(len(set(data_wide_before_rh.file_name)))
print(len(set(data_wide_after_rh.file_name)))

In [None]:
data_wide.head()

### K-means clustering to check if I stumble over anything interesting:

In [None]:
numerical_var_names = list(data_wide.iloc[:,8:].columns)

In [None]:
# Get the numerical variables:
numerical_variables = data_wide[numerical_var_names]
numerical_variables

#### Scale variables and put them into a numpy array:

In [None]:
num_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
    ])

In [None]:
num_attribs = list(numerical_variables)
#cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        #("cat", OneHotEncoder(), cat_attribs),
    ])

#num_vars_prepared = full_pipeline.fit_transform(num_vars_select)
num_vars_prepared = full_pipeline.fit_transform(numerical_variables)

#### Optimal number of clusters not obvious as there is no inertia inflection point:

In [None]:
inertias = []

for i in range(1,11):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(num_vars_prepared)
    #kmeans.fit(numerical_variables)
    inertias.append(kmeans.inertia_)

plt.plot(range(1,11), inertias, marker='o')
plt.title('Elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3)
#kmeans.fit(num_data)
kmeans.fit(num_vars_prepared)

In [None]:
print(num_vars_prepared.shape)
print(numerical_variables.shape)
print(data_wide.shape)

In [None]:
data_wide_l = data_wide.copy()
data_wide_l['kmeans_labels'] = kmeans.labels_
data_wide_l_T2 = data_wide_l.loc[data_wide_l.exp_phase_id == 'T2',:]
data_wide_l_T3 = data_wide_l.loc[data_wide_l.exp_phase_id == 'T3',:]

#### Clusters do not coincide with phase of experiment (exp_phase_one_hot) and see to be equally distributed over time:

In [None]:
index_1 = 6
index_2 = 4
index_3 = 10


print(data_wide_l.columns[index_1])
print(data_wide_l.columns[index_2])
print(data_wide_l.columns[index_3])


plt.figure(figsize=(9, 9))

#X_inverse = rbf_pca.inverse_transform(X_reduced_rbf)

ax = plt.subplot(111, projection='3d')
ax.view_init(10, -70)

scatter = ax.scatter(data_wide_l.iloc[:, index_1], data_wide_l.iloc[:, index_2], 
           data_wide_l.iloc[:, index_3], 
           c=kmeans.labels_, cmap=plt.get_cmap("jet"), 
           marker="o")
ax.set_xlabel(data_wide_l.columns[index_1])
ax.set_ylabel(data_wide_l.columns[index_2])
ax.set_zlabel(data_wide_l.columns[index_3])
#ax.set_xticklabels([])
#ax.set_yticklabels([])
#ax.set_zticklabels([])

# produce a legend with a cross-section of sizes from the scatter
handles, labels = scatter.legend_elements()
legend2 = ax.legend(handles, labels, loc="upper right", title="Clusters")

#save_fig("preimage_plot", tight_layout=False)
plt.show()

In [None]:
data_wide_l.columns

#### Clusters seem to be based on a somewhat trianguar shaped data distribution:

In [None]:
index_1 = 11
index_2 = 8
index_3 = 10


print(data_wide_l.columns[index_1])
print(data_wide_l.columns[index_2])
print(data_wide_l.columns[index_3])


plt.figure(figsize=(9, 9))

#X_inverse = rbf_pca.inverse_transform(X_reduced_rbf)

ax = plt.subplot(111, projection='3d')
ax.view_init(10, -70)

scatter = ax.scatter(data_wide_l.iloc[:, index_1], data_wide_l.iloc[:, index_2], 
           data_wide_l.iloc[:, index_3], 
           c=kmeans.labels_, cmap=plt.get_cmap("jet"), 
           marker="o")
ax.set_xlabel(data_wide_l.columns[index_1])
ax.set_ylabel(data_wide_l.columns[index_2])
ax.set_zlabel(data_wide_l.columns[index_3])
#ax.set_xticklabels([])
#ax.set_yticklabels([])
#ax.set_zticklabels([])

# produce a legend with a cross-section of sizes from the scatter
handles, labels = scatter.legend_elements()
legend2 = ax.legend(handles, labels, loc="upper right", title="Clusters")

#save_fig("preimage_plot", tight_layout=False)
plt.show()

In [None]:
index_1 = 10
index_2 = 8
index_3 = 11


print(data_wide_l.columns[index_1])
print(data_wide_l.columns[index_2])
print(data_wide_l.columns[index_3])


plt.figure(figsize=(9, 9))

#X_inverse = rbf_pca.inverse_transform(X_reduced_rbf)

ax = plt.subplot(111, projection='3d')
ax.view_init(10, -70)

scatter = ax.scatter(data_wide_l.iloc[:, index_1], data_wide_l.iloc[:, index_2], 
           data_wide_l.iloc[:, index_3], 
           c=kmeans.labels_, cmap=plt.get_cmap("jet"), 
           marker="o")
ax.set_xlabel(data_wide_l.columns[index_1])
ax.set_ylabel(data_wide_l.columns[index_2])
ax.set_zlabel(data_wide_l.columns[index_3])
#ax.set_xticklabels([])
#ax.set_yticklabels([])
#ax.set_zticklabels([])

# produce a legend with a cross-section of sizes from the scatter
handles, labels = scatter.legend_elements()
legend2 = ax.legend(handles, labels, loc="upper right", title="Clusters")

#save_fig("preimage_plot", tight_layout=False)
plt.show()

#### The data shape reflects the fact that walking time varies most at a particular mean cadence and stride length standard deviation:

In [None]:
ax = plt.subplot()
scatter = ax.scatter(data_wide_l.cadence_mean, data_wide_l.WB_time, alpha=0.4,
          c=data_wide_l.kmeans_labels, cmap=plt.get_cmap("jet"))
# produce a legend with a cross-section of sizes from the scatter
handles, labels = scatter.legend_elements()
legend2 = ax.legend(handles, labels, loc="upper right", title="Clusters")
ax.set_xlabel("cadence_mean")
ax.set_ylabel("WB_time")

In [None]:
ax = plt.subplot()
scatter = ax.scatter(data_wide_l.slength_std, data_wide_l.WB_time, alpha=0.4,
          c=data_wide_l.kmeans_labels, cmap=plt.get_cmap("jet"))
# produce a legend with a cross-section of sizes from the scatter
handles, labels = scatter.legend_elements()
legend2 = ax.legend(handles, labels, loc="upper right", title="Clusters")
ax.set_xlabel("slength_std")
ax.set_ylabel("WB_time")

#### The cluster on the right has higher mean speed values and contrary to the other two clusters seems to show correlation between mean speed and walking bout time:

In [None]:
ax = plt.subplot()
scatter = ax.scatter(data_wide_l.speed_mean, data_wide_l.WB_time, alpha=0.4,
          c=data_wide_l.kmeans_labels, cmap=plt.get_cmap("jet"))
# produce a legend with a cross-section of sizes from the scatter
handles, labels = scatter.legend_elements()
legend2 = ax.legend(handles, labels, loc="upper right", title="Clusters")
ax.set_xlabel("speed_mean")
ax.set_ylabel("WB_time")

In [None]:
data_wide_l.plot(kind="scatter", x="cadence_mean", y="speed_mean", alpha=0.4,
    c="kmeans_labels", cmap=plt.get_cmap("jet"), colorbar=False,
    sharex=True)

plt.show()

In [None]:
ax = plt.subplot()
scatter = ax.scatter(data_wide_l.cadence_mean, data_wide_l.speed_mean, alpha=0.4,
          c=data_wide_l.kmeans_labels, cmap=plt.get_cmap("jet"))
# produce a legend with a cross-section of sizes from the scatter
handles, labels = scatter.legend_elements()
legend2 = ax.legend(handles, labels, loc="upper right", title="Clusters")
ax.set_xlabel("cadence_mean")
ax.set_ylabel("speed_mean")