In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from src.features.DataTransformation import LowPassFilter, PrincipalComponentAnalysis
from src.features.TemporalAbstraction import NumericalAbstraction


# --------------------------------------------------------------
# Load data
# --------------------------------------------------------------
df = pd.read_pickle("data/interim/02_outliers_removed_chavnets.pkl")

df.rename(columns = {"acceleration_x":"acc_x","acceleration_y":"acc_y","acceleration_z":"acc_z",'gyroscope_x':'gyr_x','gyroscope_y':'gyr_y',
                    'gyroscope_z':'gyr_z'},inplace=True)
df
# --------------------------------------------------------------
# Dealing with missing values (imputation)
# --------------------------------------------------------------
predicted_columns = list(df.columns[:6])
predicted_columns
# --------------------------------------------------------------
# Calculating set duration
# --------------------------------------------------------------
for col in predicted_columns:
    df[col] = df[col].interpolate()

duration = df[df['Set']==1].index[-1] - df[df['Set']==1].index[0]

for s in df['Set'].unique():
    start = df[df['Set']==s].index[0]
    stop = df[df['Set']==s].index[-1]
    duration = stop-start
    df.loc[(df['Set']==s),"duration"]=duration.seconds

df.groupby(by = 'category')['duration'].mean()
# --------------------------------------------------------------
# Butterworth lowpass filter
# --------------------------------------------------------------
df_lowpass = df.copy()
Lowpass = LowPassFilter()
fs = 1000/200
cutoff = 1
df_lowpass = Lowpass.low_pass_filter(df_lowpass,"acc_y",fs,cutoff)
df_lowpass

for col in predicted_columns:
    df_lowpass = Lowpass.low_pass_filter(df_lowpass,col,fs,cutoff)
    df_lowpass[col] = df_lowpass[col + "_lowpass"]
    del df_lowpass[col+ "_lowpass"]

df_lowpass
# --------------------------------------------------------------
# Principal component analysis PCA
# --------------------------------------------------------------
df_pca = df_lowpass.copy()
pca = PrincipalComponentAnalysis()
pc_values = pca.determine_pc_explained_variance(df_pca,predicted_columns)
pc_values


plt.rcParams['figure.figsize']=[20,10]
plt.plot(range(1,len(predicted_columns)+1),pc_values)
plt.xlabel("no of components")
plt.ylabel("pca values")
#plt.show()

df_pca = pca.apply_pca(df_pca,predicted_columns,3)
df_pca
# --------------------------------------------------------------
# Sum of squares attributes
# --------------------------------------------------------------
df_squared = df_pca.copy()
acc_r  =df_squared['acc_x']**2+df_squared['acc_y']**2+df_squared['acc_z']**2
gyr_r  =df_squared['gyr_x']**2+df_squared['gyr_y']**2+df_squared['gyr_z']**2

df_squared['acc_r'] = np.sqrt(acc_r)
df_squared['gyr_r'] = np.sqrt(gyr_r)
subset = df_squared[df_squared['Set']==14]
subset[['acc_r']]
# -------------------------------------------------------------
# Temporal abstraction
# --------------------------------------------------------------
df_temporal = df_squared.copy()
num_abs = NumericalAbstraction()

predicted_columns = predicted_columns + ["acc_r","gyr_r"]
ws = int(1000/200)
#for col in predicted_columns:
#    df_temporal = num_abs.abstract_numerical(df_temporal,[col],ws,"mean")
#    df_temporal = num_abs.abstract_numerical(df_temporal,[col],ws,"std")

df_temporal_list=[]

for s in df_temporal['Set'].unique():
    subset = df_temporal[df_temporal['Set']==s].copy()
    for col in predicted_columns:
        subset = num_abs.abstract_numerical(subset,[col],ws,"mean")
        subset = num_abs.abstract_numerical(subset,[col],ws,"std")
    df_temporal_list.append(subset)

df_temporal = pd.concat(df_temporal_list)
df_temporal.drop(columns = "duration",inplace = True)

subset[["acc_r","acc_x_temp_mean_ws_5","acc_x_temp_std_ws_5"]].plot()
# --------------------------------------------------------------
# Frequency features
# --------------------------------------------------------------
from src.features.FrequencyAbstraction import FourierTransformation
df_freq  = df_temporal.copy().reset_index()
freqabs = FourierTransformation()
fs = int(1000/200)
ws = int(2800/200)
df_freq = freqabs.abstract_frequency(df_freq,["acc_y"],ws,fs)

df_freq_list=[]
for s in df_freq['Set'].unique():
    print(f"the fourier transformation is applied in {s}")
    subset = df_freq[df_freq['Set']==s].reset_index(drop = True).copy()
    subset= freqabs.abstract_frequency(subset,predicted_columns,ws,fs)
    df_freq_list.append(subset)

df_freq = pd.concat(df_freq_list).set_index("epoch (ms)",drop=True)

df_freq = df_freq.dropna()
df_freq = df_freq.iloc[::2]
df_freq

##-------Clustering-------##
df_cluster = df_freq.copy()
from sklearn.cluster import KMeans

cluster_columns = ["acc_x","acc_y","acc_z"]
k_values = range(2,10)
inertias = []

for k in k_values:
    subset = df_cluster[cluster_columns]
    kmeans = KMeans(n_clusters = k , n_init=20 , random_state = 0)
    cluster_labels = kmeans.fit_predict(subset)
    inertias.append(kmeans.inertia_)

kmeans = KMeans(n_clusters = 5 , n_init = 20 , random_state = 0)
subset = df_cluster[cluster_columns]
df_cluster["cluster"] = kmeans.fit_predict(subset)

df_cluster.to_pickle("data/interim/03_data_features.pkl")