# Preprocessing notebook

_______

In [1]:
DF_PATH = '../data/interim/1_df_resampled_datetime.pkl'
FIG_DIR = '../reports/figures/outliers_graps/'
EXPORT_PATH = '../data/processed/4_cleaned_outliers_data.pkl'

In [2]:
# Libraries 
import pandas as pd 
import numpy as np 
import math 
import scipy
import logging 
import pickle 
import os 

# Visualization Libraries 
import plotly.graph_objects as go 
from plotly.subplots import make_subplots
import plotly.express as px 
import plotly.figure_factory as ff 
from matplotlib import pyplot as plt 
import matplotlib as mpl
import seaborn as sns
from IPython.display import display
from cycler import cycler

# Analysis Libraries 
from sklearn.neighbors import LocalOutlierFactor
from sklearn.manifold import TSNE 
from sklearn.preprocessing import StandardScaler , MultiLabelBinarizer
from sklearn.cluster  import  AgglomerativeClustering
from sklearn.pipeline import make_pipeline
from scipy.cluster.hierarchy import fcluster , linkage


In [3]:
# Ajust matplotlib style Function
class CustomMatplotlibStyle:
    def __init__(self, custom_colors=None, legend_fontsize=10):
        if custom_colors is None:
            custom_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
        self.set_colors(custom_colors)
        self.set_style()
        self.set_figure_size()
        self.set_grid()
        self.set_line_width()
        self.set_tick_colors()
        self.set_font_size()
        self.set_title_size()
        self.set_legend_fontsize(legend_fontsize)
        self.set_dpi()

    def set_colors(self, custom_colors):
        plt.rcParams["axes.prop_cycle"] = cycler(color=custom_colors)

    def set_style(self):
        plt.style.use("bmh")

    def set_figure_size(self):
        plt.rcParams["figure.figsize"] = (20, 10)

    def set_grid(self):
        plt.rcParams["axes.grid"] = True
        plt.rcParams["grid.color"] = "lightgray"

    def set_line_width(self):
        plt.rcParams["axes.linewidth"] = 1.5

    def set_tick_colors(self):
        plt.rcParams["xtick.color"] = "black"
        plt.rcParams["ytick.color"] = "black"

    def set_font_size(self):
        plt.rcParams["font.size"] = 15

    def set_title_size(self):
        plt.rcParams["figure.titlesize"] = 20

    def set_legend_fontsize(self, legend_fontsize):
        plt.rcParams["legend.fontsize"] = legend_fontsize

    def set_dpi(self):
        plt.rcParams["figure.dpi"] = 100

custom_style = CustomMatplotlibStyle(legend_fontsize=8)


-----

## Reading Data

In [4]:
raw_df = pd.read_pickle(DF_PATH)

In [5]:
df = raw_df.copy()
df

Unnamed: 0_level_0,mean_xc,mean_yc,mean_zc,mean_xg,mean_yg,mean_zg,posture,types,specimen,set
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2024-02-11 17:08:05.200,0.013500,0.977000,-0.071000,-1.8904,2.4392,0.9388,bench,heavy,B,64.0
2024-02-11 17:08:05.400,-0.001500,0.970500,-0.079500,-1.6826,-0.8904,2.1708,bench,heavy,B,64.0
2024-02-11 17:08:05.600,0.001333,0.971667,-0.064333,2.5608,-0.2560,-1.4146,bench,heavy,B,64.0
2024-02-11 17:08:05.800,-0.024000,0.957000,-0.073500,8.0610,-4.5244,-2.0730,bench,heavy,B,64.0
2024-02-11 17:08:06.000,-0.028000,0.957667,-0.115000,2.4390,-1.5486,-3.6098,bench,heavy,B,64.0
...,...,...,...,...,...,...,...,...,...,...
2024-02-20 19:33:27.000,-0.048000,-1.041500,-0.076500,1.4146,-5.6218,0.2926,row,medium,E,71.0
2024-02-20 19:33:27.200,-0.037000,-1.030333,-0.053333,-2.7684,-0.5854,2.2440,row,medium,E,71.0
2024-02-20 19:33:27.400,-0.060000,-1.031000,-0.082000,2.8416,-5.1342,-0.1220,row,medium,E,71.0
2024-02-20 19:33:27.600,-0.038667,-1.025667,-0.044667,-0.2318,0.2562,1.1220,row,medium,E,71.0


## Detecting Outliers wz (IQR):

------

- Plotting outliers using boxplots

- **Accelerometer Data**  

`first with x axis [Left and Right]`

In [6]:
fig = go.Figure()
for posture in df['posture'].unique():
    fig.add_trace(go.Box(y=df[df['posture'] == posture]['mean_xc'], name=posture))

# Update layout for better readability
fig.update_layout(
    title='Box Plot Grouped by Posture',
    xaxis_title='Posture',
    yaxis_title='Mean XC',
    template='plotly_dark'
)

# Show the plot
fig.show()


`Second with y axis [Up and Down]`

In [7]:
fig = go.Figure()
for posture in df['posture'].unique():
    fig.add_trace(go.Box(y=df[df['posture'] == posture]['mean_yc'], name=posture))

# Update layout for better readability
fig.update_layout(
    title='Box Plot Grouped by Posture',
    xaxis_title='Posture',
    yaxis_title='Mean YC',
    template='plotly_dark'
)
# Show the plot
fig.show()

-----

- **Gyroscope Data**  

`first with x axis`

In [8]:
fig = go.Figure()
for posture in df['posture'].unique():
    fig.add_trace(go.Box(y=df[df['posture'] == posture]['mean_xg'], name=posture))

# Update layout for better readability
fig.update_layout(
    title='Box Plot Grouped by Posture',
    xaxis_title='Posture',
    yaxis_title='Mean XG',
    template='plotly_dark'
)

# Show the plot
fig.show()


`Second with y axis`

In [9]:
fig = go.Figure()
for posture in df['posture'].unique():
    fig.add_trace(go.Box(y=df[df['posture'] == posture]['mean_yg'], name=posture))

# Update layout for better readability
fig.update_layout(
    title='Box Plot Grouped by Posture',
    xaxis_title='Posture',
    yaxis_title='Mean YG',
    template='plotly_dark'
)

# Show the plot
fig.show()


-  **we see some outliers in the data we need to deal with them i will use 3 method to deal with them**

-------

In [10]:
outlier_df_col = ['mean_xc', 'mean_yc', 'mean_zc', 'mean_xg', 'mean_yg', 'mean_zg']

### Accelerometer Data 

In [11]:
acc_df = df[outlier_df_col[:3] + ['posture']]
acc_df

Unnamed: 0_level_0,mean_xc,mean_yc,mean_zc,posture
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-02-11 17:08:05.200,0.013500,0.977000,-0.071000,bench
2024-02-11 17:08:05.400,-0.001500,0.970500,-0.079500,bench
2024-02-11 17:08:05.600,0.001333,0.971667,-0.064333,bench
2024-02-11 17:08:05.800,-0.024000,0.957000,-0.073500,bench
2024-02-11 17:08:06.000,-0.028000,0.957667,-0.115000,bench
...,...,...,...,...
2024-02-20 19:33:27.000,-0.048000,-1.041500,-0.076500,row
2024-02-20 19:33:27.200,-0.037000,-1.030333,-0.053333,row
2024-02-20 19:33:27.400,-0.060000,-1.031000,-0.082000,row
2024-02-20 19:33:27.600,-0.038667,-1.025667,-0.044667,row


- **Plotting outliers of acc**

In [12]:
grouped_df = acc_df.groupby('posture')

# Create subplots
fig = make_subplots(rows=1, cols=3, subplot_titles=['mean_xc', 'mean_yc', 'mean_zc'])

# Iterate over each column and add box trace to the subplot
for i, col in enumerate(['mean_xc', 'mean_yc', 'mean_zc'], start=1):
    # Iterate over each group
    for group_name, group_data in grouped_df:
        # Add box trace for the current group and column to the subplot
        fig.add_trace(go.Box(y=group_data[col], name=group_name ,showlegend=False), row=1, col=i)

# Update layout for better readability
fig.update_layout(
    title='Box Plot Grouped by Posture',
    xaxis_title='Posture',
    yaxis_title='Values',
    template='plotly_dark',
)
fig.show()


### Gyroscope Data 

- **Plotting outliers of gyr**

In [13]:
gyr_df = df[outlier_df_col[3:] + ['posture']]
gyr_df

Unnamed: 0_level_0,mean_xg,mean_yg,mean_zg,posture
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-02-11 17:08:05.200,-1.8904,2.4392,0.9388,bench
2024-02-11 17:08:05.400,-1.6826,-0.8904,2.1708,bench
2024-02-11 17:08:05.600,2.5608,-0.2560,-1.4146,bench
2024-02-11 17:08:05.800,8.0610,-4.5244,-2.0730,bench
2024-02-11 17:08:06.000,2.4390,-1.5486,-3.6098,bench
...,...,...,...,...
2024-02-20 19:33:27.000,1.4146,-5.6218,0.2926,row
2024-02-20 19:33:27.200,-2.7684,-0.5854,2.2440,row
2024-02-20 19:33:27.400,2.8416,-5.1342,-0.1220,row
2024-02-20 19:33:27.600,-0.2318,0.2562,1.1220,row


In [14]:
grouped_df = gyr_df.groupby('posture')
fig = make_subplots(rows=1, cols=3, subplot_titles=['mean_xg', 'mean_yg', 'mean_zg'])

# Iterate over each column and add box trace to the subplot
for i, col in enumerate(['mean_xg', 'mean_yg', 'mean_zg'], start=1):
    # Iterate over each group
    for group_name, group_data in grouped_df:
        # Add box trace for the current group and column to the subplot
        fig.add_trace(go.Box(y=group_data[col], name=group_name, showlegend=False), row=1, col=i)

# Update layout for better readability
fig.update_layout(
    title='Box Plot Grouped by Posture',
    xaxis_title='Posture',
    yaxis_title='Values',
    template='plotly_dark',
)
# Show the plot
fig.show()

## Plotting outliers wz time :

- **Too look to data in deeper level i use a function by ploting outliers in case of a binary outlier score (github.com/mhoogen/ML4QS)**
- **by using this finction help us to know if the outliers is realy exreme value or something pretty normal** 

In [16]:
def plot_binary_outliers_plotly(dataset, col, outlier_col, reset_index=False):
    dataset = dataset.dropna(subset=[col, outlier_col])
    dataset[outlier_col] = dataset[outlier_col].astype(bool)

    if reset_index:
        dataset = dataset.reset_index(drop=True)

    non_outliers = dataset[~dataset[outlier_col]]
    outliers = dataset[dataset[outlier_col]]

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=non_outliers.index, y=non_outliers[col], mode='markers', marker=dict(color='blue'), name=f"No Outlier {col}"))
    fig.add_trace(go.Scatter(x=outliers.index, y=outliers[col], mode='markers', marker=dict(color='red'), name=f"Outlier {col}"))

    fig.update_layout(
        xaxis_title="Samples",
        yaxis_title="Value",
        legend=dict(
            x=0.5,
            y=1.15,
            orientation="h",
            bgcolor='rgba(255, 255, 255, 0)',
            bordercolor='rgba(255, 255, 255, 0)'
        )
    )
    fig.show()

## Function to marking outliers using IQR


- Since the data is non-normal, you were better off removing outliers using the IQR method. If you think it removes too much data, then use percentiles like <5% and 95% instead of quartiles.

In [17]:
def mark_outliers_percentile(dataset, col, lower_percentile=5, upper_percentile=95):
    dataset = dataset.copy()
    lower_bound = dataset[col].quantile(lower_percentile / 100)
    upper_bound = dataset[col].quantile(upper_percentile / 100)
    dataset[col + "_outlier"] = (dataset[col] < lower_bound) | (dataset[col] > upper_bound)
    return dataset

## Viz with accelerometer data :

In [18]:
new_df = mark_outliers_percentile(dataset = df, col = 'mean_xc')
new_df

Unnamed: 0_level_0,mean_xc,mean_yc,mean_zc,mean_xg,mean_yg,mean_zg,posture,types,specimen,set,mean_xc_outlier
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-02-11 17:08:05.200,0.013500,0.977000,-0.071000,-1.8904,2.4392,0.9388,bench,heavy,B,64.0,False
2024-02-11 17:08:05.400,-0.001500,0.970500,-0.079500,-1.6826,-0.8904,2.1708,bench,heavy,B,64.0,False
2024-02-11 17:08:05.600,0.001333,0.971667,-0.064333,2.5608,-0.2560,-1.4146,bench,heavy,B,64.0,False
2024-02-11 17:08:05.800,-0.024000,0.957000,-0.073500,8.0610,-4.5244,-2.0730,bench,heavy,B,64.0,False
2024-02-11 17:08:06.000,-0.028000,0.957667,-0.115000,2.4390,-1.5486,-3.6098,bench,heavy,B,64.0,False
...,...,...,...,...,...,...,...,...,...,...,...
2024-02-20 19:33:27.000,-0.048000,-1.041500,-0.076500,1.4146,-5.6218,0.2926,row,medium,E,71.0,False
2024-02-20 19:33:27.200,-0.037000,-1.030333,-0.053333,-2.7684,-0.5854,2.2440,row,medium,E,71.0,False
2024-02-20 19:33:27.400,-0.060000,-1.031000,-0.082000,2.8416,-5.1342,-0.1220,row,medium,E,71.0,False
2024-02-20 19:33:27.600,-0.038667,-1.025667,-0.044667,-0.2318,0.2562,1.1220,row,medium,E,71.0,False


- `IQR outliers wz Time`

In [19]:
col = 'mean_xc'
plot_binary_outliers_plotly(dataset=new_df, col=col, outlier_col=col+"_outlier", reset_index=False)

- it's about two weeks of collected data so it's not ideal to visualize  this data so i try it with samples count

- `IQR outliers wz samples count`

In [20]:
plot_binary_outliers_plotly(dataset=new_df, col=col, outlier_col=col+"_outlier", reset_index=True)

- we clear see that red dots are outliers appears to be extrme values at least not in the middle of the data (outliers in top or bottom)

## Plotting all outliers columns:

In [21]:
for col in outlier_df_col:
    marked_dataset = mark_outliers_percentile(dataset=df, col=col)
    plot_binary_outliers_plotly(dataset=marked_dataset, col=col, outlier_col=col+"_outlier", reset_index=True)
    #if not os.path.exists(FIG_DIR):
        #os.makedirs(FIG_DIR)
    #plt.savefig(f"{FIG_DIR}{col}_outliers.png" , dpi=200, format='png', bbox_inches='tight')
    plt.show()


- *After plotting all outliers we clear see that red dots are outliers appears more in Gyroscope data than Accelerometer data*

## Dealing with outliers:

- here we start with using IQR to deal with outliers with lower bound  = 5 and upper bound = 95

In [22]:
# single column first to show difference 
posture = "bench"
col = "mean_xc"
dataset = mark_outliers_percentile(df.loc[df["posture"] == posture], col)
# here i don't need to drop outlier values so make if the value is outlier so make it as nan i will deal with it with IQR
dataset.loc[dataset["mean_xc" + "_outlier"], "mean_xc"] = np.nan

In [23]:
# Showing ressult of True values 
dataset[dataset['mean_xc_outlier']]

Unnamed: 0_level_0,mean_xc,mean_yc,mean_zc,mean_xg,mean_yg,mean_zg,posture,types,specimen,set,mean_xc_outlier
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-02-11 17:08:18.400,,1.393333,-0.085333,26.2194,5.9266,-20.8172,bench,heavy,B,64.0,True
2024-02-11 17:10:23.600,,0.975000,-0.117000,1.3778,-0.1950,1.2196,bench,heavy,A,72.0,True
2024-02-11 17:10:23.800,,0.978000,-0.119000,-1.7440,-1.4514,-1.9148,bench,heavy,A,72.0,True
2024-02-11 17:10:24.000,,0.967667,-0.100000,-0.2194,-4.9756,-3.8658,bench,heavy,A,72.0,True
2024-02-11 17:12:08.600,,1.216000,0.025500,-26.2682,-11.7928,9.4636,bench,heavy,B,40.0,True
...,...,...,...,...,...,...,...,...,...,...,...
2024-02-19 19:21:54.200,,0.863500,-0.060500,3.3902,-8.3536,3.6708,bench,medium,E,70.0,True
2024-02-19 19:21:54.400,,0.972667,-0.049333,1.2806,-2.6098,-1.3172,bench,medium,E,70.0,True
2024-02-19 19:21:54.600,,0.972000,-0.040000,1.1952,-2.7804,-1.4878,bench,medium,E,70.0,True
2024-02-19 19:21:55.000,,0.969000,-0.044000,2.7804,-1.3416,0.0976,bench,medium,E,70.0,True


In [24]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1665 entries, 2024-02-11 17:08:05.200000 to 2024-02-19 19:24:42.600000
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   mean_xc          1499 non-null   float64
 1   mean_yc          1665 non-null   float64
 2   mean_zc          1665 non-null   float64
 3   mean_xg          1665 non-null   float64
 4   mean_yg          1665 non-null   float64
 5   mean_zg          1665 non-null   float64
 6   posture          1665 non-null   object 
 7   types            1665 non-null   object 
 8   specimen         1665 non-null   object 
 9   set              1665 non-null   float64
 10  mean_xc_outlier  1665 non-null   bool   
dtypes: bool(1), float64(7), object(3)
memory usage: 144.7+ KB


In [25]:
outlier_df_col

['mean_xc', 'mean_yc', 'mean_zc', 'mean_xg', 'mean_yg', 'mean_zg']

In [26]:
df.posture.unique()

array(['bench', 'ohp', 'squat', 'dead', 'row', 'rest'], dtype=object)

In [27]:
cleaned_outliers_data = df.copy()
for col in outlier_df_col:
    for pos in df.posture.unique():
        subset_df = cleaned_outliers_data.loc[df["posture"] == pos]
        dataset = mark_outliers_percentile(subset_df, col)
        dataset.loc[dataset[col + "_outlier"], col] = np.nan
        # Updating the column in the original dataframe with the new values
        cleaned_outliers_data.loc[subset_df.index, col] = dataset[col]
        num_outliers_removed = len(cleaned_outliers_data) - len(cleaned_outliers_data[col].dropna())
        print(f"Removed index of {num_outliers_removed} from {col} for {pos}")

Removed index of 166 from mean_xc for bench
Removed index of 334 from mean_xc for ohp
Removed index of 496 from mean_xc for squat
Removed index of 650 from mean_xc for dead
Removed index of 780 from mean_xc for row
Removed index of 888 from mean_xc for rest
Removed index of 168 from mean_yc for bench
Removed index of 336 from mean_yc for ohp
Removed index of 498 from mean_yc for squat
Removed index of 652 from mean_yc for dead
Removed index of 792 from mean_yc for row
Removed index of 900 from mean_yc for rest
Removed index of 168 from mean_zc for bench
Removed index of 336 from mean_zc for ohp
Removed index of 496 from mean_zc for squat
Removed index of 649 from mean_zc for dead
Removed index of 789 from mean_zc for row
Removed index of 897 from mean_zc for rest
Removed index of 168 from mean_xg for bench
Removed index of 336 from mean_xg for ohp
Removed index of 498 from mean_xg for squat
Removed index of 651 from mean_xg for dead
Removed index of 793 from mean_xg for row
Removed ind

In [28]:
cleaned_outliers_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9009 entries, 2024-02-11 17:08:05.200000 to 2024-02-20 19:33:27.800000
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   mean_xc    8121 non-null   float64
 1   mean_yc    8109 non-null   float64
 2   mean_zc    8112 non-null   float64
 3   mean_xg    8108 non-null   float64
 4   mean_yg    8111 non-null   float64
 5   mean_zg    8114 non-null   float64
 6   posture    9009 non-null   object 
 7   types      9009 non-null   object 
 8   specimen   9009 non-null   object 
 9   set        9009 non-null   float64
dtypes: float64(7), object(3)
memory usage: 1.0+ MB


-----

# Export Data 

In [29]:
cleaned_outliers_data.to_pickle(EXPORT_PATH)

-----