# Preprocessing

In [None]:
# Prediction margin: the only parameter to set. Margin in {4, 7, 10, 13} (aka 0.3, 0.6, 0.9, 1.2 seconds)
margin = 13

## Import libraries and define utility functions

In [None]:
import pandas as pd
import numpy as np
import sys
import random
import pickle
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from math import floor

In [None]:
mean = lambda l: sum(l) / len(l)

def describe(l, n_quantiles=10, list_quantiles=None):
    print(f'Min={min(l):.4}, Max={max(l):.4}, Avg={mean(l):.4}, Tot={len(l)}')
    if list_quantiles is None:
        list_quantiles = [e / n_quantiles for e in range(1, n_quantiles)]
    for q in list_quantiles:
        print(q, ":", round(np.quantile(l, q), 3))
    return min(l), max(l), mean(l), len(l)

#### Set up global variables

In [None]:
pd.set_option('display.float_format', lambda x: '%.4f' % x)
plt.rcParams['figure.figsize'] = [18, 12]

#### Paths

In [None]:
f_headers = r'data/headers.txt'
f_acc = r'data/raw/accelerations.csv'
f_pos = r'data/raw/positions.csv'
f_labels = r'data/raw/labels_train.csv'

## Raw data import

In [None]:
with open(f_headers, "r") as file:
    col_acc = file.readline()[:-1].split(",")
    col_pos = file.readline()[:-1].split(",")
    col_lab = file.readline().split(",")

In [None]:
acc_df = pd.read_csv(f_acc, names=col_acc)
pos_df = pd.read_csv(f_pos, names=col_pos)
lab_df = pd.read_csv(f_labels, names=col_lab)

#### Get timestamps

In [None]:
pos_df["time_server"] = pd.to_datetime(pos_df["time_server"], format='%Y-%m-%d %H:%M:%S.%f').apply(lambda x: x.timestamp())

#### Raw data inspection

In [None]:
print(f'The dataset contains {len(acc_df)} samples')
acc_df.head()

In [None]:
print(f'The dataset contains {len(pos_df)} samples')
pos_df.head()

In [None]:
print(f'The dataset contains {len(lab_df)} samples')
lab_df.head()

## Data merge

In [None]:
def find_nearest(array, value, return_index=True):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    if return_index:
        return array[idx], idx
    else:
        return array[idx]

def merge_data(acc, pos, lab):
    
    delta = 0.05  # data sampling from sensors every 0.1s on average
    merged_data = []
    disalignments = []
    
    for i_lab in range(len(lab) - 1):
        if lab.loc[i_lab, "label"] == "s":
            start = lab.loc[i_lab, "time_server"]
            end = lab.loc[i_lab + 1, "time_server"]
            curr_acc = acc[(acc["time_server"] > start - delta) & (acc["time_server"] < end + delta)]
            curr_pos = pos[(pos["time_server"] > start - delta) & (pos["time_server"] < end + delta)]
            
            for i_acc in range(len(curr_acc)):
                curr_data_acc = curr_acc.iloc[i_acc]
                curr_time = curr_data_acc["time_server"]
                curr_data_acc = curr_data_acc.tolist()
                nearest, i_nearest = find_nearest(curr_pos["time_server"].to_numpy(), curr_time)
                disalignments.append(np.abs(curr_time - nearest))
                curr_data_pos = curr_pos.iloc[i_nearest].tolist()
                merged_data.append(curr_data_acc + curr_data_pos + [len(curr_acc) - 1 - i_acc])
                
            print(f'The {int(i_lab / 2 + 1)}° lap lasts for approx. {len(curr_acc) / 10} seconds')
        
    return merged_data, disalignments

In [None]:
merged_data, time_disalignments = merge_data(acc_df, pos_df, lab_df)

In [None]:
raw_df = pd.DataFrame(merged_data, columns=col_acc+["time_server_pos"]+col_pos[1:]+["label"])
raw_df

## Data processing

In [None]:
useful_columns_raw = ["Gz", "Ax", "Ay", 'POSx', 'POSy', 'orient', "label"]

In [None]:
raw_df_reduced = raw_df[useful_columns_raw]

In [None]:
df_split = []
fault_indexes = raw_df_reduced.index[raw_df_reduced["label"] == 0].tolist() # list of indexes representing faults
        
previous = 0
for fi in fault_indexes:
    df_split.append(raw_df_reduced.iloc[previous:fi+1, :])
    previous = fi + 1

In [None]:
print(f'There are {len(df_split)} faults, hence {len(df_split)} datasets.')
for i, df_tmp in enumerate(df_split):
    print(i, df_tmp.shape)

## Features creation

In [None]:
w_lens = [5, 10, 15, 20]

In [None]:
new_dfs = []
lag_features = []
first = True

for temps in df_split:
    curr_w_len_data = []
    for w_len in w_lens:

        means = temps.rolling(w_len).mean()
        cols = [t + "_mean_w" + str(w_len) for t in temps.columns]
        means.columns = cols 
        curr_w_len_data.append(means)
        if first:
            lag_features.append(cols)
        
        stds = temps.rolling(w_len).std()
        cols = [t + "_std_w" + str(w_len) for t in temps.columns]
        stds.columns = cols
        curr_w_len_data.append(stds)
        if first:
            lag_features.append(cols)

        mins = temps.rolling(w_len).min()
        cols = [t + "_min_w" + str(w_len) for t in temps.columns]
        mins.columns = cols
        curr_w_len_data.append(mins)
        if first:
            lag_features.append(cols)

        maxs = temps.rolling(w_len).max()
        cols = [t + "_max_w" + str(w_len) for t in temps.columns]
        maxs.columns = cols
        curr_w_len_data.append(maxs)
        if first:
            lag_features.append(cols)
    
    first = False

    temps_diff = temps - temps.shift(1)
    temps_diff.columns = [t + "_diff" for t in temps.columns]

    df_with_nan = pd.concat([temps, temps_diff] + curr_w_len_data, axis=1)
    df_curr = df_with_nan.dropna()

    new_dfs.append(df_curr)

df_new_features = new_dfs[0]
for to_concat in new_dfs[1:]:
    df_new_features = pd.concat([df_new_features, to_concat])
    
lag_features = [e for nested_lag_features in lag_features for e in nested_lag_features 
                if not e.startswith("POS") and not e.startswith("orient") and not e.startswith("label")]

In [None]:
point_features = ['Gz', 'Ax', 'Ay', 'Gz_diff', 
                  'Ax_diff', 'Ay_diff']
differencing_features = point_features + lag_features
new_features = differencing_features + ['POSx', 'POSy', 'orient', 'label']
df_new_features = df_new_features[new_features]
df_new_features = df_new_features.reset_index(drop=True)

In [None]:
df_new_features.head()

In [None]:
df_new_features.describe()

In [None]:
df_new_features.columns.tolist()

## Handling seasonality with differencing over position and orientation

### WARNING: Currently using POSx because of new trajectories. Also fixed_orient changes.

In [None]:
df_new_features["POSx"].min(), df_new_features["POSx"].max()

In [None]:
fixed_pos = np.linspace(4.6, 15.9, 114)
fixed_pos

### When the blue line is in the white area the AGV is moving forward or backward

In [None]:
(df_new_features["orient"]).plot()
(df_new_features["POSx"] * 100 - 1000).plot()

x = np.linspace(0, 100000, 100)
y1 = np.full_like(x, -45)  
y2 = np.full_like(x, 45)  
plt.fill_between(x, y1, y2, color='lightblue', alpha=0.5)

x = np.linspace(0, 100000, 100)
y1 = np.full_like(x, -135)  
y2 = np.full_like(x, -180)  
plt.fill_between(x, y1, y2, color='orange', alpha=0.5)

x = np.linspace(0, 100000, 100)
y1 = np.full_like(x, 135)  
y2 = np.full_like(x, 180)  
plt.fill_between(x, y1, y2, color='orange', alpha=0.5)

plt.xlim((11000, 12000))
plt.ylim((-600, 600))
plt.show()

In [None]:
df_new_features["orient"].max(), df_new_features["orient"].min()

In [None]:
fixed_orient = [[-135, -45], [45, 135]]

In [None]:
df_new_features["orient_discr"] = pd.Series([1 if (orient > fixed_orient[0][0] and orient < fixed_orient[0][1]) 
                                             else -1 if (orient > fixed_orient[1][0] or orient < fixed_orient[1][1]) 
                                             else 0 for orient in df_new_features["orient"]])
df_new_features["POSx_discr"] = pd.Series([find_nearest(fixed_pos, curr_pos, return_index=False) 
                                           for curr_pos in df_new_features["POSx"]])

In [None]:
df_new_features.head()

In [None]:
differencing_dict = {feature: {} for feature in differencing_features}
differencing_list = {feature: [] for feature in differencing_features}
count_dict = {}

for _, row in df_new_features.iterrows():
    if row["label"] >= margin:
        keys = (row["orient_discr"], row["POSx_discr"])
        if keys not in differencing_dict["Ax"]:
            for feature in differencing_features:
                differencing_dict[feature][keys] = row[feature]
            count_dict[keys] = 1
        else:
            for feature in differencing_features:
                differencing_dict[feature][keys] += row[feature]
            count_dict[keys] += 1

for feature in differencing_features:
    for keys in differencing_dict["Ax"]:
        differencing_dict[feature][keys] = differencing_dict[feature][keys] / count_dict[keys]
    
for _, row in df_new_features.iterrows():
    for feature in differencing_features:
        differencing_list[feature].append(row[feature] - differencing_dict[feature][(row["orient_discr"], row["POSx_discr"])])

for feature in differencing_features:
    df_new_features["differencing_" + feature] = pd.Series(differencing_list[feature])

df_new_features.head()

In [None]:
list(df_new_features.columns)

## Features scaling

In [None]:
scaler = StandardScaler()
df = pd.DataFrame(scaler.fit_transform(df_new_features), 
                  columns=list(df_new_features.columns))
df = df.drop(columns=["label", 'orient_discr','POSx_discr'])
df["orient_discr"] = df_new_features["orient_discr"]
df["POSx_discr"] = df_new_features["POSx_discr"]
df["label"] = df_new_features["label"]
df

In [None]:
df.describe()

## Temporal delay analysis

#### Time disalignments intra-datasets

Collect all time differences between subsequent observations

In [None]:
def get_gaps(df, are_labels=False):
    gaps = []
    for i in range(len(df) - 1):
        if are_labels and df.loc[i, "label"] == "s":
            gaps.append(df.loc[i + 1, "time_server"] - df.loc[i, "time_server"])
        else:
            gaps.append(df.loc[i + 1, "time_server"] - df.loc[i, "time_server"])
    return gaps

#### Accelerations:

In [None]:
acc_gaps = get_gaps(acc_df)

In [None]:
min_acc, max_acc, _, _ = describe(acc_gaps, list_quantiles=[0.5, 0.9, 0.99, 0.999, 0.9999])

In [None]:
plt.hist(acc_gaps, bins=100)
plt.xticks(np.arange(0, max_acc, round(max_acc / 25, 2)))
plt.show()

In [None]:
plt.hist([g for g in acc_gaps if g < 1], bins=100)
plt.xticks(np.arange(0, 1, round(1 / 25, 2)))
plt.show()

#### Positions:

In [None]:
pos_gaps = get_gaps(pos_df)

In [None]:
min_pos, max_pos, _, _ = describe(pos_gaps, list_quantiles=[0.5, 0.9, 0.99, 0.999, 0.9999])

In [None]:
plt.hist(pos_gaps, bins=100)
plt.xticks(np.arange(0, max_pos, round(max_pos / 25, 2)))
plt.show()

In [None]:
plt.hist([g for g in pos_gaps if g < 1], bins=100)
plt.xticks(np.arange(0, 1, round(1 / 25, 2)))
plt.show()

#### Labels:

In [None]:
lab_gaps = get_gaps(lab_df, are_labels=True)

In [None]:
min_lab, max_lab, _, _ = describe(lab_gaps)

In [None]:
plt.hist(lab_gaps, bins=50)
plt.xticks(np.arange(0, max_lab, round(max_lab / 25, 2)))
plt.show()

#### Time disalignments inter-datasets

In [None]:
min_td, max_td, _, _ = describe(time_disalignments, list_quantiles=[0.5, 0.9, 0.99, 0.999, 0.9999])

In [None]:
plt.hist(time_disalignments, bins=100)
plt.xticks(np.arange(0, max_td, round(max_td / 25, 2)))
plt.show()

In [None]:
plt.hist([t for t in time_disalignments if t < 1], bins=100)
plt.xticks(np.arange(0, 1, round(1 / 25, 2)))
plt.show()

## Remaining useful life (RUL)

The following plot shows the Remaining Useful Life (RUL), namely the number of time steps before that a failure occurs.

In [None]:
plt.plot(df["label"])
plt.show()

## Data trends visualization

In [None]:
def plot_all_signal(df, feature):
    plt.plot(df[feature], alpha=0.6)
    plt.plot(df["POSx"])
    ranges = (df["label"] == 0).map({True:0, False:30})
    plt.scatter(range(len(df)), ranges, color="red", s=20)
    plt.ylim(-9,5)
    plt.show()

In [None]:
def plot_all_signal_zoomed(df, feature):
    for i in range(0, len(df) - 2000, 2000):
        start = i
        end = i + 2000
        plt.plot(df[feature][start:end], alpha=0.6)
        plt.plot(df["POSx"][start:end])
        ranges = (df["label"] == 0).map({True:0, False:30})
        plt.scatter(range(start, end), ranges[start:end], color="red", s=20)
        plt.ylim(-9, 5)
        plt.show()

In [None]:
def plot_only_faults(df, feature):
    for i, is_fault in enumerate((df["label"] == 0).tolist()):
        if is_fault:
            start = i - 20
            end = i + 1
            plt.plot(df[feature][start:end], alpha=0.6)
            plt.plot(df["POSx"][start:end])
            ranges = (df["label"] == 0).map({True:0, False:30})
            plt.scatter(range(start, end), ranges[start:end], color="red", s=20)
            plt.ylim(-9, 5)
            plt.show()

In [None]:
list(df.columns)  # select among these features

In [None]:
feature = "differencing_Ax_min_w5"  # select feature to analyze here

#### Analysis

In [None]:
plot_all_signal(df, feature)

In [None]:
plot_all_signal_zoomed(df, feature)

In [None]:
plot_only_faults(df, feature)

## Data and constants storage

In [None]:
df.to_csv("data/train/training_" + str(margin) + ".csv")

with open('data/utils/scaler_' + str(margin) + '.bin', 'wb') as handle:
    pickle.dump(scaler, handle)
with open('data/utils/fixed_orient_' + str(margin) + '.bin', 'wb') as handle:
    pickle.dump(fixed_orient, handle)
with open('data/utils/fixed_pos_' + str(margin) + '.bin', 'wb') as handle:
    pickle.dump(fixed_pos, handle)
with open('data/utils/differencing_dict_' + str(margin) + '.bin', 'wb') as handle:
    pickle.dump(differencing_dict, handle) 