### Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from scipy import stats
from scipy.signal import find_peaks

%matplotlib inline

### Data Analysis

In [2]:
path = r"C:\Users\arockias\Desktop\SmartRadL\SmartRadL\Combined_Dataset\Overall_Dataset.csv"
df = pd.read_csv(path)

In [3]:
df.head()

Unnamed: 0,time,gFx,gFy,gFz,ax,ay,az,wx,wy,wz,Bx,By,Bz,Azimuth,Pitch,Roll,Latitude,Longitude,Speed_(km/hr),Labels
0,2022-05-17 12:02:40.821,0.266,0.6544,0.7067,-0.1974,0.018,0.0424,0.015,0.0457,0.0929,48.0924,-9.638,-20.374,277.1417,-42.9618,8.5736,48.775411,9.171374,0.0,Stop
1,2022-05-17 12:02:40.823,0.266,0.6544,0.7067,0.3672,-0.1263,0.0487,0.015,0.0457,0.0929,48.0924,-9.638,-20.374,277.1417,-42.9618,8.5736,48.775411,9.171374,0.0,Stop
2,2022-05-17 12:02:40.834,0.3034,0.6422,0.6948,0.3672,-0.1263,0.0487,0.015,0.0457,0.0929,48.0924,-9.638,-20.374,277.1417,-42.9618,8.5736,48.775411,9.171374,0.0,Stop
3,2022-05-17 12:02:40.868,0.2816,0.6297,0.7473,0.3672,-0.1263,0.0487,0.015,0.0457,0.0929,48.0924,-9.638,-20.374,277.1417,-42.9618,8.5736,48.775411,9.171374,0.0,Stop
4,2022-05-17 12:02:40.869,0.2816,0.6297,0.7473,0.3672,-0.1263,0.0487,0.015,0.0457,0.0929,48.3974,-9.6136,-20.1666,277.1417,-42.9618,8.5736,48.775411,9.171374,0.0,Stop


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 393567 entries, 0 to 393566
Data columns (total 20 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   time           393567 non-null  object 
 1   gFx            393567 non-null  float64
 2   gFy            393567 non-null  float64
 3   gFz            393567 non-null  float64
 4   ax             393567 non-null  float64
 5   ay             393567 non-null  float64
 6   az             393567 non-null  float64
 7   wx             393567 non-null  float64
 8   wy             393567 non-null  float64
 9   wz             393567 non-null  float64
 10  Bx             393567 non-null  float64
 11  By             393567 non-null  float64
 12  Bz             393567 non-null  float64
 13  Azimuth        393567 non-null  float64
 14  Pitch          393567 non-null  float64
 15  Roll           393567 non-null  float64
 16  Latitude       393567 non-null  float64
 17  Longitude      393567 non-nul

In [5]:
df.Labels.value_counts()

Asphalt_Ride        256169
Stop                 71055
Cobblestone_Ride     47537
Asphalt_Manhole      13892
Asphalt_Kurb          3054
Asphalt_Bump          1860
Name: Labels, dtype: int64

In [6]:
x_list = []
y_list = []
z_list = []
gFx, gFy, gFz, wx, wy, wz, Azimuth, Pitch, Roll, Speed = [], [], [], [],[],[],[],[],[],[]
train_labels = []
remaining_data = []
window_size = 10
step_size = 5

df_remaining = {}

# creating overlaping windows of size window-size 100
for i in range(0, df.shape[0] - window_size, step_size):
    xs = df['ax'].values[i: i + window_size]
    ys = df['ay'].values[i: i + window_size]
    zs = df['az'].values[i: i + window_size]
    gFxs = df["gFx"].values[i: i + window_size].mean()
    gFys = df["gFy"].values[i: i + window_size].mean()
    gFzs = df["gFz"].values[i: i + window_size].mean()
    wxs = df["wx"].values[i: i + window_size].mean()
    wys = df["wy"].values[i: i + window_size].mean()
    wzs = df["wz"].values[i: i + window_size].mean()
    Azimuths = df["Azimuth"].values[i: i + window_size].mean()
    Pitchs = df["Pitch"].values[i: i + window_size].mean()
    Rolls = df["Roll"].values[i: i + window_size].mean()
    Speeds = df["Speed_(km/hr)"].values[i: i + window_size].mean()
            
    label = stats.mode(df['Labels'][i: i + window_size])[0][0]    

    x_list.append(xs)
    y_list.append(ys)
    z_list.append(zs)
    gFx.append(gFxs)
    gFy.append(gFys)
    gFz.append(gFzs)
    wx.append(wxs)
    wy.append(wys)
    wz.append(wzs)
    Azimuth.append(Azimuths) 
    Pitch.append(Pitchs)
    Roll.append(Rolls)
    Speed.append(Speeds)
    train_labels.append(label) #labels

df_remaining = {"gFx": gFx, "gFy": gFy, "gFz":gFz, "wx":wx, "wy":wy, "wz":wz,"Azimuth":Azimuth, "Pitch":Pitch, "Roll":Roll, "Speed_(km/hr)": Speed, "Labels": train_labels}
df_rem = pd.DataFrame(df_remaining)

# FFT extraction
x_list_fft = pd.Series(x_list).apply(lambda x: np.abs(np.fft.fft(x))[1:10])
y_list_fft = pd.Series(y_list).apply(lambda x: np.abs(np.fft.fft(x))[1:10])
z_list_fft = pd.Series(z_list).apply(lambda x: np.abs(np.fft.fft(x))[1:10])

In [7]:
def statistical_feature_from_raw_data(feature_list, column_name):
    X_data = pd.DataFrame()
    
    # mean
    X_data[column_name+ '_' + 'mean'] = pd.Series(feature_list).apply(lambda x: x.mean())
    
    # std dev
    X_data[column_name+ '_' + 'std'] = pd.Series(feature_list).apply(lambda x: x.std())
    
    # avg absolute diff
    X_data[column_name+ '_' + 'aad'] = pd.Series(feature_list).apply(lambda x: np.mean(np.absolute(x - np.mean(x))))
    
    # min
    X_data[column_name+ '_' + 'min'] = pd.Series(feature_list).apply(lambda x: x.min())
    
    # max
    X_data[column_name+ '_' + 'max'] = pd.Series(feature_list).apply(lambda x: x.max())
    
    # max-min diff
    X_data[column_name+ '_' + 'maxmin_diff'] = X_data[column_name+ '_' +'max'] - X_data[column_name+ '_' +'min']
    
    # median
    X_data[column_name+ '_' + 'median'] = pd.Series(feature_list).apply(lambda x: np.median(x))
    
    # median abs dev 
    X_data[column_name+ '_' + 'mad'] = pd.Series(feature_list).apply(lambda x: np.median(np.absolute(x - np.median(x))))
    
    # interquartile range
    X_data[column_name+ '_' + 'IQR'] = pd.Series(feature_list).apply(lambda x: np.percentile(x, 75) - np.percentile(x, 25))
    
    # negtive count
    X_data[column_name+ '_' + 'neg_count'] = pd.Series(feature_list).apply(lambda x: np.sum(x < 0))
    
    # positive count
    X_data[column_name+ '_' + 'pos_count'] = pd.Series(feature_list).apply(lambda x: np.sum(x > 0))
    
    # values above mean
    X_data[column_name+ '_' + 'above_mean'] = pd.Series(feature_list).apply(lambda x: np.sum(x > x.mean()))
    
    # number of peaks
    X_data[column_name+ '_' + 'peak_count'] = pd.Series(feature_list).apply(lambda x: len(find_peaks(x)[0]))
    
    # skewness
    X_data[column_name+ '_' + 'skewness'] = pd.Series(feature_list).apply(lambda x: stats.skew(x))
    
    # kurtosis
    X_data[column_name+ '_' + 'kurtosis'] = pd.Series(feature_list).apply(lambda x: stats.kurtosis(x))
    
    # energy
    X_data[column_name+ '_' + 'energy'] = pd.Series(feature_list).apply(lambda x: np.sum(x**2)/window_size)
    
    return X_data

In [8]:
X_data_x_axis = statistical_feature_from_raw_data(x_list, 'x')

X_data_y_axis = statistical_feature_from_raw_data(y_list, 'y')

X_data_z_axis = statistical_feature_from_raw_data(z_list, 'z')


X_data_all = pd.DataFrame()
# avg resultant
X_data_all['avg_result_accl'] = [i.mean() for i in ((pd.Series(x_list)**2 + pd.Series(y_list)**2 + pd.Series(z_list)**2)**0.5)]

# signal magnitude area
X_data_all['sma'] =    pd.Series(x_list).apply(lambda x: np.sum(abs(x)/window_size)) + pd.Series(y_list).apply(lambda x: np.sum(abs(x)/window_size)) \
                  + pd.Series(z_list).apply(lambda x: np.sum(abs(x)/window_size)) 

In [9]:
def FFT_features(fft_feature_list, column_name):
    
    X_fft_data = pd.DataFrame()
    
    # FFT mean
    X_fft_data[column_name+ '_' + 'mean_fft'] = pd.Series(fft_feature_list).apply(lambda x: x.mean())
    
    # FFT std dev
    X_fft_data[column_name+ '_' + 'std_fft'] = pd.Series(fft_feature_list).apply(lambda x: x.std())
    
    # FFT avg absolute diff
    X_fft_data[column_name+ '_' + 'aad_fft'] = pd.Series(fft_feature_list).apply(lambda x: np.mean(np.absolute(x - np.mean(x))))
    
    # FFT min
    X_fft_data[column_name+ '_' + 'min_fft'] = pd.Series(fft_feature_list).apply(lambda x: x.min())
    
    # FFT max
    X_fft_data[column_name+ '_' + 'max_fft'] = pd.Series(fft_feature_list).apply(lambda x: x.max())
    
    # FFT max-min diff
    X_fft_data[column_name+ '_' + 'maxmin_diff_fft'] = X_fft_data[column_name+ '_' +'max_fft'] - X_fft_data[column_name+ '_' +'min_fft']
    
    # FFT median
    X_fft_data[column_name+ '_' + 'median_fft'] = pd.Series(fft_feature_list).apply(lambda x: np.median(x))
    
    # FFT median abs dev 
    X_fft_data[column_name+ '_' + 'mad_fft'] = pd.Series(fft_feature_list).apply(lambda x: np.median(np.absolute(x - np.median(x))))
    
    # FFT Interquartile range
    X_fft_data[column_name+ '_' + 'IQR_fft'] = pd.Series(fft_feature_list).apply(lambda x: np.percentile(x, 75) - np.percentile(x, 25))
    
    # FFT values above mean
    X_fft_data[column_name+ '_' + 'above_mean_fft'] = pd.Series(fft_feature_list).apply(lambda x: np.sum(x > x.mean()))
    
    # FFT number of peaks
    X_fft_data[column_name+ '_' + 'peak_count_fft'] = pd.Series(fft_feature_list).apply(lambda x: len(find_peaks(x)[0]))
    
    # FFT skewness
    X_fft_data[column_name+ '_' + 'skewness_fft'] = pd.Series(fft_feature_list).apply(lambda x: stats.skew(x))
    
    # FFT kurtosis
    X_fft_data[column_name+ '_' + 'kurtosis_fft'] = pd.Series(fft_feature_list).apply(lambda x: stats.kurtosis(x))
    
    # FFT energy
    X_fft_data[column_name+ '_' + 'energy_fft'] = pd.Series(fft_feature_list).apply(lambda x: np.sum(x**2)/5)
    
    return X_fft_data

In [10]:
X_fft_data_x_axis = FFT_features(x_list_fft, 'x')
X_fft_data_y_axis = FFT_features(y_list_fft, 'y')
X_fft_data_z_axis = FFT_features(z_list_fft, 'z')


X_fft_data_all = pd.DataFrame()

# FFT avg resultant
X_fft_data_all['avg_result_accl_fft'] = [i.mean() for i in ((pd.Series(x_list_fft)**2 + pd.Series(y_list_fft)**2 + pd.Series(z_list_fft)**2)**0.5)]

# FFT Signal magnitude area
X_fft_data_all['sma_fft'] = pd.Series(x_list_fft).apply(lambda x: np.sum(abs(x)/5)) + pd.Series(y_list_fft).apply(lambda x: np.sum(abs(x)/5)) \
                     + pd.Series(z_list_fft).apply(lambda x: np.sum(abs(x)/5))

In [11]:
df_afterDA = pd.concat([X_data_x_axis, X_data_y_axis, X_data_z_axis, X_data_all, X_fft_data_x_axis, X_fft_data_y_axis, X_fft_data_z_axis, X_fft_data_all, df_rem], axis=1)

In [12]:
df_afterDA.head()

Unnamed: 0,x_mean,x_std,x_aad,x_min,x_max,x_maxmin_diff,x_median,x_mad,x_IQR,x_neg_count,...,avg_result_accl_fft,sma_fft,gFx,gFy,gFz,wx,wy,wz,Speed_(km/hr),Labels
0,0.31222,0.169898,0.101924,-0.1974,0.3746,0.572,0.3672,0.0,0.0,1,...,0.587719,1.379724,0.23539,0.65775,0.71273,0.02024,0.09686,0.0615,0.0,Stop
1,0.37238,0.003391,0.003108,0.3672,0.3746,0.0074,0.3746,0.0,0.00555,0,...,0.06781,0.167264,0.2106,0.67945,0.70368,0.02133,0.14655,0.02035,0.0,Stop
2,0.4589,0.0843,0.0843,0.3746,0.5432,0.1686,0.4589,0.0843,0.1686,0,...,0.198421,0.465989,0.24008,0.67795,0.70208,0.00899,0.12369,0.00775,0.0,Stop
3,0.55686,0.02732,0.021856,0.5432,0.6115,0.0683,0.5432,0.0,0.0,0,...,0.312503,0.85934,0.17463,0.67616,0.72256,-0.00036,0.13622,-0.08986,0.0,Stop
4,0.59101,0.031299,0.028686,0.5432,0.6115,0.0683,0.6115,0.0,0.051225,0,...,0.337339,0.927637,0.08578,0.68216,0.75019,-0.00281,0.18354,-0.17353,0.0,Stop


In [13]:
df_afterDA.Labels.value_counts()

Asphalt_Ride        51190
Stop                14193
Cobblestone_Ride     9508
Asphalt_Manhole      2824
Asphalt_Kurb          621
Asphalt_Bump          376
Name: Labels, dtype: int64

In [15]:
df_afterDA.to_csv(r"C:\Users\arockias\Desktop\SmartRadL\SmartRadL\Programs\Data Analysis\Overall_DA_Dataset_with_APR.csv", index=False)