In [1]:
import pandas as pd
import numpy as np
import pacmap
from sklearn.decomposition import PCA
from pyod.models.iforest import IForest
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import plotly.express as px

In [2]:
df = pd.read_csv("fft_dataset.csv", index_col=0)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3390,3391,3392,3393,3394,3395,3396,3397,3398,3399
6_01_0,0.131477,0.040664,0.021781,0.016047,0.241540,0.049441,0.065172,0.349460,0.476330,0.386294,...,0.000092,0.000070,0.000042,0.000073,0.000092,0.000071,0.000042,0.000072,0.000092,0.000072
6_01_1,0.115790,0.033437,0.045090,0.037052,0.060328,0.065047,0.090258,0.186483,0.435931,0.238401,...,0.000018,0.000021,0.000046,0.000041,0.000009,0.000029,0.000047,0.000035,0.000002,0.000035
6_01_10,0.005463,0.078892,0.009493,0.136006,0.058788,0.058395,0.300883,0.224023,1.000000,0.677277,...,0.000072,0.000157,0.000122,0.000031,0.000136,0.000151,0.000053,0.000099,0.000160,0.000099
6_01_11,0.110545,0.009117,0.026015,0.037528,0.062674,0.072264,0.082788,0.136266,0.472449,0.432665,...,0.000222,0.000229,0.000231,0.000225,0.000222,0.000228,0.000232,0.000226,0.000221,0.000226
6_01_12,0.125418,0.031603,0.048844,0.026526,0.128466,0.074944,0.107386,0.383622,0.633578,0.299405,...,0.000052,0.000108,0.000110,0.000057,0.000027,0.000096,0.000116,0.000078,0.000008,0.000078
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6_60_5,0.071865,0.004768,0.003300,0.011508,0.008962,0.010774,0.001901,0.100205,0.240953,0.056392,...,0.000005,0.000005,0.000005,0.000005,0.000005,0.000005,0.000005,0.000005,0.000005,0.000005
6_60_6,0.298776,0.158232,0.082080,0.036483,0.101728,0.085802,0.072058,0.126860,0.250605,0.041331,...,0.000017,0.000016,0.000008,0.000012,0.000018,0.000015,0.000007,0.000014,0.000018,0.000014
6_60_7,0.178895,0.040466,0.052830,0.055122,0.031937,0.033674,0.073049,0.090692,0.503542,0.300539,...,0.000165,0.000141,0.000103,0.000065,0.000060,0.000096,0.000135,0.000162,0.000171,0.000162
6_60_8,0.074978,0.008684,0.010270,0.021518,0.033130,0.030203,0.027108,0.061079,0.156846,0.074700,...,0.000050,0.000034,0.000026,0.000044,0.000051,0.000038,0.000025,0.000041,0.000051,0.000041


In [3]:
# train-test split
X = df.to_numpy()
y = df.index.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, shuffle=False)

In [4]:
# Standardize the data
scaler = StandardScaler()
X_train_scaled = X_train#scaler.fit_transform(X_train)
X_test_scaled = X_test#scaler.transform(X_test)
X_scaled = X#scaler.transform(X)

In [5]:
# use iForest for anomaly detection
model = IForest(contamination=0.1, random_state=42)
model.fit(X_train_scaled)
# predict anomalies
y_train_pred = model.predict(X_train_scaled)
# get anomaly scores
y_train_scores = model.decision_scores_
# get anomaly scores for test set
y_test_scores = model.decision_function(X_test_scaled)
# get anomaly labels for test set
y_test_pred = model.predict(X_test_scaled)

In [6]:
all_scores = np.concatenate((y_train_scores, y_test_scores))
all_scores

array([-0.09231   , -0.0783146 , -0.08698021, ..., -0.06178318,
       -0.09948243, -0.08303663], shape=(3000,))

In [7]:
df_results = pd.DataFrame(index= df.index)
# add a column train or test to the original dataframe
df_results['set'] = ['train' if i in y_train else 'test' for i in y]
# add a column for anomaly scores to the original dataframe
df_results['anomaly_score'] = all_scores
df_results

Unnamed: 0,set,anomaly_score
6_01_0,train,-0.092310
6_01_1,train,-0.078315
6_01_10,train,-0.086980
6_01_11,train,-0.090074
6_01_12,train,-0.065836
...,...,...
6_60_5,test,-0.091232
6_60_6,test,0.114529
6_60_7,test,-0.061783
6_60_8,test,-0.099482


In [8]:
# check how many components to keep for 99% variance explained
pca = PCA(n_components=0.99)
pca.fit(X_train_scaled)
# Number of components to keep
n_components = pca.n_components_
print(f"Number of components to keep for 99% variance explained: {n_components}")

Number of components to keep for 99% variance explained: 131


In [9]:
# Variance explained by each component
explained_variance = pca.explained_variance_ratio_
print(f"Explained variance by each component: {explained_variance}")

Explained variance by each component: [1.54861977e-01 1.30404416e-01 1.05764252e-01 7.93315124e-02
 6.27198722e-02 4.62583337e-02 3.75289711e-02 3.18259696e-02
 2.80071700e-02 2.60812084e-02 2.48309016e-02 2.06785031e-02
 1.68202537e-02 1.54163535e-02 1.51988842e-02 1.22532049e-02
 1.14167353e-02 1.01334310e-02 9.10799329e-03 8.42460880e-03
 7.45918550e-03 6.19598142e-03 5.52894316e-03 5.03855827e-03
 4.89286363e-03 4.60619161e-03 4.51174436e-03 4.12428414e-03
 3.88801290e-03 3.57033871e-03 3.27835642e-03 3.16781132e-03
 3.01899871e-03 2.88117303e-03 2.84413542e-03 2.83039707e-03
 2.66958682e-03 2.60164698e-03 2.42656584e-03 2.35843340e-03
 2.21242762e-03 2.10772102e-03 2.01067979e-03 1.90289397e-03
 1.84855227e-03 1.73104929e-03 1.67796219e-03 1.63057420e-03
 1.52641062e-03 1.47785880e-03 1.47206559e-03 1.40787696e-03
 1.38392792e-03 1.34706897e-03 1.25939609e-03 1.25384908e-03
 1.20070265e-03 1.19328729e-03 1.17105051e-03 1.13982335e-03
 1.10820359e-03 1.07672435e-03 1.02577258e-03 9

In [10]:
X_train_pca = pca.transform(X_train_scaled)
df_pca_comp = pd.DataFrame(data = pca.components_,
                           columns = df.columns.values,
                           index = ['Component '+str(i+1) for i in range(0, pca.n_components_)])
df_pca_comp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3390,3391,3392,3393,3394,3395,3396,3397,3398,3399
Component 1,0.008720,0.002546,0.003441,0.003889,0.005173,0.010463,0.014383,0.016872,0.030923,0.028608,...,0.000003,0.000002,0.000003,2.690584e-06,0.000003,3.152733e-06,2.806550e-06,0.000003,0.000003,0.000003
Component 2,-0.001259,-0.001382,-0.001920,-0.002805,-0.004076,-0.011251,-0.018535,-0.010136,0.000335,0.004076,...,-0.000004,-0.000004,-0.000004,-3.413602e-06,-0.000004,-4.411192e-06,-3.981014e-06,-0.000004,-0.000003,-0.000004
Component 3,0.005015,0.001599,0.001263,0.001050,0.001011,0.000046,0.000032,0.000054,0.006537,0.023501,...,0.000004,0.000003,0.000003,3.310508e-06,0.000003,3.398779e-06,3.481072e-06,0.000003,0.000003,0.000003
Component 4,-0.002281,-0.001767,-0.002334,-0.002647,-0.003462,-0.006167,-0.014148,-0.014031,-0.013808,-0.000072,...,0.000001,0.000001,0.000002,9.638361e-07,0.000002,5.495352e-07,7.184168e-07,0.000002,0.000001,0.000002
Component 5,0.000860,-0.000338,-0.000319,-0.001162,-0.000635,-0.002613,-0.009858,-0.007624,-0.001146,0.012434,...,0.000004,0.000003,0.000004,4.457010e-06,0.000004,3.478994e-06,3.167695e-06,0.000004,0.000004,0.000004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Component 127,-0.037314,0.009620,0.005917,-0.004671,0.006395,-0.033674,-0.010227,-0.001273,0.006078,0.000747,...,-0.000027,-0.000019,-0.000013,-1.286708e-05,-0.000027,-2.498072e-05,2.274277e-06,-0.000012,-0.000027,-0.000012
Component 128,0.050063,0.026765,0.021698,0.018839,0.050133,0.029294,-0.038714,0.027425,0.007203,-0.013350,...,0.000024,0.000036,0.000027,2.118912e-05,0.000037,1.896756e-05,3.493506e-05,0.000028,0.000006,0.000028
Component 129,0.056833,-0.014740,0.004617,0.009263,-0.002595,-0.012272,-0.004657,0.012903,-0.022036,0.017660,...,0.000028,0.000044,0.000035,3.254280e-05,0.000017,5.647608e-05,3.837967e-05,0.000037,0.000032,0.000037
Component 130,0.045947,0.020962,0.018353,0.009633,0.012868,0.058235,-0.071555,0.052214,-0.007538,-0.014260,...,0.000027,0.000021,0.000021,8.299629e-06,0.000028,2.467764e-05,1.493023e-05,0.000021,0.000038,0.000021


In [None]:
# visualize first 2 components with line plot
fig = px.line(df_pca_comp.head(2).T, title='PCA Components', labels={'value': 'Component Value', 'index': 'Feature'}, template='plotly_white')
fig.update_layout(xaxis_title='Feature', yaxis_title='Component Value')
fig.show()

In [12]:
# calculate reconstruction error for PCAm (training set)
X_train_pca_inv = pca.inverse_transform(X_train_pca)
reconstruction_error_pca_train = ((X_train_scaled - X_train_pca_inv) ** 2).mean(axis=1) # MSE
print(f"Reconstruction error for PCA: {reconstruction_error_pca_train.mean()}")

Reconstruction error for PCA: 5.407241429923077e-05


In [14]:
# calculate reconstruction error for PCAm (test set)
X_test_scaled = X_test#scaler.transform(X_test)
X_test_pca = pca.transform(X_test_scaled)
X_test_pca_inv = pca.inverse_transform(X_test_pca)
reconstruction_error_pca_test = ((X_test_scaled - X_test_pca_inv) ** 2).mean(axis=1) # MSE
print(f"Reconstruction error for PCA on test set: {reconstruction_error_pca_test.mean()}")

Reconstruction error for PCA on test set: 5.5658096817994724e-05


In [15]:
all_recostruction = np.concatenate((X_train_pca_inv, X_test_pca_inv))
all_recostruction.shape

(3000, 3400)

In [16]:
all_absolute_reconstruction_error = np.abs(X_scaled - all_recostruction)
df_errors = pd.DataFrame(all_absolute_reconstruction_error, index=df.index, columns=df.columns)
df_errors


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3390,3391,3392,3393,3394,3395,3396,3397,3398,3399
6_01_0,0.014790,0.001416,0.022860,0.033189,0.164342,0.022550,0.038045,0.004257,0.002465,0.001508,...,0.000046,6.672786e-05,0.000103,0.000063,0.000060,0.000074,0.000094,0.000071,0.000051,0.000071
6_01_1,0.004220,0.016162,0.004211,0.015267,0.015560,0.015245,0.004124,0.003060,0.003122,0.000962,...,0.000115,1.164994e-04,0.000088,0.000097,0.000133,0.000105,0.000084,0.000105,0.000133,0.000105
6_01_10,0.106817,0.015537,0.054044,0.056002,0.038953,0.098687,0.081234,0.032280,0.006890,0.004808,...,0.000037,3.519401e-05,0.000001,0.000086,0.000018,0.000031,0.000070,0.000028,0.000046,0.000028
6_01_11,0.014441,0.032865,0.017155,0.009737,0.005275,0.006152,0.005015,0.001475,0.000281,0.003417,...,0.000153,1.593950e-04,0.000172,0.000153,0.000154,0.000166,0.000161,0.000152,0.000165,0.000152
6_01_12,0.026249,0.023277,0.001921,0.030432,0.030695,0.021742,0.012976,0.002079,0.001276,0.000685,...,0.000042,8.370780e-07,0.000008,0.000047,0.000071,0.000006,0.000011,0.000026,0.000076,0.000026
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6_60_5,0.029150,0.014397,0.001194,0.002358,0.008880,0.019109,0.009993,0.000400,0.001845,0.005014,...,0.000043,4.500048e-05,0.000041,0.000051,0.000028,0.000025,0.000038,0.000060,0.000038,0.000060
6_60_6,0.145400,0.120923,0.030300,0.005665,0.028767,0.002229,0.033187,0.015143,0.011163,0.006083,...,0.000087,1.202126e-04,0.000128,0.000112,0.000105,0.000115,0.000108,0.000126,0.000097,0.000126
6_60_7,0.050206,0.015064,0.010934,0.014968,0.006929,0.021962,0.021272,0.014128,0.007551,0.004632,...,0.000036,5.070246e-05,0.000098,0.000142,0.000132,0.000116,0.000065,0.000037,0.000020,0.000037
6_60_8,0.014142,0.008707,0.003656,0.007464,0.017497,0.003294,0.004378,0.007624,0.001937,0.000765,...,0.000008,3.167977e-05,0.000032,0.000018,0.000001,0.000008,0.000029,0.000033,0.000006,0.000033


In [17]:
all_reconstruction_error = ((X_scaled - all_recostruction) ** 2).mean(axis=1) # MSE
all_reconstruction_error.shape

(3000,)

In [18]:
# calculate for each test sample the reconstruction error
reconstruction_error_pca_df = pd.DataFrame({
    'Sample': y,
    'Reconstruction Error': all_reconstruction_error
})
reconstruction_error_pca_df.sort_values(by='Reconstruction Error', ascending=False).head(10)

Unnamed: 0,Sample,Reconstruction Error
2997,6_60_7,0.00056
2466,6_50_23,0.000557
2996,6_60_6,0.000502
2973,6_60_3,0.000469
2342,6_47_47,0.000424
2061,6_42_19,0.000416
775,6_16_31,0.0004
1139,6_23_44,0.000383
2987,6_60_42,0.000339
2043,6_41_48,0.000328


In [19]:
def get_reconstruction_analysis_df(sample_name):
    sample_index = y.tolist().index(sample_name)
    original_spectrum = X_scaled[sample_index]
    reconstructed_spectrum = all_recostruction[sample_index]
    absolute_error = np.abs(original_spectrum - reconstructed_spectrum)
    reconstruction_df = pd.DataFrame({
        'Frequency (Hz)': df.columns.values,
        'Original Spectrum': original_spectrum,
        'Reconstructed Spectrum': reconstructed_spectrum,
        'Absolute Error': absolute_error
    })
    reconstruction_df.set_index('Frequency (Hz)', inplace=True)
    # apply centered moving average to the absolute error
    reconstruction_df['Centered Moving Average absolute error'] = reconstruction_df['Absolute Error'].rolling(window=34, center=True).mean()
    return reconstruction_df

In [20]:
# take the first sample with the highest reconstruction error and plot the original spectrum and the reconstructed spectrum
sample_with_highest_error = reconstruction_error_pca_df.sort_values(by='Reconstruction Error', ascending=False).iloc[0]
reconstruction_df_highest = get_reconstruction_analysis_df(sample_with_highest_error['Sample'])
title = "Original Spectrum vs Reconstructed Spectrum for Sample: " + sample_with_highest_error['Sample']
fig = px.line(reconstruction_df_highest,
    labels={'x': 'Frequency (Hz)', 'value': 'Amplitude'},
    title=title,
    width=1200,
    height=500,
    template="plotly_white"
)
#fig.update_traces(name=['Original Spectrum', 'Reconstructed Spectrum'])
fig.update_layout(
    xaxis_title="Frequency (Hz)",
    yaxis_title="Amplitude",
)
fig.show()
sample_with_lowest_error = reconstruction_error_pca_df.sort_values(by='Reconstruction Error', ascending=True).iloc[0]
reconstruction_df_lowest = get_reconstruction_analysis_df(sample_with_lowest_error['Sample'])
title = "Original Spectrum vs Reconstructed Spectrum for Sample: " + sample_with_lowest_error['Sample']
fig = px.line(reconstruction_df_lowest,
    labels={'x': 'Frequency (Hz)', 'value': 'Amplitude'},
    title=title,
    width=1200,
    height=500,
    template="plotly_white"
)
fig.update_layout(
    xaxis_title="Frequency (Hz)",
    yaxis_title="Amplitude",
)
fig.show()

In [21]:
# all reconstruction errors
reconstruction_error_pca_all = np.concatenate((reconstruction_error_pca_train, reconstruction_error_pca_test))
df_results['reconstruction_error_pca'] = reconstruction_error_pca_all
df_results

Unnamed: 0,set,anomaly_score,reconstruction_error_pca
6_01_0,train,-0.092310,0.000040
6_01_1,train,-0.078315,0.000057
6_01_10,train,-0.086980,0.000091
6_01_11,train,-0.090074,0.000028
6_01_12,train,-0.065836,0.000042
...,...,...,...
6_60_5,test,-0.091232,0.000041
6_60_6,test,0.114529,0.000502
6_60_7,test,-0.061783,0.000560
6_60_8,test,-0.099482,0.000042


In [40]:
df_results[['anomaly_score','reconstruction_error_pca']].corr(method='spearman')

Unnamed: 0,anomaly_score,reconstruction_error_pca
anomaly_score,1.0,0.608262
reconstruction_error_pca,0.608262,1.0


In [23]:
#X_scaled = np.concatenate((X_train_scaled, X_test_scaled), axis=0)
#X_scaled.shape

In [24]:
# reduce dimensionality of using PACMAP
embedding = pacmap.PaCMAP(n_components=2, n_neighbors=10, MN_ratio=0.5, FP_ratio=2.0)
X_pacmap = embedding.fit_transform(X_scaled)
df_pacmap = pd.DataFrame(X_pacmap, index=df.index, columns=[f"PC{i+1}" for i in range(X_pacmap.shape[1])])
df_results["PC1"] = df_pacmap["PC1"]
df_results["PC2"] = df_pacmap["PC2"]
df_results

Unnamed: 0,set,anomaly_score,reconstruction_error_pca,PC1,PC2
6_01_0,train,-0.092310,0.000040,8.394159,-2.773104
6_01_1,train,-0.078315,0.000057,8.759748,-2.855465
6_01_10,train,-0.086980,0.000091,8.022003,1.313037
6_01_11,train,-0.090074,0.000028,8.990813,4.055992
6_01_12,train,-0.065836,0.000042,7.435779,-5.331356
...,...,...,...,...,...
6_60_5,test,-0.091232,0.000041,-4.692497,-10.316690
6_60_6,test,0.114529,0.000502,-1.290030,-7.582787
6_60_7,test,-0.061783,0.000560,0.871884,-9.023148
6_60_8,test,-0.099482,0.000042,-5.155885,-9.903257


In [25]:
# visualize the reduced data with a 3d scatter plot using plotly
fig = px.scatter(df_results, x='PC1', y='PC2', title="PACMAP Reduced Train Data", height=1000, width=1200, template="plotly_white",
                 color=df_results['anomaly_score'], size=df_results['reconstruction_error_pca'], symbol=df_results['set'],
                 hover_name=df_results.index, hover_data=['anomaly_score', 'reconstruction_error_pca', 'set'])
# change hover data to show index
#fig.update_traces(hovertemplate=df_pacmap.index)
# move symbol legend to the top right
fig.update_layout(legend=dict(title='Set', orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1))
# change color scale to viridis
fig.update_traces(marker=dict(colorscale='Viridis', showscale=True, colorbar=dict(title='Anomaly Score')))
fig.show()

In [26]:
# reduce dimensionality of using PACMAP (errors)
embedding = pacmap.PaCMAP(n_components=2, n_neighbors=10, MN_ratio=0.5, FP_ratio=2.0)
X_pacmap_errors = embedding.fit_transform(df_errors.to_numpy())
df_pacmap_errors = pd.DataFrame(X_pacmap_errors, index=df.index, columns=[f"PC{i+1}" for i in range(X_pacmap_errors.shape[1])])
df_results["PC1_errors"] = df_pacmap_errors["PC1"]
df_results["PC2_errors"] = df_pacmap_errors["PC2"]
df_results

Unnamed: 0,set,anomaly_score,reconstruction_error_pca,PC1,PC2,PC1_errors,PC2_errors
6_01_0,train,-0.092310,0.000040,8.394159,-2.773104,-0.587626,4.066014
6_01_1,train,-0.078315,0.000057,8.759748,-2.855465,3.635004,2.881106
6_01_10,train,-0.086980,0.000091,8.022003,1.313037,4.669347,3.838667
6_01_11,train,-0.090074,0.000028,8.990813,4.055992,-1.718205,2.739026
6_01_12,train,-0.065836,0.000042,7.435779,-5.331356,1.136597,-2.307651
...,...,...,...,...,...,...,...
6_60_5,test,-0.091232,0.000041,-4.692497,-10.316690,-3.458192,4.346852
6_60_6,test,0.114529,0.000502,-1.290030,-7.582787,8.641584,1.952866
6_60_7,test,-0.061783,0.000560,0.871884,-9.023148,6.820626,3.382800
6_60_8,test,-0.099482,0.000042,-5.155885,-9.903257,-0.526651,-1.344941


In [27]:
# visualize the reduced data with a 3d scatter plot using plotly
fig = px.scatter(df_results, x='PC1_errors', y='PC2_errors', title="PACMAP Reduced Train Data", height=1000, width=1200, template="plotly_white",
                 color=df_results['anomaly_score'], size=df_results['reconstruction_error_pca'], symbol=df_results['set'],
                 hover_name=df_results.index, hover_data=['anomaly_score', 'reconstruction_error_pca', 'set'])
# change hover data to show index
#fig.update_traces(hovertemplate=df_pacmap.index)
# move symbol legend to the top right
fig.update_layout(legend=dict(title='Set', orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1))
# change color scale to viridis
fig.update_traces(marker=dict(colorscale='Viridis', showscale=True, colorbar=dict(title='Anomaly Score')))
fig.show()