In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("fft_dataset.csv", index_col=0)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5990,5991,5992,5993,5994,5995,5996,5997,5998,5999
6_01_0,0.112358,0.034818,0.018651,0.013124,0.206282,0.041696,0.054382,0.297927,0.405547,0.330600,...,0.000041,0.000033,0.000023,0.000033,0.000040,0.000033,0.000023,0.000033,0.000040,0.000033
6_01_1,0.113601,0.032898,0.044340,0.036356,0.059043,0.064000,0.088885,0.181754,0.427581,0.235320,...,0.000055,0.000055,0.000052,0.000052,0.000056,0.000055,0.000051,0.000053,0.000056,0.000054
6_01_10,0.005531,0.078903,0.009256,0.136242,0.058258,0.058613,0.301000,0.221344,1.000000,0.683145,...,0.000230,0.000238,0.000238,0.000230,0.000235,0.000240,0.000233,0.000231,0.000239,0.000236
6_01_11,0.110023,0.009812,0.025336,0.036760,0.062735,0.070672,0.080056,0.135541,0.464421,0.429974,...,0.000018,0.000045,0.000049,0.000028,0.000013,0.000041,0.000050,0.000033,0.000010,0.000037
6_01_12,0.105340,0.026534,0.041305,0.022346,0.107876,0.062634,0.090992,0.321599,0.534074,0.254085,...,0.000074,0.000087,0.000085,0.000072,0.000071,0.000084,0.000088,0.000076,0.000068,0.000080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6_60_5,0.069576,0.004602,0.003180,0.011101,0.008695,0.010618,0.002082,0.096498,0.233499,0.054914,...,0.000024,0.000024,0.000025,0.000025,0.000024,0.000023,0.000022,0.000021,0.000019,0.000019
6_60_6,0.279638,0.148183,0.076664,0.034406,0.095101,0.080251,0.067548,0.118046,0.234762,0.038431,...,0.000239,0.000251,0.000141,0.000150,0.000254,0.000232,0.000118,0.000180,0.000262,0.000208
6_60_7,0.160999,0.037529,0.048340,0.050212,0.028655,0.030842,0.065344,0.083618,0.454500,0.274905,...,0.000484,0.000469,0.000473,0.000496,0.000524,0.000543,0.000545,0.000530,0.000504,0.000478
6_60_8,0.072625,0.008465,0.010114,0.020994,0.032169,0.029160,0.025781,0.059348,0.151535,0.072892,...,0.000020,0.000017,0.000016,0.000019,0.000020,0.000018,0.000016,0.000018,0.000020,0.000018


In [3]:
# train-test split
X = df.to_numpy()
y = df.index.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled

array([[-0.3619921 , -0.7657903 , -0.64247805, ..., -0.45802243,
        -0.46334919, -0.42117727],
       [ 0.84150189,  0.51990414,  1.01891485, ...,  0.59876514,
         0.71873613,  0.60037672],
       [-0.13986019,  0.86499921,  0.7211492 , ...,  1.02554825,
         0.62362689,  0.77460008],
       ...,
       [ 0.12882798,  1.13256386,  0.16163532, ..., -0.34918633,
        -0.36614319, -0.38774265],
       [ 1.32281306, -0.29404717,  0.39258625, ...,  0.87968584,
         1.1343874 ,  1.01226944],
       [-0.00227569, -0.04213929,  0.10735878, ..., -0.35944589,
        -0.35857669, -0.35631623]], shape=(2400, 6000))

In [5]:
X_test_scaled = scaler.transform(X_test)

In [6]:
# check how many components to keep for 99% variance explained
pca = PCA(n_components=0.99)
pca.fit(X_train_scaled)
# Number of components to keep
n_components = pca.n_components_
print(f"Number of components to keep for 99% variance explained: {n_components}")

Number of components to keep for 99% variance explained: 1438


In [7]:
X_train_pca = pca.transform(X_train_scaled)
X_train_pca

array([[ 4.08460372e+01,  2.52764991e+01, -1.62781202e+01, ...,
        -1.36679058e-01, -3.30165195e-01, -2.93992734e-02],
       [-1.55202410e+01,  3.04548859e-01, -8.03185213e+00, ...,
        -5.62807961e-01,  4.83565417e-01,  2.97535456e-01],
       [ 6.23995155e+01, -6.11291265e+01,  1.05685065e+01, ...,
        -9.46885254e-02, -7.44909720e-02,  6.42890769e-02],
       ...,
       [ 4.52304813e+01, -9.22903442e+00,  3.39858630e+01, ...,
         1.47005017e-01, -2.30861014e-01,  3.44826789e-01],
       [ 1.53435482e+01, -2.06360422e+01, -3.58334003e+00, ...,
         3.68090726e-01,  5.30046839e-01, -3.00786180e-01],
       [-9.90640866e+00,  2.42912622e+01,  9.05916524e+00, ...,
         2.78004158e-02, -4.07485823e-01,  4.94695809e-01]],
      shape=(2400, 1438))

In [8]:
# reduce dimensionality of using PACMAP
import pacmap
embedding = pacmap.PaCMAP(n_components=2, n_neighbors=10, MN_ratio=0.5, FP_ratio=2.0)
X_train_pacmap = embedding.fit_transform(X_train_scaled)
df_X_train_pacmap = pd.DataFrame(X_train_pacmap, index=y_train, columns=[f"PC{i+1}" for i in range(X_train_pacmap.shape[1])])
df_X_train_pacmap

Unnamed: 0,PC1,PC2
6_13_47,7.524396,9.179776
6_15_0,0.525205,3.452718
6_05_32,11.437427,-2.073351
6_34_7,-0.368975,10.872567
6_21_18,-2.147217,-1.157230
...,...,...
6_33_43,-1.556255,9.761810
6_22_5,1.426750,-9.361582
6_23_36,7.211289,3.196271
6_26_49,-8.224277,-6.952924


In [15]:
# visualize the reduced data with a 3d scatter plot using plotly
import plotly.express as px
fig = px.scatter(df_X_train_pacmap, x='PC1', y='PC2', title="PCA Reduced Data", height=1000, width=1000, text=df_X_train_pacmap.index)
# add labels to the points as hover text
fig.update_traces(text=df_X_train_pacmap.index, hoverinfo='text+name')
#fig.update_traces(marker=dict(size=5))
fig.show()