In [1]:
import pandas as pd
import numpy as np
import numpy as np
from scipy.signal import fftconvolve
from concurrent.futures import ThreadPoolExecutor
import numpy.fft as fft
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from scipy.spatial.distance import pdist, squareform

In a future release, Dask DataFrame will use new implementation that
contains several improvements including a logical query planning.
The user-facing DataFrame API will remain unchanged.

The new implementation is already available and can be enabled by
installing the dask-expr library:

    $ pip install dask-expr

and turning the query planning option on:

    >>> import dask
    >>> dask.config.set({'dataframe.query-planning': True})
    >>> import dask.dataframe as dd

API documentation for the new implementation is available at
https://docs.dask.org/en/stable/dask-expr-api.html

Any feedback can be reported on the Dask issue tracker
https://github.com/dask/dask/issues 

  import dask.dataframe as dd


In [2]:
data = pd.read_parquet('../../data/features/dsp_real.parquet', engine='pyarrow')  # You can use 'fastparquet' as the engine
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29743,29744,29745,29746,29747,29748,29749,29750,Target,Test
0,841,366,872,283,272,324,502,492,399,197,...,399,492,502,324,272,283,872,366,XBB.1.5,0
1,819,387,880,268,281,310,529,490,413,194,...,413,490,529,310,281,268,880,387,AY.116,0
2,810,405,875,239,293,319,526,503,403,198,...,403,503,526,319,293,239,875,405,B.1.525,1
3,850,405,866,261,274,316,524,490,419,210,...,419,490,524,316,274,261,866,405,B.1.1.57,0
4,812,402,884,265,277,336,507,488,381,185,...,381,488,507,336,277,265,884,402,BQ.1.1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22706,841,369,871,273,281,317,507,491,407,188,...,407,491,507,317,281,273,871,369,CP.5,1
22707,821,392,852,246,325,277,533,478,494,185,...,494,478,533,277,325,246,852,392,BE.7,1
22708,804,392,858,239,326,282,532,467,488,188,...,488,467,532,282,326,239,858,392,BE.7,1
22709,843,389,782,272,313,352,553,519,396,191,...,396,519,553,352,313,272,782,389,BA.1,1


In [3]:
def random_undersample(data_df, max_samples_per_class=40, random_state=42):
    
    data_df = pd.DataFrame(data)
    
    undersampled_data = []

    for class_value, group in data_df.groupby('Target'):
        if len(group) > max_samples_per_class:
            undersampled_group = group.sample(n=max_samples_per_class, random_state=random_state)
        else:
            undersampled_group = group
        undersampled_data.append(undersampled_group)

    undersampled_data_df = pd.concat(undersampled_data)
    undersampled_data_df = undersampled_data_df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    return undersampled_data_df

In [4]:
pair_data = random_undersample(data[data["Test"] == 0])

In [5]:
# Ensure data_array and pair_data_array are NumPy arrays
data_array = data.drop(columns=["Test", "Target"]).to_numpy()
pair_data_array = pair_data.drop(columns=["Test", "Target"]).to_numpy()

# Standardize the data and pair_data arrays
data_std = (data_array - np.mean(data_array, axis=1, keepdims=True)) / np.std(data_array, axis=1, keepdims=True)
pair_data_std = (pair_data_array - np.mean(pair_data_array, axis=1, keepdims=True)) / np.std(pair_data_array, axis=1, keepdims=True)

# Compute the dot product between standardized data and pair_data (transposed)
# Since we're dealing with standardized data, the dot product gives us the sum of the products of standardized scores
dot_product = np.dot(data_std, pair_data_std.T)

# Divide by the number of observations to get the Pearson correlation coefficients
# Adjust n-1 for Pearson's correlation if needed, depending on your definition of standard deviation (np.std by default uses n)
n = data_array.shape[1]
correlation_matrix = dot_product / n

In [6]:
distance_matrix = (1 - correlation_matrix) / 2

In [7]:
distance_matrix = pd.DataFrame(distance_matrix)

In [8]:
distance_matrix["Target"] = data["Target"].to_list()
distance_matrix["Test"] = data["Test"].to_list()

In [9]:
distance_matrix.to_parquet('../../data/features/dsp_real_dist.parquet', engine='pyarrow')

  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
