In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
from scipy.fftpack import fft, ifft
from data_repository import DataRepository

# Chuẩn bị dữ liệu

In [2]:
data_repo = DataRepository("../.env")
#Clean data
X_Train, y_train, X_validation, y_validation, X_test, y_test = data_repo.load_current_data(clean_data=False)

In [17]:
train_files, validation_files, test_files = data_repo.load_unclean_file_names()
file_names = np.concatenate([train_files, validation_files, test_files])
file_names.shape

(12000,)

In [14]:
data = np.concatenate([X_Train, X_validation, X_test])

In [15]:
data.shape

(12000, 9000)

## Performing DFT

In [10]:
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
def equalize_column_lengths(data_dict):
    # Find the maximum length among all columns
    max_length = max(len(data) for data in data_dict.values())
    
    # Check and fill columns to ensure equality of lengths
    for key, value in data_dict.items():
        current_length = len(value)
        if current_length < max_length:
            # Determine the fill value based on data type
            fill_value = 0 if isinstance(value[0], (int, float)) else np.nan
            
            # Pad the array to match the maximum length
            data_dict[key] = np.pad(value, (0, max_length - current_length), 'constant', constant_values=fill_value)

    return data_dict

def my_cal_fft(signal, sample_rate):
    pnts   = len(signal) # number of time points
    # prepare the Fourier transform
    fourTime = np.array(range(pnts))/pnts #Normalize time vector!
    fCoefs   = np.zeros((pnts,),dtype=complex) #Init output vector of the fourier coefficient
    for k in range(pnts):
        # create complex sine wave
        csw = np.exp( -1j*2*np.pi*k*fourTime )
        # compute dot product between sine wave and signal 
        # these are called the Fourier coefficients
        # using vectorization for fast calculation
        # Normalize the Fourier coefficient by divine it to 1/N -> reduce the computational cost.
        fCoefs[k] = np.sum( np.multiply(signal,csw) ) / pnts
    # extract amplitudes only in the first half
    hz = np.linspace(0,sample_rate/2,int(math.floor(pnts/2.)+1))
    ampls = 2*np.abs(fCoefs)[0:len(hz)]
    # compute frequencies vector
    return ampls, hz

def cal_and_store_fft(signal, file_name):
    file_path = f"../../data/original_fft_data/{file_name}"
    if os.path.exists(file_path):
        # File already exists, skip the iteration
        return False
    ampls, hz = my_cal_fft(signal=signal, sample_rate=1000)
    # Determine the maximum length among signal, ampls, and hz
    data_dict = { 
        "Amplitude": ampls,
        "Frequency": hz
    }
    data_dict = equalize_column_lengths(data_dict=data_dict)
    df = pd.DataFrame(data=data_dict)
    df.to_csv(file_path, index=False)
    return True

# Single proccess
# for i in tqdm(range(0, len(data), 1), desc="Processing signals"):
#     cal_and_store_fft(signal=data[i], file_name=file_names[i])

# Function to be parallelized
def process_data(args):
    signal, file_name = args
    cal_and_store_fft(signal, file_name)
    
def plot_frequency_domain(ampls, hz):
    srate  = 1000 # hz
    dc_ampl = ampls[0]
    print(f"Dc element: {dc_ampl}")
    #Don't plot DC element since it's too big
    ampls = ampls[1:len(ampls)]
    hz = hz[1:len(hz)]
    plt.subplot(212)  # 2 rows, 1 column, 2nd subplot
    plt.stem(hz,ampls[range(len(hz))])
    plt.xlabel('Frequency (Hz)'), plt.ylabel('Amplitude (a.u.)')
    plt.xlim(0, srate//2)
    plt.show()

In [6]:
signal = X_Train[1]
ampls, hz = my_cal_fft(signal=signal, sample_rate=1000)

In [8]:
hz.shape

(4501,)

In [31]:
data_dict = { 
    "Amplitude": ampls,
    "Frequency": hz
}
data_dict = equalize_column_lengths(data_dict=data_dict)
df = pd.DataFrame(data=data_dict)

In [33]:
# Use ProcessPoolExecutor for parallel processing
with ProcessPoolExecutor() as executor:
    args_list = zip(data, file_names)
    results = list(tqdm(executor.map(process_data, args_list), total=len(data), desc="Processing signals"))

Processing signals: 100%|██████████| 12000/12000 [56:06<00:00,  3.56it/s] 


In [35]:
directory_path = "../../data/original_fft_data/"  # Replace this with the path to your directory

# Get the list of files in the directory
files = os.listdir(directory_path)

# Count the number of files
num_files = len(files)

print(f"There are {num_files} files in the directory {directory_path}.")

There are 12000 files in the directory ../../data/original_fft_data/.
