In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import torch
from chronos import ChronosPipeline
from tqdm import tqdm
import pickle

# Set up device based on CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize Chronos pipeline on specified device with bfloat16 precision
pipeline = ChronosPipeline.from_pretrained(
    "amazon/chronos-t5-small",
    device_map="cpu" if device.type == "cpu" else "auto",
    torch_dtype=torch.bfloat16
)

print(f"Using device: {device}")


2024-11-04 00:02:15.525415: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-04 00:02:15.570519: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-04 00:02:15.583838: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-04 00:02:15.612840: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using device: cuda


In [2]:
import warnings
warnings.filterwarnings(action="ignore")

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image 
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [3]:
def chunkify(df: pd.DataFrame, chunk_size: int, stride: int = 1):
    start = 0
    length = df.shape[0]

    # If DF is smaller than the chunk, return the DF
    if length <= chunk_size:
        return df[:]

    # Producing individual chunks
    dfs = []
    # while start + chunk_size <= length:
    #     dfs.append(df[start:chunk_size + start])
    #     start = start + chunk_size
    for i in range(0, length - chunk_size, stride):
        dfs.append(df[i:i + chunk_size])
    return dfs

In [4]:
df_value_value = pd.read_csv('../data/redd_active_value_f1hz.csv')
attributes = [c for c in df_value_value.columns.values if not c in ['timestamp']]
labels = [c for c in df_value_value.columns.values if not c in ['timestamp', 'mains', 'amplitude_spectrum', 'phase_spectrum']]
#labels = ['Fridge01','Dish washer01','Microwave01','Washer dryer01','Washer dryer02']
predictors = ['mains']
index_name = 'timestamp'
training_start = '2011-04-16'
training_end = '2011-05-16'
test_start = '2011-05-17'
test_end = '2011-05-31'

# Ensure 'timestamp' column is in datetime format
df_value_value[index_name] = pd.to_datetime(df_value_value[index_name])
# Set the index as the timestamp
df_value_value.set_index(index_name, inplace=True)

In [5]:
windows_size_opt = 28

In [6]:
scaler = StandardScaler()
# Partition the DataFrame into training and test sets
training_active_value_set = df_value_value.loc[training_start:training_end]
training_active_value_set[predictors] = scaler.fit_transform(training_active_value_set[predictors])
data_train = {}
for a in labels: # type: ignore
    p = predictors.copy()
    p.append(a)
    data_train[a] = chunkify(
            training_active_value_set[p], # type: ignore
            windows_size_opt,
            windows_size_opt
        )
test_active_value_set = df_value_value.loc[test_start:test_end]
test_active_value_set[predictors] = scaler.transform(test_active_value_set[predictors])
data_test = {}
for a in labels: # type: ignore
    p = predictors.copy()
    p.append(a)
    data_test[a] = chunkify(
            test_active_value_set[p], # type: ignore
            windows_size_opt,
            windows_size_opt
        )

In [7]:
import os
import torch
import numpy as np
from tqdm import tqdm

result_path = '../data/v9'
if not os.path.isdir(os.path.join(result_path)):
    os.makedirs(os.path.join(result_path))

# File to store the progress
progress_file = os.path.join(result_path, 'progress.txt')
indices_file = os.path.join(result_path, 'indices.txt')

# Load existing progress if available
saved_result = {p: [] for p in predictors}
processed_indices = {p: set() for p in predictors}

if os.path.exists(progress_file):
    # Load the processed indices
    if os.path.exists(indices_file):
        with open(indices_file, 'r') as f:
            for line in f:
                predictor, idx = line.strip().split(',')
                processed_indices[predictor].add(int(idx))

# Process the data
for p in predictors:
    result = saved_result[p]
    for idx, ds in enumerate(tqdm(data_train[labels[0]])):
        if idx in processed_indices[p]:
            continue  # Skip already processed data

        context = torch.tensor(np.array(ds[p]).tolist())
        embeddings, tokenizer_state = pipeline.embed(context)

        # Convert to float32
        embedding_float32 = embeddings.to(torch.float32)

        # Convert to float32
        tokenizer_state_float32 = tokenizer_state.to(torch.float32)

        # Flatten the embedding
        flattened_embedding = torch.flatten(embedding_float32).numpy().tolist()

        # Convert tokenizer state to a list (if it is not already flattened or if it’s a tensor)
        tokenizer_state_list = tokenizer_state_float32.flatten().numpy().tolist()

        # Concatenate the flattened embedding and the tokenizer state
        combined_list = flattened_embedding + tokenizer_state_list

        # Append the last row to the text file
        with open(progress_file, 'a') as f:
            f.write(','.join(map(str, combined_list)) + '\n')

        # Track processed index
        with open(indices_file, 'a') as f:
            f.write(f'{p},{idx}\n')

        processed_indices[p].add(idx)


100%|██████████| 95504/95504 [12:39<00:00, 125.74it/s]   


In [8]:
# File to store the progress
progress_file = os.path.join(result_path, 'progress2.txt')
indices_file = os.path.join(result_path, 'indices2.txt')

# Load existing progress if available
saved_result2 = {p: [] for p in predictors}
processed_indices2 = {p: set() for p in predictors}

if os.path.exists(progress_file):
    # Load the processed indices
    if os.path.exists(indices_file):
        with open(indices_file, 'r') as f:
            for line in f:
                predictor, idx = line.strip().split(',')
                processed_indices2[predictor].add(int(idx))

# Process the data
for p in predictors:
    result = saved_result2[p]
    for idx, ds in enumerate(tqdm(data_test[labels[0]])):
        if idx in processed_indices2[p]:
            continue  # Skip already processed data

        context = torch.tensor(np.array(ds[p]).tolist())
        embeddings, tokenizer_state = pipeline.embed(context)

        # Convert to float32
        embedding_float32 = embeddings.to(torch.float32)
        
        # Convert to float32
        tokenizer_state_float32 = tokenizer_state.to(torch.float32)

        # Flatten the embedding
        flattened_embedding = torch.flatten(embedding_float32).numpy().tolist()

        # Convert tokenizer state to a list (if it is not already flattened or if it’s a tensor)
        tokenizer_state_list = tokenizer_state_float32.flatten().numpy().tolist()

        # Concatenate the flattened embedding and the tokenizer state
        combined_list = flattened_embedding + tokenizer_state_list

        # Append the last row to the text file
        with open(progress_file, 'a') as f:
            f.write(','.join(map(str, combined_list)) + '\n')

        # Track processed index
        with open(indices_file, 'a') as f:
            f.write(f'{p},{idx}\n')

        processed_indices2[p].add(idx)

100%|██████████| 42728/42728 [13:55<00:00, 51.11it/s]
