In [1]:
import pandas as pd
import numpy as np
import json
import glob
import os
from tqdm import tqdm

In [2]:
directory = '/workspace/data/radar/24'

In [3]:
file_paths = glob.glob(os.path.join(directory, '*'))

In [4]:
len(file_paths)

11520

In [5]:
import numpy as np
import pandas as pd
import glob
import os
from tqdm import tqdm
from scipy import sparse

# Path to the directory
directory = '/workspace/data/radar/24'
output_directory = "/workspace/data/radar_quantized/24_quantized"

# Create output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Get all files in the directory
file_paths = glob.glob(os.path.join(directory, '*'))

# Process each file
for file_path in tqdm(file_paths):
    # Extract file name
    file_name = file_path.split("/")[-1].strip().split(".")[0].strip()
    
    try:
        # Read sample to determine column count
        sample_df = pd.read_csv(file_path, header=None, skiprows=10, nrows=1)
        num_columns = len(sample_df.columns)

        # Read full dataset
        matrix_df = pd.read_csv(
            file_path,
            header=None,
            skiprows=10,           # skip Excel rows 1–10
            usecols=range(num_columns)  # use all available columns
        )

        # Convert to numpy array
        matrix = matrix_df.to_numpy()
        
        # Convert to CSR sparse matrix format
        sparse_matrix = sparse.csr_matrix(matrix)
        
        # Save to compressed NPZ file
        output_path = os.path.join(output_directory, f"{file_name}.npz")
        sparse.save_npz(output_path, sparse_matrix)
        
        # Optionally print statistics
        # print(f"Processed {file_name}, shape: {matrix.shape}, " 
        #       f"density: {sparse_matrix.nnz/(matrix.shape[0]*matrix.shape[1]):.4f}, "
        #       f"non-zeros: {sparse_matrix.nnz}")
        
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

100%|██████████| 11520/11520 [08:15<00:00, 23.26it/s]


In [1]:
import numpy as np
import pandas as pd
import glob
import os
import json
from tqdm import tqdm

# Path to the directory
directory = '/workspace/data/radar/24'
output_directory = "/workspace/data/radar_quantized/24_quantized"

# Create output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Get all files in the directory
file_paths = glob.glob(os.path.join(directory, '*'))

# Process each file
for file_path in tqdm(file_paths):
    # Extract file name
    file_name = file_path.split("/")[-1].strip().split(".")[0].strip()
    
    try:
        # Read sample to determine column count
        sample_df = pd.read_csv(file_path, header=None, skiprows=10, nrows=1)
        num_columns = len(sample_df.columns)

        # Read full dataset
        matrix_df = pd.read_csv(
            file_path,
            header=None,
            skiprows=10,           # skip Excel rows 1–10
            usecols=range(num_columns)  # use all available columns
        )

        # Convert to numpy array
        matrix = matrix_df.to_numpy()
        
        # Create our own CSR representation
        data = []          # Will hold non-zero values
        indices = []       # Will hold column indices
        indptr = [0]       # Row pointers (starting with 0)
        
        # Process each row
        for row in matrix:
            # Find non-zero elements in this row
            row_indices = np.nonzero(row)[0]
            row_data = row[row_indices]
            
            # Add this row's data and indices
            data.extend(row_data.tolist())
            indices.extend(row_indices.tolist())
            
            # Update indptr (points to end of this row/start of next)
            indptr.append(len(data))
        
        # Create JSON representation
        sparse_json = {
            "data": data,
            "indices": indices,
            "indptr": indptr,
            "shape": list(matrix.shape)
        }
        
        # Save to JSON file
        output_path = os.path.join(output_directory, f"{file_name}.json")
        with open(output_path, "w") as f:
            json.dump(sparse_json, f)
        
        # Calculate and print statistics
        density = len(data) / (matrix.shape[0] * matrix.shape[1])
        # print(f"Processed {file_name}, shape: {matrix.shape}, " 
        #       f"density: {density:.4f}, "
        #       f"non-zeros: {len(data)}")
        
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

100%|██████████| 11520/11520 [11:24<00:00, 16.84it/s]


In [1]:
import torch
import numpy as np
import os
from tqdm import tqdm
import pandas as pd
import glob

# Path to directories
directory = '/workspace/data/radar/24'
output_directory = "/workspace/data/radar_quantized/24_quantized"
os.makedirs(output_directory, exist_ok=True)

# Get all files
file_paths = glob.glob(os.path.join(directory, '*'))

for file_path in tqdm(file_paths):
    file_name = file_path.split("/")[-1].strip().split(".")[0].strip()
    
    try:
        # Read data as before
        sample_df = pd.read_csv(file_path, header=None, skiprows=10, nrows=1)
        num_columns = len(sample_df.columns)
        
        matrix_df = pd.read_csv(
            file_path,
            header=None,
            skiprows=10,
            usecols=range(num_columns)
        )
        
        # Convert to numpy array
        matrix = matrix_df.to_numpy()
        
        # Find non-zero indices
        indices = np.nonzero(matrix)
        values = matrix[indices[0], indices[1]]
        
        # Convert to PyTorch sparse tensor
        # First, create indices tensor (2 x nnz format)
        i = torch.LongTensor(np.vstack(indices))
        
        # Then create values tensor
        v = torch.FloatTensor(values)
        
        # Create sparse tensor
        sparse_tensor = torch.sparse_coo_tensor(
            i, v, torch.Size(matrix.shape)
        )
        
        # Optional: Convert to CSR format (more efficient for some operations)
        # sparse_tensor = sparse_tensor.to_sparse_csr()
        
        # Save to file (very efficient)
        output_path = os.path.join(output_directory, f"{file_name}.pt")
        torch.save(sparse_tensor, output_path)
        
        # print(f"Processed {file_name}, shape: {matrix.shape}, non-zeros: {len(values)}")
        
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

100%|██████████| 11520/11520 [08:00<00:00, 23.96it/s]
