In [None]:
# !pip3 install scipy

In [1]:
import pandas as pd
import numpy as np
import json
import glob
import os
from tqdm import tqdm

In [11]:
directory = '/workspace/data/radar/23'

In [12]:
file_paths = glob.glob(os.path.join(directory, '*'))

In [13]:
len(file_paths)

11520

In [15]:
for file_path in tqdm(file_paths):
    # print(file_path)
    file_name = file_path.split("/")[-1].strip().split(".")[0].strip()
    
    sample_df = pd.read_csv(file_path, header=None, skiprows=10, nrows=1)
    num_columns = len(sample_df.columns)

    matrix_df = pd.read_csv(
        file_path,
        header=None,
        skiprows=10,           # skip Excel rows 1–10
        usecols=range(num_columns),     # use all available columns
    )

    matrix = matrix_df.to_numpy()

    indices = np.argwhere(matrix != 0)
    values = matrix[indices[:, 0], indices[:, 1]]
    shape = matrix.shape

    sparse_tensor = {
        "indices": indices.tolist(),
        "values": values.tolist(),
        "shape": list(shape)
    }

    with open(f"/workspace/data/radar_quantized/23_quantized/{file_name}.json", "w") as f:
        json.dump(sparse_tensor, f, indent=2)

    # break

100%|██████████| 11520/11520 [25:55<00:00,  7.40it/s]


In [1]:
import numpy as np
import pandas as pd
import glob
import os
from tqdm import tqdm
from scipy import sparse

# Path to the directory
directory = '/workspace/data/radar/23'
output_directory = "/workspace/data/radar_quantized/23_quantized"

# Create output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Get all files in the directory
file_paths = glob.glob(os.path.join(directory, '*'))

# Process each file
for file_path in tqdm(file_paths):
    # Extract file name
    file_name = file_path.split("/")[-1].strip().split(".")[0].strip()
    
    try:
        # Read sample to determine column count
        sample_df = pd.read_csv(file_path, header=None, skiprows=10, nrows=1)
        num_columns = len(sample_df.columns)

        # Read full dataset
        matrix_df = pd.read_csv(
            file_path,
            header=None,
            skiprows=10,           # skip Excel rows 1–10
            usecols=range(num_columns)  # use all available columns
        )

        # Convert to numpy array
        matrix = matrix_df.to_numpy()
        
        # Convert to CSR sparse matrix format
        sparse_matrix = sparse.csr_matrix(matrix)
        
        # Save to compressed NPZ file
        output_path = os.path.join(output_directory, f"{file_name}.npz")
        sparse.save_npz(output_path, sparse_matrix)
        
        # Optionally print statistics
        # print(f"Processed {file_name}, shape: {matrix.shape}, " 
        #       f"density: {sparse_matrix.nnz/(matrix.shape[0]*matrix.shape[1]):.4f}, "
        #       f"non-zeros: {sparse_matrix.nnz}")
        
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

100%|██████████| 11520/11520 [08:15<00:00, 23.24it/s]


In [2]:
loaded_sparse_matrix = sparse.load_npz("/workspace/data/radar_quantized/23_quantized/20240523_194300_Rain_001.npz")

In [None]:
full_matrix = sparse_matrix.toarray()

In [5]:
full_matrix

array([[0.1, 0.1, 0.1, ..., 0.1, 0.1, 0.1],
       [0.8, 0.7, 0.8, ..., 1. , 1.1, 1.1],
       [0.9, 0.9, 0.9, ..., 1.1, 1.1, 1.2],
       ...,
       [4. , 4.1, 4. , ..., 2.7, 2.9, 4.6],
       [4.7, 6. , 7. , ..., 2.5, 3.1, 5.1],
       [4.8, 6.2, 7. , ..., 2.6, 2.8, 4.4]], shape=(300, 723))

In [1]:
import numpy as np
import pandas as pd
import glob
import os
import json
from tqdm import tqdm

# Path to the directory
directory = '/workspace/data/radar/23'
output_directory = "/workspace/data/radar_quantized/23_quantized"

# Create output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Get all files in the directory
file_paths = glob.glob(os.path.join(directory, '*'))

# Process each file
for file_path in tqdm(file_paths):
    # Extract file name
    file_name = file_path.split("/")[-1].strip().split(".")[0].strip()
    
    try:
        # Read sample to determine column count
        sample_df = pd.read_csv(file_path, header=None, skiprows=10, nrows=1)
        num_columns = len(sample_df.columns)

        # Read full dataset
        matrix_df = pd.read_csv(
            file_path,
            header=None,
            skiprows=10,           # skip Excel rows 1–10
            usecols=range(num_columns)  # use all available columns
        )

        # Convert to numpy array
        matrix = matrix_df.to_numpy()
        
        # Create our own CSR representation
        data = []          # Will hold non-zero values
        indices = []       # Will hold column indices
        indptr = [0]       # Row pointers (starting with 0)
        
        # Process each row
        for row in matrix:
            # Find non-zero elements in this row
            row_indices = np.nonzero(row)[0]
            row_data = row[row_indices]
            
            # Add this row's data and indices
            data.extend(row_data.tolist())
            indices.extend(row_indices.tolist())
            
            # Update indptr (points to end of this row/start of next)
            indptr.append(len(data))
        
        # Create JSON representation
        sparse_json = {
            "data": data,
            "indices": indices,
            "indptr": indptr,
            "shape": list(matrix.shape)
        }
        
        # Save to JSON file
        output_path = os.path.join(output_directory, f"{file_name}.json")
        with open(output_path, "w") as f:
            json.dump(sparse_json, f)
        
        # Calculate and print statistics
        density = len(data) / (matrix.shape[0] * matrix.shape[1])
        # print(f"Processed {file_name}, shape: {matrix.shape}, " 
        #       f"density: {density:.4f}, "
        #       f"non-zeros: {len(data)}")
        
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

100%|██████████| 11520/11520 [12:38<00:00, 15.18it/s]


In [2]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128

Looking in indexes: https://download.pytorch.org/whl/cu128
Collecting torch
  Downloading https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp310-cp310-manylinux_2_28_x86_64.whl (1097.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 GB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hCollecting torchvision
  Downloading https://download.pytorch.org/whl/cu128/torchvision-0.22.0%2Bcu128-cp310-cp310-manylinux_2_28_x86_64.whl (8.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting torchaudio
  Downloading https://download.pytorch.org/whl/cu128/torchaudio-2.7.0%2Bcu128-cp310-cp310-manylinux_2_28_x86_64.whl (3.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting networkx
  Downloading https://download.pytorch.org/whl/networkx

In [1]:
import torch
import numpy as np
import os
from tqdm import tqdm
import pandas as pd
import glob

# Path to directories
directory = '/workspace/data/radar/23'
output_directory = "/workspace/data/radar_quantized/23_quantized"
os.makedirs(output_directory, exist_ok=True)

# Get all files
file_paths = glob.glob(os.path.join(directory, '*'))

for file_path in tqdm(file_paths):
    file_name = file_path.split("/")[-1].strip().split(".")[0].strip()
    
    try:
        # Read data as before
        sample_df = pd.read_csv(file_path, header=None, skiprows=10, nrows=1)
        num_columns = len(sample_df.columns)
        
        matrix_df = pd.read_csv(
            file_path,
            header=None,
            skiprows=10,
            usecols=range(num_columns)
        )
        
        # Convert to numpy array
        matrix = matrix_df.to_numpy()
        
        # Find non-zero indices
        indices = np.nonzero(matrix)
        values = matrix[indices[0], indices[1]]
        
        # Convert to PyTorch sparse tensor
        # First, create indices tensor (2 x nnz format)
        i = torch.LongTensor(np.vstack(indices))
        
        # Then create values tensor
        v = torch.FloatTensor(values)
        
        # Create sparse tensor
        sparse_tensor = torch.sparse_coo_tensor(
            i, v, torch.Size(matrix.shape)
        )
        
        # Optional: Convert to CSR format (more efficient for some operations)
        # sparse_tensor = sparse_tensor.to_sparse_csr()
        
        # Save to file (very efficient)
        output_path = os.path.join(output_directory, f"{file_name}.pt")
        torch.save(sparse_tensor, output_path)
        
        # print(f"Processed {file_name}, shape: {matrix.shape}, non-zeros: {len(values)}")
        
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

100%|██████████| 11520/11520 [08:08<00:00, 23.60it/s]
