In [1]:
import os, sys
from graphnet.data.sqlite.sqlite_utilities import create_table
import pandas as pd
import sqlite3
import pyarrow.parquet as pq
import sqlalchemy
from tqdm import tqdm
from typing import Any, Dict, List, Optional
import numpy as np
import gc

[1;34mgraphnet[0m: [32mINFO    [0m 2023-02-07 19:48:39 - get_logger - Writing log to [1mlogs/graphnet_20230207-194839.log[0m


In [2]:
input_data_folder = './data/train'
meta_data_path = './data/train_meta.parquet'
geometry_table = pd.read_csv('./data/sensor_geometry.csv')
database_path = '/media/mo/SSD_4TB/database_all.db'
engine = sqlalchemy.create_engine("sqlite:///" + database_path)

In [3]:
def load_input(meta_batch: pd.DataFrame, input_data_folder: str) -> pd.DataFrame:
        batch_id = meta_batch['batch_id'].unique()
        assert len(batch_id) == 1, "contains multiple batch_ids. Did you set the batch_size correctly?"
        
        detector_readings = pd.read_parquet(path = f'{input_data_folder}/batch_{batch_id[0]}.parquet')
        sensor_positions = geometry_table.loc[detector_readings['sensor_id'], ['x', 'y', 'z']]
        sensor_positions.index = detector_readings.index

        for column in sensor_positions.columns:
            if column not in detector_readings.columns:
                detector_readings[column] = sensor_positions[column]

        detector_readings['auxiliary'] = detector_readings['auxiliary'].replace({True: 1, False: 0})
        return detector_readings.reset_index()


In [4]:
def add_to_table(database_path: str,
                      df: pd.DataFrame,
                      table_name:  str,
                      is_primary_key: bool,
                      engine: sqlalchemy.engine.base.Engine) -> None:
                      
    try:
        create_table(   columns=  df.columns,
                        database_path = database_path, 
                        table_name = table_name,
                        integer_primary_key= is_primary_key,
                        index_column = 'event_id')
    except sqlite3.OperationalError as e:
        if 'already exists' in str(e):
            pass
        else:
            raise e
   
    df.to_sql(table_name, con=engine, index=False, if_exists="append", chunksize = 200000)
    engine.dispose()
    return

In [5]:
def convert_to_sqlite(meta_data_path: str,
                      database_path: str,
                      input_data_folder: str,
                      batch_size: int = 200000,
                      batch_ids: list = list(range(1,661,1)),
                      engine: sqlalchemy.engine.base.Engine = None
                      ) -> None:
    
    meta_data_iter = pq.ParquetFile(meta_data_path).iter_batches(batch_size = batch_size)
    batch_id = 1
    converted_batches = []
    for meta_data_batch in tqdm(meta_data_iter):
        if batch_id in batch_ids:
            meta_data_batch  = meta_data_batch.to_pandas()
            add_to_table(database_path = database_path,
                        df = meta_data_batch,
                        table_name='meta_table',
                        is_primary_key= True,
                        engine = engine)
            pulses = load_input(meta_batch=meta_data_batch, input_data_folder= input_data_folder)
            del meta_data_batch 
            add_to_table(database_path = database_path,
                        df = pulses,
                        table_name='pulse_table',
                        is_primary_key= False,
                        engine = engine)
            del pulses 
            converted_batches.append(batch_id)
        batch_id +=1
        if len(batch_ids) == len(converted_batches):
            break
        gc.collect()
    del meta_data_iter 
    print(f'Conversion Complete! Database available at\n {database_path}')

In [6]:
convert_to_sqlite(meta_data_path,
                  database_path=database_path,
                  input_data_folder=input_data_folder,
                  engine=engine)

659it [30:40:04, 167.53s/it]

Conversion Complete! Database available at
 /media/mo/SSD_4TB/database_all.db





In [7]:
database_path = '/media/mo/SSD_4TB/database_all.db'
with sqlite3.connect(database_path) as conn:
    meta_query = f'SELECT COUNT(1) FROM pulse_table'
    meta_data = pd.read_sql(meta_query,conn)

In [8]:
meta_data

Unnamed: 0,event_id,sensor_id,time,charge,auxiliary,x,y,z
0,24,3918,5928,1.325,1,303.41,335.64,206.58
1,24,4157,6115,1.175,1,-145.45,374.24,212.73
2,24,3520,6492,0.925,1,505.27,257.88,-174.6
3,24,5041,6665,0.225,1,-9.68,-79.5,181.0
4,24,2948,8054,1.575,1,576.37,170.92,357.88
5,24,860,8124,0.675,1,-290.66,-307.38,163.61
6,24,2440,8284,1.625,1,-526.63,-15.6,-178.17
7,24,1743,8478,0.775,1,500.43,-58.45,450.79
8,24,3609,8572,1.025,1,-313.6,237.44,348.01
9,24,5057,8680,3.975,1,-9.68,-79.5,-205.47
