In [13]:
import sqlite3 as sql
import pandas as pd
import sys
import os
import matplotlib.pyplot as plt
import numpy as np

from collections import defaultdict
from typing import List
from tqdm import tqdm

import pyarrow as pa
import pyarrow.parquet as pq
# 2 min 22 sec in HEP04

In [14]:
sys.path.append('/groups/icecube/cyan/Utils')
from PlotUtils import setMplParam, getColour, getHistoParam 
# getHistoParam:
# Nbins, binwidth, bins, counts, bin_centers  = 
from DB_lister import list_content, list_tables
from ExternalFunctions import nice_string_output, add_text_to_ax
setMplParam()

In [15]:
def convertDFtoDB(file:str, table:str, df: pd.DataFrame) -> None:
    con = sql.connect(file)
    df.to_sql(table, con, if_exists='replace', index=False)
    con.close()

In [16]:
def convertDBtoDF(file:str, table:str, N_events_total:int, N_events:int = None) -> pd.DataFrame:
    con = sql.connect(file)
    if N_events is None or N_events > N_events_total:
        N_events = N_events_total
    # Query to fetch the first `N_events` unique event_no values
    event_no_query = f'SELECT DISTINCT event_no FROM {table} LIMIT {N_events}'
    event_nos = pd.read_sql_query(event_no_query, con)['event_no'].tolist()
    
    # Use the selected event_no values to filter the main data
    event_filter = ','.join(map(str, event_nos))  # Convert to comma-separated string for SQL IN clause
    query = f'SELECT * FROM {table} WHERE event_no IN ({event_filter})'
    
    # Read data and close the connection
    df = pd.read_sql_query(query, con)
    con.close()
    
    return df

In [17]:
def get_table_row_count(file: str, table: str) -> int:
    conn = sql.connect(file)
    cursor = conn.cursor()
    cursor.execute(f"SELECT COUNT(*) FROM {table}")
    row_count = cursor.fetchone()[0]
    conn.close()
    return row_count

In [18]:
def convertDBtoDF(file:str, table:str, Nlines_model:int = None) -> pd.DataFrame:
    if Nlines_model is None:
        Nlines_model = get_table_row_count(file, table)
    print(f'Loading {Nlines_model} rows from {table} in {file}')
    con = sql.connect(file)
    query = f'SELECT * FROM {table} LIMIT {Nlines_model}'
    df = pd.read_sql_query(query, con)
    con.close()
    return df

In [19]:
def get_table_event_count(conn: sql.Connection, table: str) -> int:
    cursor = conn.cursor()
    cursor.execute(f"SELECT COUNT(DISTINCT event_no) FROM {table}")
    event_count = cursor.fetchone()[0]
    return event_count

In [20]:
def load_reference_data(filepath: str) -> np.ndarray:
    df = pd.read_csv(filepath)
    return df.values  # Convert the DataFrame to a NumPy array
dom_ref_pos = load_reference_data('/groups/icecube/cyan/factory/DOMification/dom_ref_pos/unique_string_dom_completed.csv')

In [54]:
def addStringAndDOMtoDB(con_source: sql.Connection, 
                        source_table: str,
                        reference_data: np.ndarray,
                        event_no_subset: List[int],
                        tolerance_xy: float = 10,
                        tolerance_z: float = 2) -> None:
        cur_source = con_source.cursor()
        cur_source.execute(f"PRAGMA table_info({source_table})")
        existing_columns = [col[1] for col in cur_source.fetchall()]
        
        # Add `string` and `dom_number` columns if they don’t exist
        if 'string' not in existing_columns:
            cur_source.execute(f"ALTER TABLE {source_table} ADD COLUMN string INTEGER")
        if 'dom_number' not in existing_columns:
            cur_source.execute(f"ALTER TABLE {source_table} ADD COLUMN dom_number INTEGER")
        
        if event_no_subset:
            event_filter = ','.join(map(str, event_no_subset))
            query = f"""
                SELECT rowid, dom_x, dom_y, dom_z 
                FROM {source_table}
                WHERE event_no IN ({event_filter}) 
                AND (string IS NULL OR dom_number IS NULL)
            """
        else:
            query = f"""
                SELECT rowid, dom_x, dom_y, dom_z 
                FROM {source_table}
                WHERE string IS NULL OR dom_number IS NULL
            """
        
        # Select rows where `string` or `dom_number` is NULL
        cur_source.execute(query)
        rows_to_update = cur_source.fetchall()
        
        for row in rows_to_update:
            row_id, dom_x, dom_y, dom_z = row
            
            matches_xy = reference_data[
                (np.abs(reference_data[:, 2] - dom_x) <= tolerance_xy) &
                (np.abs(reference_data[:, 3] - dom_y) <= tolerance_xy)
            ]
            
            if len(matches_xy) > 0:
                match_z = matches_xy[np.abs(matches_xy[:, 4] - dom_z) <= tolerance_z]
                
                if len(match_z) > 0:
                    string_val = int(match_z[0, 0])
                    dom_number_val = int(match_z[0, 1])
                    
                    cur_source.execute(
                        f"UPDATE {source_table} SET string = ?, dom_number = ? WHERE rowid = ?", (string_val, dom_number_val, row_id))
        # Commit all updates
        con_source.commit()

In [23]:
def getTruthTableNameDB(con_source: sql.Connection) -> str:
    cur_source = con_source.cursor()
    cur_source.execute("SELECT name FROM sqlite_master WHERE type='table'")
    tables = [row[0] for row in cur_source.fetchall()]
    if 'truth' in tables:
        truth_table = 'truth'
    elif 'Truth' in tables:
        truth_table = 'Truth'
    else:
        raise ValueError("Neither 'truth' nor 'Truth' table exists in the source database.")
    return truth_table

In [25]:
def getTruthPA(con_source: sql.Connection, 
            source_table: str,
            event_no_subset: List[int],
            subdirectory_no: int, 
            db_file_no: int, 
            shard_index: int) -> pd.DataFrame:
        """
        Fetches selected data for the truth table with additional metadata columns, including calculated offsets and reordered columns.
        
        Parameters:
            con_source (sql.Connection): Connection to the SQLite database.
            source_table (str): The table name in the database.
            event_nos (list): List of event numbers for the current shard.
            subdirectory_no (int): Numeric identifier for the source subdirectory.
            db_file_no (int): Numeric identifier for the database file.
            shard_index (int): Index of the current shard.
        
        Returns:
            pd.DataFrame: DataFrame with receipt data, offsets, and selected truth data.
        """
        truth_table_name = getTruthTableNameDB(con_source)
        
        receipt_data = {
            'event_no': event_no_subset,
            'subdirectory_no': [subdirectory_no] * len(event_no_subset),
            'db_file_no': [db_file_no] * len(event_no_subset),
            'shard_index': [shard_index] * len(event_no_subset),
            'file_no': [shard_index] * len(event_no_subset)  # file number is the shard index in target
        }
        df_receipt = pd.DataFrame(receipt_data)

        selected_truth_features = ['energy', 'azimuth', 'zenith', 'pid']
        columns = ['event_no'] + selected_truth_features  # Ensure event_no is included for merging
        
        event_filter = ','.join(map(str, event_no_subset))
        query = f"SELECT {', '.join(columns)} FROM {truth_table_name} WHERE event_no IN ({event_filter})"
        df_truth = pd.read_sql_query(query, con_source)
        
        # N unique (string, dom_number) combination
        activated_doms = []
        for event_no in event_no_subset:
            query = f"""
                SELECT COUNT(DISTINCT string || '-' || dom_number) AS N_doms
                FROM {source_table}
                WHERE event_no = ?
            """
            unique_count = pd.read_sql_query(query, con_source, params=(event_no,))['N_doms'][0]
            activated_doms.append(unique_count)
        
        df_receipt['N_doms'] = activated_doms
        df_receipt['offset'] = df_receipt['N_doms'].cumsum().shift(fill_value=0)

        df_combined = pd.merge(df_receipt, df_truth, on='event_no', how='inner')
        df_combined = df_combined[['event_no', 'subdirectory_no', 'db_file_no', 'shard_index', 'offset', 'file_no', 'N_doms'] + selected_truth_features]
        
        return df_combined

* this conversion from db to parquet inevitably requires use of pandas dataframe.
* it would be much desirable if the data is directly converted from I3 to PMTfied parquet.

In [None]:
def getPMTfiedPA(con_source: sql.Connection, 
                source_table: str,
                event_no_subset: List[int]) -> pa.Table:
    addStringAndDOMtoDB(
        con_source=con_source,
        source_table=source_table,
        reference_data=dom_ref_pos,
        event_no_subset=event_no_subset
    )
    event_filter = ','.join(map(str, event_no_subset))
    query = f"""SELECT * 
            
            FROM {source_table} 
            WHERE event_no IN ({event_filter})
            """
    cur_source = con_source.cursor()
    cur_source.execute(query)
    rows = cur_source.fetchall()

    # Get column names for indexing
    columns = [description[0] for description in cur_source.description]
    event_no_idx = columns.index('event_no')
    
    event_no_idx = columns.index('event_no')
    dom_string_idx = columns.index('string')
    dom_number_idx = columns.index('dom_number')
    dom_x_idx = columns.index('dom_x')
    dom_y_idx = columns.index('dom_y')
    dom_z_idx = columns.index('dom_z')
    dom_time_idx = columns.index('dom_time')
    dom_hlc_idx = columns.index('hlc')
    dom_charge_idx = columns.index('charge')
    pmt_area_idx = columns.index('pmt_area')
    rde_idx = columns.index('rde')
    saturation_status_idx = columns.index('is_saturated_dom')
    
    def getMaxQtotal(all_pulses_event: List[List[List[float]]]) -> float:
        Qsums = [sum([pulse[dom_charge_idx] for pulse in pulses]) for pulses in all_pulses_event]
        return max(Qsums)
    
    def getQweightedAverageDOMposition(all_pulses_event: List[List[List[float]]], maxQtotal: float) -> List[float]:
        dom_x = [pulse[dom_x_idx] for pulses_dom in all_pulses_event for pulse in pulses_dom]
        dom_y = [pulse[dom_y_idx] for pulses_dom in all_pulses_event for pulse in pulses_dom]
        dom_z = [pulse[dom_z_idx] for pulses_dom in all_pulses_event for pulse in pulses_dom]
        charge_sums = [pulse[dom_charge_idx] for pulses_dom in all_pulses_event for pulse in pulses_dom]

        weighted_x = np.mean([x * charge / maxQtotal for x, charge in zip(dom_x, charge_sums)])
        weighted_y = np.mean([y * charge / maxQtotal for y, charge in zip(dom_y, charge_sums)])
        weighted_z = np.mean([z * charge / maxQtotal for z, charge in zip(dom_z, charge_sums)])

        return [weighted_x, weighted_y, weighted_z]
        
    def getRelativeDOMposition(dom_x: float, dom_y: float, dom_z: float, avg_dom_position: List[float]) -> List[float]:
        return [dom_x - avg_dom_position[0], dom_y - avg_dom_position[1], dom_z - avg_dom_position[2]]
    
    # NOTE pulses_dom: [pulse, ...]
    def getDOMposition(pulses_dom: List[List[float]]) -> List[float]:
        return [pulses_dom[0][dom_x_idx], pulses_dom[0][dom_y_idx], pulses_dom[0][dom_z_idx]]
    
    def getDOMstring(pulses_dom: List[List[float]]) -> int:
        return pulses_dom[0][dom_string_idx]
    
    def getDOMnumber(pulses_dom: List[List[float]]) -> int:
        return pulses_dom[0][dom_number_idx]
    
    def getPmtArea(pulses_dom: List[List[float]]) -> float:
        return pulses_dom[0][pmt_area_idx]
    
    def getRDE(pulses_dom: List[List[float]]) -> float:
        return pulses_dom[0][rde_idx]
    
    def getSaturationStatus(pulses_dom: List[List[float]]) -> int:
        return pulses_dom[0][saturation_status_idx]
    
    def getFirstHlc(pulses_dom: List[List[float]]) -> List[int]:
        n = 3
        _fillIncomplete = -1
        if len(pulses_dom) < n:
            hlc = [pulse[dom_hlc_idx] for pulse in pulses_dom]
            hlc.extend([_fillIncomplete] * (n - len(hlc)))
        else:
            hlc = [pulse[dom_hlc_idx] for pulse in pulses_dom[:n]]
        return hlc
    
    def getFirstPulseTime(pulses_dom: List[List[float]], saturationStatus: int) -> List[float]:
        n = 3
        # HACK consider changing the fill values
        _fillSaturated = -1
        _fillIncomplete = -1
        
        if saturationStatus == 1:
            pulse_times = [_fillSaturated] * n
        elif len(pulses_dom) < n:
            pulse_times = [pulse[dom_time_idx] for pulse in pulses_dom]
            pulse_times.extend([_fillIncomplete] * (n - len(pulse_times)))
        else:
            pulse_times = [pulse[dom_time_idx] for pulse in pulses_dom[:n]]
        return pulse_times
    
    # HACK necessary?
    def getFirstHlcPulseTime(pulses_dom: List[List[float]], saturationStatus: int) -> List[float]:
        n = 3
        _fillSaturated = -1
        _fillIncomplete = -1
        if saturationStatus == 1:
            pulse_times = [_fillSaturated] * n
        elif len(pulses_dom) < n:
            pulse_times = [pulse[dom_time_idx] for pulse in pulses_dom if pulse[dom_hlc_idx] == 1]
            pulse_times.extend([_fillIncomplete] * (n - len(pulse_times)))
        else:
            pulse_times = [pulse[dom_time_idx] for pulse in pulses_dom if pulse[dom_hlc_idx] == 1][:n]
        return pulse_times
        
    def getElapsedTimeUntilChargeFraction(pulses_dom: List[List[float]], saturationStatus: int, percentile1 = 10, percentile2 = 50) -> List[float]:
        # HACK consider changing the fill values
        _fillSaturated = -1
        _fillIncomplete = -1
        if saturationStatus == 1:
            times = [_fillSaturated] * 2
        elif len(pulses_dom) < 2:
            times = [_fillIncomplete] * 2
        else:
            Qtotal = sum([pulse[dom_charge_idx] for pulse in pulses_dom])
            t_0 = pulses_dom[0][dom_time_idx]
            Qcum = 0
            T_first, T_second = -1, -1 # if these are not -1, then they are assigned
            for pulse in pulses_dom:
                Qcum += pulse[dom_charge_idx]
                if Qcum > percentile1 / 100 * Qtotal and T_first == -1:
                    T_first = pulse[dom_time_idx] - t_0
                if Qcum > percentile2 / 100 * Qtotal:
                    T_second = pulse[dom_time_idx] - t_0
                    break
            times = [T_first, T_second]
        return times
    
    def getStandardDeviation(pulse_times: List[float], saturationStatus: int) -> float:
        # HACK consider changing the fill values
        _fillSaturated = 0
        _fillIncomplete = 0
        if saturationStatus == 1:
            sigmaT = _fillSaturated
        elif len(pulse_times) < 2:
            sigmaT = _fillIncomplete
        else:
            sigmaT = np.std(pulse_times)
        return sigmaT
    
    def getFirstChargeReadout(pulses: List[List[float]], saturationStatus: int) -> List[float]:
        # HACK consider changing the fill values
        _fillSaturated = -1
        _fillIncomplete = -1
        n = 3
        if saturationStatus == 1:
            charge_readouts = [_fillSaturated] * n
        elif len(pulses) < n:
            charge_readouts = [pulse[dom_charge_idx] for pulse in pulses]
            charge_readouts.extend([_fillIncomplete] * (n - len(charge_readouts)))
        else:
            charge_readouts = [pulse[dom_charge_idx] for pulse in pulses[:n]]
        return charge_readouts
    
    def getAccumulatedChargeAfterNanoSec(pulses: List[List[float]], saturationStatus: int, interval1 = 25, interval2 = 75) -> List[float]:
        # HACK consider changing the fill values
        _fillSaturated = -1
        _fillIncomplete = -1
        if saturationStatus == 1:
            Qs = [_fillSaturated] * 3
        elif len(pulses) < 1:
            Qs = [_fillIncomplete] * 3
        else:
            Qtotal = sum([pulse[dom_charge_idx] for pulse in pulses])
            t_0 = pulses[0][dom_time_idx]
            Qinterval1 = sum([pulse[dom_charge_idx] for pulse in pulses if pulse[dom_time_idx] - t_0 < interval1])
            Qinterval2 = sum([pulse[dom_charge_idx] for pulse in pulses if pulse[dom_time_idx] - t_0 < interval2])
            Qs = [Qinterval1, Qinterval2, Qtotal]
        return Qs
    
    def processDOM(pulses: List[List[float]], avg_dom_position: List[float]):
        dom_string = getDOMstring(pulses)
        dom_number = getDOMnumber(pulses)
        dom_x, dom_y, dom_z = getDOMposition(pulses)
        dom_x_rel, dom_y_rel, dom_z_rel = getRelativeDOMposition(dom_x, dom_y, dom_z, avg_dom_position)
        pmt_area = getPmtArea(pulses)
        rde = getRDE(pulses)
        saturation_status = getSaturationStatus(pulses)
        
        # Get remaining features
        first_three_charge_readout = getFirstChargeReadout(pulses, saturation_status)
        accumulated_charge_after_nano_sec = getAccumulatedChargeAfterNanoSec(pulses, saturation_status)
        first_three_pulse_time = getFirstPulseTime(pulses, saturation_status)
        # first_three_hlc_pulse_time = getFirstHlcPulseTime(pulses, saturation_status)
        first_three_hlc = getFirstHlc(pulses)
        elapsed_time_until_charge_fraction = getElapsedTimeUntilChargeFraction(pulses, saturation_status)
        standard_deviation = getStandardDeviation([pulse[dom_time_idx] for pulse in pulses], saturation_status)
        
        data_dom = (
                    # [dom_string, dom_number]            # dom_number
                    [dom_x, dom_y, dom_z]             # dom_x, dom_y, dom_z
                    + [dom_x_rel, dom_y_rel, dom_z_rel] # dom_x_rel, dom_y_rel, dom_z_rel
                    + [pmt_area, rde, saturation_status]# pmt_area, rde, saturationStatus
                    + first_three_charge_readout        # q1, q2, q3
                    + accumulated_charge_after_nano_sec # Q25, Q75, Qtotal
                    + first_three_hlc                   # hlc1, hlc2, hlc3
                    + first_three_pulse_time            # t1, t2, t3
                    # + first_three_hlc_pulse_time        # t1_hlc, t2_hlc, t3_hlc
                    + elapsed_time_until_charge_fraction# T10, T50
                    + [standard_deviation]              # sigmaT
                    )
        return data_dom            
    # original data
    events_doms_pulses = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    # new data
    processed_data = []
    for row in rows:
        event_no = row[event_no_idx]
        string = row[dom_string_idx]
        dom_number = row[dom_number_idx]
        events_doms_pulses[event_no][string][dom_number].append(row)
    
    # NOTE data structure
    # events_doms_pulses  : {event_no: {string: {dom_number: [pulse, ...], ...}, ...}, ...}
    # strings_doms_pulses :            {string: {dom_number: [pulse, ...], ...}, ...}
    # doms_pulses         :                     {dom_number: [pulse, ...], ...}
    # pulses              :                                  [pulse, ...]
    for event_no, strings_doms_pulses in events_doms_pulses.items():
        for doms_pulses in strings_doms_pulses.values():
            # Convert the values to a list of pulses (rows)
            all_pulses_event = list(doms_pulses.values())
            maxQtotal = getMaxQtotal(all_pulses_event)
            avg_dom_position = getQweightedAverageDOMposition(all_pulses_event, maxQtotal)
            for pulses in doms_pulses.values():
                # dom_data = [event_no] + processDOM(pulses, avg_dom_position)
                dom_data = processDOM(pulses, avg_dom_position)
                processed_data.append(dom_data)

    # Convert the processed data into a DataFrame for easier handling
    df_processed = pd.DataFrame(processed_data, columns=[
        # 'event_no', 'dom_string', 'dom_number', # indices
        'dom_x', 'dom_y', 'dom_z',  
        'dom_x_rel', 'dom_y_rel', 'dom_z_rel', 
        'pmt_area', 'rde', 'saturation_status', 
        'q1', 'q2', 'q3', 
        'Q25', 'Q75', 'Qtotal',
        'hlc1', 'hlc2', 'hlc3', 
        't1', 't2', 't3', 
        'T10', 'T50', 'sigmaT'
    ])
    pa_processed = pa.Table.from_pandas(df_processed)
    return pa_processed

* `runPMTfication_DB_Parquet` layer is intended to intervene the process of sqlite connection and close

In [28]:
def get_subdirectory_no(subdirectory: str) -> int:
    """
    subdirectory (str): Name of the subdirectory (e.g., "22010").
    """
    try:
        # Convert the subdirectory name directly to an integer
        subdirectory_no = int(subdirectory)
    except ValueError:
        raise ValueError(f"Invalid subdirectory name: '{subdirectory}' is not a numeric value.")
    
    return subdirectory_no

In [29]:
def get_db_file_no(file: str) -> int:
    """
    file (str): Name of the file (e.g., "merged_part_1.db").
    """
    try:
        # Assuming format like "merged_part_X.db" where X is the file number
        file_no = int(file.split('_')[-1].split('.')[0])
    except (IndexError, ValueError):
        raise ValueError(f"Invalid file name format: '{file}' does not contain a numeric identifier.")
    
    return file_no

The initial SRTInIcePulses and Truth table may have more than 30,000 to 42,000,000 events within a db file and there can be some 30 db files in a subdirectory so it can be quite messy when the conversion finishes so I want to make a subsubdirectory under the same name of the subdirectory

In [30]:
def print_table_event_count(file: str, table: str):
    conn = sql.connect(file)
    event_count = get_table_event_count(conn, table)
    print(f"Table {table} has {event_count} unique events")
    conn.close()

In [123]:
def pmtfy_shard(
                con_source: sql.Connection,
                source_table: str,
                dest_root: str,
                source_subdirectory: str,
                db_file_no: int,
                shard_index: int,
                offset: int,
                limit: int) -> pd.DataFrame:
        """
        Processes a shard of events from the database, saves PMTfied data to a file, and returns the truth+receipt data.
        """
        # Query the subset of event_no for this shard
        event_no_query = f"""
            SELECT DISTINCT event_no 
            FROM {source_table}
            ORDER BY event_no ASC
            LIMIT {limit} OFFSET {offset}
        """
        event_nos = pd.read_sql_query(event_no_query, con_source)['event_no'].tolist()
        
        if not event_nos:
            return pd.DataFrame()  # Return an empty DataFrame if no events in this shard

        pa_pmtfied = getPMTfiedPA(
            con_source=con_source,
            source_table=source_table,
            event_no_subset=event_nos
        )
        dest_dir = os.path.join(dest_root, source_subdirectory, str(db_file_no))
        os.makedirs(dest_dir, exist_ok=True)

        pmtfied_file = os.path.join(dest_dir, f"PMTfied_{shard_index}.parquet")        
        print(f"Saving shard {shard_index} to {pmtfied_file}")
        pq.write_table(pa_pmtfied, pmtfied_file)  # Save PMTfied data to a file

        
        truth_df = getTruthPA(
            con_source,
            source_table=source_table,
            event_no_subset=event_nos,
            subdirectory_no=int(source_subdirectory),
            db_file_no=db_file_no,
            shard_index=shard_index
        )

        return truth_df

In [124]:
def divide_and_conquer_db(
                        con_source: sql.Connection, 
                        source_table: str, 
                        dest_root: str, 
                        source_subdirectory: str, 
                        db_file_no: int, 
                        N_events_per_shard: int) -> pd.DataFrame:
    """
    Divides the database events into shards and processes each shard, consolidating the truth+receipt data.
    """
    all_shards_df = []
    N_events_total = get_table_event_count(con_source, source_table)
    num_shards = (N_events_total + N_events_per_shard - 1) // N_events_per_shard
    
    for shard_index in range(num_shards):
        offset = shard_index * N_events_per_shard
        limit = min(N_events_per_shard, N_events_total - offset)

        # Process the shard and retrieve the truth+receipt data
        shard_df = pmtfy_shard(
            con_source=con_source,
            source_table=source_table,
            dest_root=dest_root,
            source_subdirectory=source_subdirectory,
            db_file_no=db_file_no,
            shard_index=shard_index + 1,
            offset=offset,
            limit=limit
        )
        
        # Append shard data to the collection
        if not shard_df.empty:
            all_shards_df.append(shard_df)

    consolidated_df = pd.concat(all_shards_df, ignore_index=True)
    return consolidated_df
    

In [None]:

def pmtfy_db(
            source_subdirectory: str, 
            source_file: str, 
            dest_root: str, 
            source_table: str, 
            N_events_per_shard: int = 2000) -> None:
    """
    Splits the database file into shards, processes each shard, and consolidates the truth+receipt data.
    """
    # Establish a connection to the database
    con_source = sql.connect(source_file)
    
    # Divide and conquer: split events into shards and process
    db_file_no = get_db_file_no(source_file)
    consolidated_df = divide_and_conquer_db(
        con_source=con_source,
        source_table=source_table,
        dest_root=dest_root,
        source_subdirectory=source_subdirectory,
        db_file_no=db_file_no,
        N_events_per_shard=N_events_per_shard,
    )

    # Save the consolidated DataFrame
    dest_subdirectory_path = os.path.join(dest_root, source_subdirectory)
    os.makedirs(dest_subdirectory_path, exist_ok=True)
    consolidated_file = os.path.join(dest_subdirectory_path, f"truth_{db_file_no}.parquet")
    consolidated_pa = pa.Table.from_pandas(consolidated_df)
    pq.write_table(consolidated_pa, consolidated_file)
    

    con_source.close()

In [126]:
def pmtfy_subdir(
                    source_root: str, 
                    dest_root: str, 
                    subdirectory_name: str, 
                    source_table: str, 
                    N_events_per_shard: int = 2000) -> None:
    """
    Processes each database file in a specific subdirectory and saves results in a mirrored directory structure.
    """
    subdirectory_path = os.path.join(source_root, subdirectory_name)
    if os.path.isdir(subdirectory_path) and subdirectory_name.isdigit():
        # List all files to process in the directory
        files = [f for f in os.listdir(subdirectory_path) if f.endswith('.db')]
        
        for filename in tqdm(files, desc=f"Processing {subdirectory_name}"):
            source_file = os.path.join(subdirectory_path, filename)
            
            # Process each database file within the subdirectory
            pmtfy_db(
                source_subdirectory=subdirectory_name,
                source_file=source_file,
                dest_root=dest_root,
                source_table=source_table,
                N_events_per_shard=N_events_per_shard,
            )
            

In [127]:
root_dir = "/lustre/hpc/project/icecube/HE_Nu_Aske_Oct2024/sqlite_pulses/Snowstorm/"
target_dir = "/lustre/hpc/project/icecube/HE_Nu_Aske_Oct2024/PMTfied/Snowstorm/"

In [128]:
pmtfy_subdir(root_dir, 
            target_dir, 
            "99999", 
            "SRTInIcePulses", 
            N_events_per_shard=10,)
# 17 sec

Processing 99999:   0%|          | 0/2 [00:00<?, ?it/s]

Saving shard 1 to /lustre/hpc/project/icecube/HE_Nu_Aske_Oct2024/PMTfied/Snowstorm/99999/98/PMTfied_1.parquet
Saving shard 2 to /lustre/hpc/project/icecube/HE_Nu_Aske_Oct2024/PMTfied/Snowstorm/99999/98/PMTfied_2.parquet
Saving shard 3 to /lustre/hpc/project/icecube/HE_Nu_Aske_Oct2024/PMTfied/Snowstorm/99999/98/PMTfied_3.parquet
Saving shard 4 to /lustre/hpc/project/icecube/HE_Nu_Aske_Oct2024/PMTfied/Snowstorm/99999/98/PMTfied_4.parquet
Saving shard 5 to /lustre/hpc/project/icecube/HE_Nu_Aske_Oct2024/PMTfied/Snowstorm/99999/98/PMTfied_5.parquet
Saving shard 6 to /lustre/hpc/project/icecube/HE_Nu_Aske_Oct2024/PMTfied/Snowstorm/99999/98/PMTfied_6.parquet
Saving shard 7 to /lustre/hpc/project/icecube/HE_Nu_Aske_Oct2024/PMTfied/Snowstorm/99999/98/PMTfied_7.parquet


Processing 99999:  50%|█████     | 1/2 [00:07<00:07,  7.82s/it]

Saving shard 8 to /lustre/hpc/project/icecube/HE_Nu_Aske_Oct2024/PMTfied/Snowstorm/99999/98/PMTfied_8.parquet
Saving shard 1 to /lustre/hpc/project/icecube/HE_Nu_Aske_Oct2024/PMTfied/Snowstorm/99999/99/PMTfied_1.parquet
Saving shard 2 to /lustre/hpc/project/icecube/HE_Nu_Aske_Oct2024/PMTfied/Snowstorm/99999/99/PMTfied_2.parquet
Saving shard 3 to /lustre/hpc/project/icecube/HE_Nu_Aske_Oct2024/PMTfied/Snowstorm/99999/99/PMTfied_3.parquet
Saving shard 4 to /lustre/hpc/project/icecube/HE_Nu_Aske_Oct2024/PMTfied/Snowstorm/99999/99/PMTfied_4.parquet
Saving shard 5 to /lustre/hpc/project/icecube/HE_Nu_Aske_Oct2024/PMTfied/Snowstorm/99999/99/PMTfied_5.parquet
Saving shard 6 to /lustre/hpc/project/icecube/HE_Nu_Aske_Oct2024/PMTfied/Snowstorm/99999/99/PMTfied_6.parquet
Saving shard 7 to /lustre/hpc/project/icecube/HE_Nu_Aske_Oct2024/PMTfied/Snowstorm/99999/99/PMTfied_7.parquet


Processing 99999: 100%|██████████| 2/2 [00:15<00:00,  7.95s/it]

Saving shard 8 to /lustre/hpc/project/icecube/HE_Nu_Aske_Oct2024/PMTfied/Snowstorm/99999/99/PMTfied_8.parquet





In [129]:
def convertParquetToDF(file:str) -> pd.DataFrame:
    table = pq.read_table(file)
    df = table.to_pandas()
    return df

In [159]:
# hello means 99, hej means 98
HelloFromDBside = root_dir + "99999/merged_part_99.db"
HejFromDBside = root_dir + "99999/merged_part_98.db"

HelloPMTfied1 = target_dir + "99999/99/PMTfied_1.parquet"
HelloPMTfied2 = target_dir + "99999/99/PMTfied_2.parquet"
HelloPMTfied3 = target_dir + "99999/99/PMTfied_3.parquet"

HejPMTfied1 = target_dir + "99999/98/PMTfied_1.parquet"
HejPMTfied2 = target_dir + "99999/98/PMTfied_2.parquet"

HelloTruth = target_dir + "99999/truth_99.parquet"
HejTruth = target_dir + "99999/truth_98.parquet"

In [153]:
def convertDBtoDF(file:str, table:str, Nlines_model:int = None) -> pd.DataFrame:
    if Nlines_model is None:
        Nlines_model = get_table_row_count(file, table)
    print(f'Loading {Nlines_model} rows from {table} in {file}')
    con = sql.connect(file)
    query = f'SELECT * FROM {table} LIMIT {Nlines_model}'
    df = pd.read_sql_query(query, con)
    con.close()
    return df

In [154]:
df_hello = convertDBtoDF(HelloFromDBside, "SRTInIcePulses")
df_hej = convertDBtoDF(HejFromDBside, "SRTInIcePulses")# 15 sec

Loading 782144 rows from SRTInIcePulses in /lustre/hpc/project/icecube/HE_Nu_Aske_Oct2024/sqlite_pulses/Snowstorm/99999/merged_part_99.db
Loading 732719 rows from SRTInIcePulses in /lustre/hpc/project/icecube/HE_Nu_Aske_Oct2024/sqlite_pulses/Snowstorm/99999/merged_part_98.db


In [155]:
df_hello

Unnamed: 0,charge,dom_time,width,dom_x,dom_y,dom_z,pmt_area,rde,is_bright_dom,is_bad_dom,is_saturated_dom,is_errata_dom,event_time,hlc,awtd,string,pmt_number,dom_number,dom_type,event_no
0,0.475,17873.0,1.0,-256.14,-521.08,-474.15,0.0444,1.00,-1.0,-1.0,0.0,0.0,59000.171844,1.0,1.0,1.0,0.0,58.0,20.0,366
1,1.475,17456.0,1.0,-256.14,-521.08,-491.17,0.0444,1.00,-1.0,-1.0,0.0,0.0,59000.171844,1.0,1.0,1.0,0.0,59.0,20.0,366
2,0.575,16567.0,8.0,-9.13,-481.74,-385.84,0.0444,1.00,-1.0,-1.0,0.0,0.0,59000.171844,0.0,0.0,3.0,0.0,53.0,20.0,366
3,1.275,19182.0,1.0,-9.13,-481.74,-419.89,0.0444,1.00,-1.0,-1.0,0.0,0.0,59000.171844,1.0,1.0,3.0,0.0,55.0,20.0,366
4,0.475,19014.0,1.0,-9.13,-481.74,-453.93,0.0444,1.00,-1.0,-1.0,0.0,0.0,59000.171844,1.0,1.0,3.0,0.0,57.0,20.0,366
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782139,0.925,12593.0,8.0,-10.97,6.72,-500.73,0.0444,1.35,-1.0,-1.0,0.0,0.0,59000.171844,1.0,0.0,86.0,0.0,60.0,20.0,442
782140,1.175,12662.0,8.0,-10.97,6.72,-500.73,0.0444,1.35,-1.0,-1.0,0.0,0.0,59000.171844,1.0,0.0,86.0,0.0,60.0,20.0,442
782141,0.625,13062.0,8.0,-10.97,6.72,-500.73,0.0444,1.35,-1.0,-1.0,0.0,0.0,59000.171844,1.0,0.0,86.0,0.0,60.0,20.0,442
782142,0.775,13768.0,8.0,-10.97,6.72,-500.73,0.0444,1.35,-1.0,-1.0,0.0,0.0,59000.171844,1.0,0.0,86.0,0.0,60.0,20.0,442


In [156]:
df_hej

Unnamed: 0,charge,dom_time,width,dom_x,dom_y,dom_z,pmt_area,rde,is_bright_dom,is_bad_dom,is_saturated_dom,is_errata_dom,event_time,hlc,awtd,string,pmt_number,dom_number,dom_type,event_no
0,0.925,12716.0,8.0,124.97,-131.25,476.63,0.0444,1.0,-1.0,-1.0,-1.0,-1.0,59000.171844,0.0,0.0,27.0,0.0,2.0,20.0,1419
1,1.175,12156.0,8.0,248.15,-111.87,412.93,0.0444,1.0,-1.0,-1.0,-1.0,-1.0,59000.171844,0.0,0.0,28.0,0.0,6.0,20.0,1419
2,0.975,10795.0,1.0,194.34,-30.92,470.05,0.0444,1.0,-1.0,-1.0,-1.0,-1.0,59000.171844,1.0,1.0,37.0,0.0,3.0,20.0,1419
3,0.375,11159.0,1.0,194.34,-30.92,453.03,0.0444,1.0,-1.0,-1.0,-1.0,-1.0,59000.171844,1.0,1.0,37.0,0.0,4.0,20.0,1419
4,0.725,11237.0,1.0,194.34,-30.92,453.03,0.0444,1.0,-1.0,-1.0,-1.0,-1.0,59000.171844,1.0,1.0,37.0,0.0,4.0,20.0,1419
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
732714,1.625,13639.0,8.0,429.76,351.02,216.79,0.0444,1.0,-1.0,-1.0,-1.0,-1.0,59000.171844,1.0,0.0,67.0,0.0,18.0,20.0,1492
732715,1.125,12392.0,1.0,429.76,351.02,199.77,0.0444,1.0,-1.0,-1.0,-1.0,-1.0,59000.171844,1.0,1.0,67.0,0.0,19.0,20.0,1492
732716,0.825,12129.0,1.0,429.76,351.02,165.73,0.0444,1.0,-1.0,-1.0,-1.0,-1.0,59000.171844,1.0,1.0,67.0,0.0,21.0,20.0,1492
732717,0.925,12689.0,1.0,338.44,463.72,301.48,0.0444,1.0,-1.0,-1.0,-1.0,-1.0,59000.171844,1.0,1.0,74.0,0.0,13.0,20.0,1492


In [169]:
df_truth = convertParquetToDF(HelloTruth)
df_1 = convertParquetToDF(HelloPMTfied1)
df_2 = convertParquetToDF(HelloPMTfied2)

In [170]:
df_truth

Unnamed: 0,event_no,subdirectory_no,db_file_no,shard_index,file_no,N_doms,offset,energy,azimuth,zenith,pid
0,366,99999,99,1,1,2283,0,7.404683e+07,3.322910,0.529491,14.0
1,367,99999,99,1,1,342,2283,1.818134e+06,3.094503,1.259288,14.0
2,368,99999,99,1,1,346,2625,1.055851e+07,1.736027,0.780361,-14.0
3,369,99999,99,1,1,120,2971,8.177518e+06,3.964585,0.818611,14.0
4,370,99999,99,1,1,15,3091,1.613618e+07,1.667725,0.773946,14.0
...,...,...,...,...,...,...,...,...,...,...,...
72,438,99999,99,8,8,23,1562,6.708934e+06,2.313756,1.401864,14.0
73,439,99999,99,8,8,106,1585,1.877443e+06,0.983877,1.740229,-14.0
74,440,99999,99,8,8,709,1691,1.890673e+06,0.515295,0.568060,14.0
75,441,99999,99,8,8,28,2400,2.904429e+06,5.990257,1.023090,-14.0


In [171]:
df_1[2280:2290]

Unnamed: 0,dom_x,dom_y,dom_z,dom_x_rel,dom_y_rel,dom_z_rel,pmt_area,rde,saturation_status,q1,...,Qtotal,hlc1,hlc2,hlc3,t1,t2,t3,T10,T50,sigmaT
2280,-10.97,6.72,-486.399994,-10.940788,6.702106,-485.284302,0.0444,1.35,0,7.175,...,6234.25,1,1,1,13542.0,13548.0,13550.0,54.0,153.0,3.399346
2281,-10.97,6.72,-493.410004,-10.940788,6.702106,-492.294312,0.0444,1.35,0,2.875,...,5971.75,1,1,1,13556.0,13561.0,13569.0,49.0,151.0,5.354126
2282,-10.97,6.72,-500.730011,-10.940788,6.702106,-499.614319,0.0444,1.35,0,4.475,...,5770.225098,1,1,1,13570.0,13577.0,13583.0,47.0,154.0,5.312459
2283,114.389999,-461.98999,39.099998,0.0,0.0,0.0,0.0444,1.0,0,0.725,...,0.725,0,-1,-1,15235.0,-1.0,-1.0,-1.0,-1.0,7182.319336
2284,-88.050003,-384.299988,74.5,0.0,0.0,0.0,0.0444,1.0,0,1.075,...,1.075,0,-1,-1,13207.0,-1.0,-1.0,-1.0,-1.0,6226.311035
2285,35.540001,-364.829987,242.399994,17.469542,-179.330688,205.3423,0.0444,1.0,0,1.925,...,1.925,0,-1,-1,14782.0,-1.0,-1.0,-1.0,-1.0,6968.772949
2286,35.540001,-364.829987,106.239998,17.469542,-179.330688,69.182297,0.0444,1.0,0,1.125,...,1.125,1,-1,-1,14284.0,-1.0,-1.0,-1.0,-1.0,6734.013672
2287,35.540001,-364.829987,72.199997,17.469542,-179.330688,35.142296,0.0444,1.0,0,0.775,...,0.775,0,-1,-1,12636.0,-1.0,-1.0,-1.0,-1.0,5957.13916
2288,35.540001,-364.829987,55.18,17.469542,-179.330688,18.122295,0.0444,1.0,0,1.225,...,2.3,1,1,-1,13746.0,17851.0,-1.0,0.0,0.0,7634.169922
2289,35.540001,-364.829987,21.129999,17.469542,-179.330688,-15.927704,0.0444,1.0,0,0.975,...,0.975,1,-1,-1,12647.0,-1.0,-1.0,-1.0,-1.0,5962.324219


In [172]:
df_1[2620:2630]

Unnamed: 0,dom_x,dom_y,dom_z,dom_x_rel,dom_y_rel,dom_z_rel,pmt_area,rde,saturation_status,q1,...,Qtotal,hlc1,hlc2,hlc3,t1,t2,t3,T10,T50,sigmaT
2620,-9.68,-79.5,150.960007,-6.077904,-49.916668,99.181648,0.0444,1.35,0,0.575,...,1.15,1,1,-1,13583.0,14098.0,-1.0,0.0,515.0,6528.332031
2621,-9.68,-79.5,140.949997,-6.077904,-49.916668,89.171646,0.0444,1.35,0,1.225,...,2.65,1,1,-1,12826.0,13510.0,-1.0,0.0,684.0,6214.203125
2622,-9.68,-79.5,120.93,-6.077904,-49.916668,69.151642,0.0444,1.35,0,0.825,...,0.825,1,-1,-1,12449.0,-1.0,-1.0,-1.0,-1.0,5868.986328
2623,-9.68,-79.5,110.910004,-6.077904,-49.916668,59.131645,0.0444,1.35,0,1.125,...,1.125,1,-1,-1,12926.0,-1.0,-1.0,-1.0,-1.0,6093.846191
2624,-9.68,-79.5,100.900002,-6.077904,-49.916668,49.121647,0.0444,1.35,0,1.075,...,1.075,1,-1,-1,12727.0,-1.0,-1.0,-1.0,-1.0,6000.036621
2625,-9.13,-481.73999,-487.970001,-1.061628,-56.016289,-50.209778,0.0444,1.0,-1,1.075,...,1.075,0,-1,-1,12922.0,-1.0,-1.0,-1.0,-1.0,6091.960449
2626,-9.13,-481.73999,-504.98999,-1.061628,-56.016289,-67.229782,0.0444,1.0,-1,0.825,...,0.825,0,-1,-1,14694.0,-1.0,-1.0,-1.0,-1.0,6927.289551
2627,114.389999,-461.98999,-216.210007,87.509537,-353.427124,-117.876808,0.0444,1.0,-1,0.275,...,0.275,0,-1,-1,12922.0,-1.0,-1.0,-1.0,-1.0,6091.960449
2628,114.389999,-461.98999,-284.290009,87.509537,-353.427124,-185.956802,0.0444,1.0,-1,0.875,...,1.5,1,1,-1,12841.0,13046.0,-1.0,0.0,0.0,6102.669922
2629,114.389999,-461.98999,-318.329987,87.509537,-353.427124,-219.996811,0.0444,1.0,-1,1.075,...,1.075,1,-1,-1,12544.0,-1.0,-1.0,-1.0,-1.0,5913.769531


In [173]:
df_1[2965:2975]

Unnamed: 0,dom_x,dom_y,dom_z,dom_x_rel,dom_y_rel,dom_z_rel,pmt_area,rde,saturation_status,q1,...,Qtotal,hlc1,hlc2,hlc3,t1,t2,t3,T10,T50,sigmaT
2965,57.200001,-105.519997,-483.790009,29.46667,-54.358788,-284.060699,0.0444,1.35,-1,1.225,...,1.65,1,1,-1,12707.0,13126.0,-1.0,0.0,0.0,6091.77002
2966,57.200001,-105.519997,-497.809998,29.46667,-54.358788,-298.080688,0.0444,1.35,-1,0.875,...,0.875,1,-1,-1,13887.0,-1.0,-1.0,-1.0,-1.0,6546.866211
2967,-9.68,-79.5,-394.700012,-4.506207,-37.008621,-165.412933,0.0444,1.35,-1,0.625,...,1.45,0,0,-1,13659.0,13672.0,-1.0,0.0,13.0,6442.452148
2968,-9.68,-79.5,-485.809998,-4.506207,-37.008621,-256.522919,0.0444,1.35,-1,0.875,...,0.875,0,-1,-1,13673.0,-1.0,-1.0,-1.0,-1.0,6445.985352
2969,-10.97,6.72,-283.149994,-1.776357e-15,0.0,42.055,0.0444,1.35,-1,0.825,...,0.825,0,-1,-1,14060.0,-1.0,-1.0,-1.0,-1.0,6628.418945
2970,-10.97,6.72,-367.26001,-1.776357e-15,0.0,-42.055,0.0444,1.35,-1,0.825,...,0.825,0,-1,-1,13567.0,-1.0,-1.0,-1.0,-1.0,6396.016602
2971,114.389999,-461.98999,158.25,0.0,0.0,0.0,0.0444,1.0,-1,1.925,...,1.925,0,-1,-1,13427.0,-1.0,-1.0,-1.0,-1.0,6330.02002
2972,237.779999,-442.420013,399.339996,213.6086,-397.446014,388.366058,0.0444,1.0,-1,0.675,...,0.675,0,-1,-1,10647.0,-1.0,-1.0,-1.0,-1.0,5019.515137
2973,237.779999,-442.420013,314.23999,213.6086,-397.446014,303.266052,0.0444,1.0,-1,0.875,...,0.875,0,-1,-1,10027.0,-1.0,-1.0,-1.0,-1.0,4727.244629
2974,237.779999,-442.420013,297.220001,213.6086,-397.446014,286.246063,0.0444,1.0,-1,0.225,...,2.175,1,1,1,11246.0,12160.0,13004.0,0.0,1758.0,717.890137


In [174]:
if convertParquetToDF(HelloPMTfied1).equals(convertParquetToDF(HelloPMTfied2)):
    print("They are equal")
else:
    print("fiat lux")

fiat lux


In [141]:
convertParquetToDF(HelloPMTfied3)

Unnamed: 0,dom_x,dom_y,dom_z,dom_x_rel,dom_y_rel,dom_z_rel,pmt_area,rde,saturation_status,q1,...,Qtotal,hlc1,hlc2,hlc3,t1,t2,t3,T10,T50,sigmaT
0,-256.14,-521.08,-355.01,-100.157305,-203.755636,-101.286726,0.0444,1.00,-1.0,0.975,...,1.300,0.0,0.0,-1.0,12137.0,12175.0,-1.0,0.0,0.0,19.000000
1,-256.14,-521.08,-491.17,-100.157305,-203.755636,-237.446726,0.0444,1.00,-1.0,1.075,...,1.075,0.0,-1.0,-1.0,11999.0,-1.0,-1.0,-1.0,-1.0,0.000000
2,-132.80,-501.45,-487.13,0.000000,0.000000,0.000000,0.0444,1.00,-1.0,0.225,...,0.225,0.0,-1.0,-1.0,12265.0,-1.0,-1.0,-1.0,-1.0,0.000000
3,-334.80,-424.50,-452.57,-59.882923,-75.926824,-60.329750,0.0444,1.00,-1.0,1.025,...,1.025,0.0,-1.0,-1.0,9764.0,-1.0,-1.0,-1.0,-1.0,0.000000
4,-334.80,-424.50,-486.61,-59.882923,-75.926824,-94.369750,0.0444,1.00,-1.0,0.775,...,0.775,0.0,-1.0,-1.0,10884.0,-1.0,-1.0,-1.0,-1.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5354,-10.97,6.72,-472.39,-10.551550,6.463666,-455.637309,0.0444,1.35,0.0,0.375,...,9.025,1.0,1.0,1.0,13148.0,13575.0,14244.0,427.0,2102.0,909.582349
5355,-10.97,6.72,-479.39,-10.551550,6.463666,-462.637309,0.0444,1.35,0.0,1.025,...,26.325,1.0,1.0,1.0,13125.0,13494.0,13684.0,559.0,1734.0,866.757960
5356,-10.97,6.72,-486.40,-10.551550,6.463666,-469.647309,0.0444,1.35,0.0,0.925,...,4.525,1.0,1.0,1.0,13764.0,13811.0,13905.0,0.0,183.0,982.365512
5357,-10.97,6.72,-493.41,-10.551550,6.463666,-476.657309,0.0444,1.35,0.0,1.275,...,13.600,1.0,1.0,1.0,13315.0,13475.0,13565.0,160.0,880.0,1993.882331


In [72]:
convertParquetToDF(HejTruth)

Unnamed: 0,event_no,subdirectory_no,db_file_no,shard_index,offset,file_no,N_doms,energy,azimuth,zenith,pid
0,1419,99999,98,1,0,1,213,1.228535e+06,4.320621,0.735934,14.0
1,1420,99999,98,1,213,1,317,7.616068e+06,1.721122,1.112967,-14.0
2,1421,99999,98,1,530,1,189,4.883525e+07,4.324323,1.747017,-14.0
3,1422,99999,98,1,719,1,40,7.368295e+07,5.271961,2.011045,-14.0
4,1423,99999,98,1,759,1,309,2.270513e+07,5.554580,1.739628,-14.0
...,...,...,...,...,...,...,...,...,...,...,...
69,1488,99999,98,7,3774,7,243,7.586169e+06,3.289613,0.991989,14.0
70,1489,99999,98,8,0,8,24,2.344431e+07,2.390422,0.236707,-14.0
71,1490,99999,98,8,24,8,393,1.908850e+07,5.173692,0.628225,-14.0
72,1491,99999,98,8,417,8,961,4.262259e+07,4.707403,1.459073,-14.0


In [4]:
convertParquetToDF("/lustre/hpc/project/icecube/HE_Nu_Aske_Oct2024/PMTfied/Snowstorm/22011/6/PMTfied_1.parquet")

Unnamed: 0,dom_x,dom_y,dom_z,dom_x_rel,dom_y_rel,dom_z_rel,pmt_area,rde,saturation_status,q1,...,Qtotal,hlc1,hlc2,hlc3,t1,t2,t3,T10,T50,sigmaT
0,210.47,-209.77,-473.24,128.170833,-127.744551,-284.972500,0.0444,1.0,-1.0,0.975,...,1.950,1.0,1.0,-1.0,10439.0,11555.0,-1.0,0.0,1116.0,558.000000
1,210.47,-209.77,-490.26,128.170833,-127.744551,-301.992500,0.0444,1.0,-1.0,0.725,...,0.725,1.0,-1.0,-1.0,10773.0,-1.0,-1.0,-1.0,-1.0,0.000000
2,210.47,-209.77,-507.28,128.170833,-127.744551,-319.012500,0.0444,1.0,-1.0,0.375,...,0.375,1.0,-1.0,-1.0,10421.0,-1.0,-1.0,-1.0,-1.0,0.000000
3,-121.77,-171.03,-467.52,0.000000,0.000000,0.000000,0.0444,1.0,-1.0,1.175,...,1.175,0.0,-1.0,-1.0,10991.0,-1.0,-1.0,-1.0,-1.0,0.000000
4,1.71,-150.63,-438.89,0.347797,-30.636611,-72.245765,0.0444,1.0,-1.0,0.875,...,0.875,1.0,-1.0,-1.0,10780.0,-1.0,-1.0,-1.0,-1.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219446,-279.53,23.17,502.61,-198.783414,16.476985,360.927406,0.0444,1.0,-1.0,0.825,...,3.400,1.0,1.0,1.0,9873.0,9910.0,9986.0,0.0,113.0,67.743542
219447,-279.53,23.17,485.59,-198.783414,16.476985,343.907406,0.0444,1.0,-1.0,0.875,...,2.050,1.0,1.0,-1.0,10263.0,10557.0,-1.0,0.0,294.0,147.000000
219448,-279.53,23.17,468.57,-198.783414,16.476985,326.887406,0.0444,1.0,-1.0,1.425,...,1.425,1.0,-1.0,-1.0,10301.0,-1.0,-1.0,-1.0,-1.0,0.000000
219449,-481.60,101.39,450.99,-253.816219,53.435271,240.060588,0.0444,1.0,-1.0,0.875,...,1.850,0.0,0.0,-1.0,11422.0,11435.0,-1.0,0.0,13.0,6.500000
