# IceCube - EDA

## Load Data

In [1]:
%cd ..

/home/jovyan/ice-cube


In [2]:
import sys

sys.path.append(".")

In [3]:
import logging

logging.basicConfig(
    # filename=__file__.replace('.py', '.log'),
    level=logging.getLevelName("INFO"),
    format="%(asctime)s [%(levelname)s] [%(module)s] %(message)s",
)

log = logging.getLogger(__name__)

In [4]:
import glob
import os
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import scipy.stats as stats
import seaborn as sns
from omegaconf import OmegaConf
from src.load_data import LoadData
from src.utils import choice_seed, df_stats, fix_seed

# from src.get_score import get_score

# pd.set_option("display.max_rows", None)
# pd.set_option("display.max_columns", None)
# pd.set_option("display.max_colwidth", None)

In [5]:
# Competition specific packages
import os
import sqlite3
from typing import Any, Dict, List, Optional

import pyarrow.parquet as pq
import sqlalchemy
from tqdm import tqdm

import graphnet
from graphnet.data.sqlite.sqlite_utilities import create_table

[1;34mgraphnet[0m: [32mINFO    [0m 2023-02-22 00:47:29 - get_logger - Writing log to [1mlogs/graphnet_20230222-004729.log[0m


2023-02-22 00:47:29,286 [INFO] [logging] Writing log to [1mlogs/graphnet_20230222-004729.log[0m




  warn(f"Failed to load image Python extension: {e}")


In [6]:
c_main = OmegaConf.load("./config/main.yaml")
c_data = OmegaConf.load("./config/data.yaml")
c = OmegaConf.merge(c_main, c_data)

fix_seed(choice_seed(c))

2023-02-22 00:47:30,506 [INFO] [utils] Fix seed: 442


In [7]:
input = LoadData(c, use_fold=False, do_preprocess=False)

2023-02-22 00:47:30,511 [INFO] [load_data] Load original file. path: ../input/icecube-neutrinos-in-deep-ice/train_meta.parquet
2023-02-22 00:47:32,551 [INFO] [load_data] Load original file. path: ../input/icecube-neutrinos-in-deep-ice/test_meta.parquet
2023-02-22 00:47:32,552 [INFO] [load_data] Load original file. path: ../input/icecube-neutrinos-in-deep-ice/sample_submission.parquet
2023-02-22 00:47:32,553 [INFO] [load_data] Load original file. path: ../input/icecube-neutrinos-in-deep-ice/sensor_geometry.csv


In [8]:
[col for col in dir(input) if not col.startswith("__")]

['c', 'sample_submission', 'sensor_geometry', 'test_meta', 'train_meta']

## Check Data

In [9]:
input.sample_submission.shape

(3, 3)

In [10]:
input.sample_submission

event_id,azimuth,zenith
i64,i64,i64
2092,1,1
7344,1,1
9482,1,1


In [11]:
input.sensor_geometry.shape

(5160, 4)

In [12]:
input.sensor_geometry

sensor_id,x,y,z
i64,f64,f64,f64
0,-256.14,-521.08,496.03
1,-256.14,-521.08,479.01
2,-256.14,-521.08,461.99
3,-256.14,-521.08,444.97
4,-256.14,-521.08,427.95
5,-256.14,-521.08,410.93
6,-256.14,-521.08,393.91
7,-256.14,-521.08,376.88
8,-256.14,-521.08,359.86
9,-256.14,-521.08,342.84


In [13]:
input.test_meta.shape

(3, 4)

In [14]:
input.test_meta

batch_id,event_id,first_pulse_index,last_pulse_index
i64,i64,i64,i64
661,2092,0,298
661,7344,299,334
661,9482,335,377


In [15]:
input.train_meta.shape

(131953924, 6)

In [16]:
input.train_meta

batch_id,event_id,first_pulse_index,last_pulse_index,azimuth,zenith
i64,i64,i64,i64,f64,f64
1,24,0,60,5.029555,2.087498
1,41,61,111,0.417742,1.549686
1,59,112,147,1.160466,2.401942
1,67,148,289,5.845952,0.759054
1,72,290,351,0.653719,0.939117
1,77,352,401,0.011372,1.295371
1,79,402,717,3.533397,2.479947
1,82,718,762,5.252108,0.672366
1,121,763,803,3.084929,2.06588
1,127,804,846,6.154335,1.371721


## Read single batch

In [17]:
batch_num = 1
batch_1 = pl.read_parquet(
    os.path.join(c.data.dir.input_train, f"batch_{batch_num}.parquet")
)

In [18]:
batch_1.shape

(32792416, 5)

In [19]:
batch_1

sensor_id,time,charge,auxiliary,event_id
i16,i64,f64,bool,i64
3918,5928,1.325,true,24
4157,6115,1.175,true,24
3520,6492,0.925,true,24
5041,6665,0.225,true,24
2948,8054,1.575,true,24
860,8124,0.675,true,24
2440,8284,1.625,true,24
1743,8478,0.775,true,24
3609,8572,1.025,true,24
5057,8680,3.975,true,24


In [20]:
batch_1.unique(subset=["auxiliary"])

sensor_id,time,charge,auxiliary,event_id
i16,i64,f64,bool,i64
3918,5928,1.325,True,24
5059,9868,1.375,False,24


In [21]:
batch_1["auxiliary"].value_counts()

auxiliary,counts
bool,u32
False,23551893
True,9240523
