<a href="https://colab.research.google.com/github/JakobSchauser/BachelorProject-IceCube-ML/blob/main/Interpreting_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install spektral

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf


import sqlite3
import os
from pandas import read_sql

from sklearn.neighbors import kneighbors_graph as knn
from sklearn.preprocessing import RobustScaler

from spektral.data import Dataset, Graph

from tqdm.notebook import tqdm


#### Put on GPU when possible
physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) > 0:
    print("Running on GPU")
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [None]:
class ParticleDataset(Dataset):
    def __init__(self,data = []):
        self.data = np.array(data)
        
    def read(self):
        return self.data
    
    def generate_data(self, path = "/content/drive/MyDrive/Bachelor Project - IceCube ML/139008_00.db",limit = 1e3):
        n_neighbors = 5
        path = "//sid.erda.dk/share_redirect/ETCAbdoluA/rasmus_classification_muon_3neutrino_3mio/data/rasmus_classification_muon_3neutrino_3mio.db"

        with sqlite3.connect(path) as con:
            query = 'select * from sequential where event_no < {}'.format(limit)
            sequential = pd.read_sql(query, con)
            query = 'select * from scalar where event_no < {}'.format(limit)
            scalar = pd.read_sql(query, con)

        # Make sure the same events are used:

        events  = set(sequential.event_no) & set(scalar.event_no)

        sequential = sequential[sequential["event_no"].isin(events)]
        scalar = scalar[scalar["event_no"].isin(events)]
        
        # print(scalar.columns)
        # return
        features = ["dom_x","dom_y","dom_z","dom_time","dom_charge"]
        pos = ["dom_x","dom_y","dom_z"]
        # target_features = ['true_primary_direction_x','true_primary_direction_y', 'true_primary_direction_z']
        target_features = ["true_primary_energy"]
        sequential_arr = np.array(sequential[features])
        scalar_arr = np.array(scalar[target_features])

        graphs = []
        targets = []

        #Find when event type changes
        _, changes = np.unique(sequential["event_no"].values,return_index = True)
        changes = np.append(changes,len(sequential))

        for i in tqdm(range(len(changes)-1)):
            ind0,ind1 = changes[i],changes[i+1]
            seq = sequential[ind0:ind1]
            target = scalar_arr[i]

            nbs = knn(seq[pos],n_neighbors)
            dists = knn(seq[pos],n_neighbors,mode = "distance")
            if len(target) == 0:
              continue
            # print((target.values))
            # print((seq[features]))
            x = sequential_arr[ind0:ind1]
            graph = Graph(x = x, a = nbs.T, y = target)
            
            graphs.append(graph)
            targets.append(target)
        self.data = np.array(graphs,dtype=object)

dataset = ParticleDataset()
dataset.generate_data(limit = 6*1e6)

OperationalError: ignored

In [None]:
dataset.data

array([Graph(n_nodes=45, n_node_features=5, n_edge_features=None, n_labels=3),
       Graph(n_nodes=56, n_node_features=5, n_edge_features=None, n_labels=3),
       Graph(n_nodes=323, n_node_features=5, n_edge_features=None, n_labels=3),
       ...,
       Graph(n_nodes=46, n_node_features=5, n_edge_features=None, n_labels=3),
       Graph(n_nodes=66, n_node_features=5, n_edge_features=None, n_labels=3),
       Graph(n_nodes=66, n_node_features=5, n_edge_features=None, n_labels=3)],
      dtype=object)

In [None]:
lim_per_file = int(5*1e6)
for i in range(1):
  try:
    p = "/content/drive/MyDrive/Bachelor Project - IceCube ML/generatedDataDirection"+str(len(lim_per_file)+" "+str(1))
    np.savez(p,dataset.data[i*lim_per_file:(i+1)*lim_per_file])
  except:
    print("File already exists")

In [None]:
dataset.read()

array([Graph(n_nodes=45, n_node_features=5, n_edge_features=None, n_labels=3),
       Graph(n_nodes=56, n_node_features=5, n_edge_features=None, n_labels=3),
       Graph(n_nodes=323, n_node_features=5, n_edge_features=None, n_labels=3),
       ...,
       Graph(n_nodes=46, n_node_features=5, n_edge_features=None, n_labels=3),
       Graph(n_nodes=66, n_node_features=5, n_edge_features=None, n_labels=3),
       Graph(n_nodes=66, n_node_features=5, n_edge_features=None, n_labels=3)],
      dtype=object)