In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
# imports
import random
import shutil
import math

from sklearn.cluster import KMeans

import matplotlib.pyplot as plt

import tensorflow as tf 
from tensorflow import keras

In [None]:
data = '/kaggle/input/icecube-neutrinos-in-deep-ice/train/batch_1.parquet'
df = pd.read_parquet(data)
df.head(20)

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.index

In [None]:
df.isnull().sum()

In [None]:
df['sensor_id'].unique()

In [None]:
sensor_data = '/kaggle/input/icecube-neutrinos-in-deep-ice/sensor_geometry.csv'
df_sensor = pd.read_csv(sensor_data)
df_sensor.head(20)

In [None]:
df_sensor['sensor_id'].unique()

# Preprocessing

One thing that really piqued my interest when I first read paper detailing the 1st place solution was the minimal amount of preprocessing used by the winning team. Their results come entirely from the efficiency and sophistication of their architecture, and I found that facsinating. 

In [None]:
# os.mkdir('/kaggle/working/train_raw')

In [None]:
# scale geo data
df_sensor_working = df_sensor.copy() 
df_sensor_working[['x', 'y', 'z']] = df_sensor_working[['x', 'y', 'z']] / 500
df_sensor_working

In [None]:
# os.mkdir('/kaggle/working/train_subset/')

In [None]:
# Define paths
dest = '/kaggle/working/train_subset/'
curr = '/kaggle/input/icecube-neutrinos-in-deep-ice/train/'

for i in range(50):
    o = random.randrange(1, 661)
    processing = f"{curr}batch_{o}.parquet"
    
    # load data
    df = pd.read_parquet(processing)
    
    # filter events with max 200 pulses
    pulse_counts = df.index.value_counts()
    valid_events = pulse_counts[pulse_counts <= 200].index
    df_working = df[df.index.isin(valid_events)].copy()  # Create a copy here
    
    # fetch id of relevant sensors 
    sensors = df_working['sensor_id'].unique()
    
    # scale time and charge
    df_working['time'] = (df_working['time'] - 1e4) / 115
    df_working['charge'] = np.where(df_working['charge'] > 0, np.log10(df_working['charge']) / 3, 0)
    
    # fetch geometric data from the sensors df
    geo_data = df_sensor_working[df_sensor_working['sensor_id'].isin(sensors)]
    coords = geo_data[['sensor_id', 'x', 'y', 'z']]  # Include 'sensor_id' for merging

    # reset index, wont need that 
    df_working.reset_index(drop=True, inplace=True)

    # merge and save dataframe
    processed_df = pd.merge(df_working, coords, on='sensor_id', how='left')
    processed_df.drop('sensor_id', axis=1, inplace=True)
    processed_df.to_parquet(f"{dest}batch_{o}.parquet")

In [None]:
test = pd.read_parquet('/kaggle/working/train_subset/batch_419.parquet')
test.head(5)

Excellent. Elementary features have been extracted and preprocessed for 50 batches. Next, we must obtain node homophility of (x, y, z, t). We will do this for each batch, using a kNN graph.

In [None]:
# choosing an approriate number of clusters using the elbow method i.e. the point at which the within-cluster sum of squares
# stops increasing rapidly

# man this took years
df = pd.read_parquet('/kaggle/working/train_subset/batch_419.parquet')
features = df[['x', 'y', 'z', 'time']]
wcss = []

for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, n_init=10, random_state=0)
    kmeans.fit(features)
    wcss.append(kmeans.inertia_)

In [None]:
plt.plot(range(1, 11), wcss)
plt.title('WCSS vs Number of Clusters')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.grid()
# Let's go with 7

In [None]:
# def calc_homophily_ratio(nodes:pd.DataFrame, group_column:str) -> float:
#     # create edges 
#     edges = pd.merge(node, nodes, on=group_column)

#     # calculate homophily 
#     total_edges = len(edges)
#     same_group_edges = len(edges[edges['group_x'] == edges['group_y']])
#     homophily_ratio = same_group_edges / total_edges

#     return homophily_ratio

In [None]:
# def process_file(file):
#     df = pd.read_parquet(f"{dest}{file}")
#     features = df[['x', 'y', 'z', 'time']]
    
#     kmeans = KMeans(n_clusters=7)
#     df['group'] = kmeans.fit_predict(features)
    
#     ratio = calc_homophily_ratio(df, 'group')
#     return file, ratio

# global_stats = {}
# for file in os.listdir(dest):
#     file, ratio = process_file(file)
#     global_stats[file] = ratio

I will skip the global statics extraction, as I am still trying to understand excatly what they mean. I will jump to building some of the components of the model. We start with the feed forward network (Feed Forward -> Add & Norm).

In [5]:
# Feed Forward followed by Add & Norm

class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
          tf.keras.layers.Dense(dff, activation='relu'),
          tf.keras.layers.Dense(d_model),
          tf.keras.layers.Dropout(dropout_rate)
        ])
        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()
        
    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layer_norm(x) 
        return x

In [None]:
# Multi-Head Attention followed by Add & Norm

class SelfAttention(BaseAttention):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()
    
    def call(self, x):
        attn_output = self.mha(
            query=x,
            value=x,
            key=x)
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x