In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Concepts

This notebook attempts at creating a model to predict/estimate a given neutrino direction, from a set of coordinates measured by several sensors in one event. 

A neural network was defined having in mind this is a regression problem:
 - 5  linear layers
 - RELU as activation function
 - L1Loss as model metric
 - ADAM as optimizer
 
LGBM machine learning model was attempted, but it perform poorly, especially in computation time. Score from test set was 1.558 and since the model can't use further trees (because it increases computation time directly with the amount of batches trained) this solution was abandoned. 

# Data

In [1]:
def get_train_df_from_a_batch (train_batch_df, sensors_df, train_meta_df, batch_number):
    """
    Converts train_batch, train_meta and sensor_geometry into a 'train_df' dataframe containing features and targets
    It filters 'auxiliary' field to only 'False' values (reduces db in 27%), due to challenge explanation:
    ' If True, the pulse was not fully digitized, is of lower quality, and was more likely to originate from noise.'
    It uses polars dataframes only.
    """
    train_batch_df = train_batch_df.filter(pl.col("auxiliary") == False)
    sensors_df = sensors_df.with_columns(pl.col('sensor_id').cast(pl.Int16, strict=False))
    train_df = train_batch_df.join (sensors_df, how='left', on = 'sensor_id')
    train_meta_batch_df = train_meta_df.filter(pl.col("batch_id") == batch_number)
    train_df = train_df.join (train_meta_batch_df, how='left', on = 'event_id')
    train_df = train_df.drop (columns=['batch_id', 'auxiliary']) #train_df is filtered for 1 batch_id and auxiliary = False, these columns are useless
    train_df = train_df.drop (columns=['first_pulse_index'])
    #train_df = train_df.with_columns(xy = pl.col('x') * pl.col('y'))
    del train_meta_batch_df #memory
    del train_batch_df #memory
    print (f'Train dataframe:\n')
    print (train_df)
    return train_df

# 3D Plotting

In [2]:
def plot_3D (trn_df, event_num):
    """
    Plots x, y, and z from sensors vs azimuth and zenith calculated, per 1 event
    """
    # Get x, y, z, azimuth and zenith values from sensors
    train_df = trn_df.filter(pl.col("event_id") == event_num)
    m = 0
    M = len(train_df.collect())

    xs = train_df.collect()[m:M, 'x']
    ys = train_df.collect()[m:M, 'y']
    zs = train_df.collect()[m:M, 'z']
    azim = train_df.collect()[m:M, 'azimuth']
    zen = train_df.collect()[m:M, 'zenith']

    # Calculate the Cartesian coordinates of the vector
    xp = np.sin(zen) * np.cos(azim)
    yp = np.sin(zen) * np.sin(azim)
    zp = np.cos(zen)

    # Set figure
    fig = plt.figure(figsize = (12,20))
    ax = fig.add_subplot(111, projection='3d')

    # Plot the vector as a line from (0,0,0) to (x,y,z)
    ax.scatter(xp, yp, zp, color='g')
    ax.scatter(xs,ys,zs, color='b')
    ax.view_init(-160, 30)

    # Add labels for the x, y, and z axes
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('Z')
    
    plt.title(f"Event {event_num}")

    # Show the plot
    plt.show()

# Score Function

In [3]:
def angular_dist_score(az_true, zen_true, az_pred, zen_pred, batch_size=1):
    '''
    calculate the MAE of the angular distance between two directions.
    The two vectors are first converted to cartesian unit vectors,
    and then their scalar product is computed, which is equal to
    the cosine of the angle between the two vectors. The inverse 
    cosine (arccos) thereof is then the angle between the two input vectors
    
    Parameters:
    -----------
    
    az_true : float (or array thereof)
        true azimuth value(s) in radian
    zen_true : float (or array thereof)
        true zenith value(s) in radian
    az_pred : float (or array thereof)
        predicted azimuth value(s) in radian
    zen_pred : float (or array thereof)
        predicted zenith value(s) in radian
    
    Returns:
    --------
    
    dist : float
        mean over the angular distance(s) in radian
    '''
    
    if not (np.all(np.isfinite(az_true)) and
            np.all(np.isfinite(zen_true)) and
            np.all(np.isfinite(az_pred)) and
            np.all(np.isfinite(zen_pred))):
        raise ValueError("All arguments must be finite")
    
    import numexpr as ne

    n = len(az_true)
    angle_sum = 0.0
    for i in range(0, n, batch_size):
        end = min(i + batch_size, n)
        sa1 = np.sin(az_true[i:end]).astype(np.float32)
        ca1 = np.cos(az_true[i:end]).astype(np.float32)
        sz1 = np.sin(zen_true[i:end]).astype(np.float32)
        cz1 = np.cos(zen_true[i:end]).astype(np.float32)
        sa2 = np.sin(az_pred[i:end]).astype(np.float32)
        ca2 = np.cos(az_pred[i:end]).astype(np.float32)
        sz2 = np.sin(zen_pred[i:end]).astype(np.float32)
        cz2 = np.cos(zen_pred[i:end]).astype(np.float32)
        scalar_prod = ne.evaluate('sz1*sz2*(ca1*ca2 + sa1*sa2) + cz1*cz2')
        scalar_prod = np.clip(scalar_prod, -1, 1)
        angle_sum += np.sum(np.arccos(scalar_prod))
    return angle_sum / (n * batch_size) 

# Correlations

In [4]:
# Utility functions from Tutorial
def make_mi_scores(X, y):
    
    from sklearn.feature_selection import mutual_info_regression
    
    for colname in ["object", "category"]:
        if colname in X.dtypes:
            X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

#X = train_analysis.to_pandas()
#y_az = X['azimuth']
#X = X.drop (columns = ['azimuth'])
#mi_scores = make_mi_scores(X, y_az)
#del train_analysis #memory
#mi_scores

# Train and score a regression neural network

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler

class CustomDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.features = self.df.drop(columns=['azimuth', 'zenith']).to_numpy()
        self.targets = self.df[['azimuth', 'zenith']].to_numpy()
        self.scaler = StandardScaler()
        self.features = self.scaler.fit_transform(self.features)
    
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        features = self.features[idx] #maybe we need to apply the scaler here again
        target = self.targets[idx]
        return torch.tensor(features, dtype=torch.float32), torch.tensor(target, dtype=torch.float32)

    
class CustomModel(torch.nn.Module):
    def __init__(self, input_size, output_size):
        super(CustomModel, self).__init__()
        self.fc1 = torch.nn.Linear(input_size, 64)
        self.fc2 = torch.nn.Linear(64, 32)
        self.fc3 = torch.nn.Linear(32, 16)
        self.fc4 = torch.nn.Linear(16, 8)
        self.fc5 = torch.nn.Linear(8, output_size)

    def forward(self, x):
        x = torch.nn.functional.relu(self.fc1(x))
        x = torch.nn.functional.relu(self.fc2(x))
        x = torch.nn.functional.relu(self.fc3(x))
        x = torch.nn.functional.relu(self.fc4(x))
        x = self.fc5(x)
        return x

def train_model(train_dataset, batch_size, num_epochs, learning_rate, device, model_path=None):
    scores = []
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    model = CustomModel(input_size=train_dataset.features.shape[1], output_size=2)
    optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
    criterion = torch.nn.L1Loss()

    if os.path.exists (model_path):
        model.load_state_dict(torch.load(model_path))

    model.to(device)

    for epoch in tqdm(range(num_epochs)):
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        #print('Epoch {} loss: {}'.format(epoch+1, running_loss / len(train_loader)))
        scores.append (running_loss / len(train_loader)) 

        if model_path is not None:
            torch.save(model.state_dict(), model_path)
    score = np.mean(scores)

    return model, score

def predict_model(test_dataset, batch_size, device, model_path):
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    model = CustomModel(input_size=test_dataset.features.shape[1], output_size=2)
    model.load_state_dict(torch.load(model_path))
    model.to(device)

    predictions = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            predictions.extend(outputs.cpu().numpy())

    return np.array(predictions)

# Model train

## Imports Data

In [6]:
%%time
%matplotlib inline
import matplotlib.pyplot as plt
import polars as pl
import numpy as np
import pandas as pd
import os, gc

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import SCORERS, mean_absolute_error
import optuna
from tqdm import tqdm
from torch import cuda

print ('\nFinished loading imports.\n')

if cuda.is_available():
    device_lgbm = 'gpu'
    device_nn = 'cuda'
else:
    device_lgbm = 'cpu'
    device_nn = 'cpu'
    
print (f"Device for training is {device_lgbm}.\n")

input_path = '/kaggle/input/'
work_path = '/kaggle/working/'
scores_nn_path = f'{work_path}scores_nn.csv'
scores_nn_df = pd.DataFrame([])
model_path = f'{work_path}model.pt'
saved_scores_nn_path = f'{input_path}scores_nn.csv'
saved_model_path = f'{input_path}model.pt'

for dirname, _, filenames in os.walk(input_path):
    for filename in filenames:
        filepath = os.path.join(dirname, filename)
        if 'sensor' in filepath:
            sensors_df = pl.read_csv (filepath).lazy()
            print ("'sensor_geometry' file loaded.")
        elif 'score' in filepath:
            scores_nn_df = pd.read_csv (filepath)
        elif 'train_meta' in filepath:
            train_meta_filepath = filepath
            print ("'train_meta' file path found and loaded.")
print ('\nAll paths are set.\n')


Finished loading imports.

Device for training is cpu.

'train_meta' file path found and loaded.
'sensor_geometry' file loaded.

All paths are set.

CPU times: user 1.39 s, sys: 222 ms, total: 1.62 s
Wall time: 2.88 s


## Train model

In [7]:
%%time
y_preds = []
submission_df = pl.DataFrame([]).lazy()
counts = 1
max = 5
for dirname, _, filenames in os.walk(input_path):
    for filename in filenames:
        filepath = os.path.join(dirname, filename)
        if ('batch' in filepath) and ('train' in dirname):
            batch_number = int (filename.split('_')[1].split('.')[0])
            print (f"TRAINING BATCH ID {batch_number} - {counts} BATCHES OF {max}\n\ntrain_batch_{batch_number}' file loaded.\n\n")
            if len(scores_nn_df) != 0 and (batch_number in scores_nn_df.batch_id.values):
                print ('\nBatch already trained. Skipping to next batch.\n')
                continue
            train_meta_df = pl.read_parquet (train_meta_filepath).lazy()
            print ("'train_meta' file loaded.")
            train_batch_df = pl.read_parquet (filepath).lazy()
            print (f"\nLoading 'train_batch' file.\n")
            print (train_batch_df.collect())
            train_df = get_train_df_from_a_batch (train_batch_df.collect(), 
                                                  sensors_df.collect(), 
                                                  train_meta_df.collect(), 
                                                  batch_number)
            del train_meta_df #memory
            del train_batch_df
            gc.collect()
            train_df = train_df.sample(frac=0.05)
            trn_df, tst_df = train_test_split(train_df, test_size=0.2, random_state=42)
            print ('\nTraining model...\n')
            print (trn_df)
            train_dataset = CustomDataset(trn_df)
            if os.path.exists (saved_model_path):
                model_filepath = saved_model_path
            else:
                model_filepath = model_path
            model, m_score = train_model(train_dataset,
                                         batch_size=128, 
                                         num_epochs=10, 
                                         learning_rate=1e-3, 
                                         device=device_nn, 
                                         model_path=model_filepath)
            del train_df #memory
            del trn_df
            del train_dataset #memory
            gc.collect()
            print ('\nModel score:', m_score)
            print ('\nPredicting values for model score...\n')
            print (tst_df)
            test_dataset = CustomDataset(tst_df)
            y_preds.append (predict_model(test_dataset, batch_size=128, device=device_nn, model_path=model_filepath))
            y_pred = np.array(y_preds).reshape(-1, 2)
            y_preds = []
            torch.cuda.empty_cache()
            az_pred = y_pred [:,0]
            print ('\nAzimuth preds:\n', az_pred)
            ze_pred = y_pred [:,1]
            print ('\nZenith preds:\n', ze_pred)
            i = np.random.choice (list(range(0,len(az_pred))), int(len(az_pred) * 1), replace=False)
            score = angular_dist_score(tst_df['azimuth'].to_numpy()[i], tst_df['zenith'].to_numpy()[i], az_pred[i], ze_pred[i])
            print ('\nScore:', score)
            print ('\nCache cleaned.\n')
            if len(scores_nn_df) != 0:
                scores_nn_df = scores_nn_df.append (pd.DataFrame ([{'batch_id': batch_number, 'score': score}]))
            else:
                scores_nn_df = pd.DataFrame ([{'batch_id': batch_number, 'score': score}])
            scores_nn_df.to_csv (scores_nn_path, index=False)
            print (scores_nn_df)
            batch_results = {'event_id': tst_df['event_id'], 
                             'azimuth_pred': az_pred, 
                             'zenith_pred': ze_pred, 
                             'azimuth_true': tst_df['azimuth'],
                             'zenith_true': tst_df['zenith']}
            del test_dataset #memory
            del tst_df
            gc.collect()
            batch_results_df = pl.DataFrame (batch_results).lazy()
            if submission_df.select(pl.count()).collect()[0,0] == 0:
                submission_df = batch_results_df
            else:
                submission_df = pl.concat ([submission_df, batch_results_df])
            if counts == max:
                break
            counts += 1

TRAINING BATCH ID 240 - 1 BATCHES OF 5

train_batch_240' file loaded.


'train_meta' file loaded.

Loading 'train_batch' file.

shape: (32_801_416, 5)
┌───────────┬───────┬────────┬───────────┬───────────┐
│ sensor_id ┆ time  ┆ charge ┆ auxiliary ┆ event_id  │
│ ---       ┆ ---   ┆ ---    ┆ ---       ┆ ---       │
│ i16       ┆ i64   ┆ f64    ┆ bool      ┆ i64       │
╞═══════════╪═══════╪════════╪═══════════╪═══════════╡
│ 2024      ┆ 6029  ┆ 1.425  ┆ true      ┆ 777938857 │
│ 2371      ┆ 6078  ┆ 1.475  ┆ true      ┆ 777938857 │
│ 920       ┆ 6508  ┆ 0.425  ┆ true      ┆ 777938857 │
│ 4872      ┆ 7886  ┆ 0.475  ┆ true      ┆ 777938857 │
│ …         ┆ …     ┆ …      ┆ …         ┆ …         │
│ 1626      ┆ 27878 ┆ 0.275  ┆ true      ┆ 781200050 │
│ 4818      ┆ 28174 ┆ 1.425  ┆ true      ┆ 781200050 │
│ 1145      ┆ 28206 ┆ 0.975  ┆ true      ┆ 781200050 │
│ 2927      ┆ 28652 ┆ 0.675  ┆ true      ┆ 781200050 │
└───────────┴───────┴────────┴───────────┴───────────┘
Train dataframe:

shape:

100%|██████████| 10/10 [03:58<00:00, 23.89s/it]



Model score: 0.9480662024644305

Predicting values for model score...

shape: (235_675, 10)
┌───────────┬───────┬────────────┬───────────┬───┬─────────┬─────────────────┬──────────┬──────────┐
│ sensor_id ┆ time  ┆ charge     ┆ event_id  ┆ … ┆ z       ┆ last_pulse_inde ┆ azimuth  ┆ zenith   │
│ ---       ┆ ---   ┆ ---        ┆ ---       ┆   ┆ ---     ┆ x               ┆ ---      ┆ ---      │
│ i16       ┆ i64   ┆ f64        ┆ i64       ┆   ┆ f64     ┆ ---             ┆ f64      ┆ f64      │
│           ┆       ┆            ┆           ┆   ┆         ┆ i64             ┆          ┆          │
╞═══════════╪═══════╪════════════╪═══════════╪═══╪═════════╪═════════════════╪══════════╪══════════╡
│ 3747      ┆ 10111 ┆ 118.275002 ┆ 778636554 ┆ … ┆ 38.26   ┆ 6990725         ┆ 3.178607 ┆ 0.616856 │
│ 3607      ┆ 11721 ┆ 0.575      ┆ 779563094 ┆ … ┆ 382.05  ┆ 16267298        ┆ 1.114273 ┆ 0.662445 │
│ 3326      ┆ 12697 ┆ 2.175      ┆ 779093632 ┆ … ┆ 60.2    ┆ 11555180        ┆ 4.50245  ┆ 1.516744 

100%|██████████| 10/10 [04:12<00:00, 25.29s/it]



Model score: 0.9114377994557108

Predicting values for model score...

shape: (251_020, 10)
┌───────────┬───────┬────────┬───────────┬───┬─────────┬──────────────────┬──────────┬──────────┐
│ sensor_id ┆ time  ┆ charge ┆ event_id  ┆ … ┆ z       ┆ last_pulse_index ┆ azimuth  ┆ zenith   │
│ ---       ┆ ---   ┆ ---    ┆ ---       ┆   ┆ ---     ┆ ---              ┆ ---      ┆ ---      │
│ i16       ┆ i64   ┆ f64    ┆ i64       ┆   ┆ f64     ┆ i64              ┆ f64      ┆ f64      │
╞═══════════╪═══════╪════════╪═══════════╪═══╪═════════╪══════════════════╪══════════╪══════════╡
│ 5098      ┆ 11995 ┆ 4.025  ┆ 957925662 ┆ … ┆ -492.82 ┆ 9238429          ┆ 0.818278 ┆ 2.56353  │
│ 4352      ┆ 14671 ┆ 0.625  ┆ 960205147 ┆ … ┆ -47.6   ┆ 33406583         ┆ 3.823304 ┆ 1.469211 │
│ 2804      ┆ 17111 ┆ 0.725  ┆ 958840812 ┆ … ┆ -247.08 ┆ 19119725         ┆ 4.075646 ┆ 0.747256 │
│ 4251      ┆ 11727 ┆ 0.825  ┆ 957144588 ┆ … ┆ -368.24 ┆ 1178632          ┆ 0.669247 ┆ 2.117587 │
│ …         ┆ …     ┆ …  

100%|██████████| 10/10 [04:02<00:00, 24.30s/it]



Model score: 0.9278565861257473

Predicting values for model score...

shape: (227_366, 10)
┌───────────┬───────┬────────┬───────────┬───┬─────────┬──────────────────┬──────────┬──────────┐
│ sensor_id ┆ time  ┆ charge ┆ event_id  ┆ … ┆ z       ┆ last_pulse_index ┆ azimuth  ┆ zenith   │
│ ---       ┆ ---   ┆ ---    ┆ ---       ┆   ┆ ---     ┆ ---              ┆ ---      ┆ ---      │
│ i16       ┆ i64   ┆ f64    ┆ i64       ┆   ┆ f64     ┆ i64              ┆ f64      ┆ f64      │
╞═══════════╪═══════╪════════╪═══════════╪═══╪═════════╪══════════════════╪══════════╪══════════╡
│ 4732      ┆ 15512 ┆ 1.325  ┆ 513880622 ┆ … ┆ -453.49 ┆ 28341245         ┆ 3.958845 ┆ 0.494093 │
│ 3410      ┆ 10241 ┆ 0.675  ┆ 511101211 ┆ … ┆ -346.61 ┆ 964054           ┆ 0.8374   ┆ 0.858254 │
│ 58        ┆ 9956  ┆ 1.575  ┆ 511161197 ┆ … ┆ -491.17 ┆ 1569866          ┆ 5.71125  ┆ 1.640495 │
│ 4496      ┆ 10022 ┆ 0.875  ┆ 514044793 ┆ … ┆ -451.16 ┆ 29909459         ┆ 1.166667 ┆ 0.60928  │
│ …         ┆ …     ┆ …  

100%|██████████| 10/10 [03:55<00:00, 23.53s/it]



Model score: 0.904984551120348

Predicting values for model score...

shape: (235_687, 10)
┌───────────┬───────┬────────────┬───────────┬───┬─────────┬─────────────────┬──────────┬──────────┐
│ sensor_id ┆ time  ┆ charge     ┆ event_id  ┆ … ┆ z       ┆ last_pulse_inde ┆ azimuth  ┆ zenith   │
│ ---       ┆ ---   ┆ ---        ┆ ---       ┆   ┆ ---     ┆ x               ┆ ---      ┆ ---      │
│ i16       ┆ i64   ┆ f64        ┆ i64       ┆   ┆ f64     ┆ ---             ┆ f64      ┆ f64      │
│           ┆       ┆            ┆           ┆   ┆         ┆ i64             ┆          ┆          │
╞═══════════╪═══════╪════════════╪═══════════╪═══╪═════════╪═════════════════╪══════════╪══════════╡
│ 416       ┆ 10283 ┆ 2.025      ┆ 113756460 ┆ … ┆ -452.57 ┆ 31140829        ┆ 3.229128 ┆ 0.918351 │
│ 5050      ┆ 18191 ┆ 0.825      ┆ 112709241 ┆ … ┆ -156.41 ┆ 21040578        ┆ 0.562353 ┆ 0.344161 │
│ 921       ┆ 29358 ┆ 0.575      ┆ 111111019 ┆ … ┆ 140.22  ┆ 4589168         ┆ 5.308351 ┆ 1.123064 │

100%|██████████| 10/10 [04:20<00:00, 26.08s/it]



Model score: 0.907048113450406

Predicting values for model score...

shape: (243_907, 10)
┌───────────┬───────┬────────┬───────────┬───┬─────────┬──────────────────┬──────────┬──────────┐
│ sensor_id ┆ time  ┆ charge ┆ event_id  ┆ … ┆ z       ┆ last_pulse_index ┆ azimuth  ┆ zenith   │
│ ---       ┆ ---   ┆ ---    ┆ ---       ┆   ┆ ---     ┆ ---              ┆ ---      ┆ ---      │
│ i16       ┆ i64   ┆ f64    ┆ i64       ┆   ┆ f64     ┆ i64              ┆ f64      ┆ f64      │
╞═══════════╪═══════╪════════╪═══════════╪═══╪═════════╪══════════════════╪══════════╪══════════╡
│ 2776      ┆ 9981  ┆ 0.875  ┆ 471008939 ┆ … ┆ 229.5   ┆ 24420712         ┆ 5.629429 ┆ 2.578089 │
│ 1016      ┆ 12188 ┆ 6.375  ┆ 469257673 ┆ … ┆ -453.04 ┆ 6109158          ┆ 0.520818 ┆ 0.309921 │
│ 1900      ┆ 16362 ┆ 0.675  ┆ 468779794 ┆ … ┆ -180.16 ┆ 1188415          ┆ 5.856308 ┆ 0.882152 │
│ 2744      ┆ 10540 ┆ 1.875  ┆ 470867359 ┆ … ┆ -247.25 ┆ 22807279         ┆ 2.560566 ┆ 1.397903 │
│ …         ┆ …     ┆ …   

In [8]:
print (submission_df.collect().sort('event_id'))
submission_df.groupby('event_id').mean().collect()

shape: (1_193_655, 5)
┌───────────┬──────────────┬─────────────┬──────────────┬─────────────┐
│ event_id  ┆ azimuth_pred ┆ zenith_pred ┆ azimuth_true ┆ zenith_true │
│ ---       ┆ ---          ┆ ---         ┆ ---          ┆ ---         │
│ i64       ┆ f32          ┆ f32         ┆ f64          ┆ f64         │
╞═══════════╪══════════════╪═════════════╪══════════════╪═════════════╡
│ 110630201 ┆ 4.022473     ┆ 1.52762     ┆ 0.499148     ┆ 1.755306    │
│ 110630219 ┆ 3.233817     ┆ 1.353588    ┆ 0.139054     ┆ 1.632285    │
│ 110630278 ┆ 3.462491     ┆ 1.477876    ┆ 5.762015     ┆ 2.033112    │
│ 110630325 ┆ 3.449636     ┆ 1.346851    ┆ 6.221193     ┆ 1.924065    │
│ …         ┆ …            ┆ …           ┆ …            ┆ …           │
│ 960289995 ┆ 3.225251     ┆ 1.136564    ┆ 6.002078     ┆ 0.603835    │
│ 960289995 ┆ 3.229476     ┆ 1.215797    ┆ 6.002078     ┆ 0.603835    │
│ 960290060 ┆ 1.901494     ┆ 0.897813    ┆ 6.061678     ┆ 0.847598    │
│ 960290060 ┆ 1.559483     ┆ 0.82427     ┆

event_id,azimuth_pred,zenith_pred,azimuth_true,zenith_true
i64,f32,f32,f64,f64
471679648,1.990932,0.959356,5.795802,0.986101
778263216,3.303907,1.414963,2.830851,0.666966
113068396,3.174665,1.375689,2.33177,1.603507
512146596,4.080689,1.258112,4.050699,0.630621
779595348,2.15779,1.554056,0.251988,0.834611
959174260,2.827002,1.39302,5.042101,1.403492
779090788,3.253269,1.319611,3.004322,2.580138
111155796,2.320233,1.235481,0.44211,2.268895
111113440,3.592745,1.27651,1.070524,0.583556
470433708,4.103955,1.099584,0.03072,1.410625
