In [2]:
import numpy as np
import pandas as pd
import polars
import seaborn as sns
import math
from pathlib import Path

data_dir = '/kaggle/input/icecube-neutrinos-in-deep-ice/'
metadata_file = f'{data_dir}/train_meta.parquet'
batch_files = [f'{data_dir}train/batch_{i}.parquet' for i in range(1, 2)] # You can set up to 660 files

sensor_geometry = pd.read_csv((data_dir + 'sensor_geometry.csv'))
metadata = pd.read_parquet(metadata_file)

In [3]:
batch_file = batch_files[0] #delete this line in the final code

train_batch = polars.scan_parquet(batch_file).lazy()
df_train_meta = polars.DataFrame(metadata).lazy()
df_sensor_geometry = polars.DataFrame(sensor_geometry).with_columns(polars.col('sensor_id').cast(polars.Int16)).lazy()

In [9]:
def generating_some_features(train_batch, df_train_meta, df_sensor_geometry):
    
    
    def join_tables_all(df_meta, df_batch, df_sensor):
        return df_meta.join(df_batch, on='event_id').join(df_sensor, on='sensor_id').with_columns([
            (polars.col('time') - polars.col('time').min()).over('event_id')
        ])

    def generate_features_grouped(dataf):
        return dataf.groupby('event_id').agg([
        polars.col('x').mean().alias('x_mean'),
        polars.col('x').median().alias('x_median'),
        polars.col('y').mean().alias('y_mean'),
        polars.col('y').median().alias('y_median'),
        polars.col('z').mean().alias('z_mean'),
        polars.col('z').median().alias('z_median'),    
        polars.col('time').mean().alias('event_mean_time'),
        polars.col('time').max().alias('event_max_time'),
        polars.col('charge').min().alias('event_min_charge'),
        polars.col('charge').mean().alias('event_mean_charge'),
        polars.col('charge').max().alias('event_max_charge'),
        polars.col('charge').count().alias('overall_count'),
        polars.col('auxiliary').sum().alias('overall_aux_sum'),
        polars.col('charge').sum().alias('sum_charge'),
        (polars.col('auxiliary').sum() / polars.col('auxiliary').count()).alias('aux_ratio'),
        polars.col('sensor_id').n_unique().alias('sensor_count'),
    ])
    
    def add_ranks(dataf):
        return dataf.with_columns(
[
    polars.col('time').rank('ordinal').over('event_id').alias('time_rank_asc'),
    polars.col('time').rank('ordinal', descending=True).over('event_id').alias('time_rank_des'),
    polars.col('charge').rank('ordinal').over('event_id').alias('charge_rank_asc'),
    polars.col('charge').rank('ordinal').over('event_id').alias('charge_rank_des')
])

    def make_geometrical_features(dataf):
        geometrical_features = dataf.select('event_id').unique()
        for direction in ['time_rank_asc','time_rank_des', 'charge_rank_asc', 'charge_rank_des']:
            for direction_axis in ['x', 'y', 'z']:    
                temp_col_1 = dataf.filter(polars.col(direction) == 1).select([
                    polars.col('event_id'),
                    polars.col(direction_axis).over('event_id')
                ]).with_columns([
                    polars.col(direction_axis).alias(direction_axis+'_'+direction+'_1')]
                ).select(polars.col('event_id'), polars.col(direction_axis+'_'+direction+'_1'))

                temp_col_2 = dataf.filter(polars.col("time_rank_asc") == 2).select([
                    polars.col('event_id'),
                    polars.col(direction_axis).over('event_id')
                ]).with_columns([
                    polars.col(direction_axis).alias(direction_axis+'_'+direction+'_2')]
                ).select(polars.col('event_id'), polars.col(direction_axis+'_'+direction+'_2'))

                temp_col_3 = dataf.filter(polars.col("time_rank_asc") == 3).select([
                    polars.col('event_id'),
                    polars.col(direction_axis).over('event_id')
                ]).with_columns([
                    polars.col(direction_axis).alias(direction_axis+'_'+direction+'_3')]
                ).select(polars.col('event_id'), polars.col(direction_axis+'_'+direction+'_3'))

                geometrical_features = geometrical_features.join(temp_col_1, on='event_id', how='left'
                               ).join(temp_col_2, on='event_id', how='left'
                               ).join(temp_col_3, on='event_id', how='left'
                               )
        return geometrical_features.fill_null(1000)
    
        #Not accounting for aux
    features_grouped_metrics = df_train_meta.pipe(join_tables_all, train_batch, df_sensor_geometry
                      ).pipe(generate_features_grouped).collect()

    geometrical_features = df_train_meta.pipe(join_tables_all, train_batch, df_sensor_geometry
                      ).pipe(add_ranks
                      ).collect().pipe(make_geometrical_features)

    temp_1 = features_grouped_metrics.join(geometrical_features, on='event_id', how='left')


    #AUX = FALSE

    features_grouped_metrics = df_train_meta.pipe(join_tables_all, train_batch, df_sensor_geometry
                      ).filter(polars.col('auxiliary') == False).pipe(generate_features_grouped).collect()

    geometrical_features = df_train_meta.pipe(join_tables_all, train_batch, df_sensor_geometry
                      ).filter(polars.col('auxiliary') == False).pipe(add_ranks
                      ).collect().pipe(make_geometrical_features)

    temp_2 = features_grouped_metrics.join(geometrical_features, on='event_id', how='left')
    
    temp_3 = temp_1.join(temp_2, on = 'event_id', how='left').fill_null(0)
    del temp_1, temp_2, features_grouped_metrics, geometrical_features
    
    temp_3 = temp_3.to_pandas().set_index('event_id')
    
    temp_3 = (temp_3-temp_3.mean())/temp_3.std()
    
    return temp_3

In [10]:
generating_some_features(train_batch, df_train_meta, df_sensor_geometry)

Unnamed: 0_level_0,x_mean,x_median,y_mean,y_median,z_mean,z_median,event_mean_time,event_max_time,event_min_charge,event_mean_charge,...,z_charge_rank_asc_3_right,x_charge_rank_des_1_right,x_charge_rank_des_2_right,x_charge_rank_des_3_right,y_charge_rank_des_1_right,y_charge_rank_des_2_right,y_charge_rank_des_3_right,z_charge_rank_des_1_right,z_charge_rank_des_2_right,z_charge_rank_des_3_right
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
424600,-0.584657,-0.122773,1.320849,1.601495,0.899139,1.048516,-0.671032,-0.441513,-0.445031,-0.228967,...,0.285292,-0.073614,-0.076539,-0.078171,1.373029,1.323660,1.329385,0.507776,0.303956,0.285292
2396384,-2.132895,-2.607910,0.050862,-0.083942,1.645888,1.803381,-0.550030,-0.434765,-1.219215,0.183970,...,1.039896,-1.746589,-1.297691,-1.305026,-0.057589,0.003836,0.003211,1.065141,0.942078,1.039896
2529288,1.057954,0.981537,1.130412,1.306160,0.370954,0.157932,0.174548,0.275346,-0.445031,-0.283119,...,1.029658,1.121164,1.218779,1.223196,1.619653,0.801013,0.804224,-0.898454,0.984652,1.029658
1061128,-0.513352,0.049713,-0.628495,-0.682259,0.236073,0.081376,-1.073487,-0.678841,-0.445031,-0.143592,...,1.032987,-1.893346,-1.835143,-1.844987,-0.440893,-0.431748,-0.434468,1.329363,1.093254,1.032987
2537448,1.353283,1.201197,-1.955461,-1.777954,-1.649325,-1.508053,-0.039162,0.196236,-1.219215,-0.255748,...,-1.533070,0.934660,1.150380,1.154478,-1.142835,-1.439826,-1.447396,-1.360424,-1.382189,-1.533070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1449349,0.796320,0.566150,-0.434115,-0.329026,-2.680006,-2.153562,-0.708995,-0.210184,-1.219215,-0.208044,...,-1.872215,0.374449,0.394212,0.394779,-0.214599,-0.452439,-0.455258,-1.601425,-1.874301,-1.872215
1774405,1.578960,1.714270,1.244334,1.601495,-1.542832,-1.765149,-0.541509,-0.654846,0.329152,-0.051793,...,-1.832990,1.423895,1.370667,1.375792,1.225292,1.180689,1.185726,-1.664549,-1.782995,-1.832990
992229,0.677833,0.983296,-1.237672,-1.144516,-0.226103,0.150748,0.586456,0.089007,-1.219215,-0.192768,...,-0.083688,1.344566,0.755621,0.757875,-1.075090,-1.506165,-1.514054,-1.425899,-0.112227,-0.083688
1491833,0.661866,0.712768,-1.296225,-1.247667,-0.642777,-0.815376,-0.109659,-0.435514,-0.445031,-0.195708,...,-0.948430,0.934660,0.897865,0.900783,-1.142835,-1.111047,-1.117035,-0.812270,-1.016872,-0.948430
