In [69]:
%%capture
%load_ext autoreload
%autoreload 2

# We can also put these magic commands in `.vscode/settings.json` like this:
# "jupyter.runStartupCommands": [
#     "%load_ext autoreload",
#     "%autoreload 2"
# ]

In [70]:
import random
from pathlib import Path

import polars as pl
import altair as alt

alt.data_transformers.enable("vegafusion")

print("All imports loaded successfully")

All imports loaded successfully


In [71]:
DATA_DIR = Path("../input/child-mind-institute-problematic-internet-use/")

filenames = []
for filename in (DATA_DIR / 'series_train.parquet').iterdir():
    filenames.append(filename.name.split('=')[1])

train_example_id = random.choice(filenames)
train_example_path = (DATA_DIR / f"series_train.parquet/id={train_example_id}")
train_example = pl.scan_parquet(train_example_path)

print(f"Reading '{train_example_path}'")
train_example.describe()

Reading '../input/child-mind-institute-problematic-internet-use/series_train.parquet/id=380029ef'


statistic,step,X,Y,Z,enmo,anglez,non-wear_flag,light,battery_voltage,time_of_day,weekday,quarter,relative_date_PCIAT
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",112531.0,112531.0,112531.0,112531.0,112531.0,112531.0,112531.0,112531.0,112531.0,112531.0,112531.0,112531.0,112531.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",56265.0,-0.033463,0.084492,-0.211385,0.10616,-15.027328,0.0,118.360664,4012.934082,52507000000000.0,3.969937,3.230656,20.415174
"""std""",32485.045908,0.563326,0.541149,0.471214,0.188283,33.114815,0.0,396.872681,94.68264,14629000000000.0,2.246566,0.421255,6.75058
"""min""",0.0,-3.820558,-2.393664,-1.00955,0.0,-89.823158,0.0,0.0,3797.75,25000000000.0,1.0,3.0,10.0
"""25%""",28133.0,-0.526424,-0.275272,-0.581833,0.019046,-38.218155,0.0,2.941176,3935.0,41220000000000.0,2.0,3.0,14.0
"""50%""",56265.0,-0.035275,0.068645,-0.241524,0.049121,-15.455792,0.0,9.0,4005.0,53715000000000.0,5.0,3.0,20.0
"""75%""",84398.0,0.45499,0.50843,0.070569,0.114776,3.581227,0.0,26.324091,4093.333252,64330000000000.0,6.0,3.0,26.0
"""max""",112530.0,1.486039,2.592262,1.784669,3.808876,89.398621,0.0,2648.0,4184.0,86395000000000.0,7.0,4.0,49.0


In [72]:
def relative_days(df):
    return df.with_columns(
        (pl.col('relative_date_PCIAT') + pl.col('time_of_day') / 86400e9).alias('day'),
        (pl.col('relative_date_PCIAT') * 86400 + pl.col('time_of_day') / 1e9).diff().fill_null(5).alias("time_diff"),
        (pl.col('time_of_day')/3600e9).round(2).alias('hour')
    )


def anglez_features(df):
    return df.with_columns(
        pl.col('anglez').abs().alias('anglez_abs'),
    ).with_columns(
        pl.col('anglez_abs').rolling_std(725).fill_null(0).alias('rolling_std_anglez')
    )

transformed_example = (
    train_example
    .pipe(relative_days)
    .pipe(anglez_features)
    .collect()
)

transformed_example.head()

step,X,Y,Z,enmo,anglez,non-wear_flag,light,battery_voltage,time_of_day,weekday,quarter,relative_date_PCIAT,day,time_diff,hour,anglez_abs,rolling_std_anglez
u32,f32,f32,f32,f32,f32,f32,f32,f32,i64,i8,i8,f32,f64,f64,f64,f32,f32
0,-0.097059,0.006258,-0.808757,0.062556,-78.320946,0.0,28.0,4181.0,45960000000000,5,3,10.0,10.531944,5.0,12.77,78.320946,0.0
1,0.058793,0.211882,-0.777389,0.065514,-69.617111,0.0,27.5,4181.0,45965000000000,5,3,10.0,10.532002,5.0,12.77,69.617111,0.0
2,0.117507,-0.174196,-0.799013,0.136439,-66.093666,0.0,27.0,4181.0,45970000000000,5,3,10.0,10.53206,5.0,12.77,66.093666,0.0
3,0.117533,-0.550463,-0.300613,0.128115,-25.511307,0.0,32.799999,4181.0,45975000000000,5,3,10.0,10.532118,5.0,12.77,25.511307,0.0
4,-0.166517,-0.272352,-0.823503,0.054886,-64.700668,0.0,12.0,4181.0,45980000000000,5,3,10.0,10.532176,5.0,12.77,64.700668,0.0


In [73]:
daily_avg_df = transformed_example.with_columns(
    id=pl.lit('randomid')
).group_by(['id', 'relative_date_PCIAT']).agg(
    pl.col('enmo').mean().alias('daily_avg_enmo'),
    pl.col('light').mean().alias('daily_avg_light')
)

In [74]:
first7days_avg_df = transformed_example.filter(
    (pl.col('day') - pl.col('day').min()) < 7
).with_columns(
    id=pl.lit('randomid')
).group_by(['id', 'relative_date_PCIAT']).agg(
    pl.col('enmo').mean().alias('daily_avg_enmo'),
    pl.col('light').mean().alias('daily_avg_light')
)

In [75]:
result_df = transformed_example.with_columns(
    id=pl.lit('randomid')
).group_by('id').agg(
    pl.col('relative_date_PCIAT').min().alias('relative_start_date_PCIAT'),
    (pl.col('relative_date_PCIAT').max() - pl.col('relative_date_PCIAT').min()).alias('total_days'),
    pl.col('rolling_std_anglez').std().alias('rolling_std_anglez_abs_std'),
    pl.col('X').mean().alias('X_mean'),
    pl.col('X').std().alias('X_std'),
    pl.col('Y').mean().alias('Y_mean'),
    pl.col('Y').std().alias('Y_std'),
    pl.col('anglez').mean().alias('angleZ_mean'),
    pl.col('anglez').std().alias('angleZ_std'),
)

In [76]:
result_df.join(
    daily_avg_df.group_by('id').agg(
        pl.col('daily_avg_enmo').min().alias('daily_avg_enmo_min'),
        pl.col('daily_avg_enmo').mean().alias('daily_avg_enmo_mean'),
        pl.col('daily_avg_enmo').std().alias('daily_avg_enmo_std'),
        pl.col('daily_avg_enmo').max().alias('daily_avg_enmo_max'),
        pl.col('daily_avg_light').min().alias('daily_avg_light_min'),
        pl.col('daily_avg_light').mean().alias('daily_avg_light_mean'),
        pl.col('daily_avg_light').std().alias('daily_avg_light_std'),
        pl.col('daily_avg_light').max().alias('daily_avg_light_max'),
    ),
    on='id',
    how='left'  # Make sure the join happens correctly
).join(
    first7days_avg_df.group_by('id').agg(
        pl.col('daily_avg_enmo').min().alias('first7_avg_enmo_min'),
        pl.col('daily_avg_enmo').mean().alias('first7_avg_enmo_mean'),
        pl.col('daily_avg_enmo').std().alias('first7_avg_enmo_std'),
        pl.col('daily_avg_enmo').max().alias('first7_avg_enmo_max'),
        pl.col('daily_avg_light').min().alias('first7_avg_light_min'),
        pl.col('daily_avg_light').mean().alias('first7_avg_light_mean'),
        pl.col('daily_avg_light').std().alias('first7_avg_light_std'),
        pl.col('daily_avg_light').max().alias('first7_avg_light_max')
    ),
    on='id',
    how='left'  # Make sure the join happens correctly
)

id,relative_start_date_PCIAT,total_days,rolling_std_anglez_abs_std,X_mean,X_std,Y_mean,Y_std,angleZ_mean,angleZ_std,daily_avg_enmo_min,daily_avg_enmo_mean,daily_avg_enmo_std,daily_avg_enmo_max,daily_avg_light_min,daily_avg_light_mean,daily_avg_light_std,daily_avg_light_max,first7_avg_enmo_min,first7_avg_enmo_mean,first7_avg_enmo_std,first7_avg_enmo_max,first7_avg_light_min,first7_avg_light_mean,first7_avg_light_std,first7_avg_light_max
str,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""randomid""",10.0,39.0,3.663863,-0.033463,0.563326,0.084493,0.541149,-15.027365,33.114815,5e-05,0.063853,0.054756,0.187817,0.0,52.966232,111.824821,449.51889,0.073315,0.12696,0.042029,0.187817,13.444325,124.981255,162.851868,402.4599
