In [44]:
import polars as pl
from lets_plot import *
from lets_plot.mapping import as_discrete
import pmdarima as pm
from pmdarima.model_selection import train_test_split
from statsmodels.tsa.stattools import adfuller
import numpy as np
from sklearn import metrics
from datetime import datetime

LetsPlot.setup_html()

In [45]:
train_data = pl.read_parquet("data/pog-sleep-data/train_dt_fixed.parquet")
train_data.head()

date,sleep_hours
datetime[μs],f64
2015-02-19 00:00:00,6.4
2015-02-20 00:00:00,7.583333
2015-02-21 00:00:00,6.35
2015-02-22 00:00:00,6.5
2015-02-23 00:00:00,8.916667


In [46]:
train_data_year = train_data.with_columns([
    pl.col('date').dt.year().alias('year'),
    pl.col("date").dt.day().alias('day'),
    pl.col("date").dt.week().alias('week'),
]).with_columns([
    pl.when(pl.col("date").is_between(datetime(2017, 9, 27), datetime(2018, 6, 12)))
    .then(pl.col("sleep_hours")/2)
    .otherwise(pl.col("sleep_hours")).alias("sleep_hours_normalized")
]
)

date_breaks = train_data_year.filter(pl.col("day") == 10).select("date").rows(named=True)
date_breaks = [item.get("date") for item in date_breaks]

(
    ggplot(train_data_year, aes('date', 'sleep_hours_normalized'))
    + geom_line(aes(group="year", color=as_discrete("year")))
    + scale_x_datetime(breaks=date_breaks, format="%b %d")
    + facet_grid(x="year", scales="free")
)

In [47]:
train_data_interpolated = train_data_year.interpolate().select(pl.col("date"), pl.col("sleep_hours_normalized").alias("sleep_hours"))

In [48]:
train_data_interpolated.null_count()

date,sleep_hours
u32,u32
0,0


In [49]:
train, test = train_test_split(train_data_interpolated["sleep_hours"], train_size=1)


In [50]:
train_data_exogeneous = train_data_year.with_columns(
    [pl.col("date").dt.month().alias("month")]
)


In [51]:
week_index = train_data_exogeneous["week"]
week_index

week
u32
8
8
8
8
9
9
9
9
9
9


In [64]:
ARIMA_model = pm.auto_arima(
    train_data_interpolated["sleep_hours"],
    start_p=1,
    start_q=1,
    start_P=1,
    start_Q=1,
    max_p=5,
    max_q=5,
    max_P=5,
    max_Q=5,
    seasonal=True,
    stepwise=True,
    suppress_warnings=True,
    D=10,
    max_D=10,
    error_action="ignore",
)

# SARIMAX_model = pm.auto_arima(
#     train_data_interpolated["sleep_hours"],
#     exogenous=week_index,
#     start_p=1,
#     start_q=1,
#     test="adf",
#     max_p=3,
#     max_q=3,
#     m=7,
#     start_P=0,
#     seasonal=True,
#     d=None,
#     D=1,
#     trace=False,
#     error_action="ignore",
#     suppress_warnings=True,
#     stepwise=True,
# )

# ARIMA_model = pm.auto_arima(
#     train,
#     start_p=1,
#     start_q=1,
#     test="adf",
#     max_p=3,
#     max_q=3,
#     m=7,  # 12 is the frequncy of the cycle
#     start_P=0,
#     seasonal=True,  # set to seasonal
#     d=None,
#     D=1,  # order of the seasonal differencing
#     trace=False,
#     error_action="ignore",
#     suppress_warnings=True,
#     stepwise=True,
# )

# Create predictions for the future, evaluate on test
preds, conf_int = ARIMA_model.predict(n_periods=train_data_interpolated["sleep_hours"].shape[0], return_conf_int=True)

#print("Test RMSE: %.3f" % np.sqrt(metrics.mean_squared_error(train_data_interpolated["sleep_hours"], preds)))


In [65]:
print("Test RMSE: %.3f" % np.sqrt(metrics.mean_squared_error(train_data_interpolated["sleep_hours"], preds)))

Test RMSE: 0.968


In [56]:
# submission
periods_to_predict = pl.date_range(
    datetime(2022, 1, 1), datetime(2023, 3, 16), "1d", name="date"
)

preds, conf_int = SARIMAX_model.predict(
    n_periods=periods_to_predict.shape[0], return_conf_int=True, exogenous=week_index
)

preds


array([7.84646191, 5.31332349, 6.03936067, 5.76955043, 6.28093638,
       6.23506918, 6.30254827, 7.96434738, 5.01533119, 5.90231943,
       6.05690079, 6.35972444, 6.41300562, 5.96046024, 8.06824922,
       4.87468695, 5.64796707, 5.83749554, 6.35554168, 5.98706989,
       6.11088697, 7.97564088, 5.07572442, 5.85669024, 5.86208377,
       6.28614663, 6.17233958, 6.05873415, 7.9402858 , 4.91500885,
       5.73078834, 5.85829357, 6.28146661, 6.1379167 , 6.0047128 ,
       7.96557192, 4.93823931, 5.73565734, 5.84774127, 6.3111092 ,
       6.10898216, 6.07585787, 7.9795425 , 4.99951908, 5.80023951,
       5.88278644, 6.31797242, 6.16627297, 6.06723636, 7.98189724,
       4.96613173, 5.76739341, 5.87250698, 6.31041161, 6.13949968,
       6.0498529 , 7.97323358, 4.96414743, 5.76182252, 5.85939885,
       6.30380997, 6.12871021, 6.05457228, 7.96826625, 4.96713661,
       5.76801477, 5.86435189, 6.30450548, 6.14024833, 6.05338347,
       7.97236621, 4.9646917 , 5.76576046, 5.86662754, 6.30851

In [57]:
len(preds)

440

In [58]:
submission_df = pl.DataFrame({"date": periods_to_predict, "sleep_hours": preds})
submission_df.head()

date,sleep_hours
datetime[μs],f64
2022-01-01 00:00:00,7.846462
2022-01-02 00:00:00,5.313323
2022-01-03 00:00:00,6.039361
2022-01-04 00:00:00,5.76955
2022-01-05 00:00:00,6.280936


In [59]:
test_df = pl.read_csv("data/pog-sleep-data/test.csv")

test_dt_df = test_df.with_columns([pl.col("date").str.strptime(pl.Datetime, "%Y-%m-%d").alias("date")])

In [60]:
test_df_sub = test_dt_df.join(
    submission_df,
    on="date",
    how="left",
)


In [61]:
submission_final = test_df_sub.drop("sleep_hours").rename({"sleep_hours_right": "sleep_hours"})

In [62]:
submission_final.with_columns([pl.col("date").cast(pl.Date)]).write_csv("data/pog-sleep-data/subs/submission_baseline.csv")