In [81]:
from lets_plot import *
from lets_plot.mapping import as_discrete
import polars as pl
import calendar
from datetime import date, datetime

LetsPlot.setup_html()

In [6]:
train_data = pl.read_csv("data/pog-sleep-data/train.csv")
train_data_dt = train_data.with_columns([pl.col("date").str.strptime(pl.Datetime, fmt="%Y-%m-%d")])

In [9]:
train_data_dt_plot = train_data_dt.with_columns(
    [
        pl.col("date").dt.year().alias("year"),
        pl.col("date").dt.month().alias("month"),
        pl.col("date").dt.day().alias("day"),
    ]
)

train_data_dt_plot.head()

date,sleep_hours,year,month,day
datetime[μs],f64,i32,u32,u32
2015-02-19 00:00:00,6.4,2015,2,19
2015-02-20 00:00:00,7.583333,2015,2,20
2015-02-21 00:00:00,6.35,2015,2,21
2015-02-22 00:00:00,6.5,2015,2,22
2015-02-23 00:00:00,8.916667,2015,2,23


In [11]:
train_data_dt_plot.glimpse()

Rows: 2354
Columns: 5
$ date        <datetime[μs]> 2015-02-19 00:00:00, 2015-02-20 00:00:00, 2015-02-21 00:00:00, 2015-02-22 00:00:00, 2015-02-23 00:00:00, 2015-02-24 00:00:00, 2015-02-25 00:00:00, 2015-02-26 00:00:00, 2015-02-27 00:00:00, 2015-02-28 00:00:00
$ sleep_hours          <f64> 6.4, 7.583333333333333, 6.35, 6.5, 8.916666666666666, 6.6, 6.216666666666667, 5.283333333333333, 9.866666666666667, 5.966666666666667
$ year                 <i32> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015
$ month                <u32> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
$ day                  <u32> 19, 20, 21, 22, 23, 24, 25, 26, 27, 28



In [14]:
train_data_dt_plot.select([pl.all().null_count()])

date,sleep_hours,year,month,day
u32,u32,u32,u32,u32
0,0,0,0,0


In [16]:
# how many data points we have per year
train_data_dt_plot.groupby("year").count().sort("year")

year,count
i32,u32
2015,214
2016,361
2017,353
2018,352
2019,360
2020,358
2021,356


So, one day can have actually 365 or 366 days. So in the dataset we're missing some days. 

In [22]:
train_data_grouped_year_month = (
    train_data_dt_plot
    .groupby(["year", "month"]).count().sort(["year", "month"])
)

(
    ggplot(train_data_grouped_year_month, aes("month", "count"))
    + geom_bar(stat="identity")
    + facet_grid("year", scales="free")
)

- No missing data

In [54]:
def get_max_days_by_month(year, month=1):
    month_days = calendar.monthcalendar(year, month)
    return max(month_days[-1])

max_days = []
for rows in train_data_grouped_year_month.rows():
    year = rows[0]
    month = rows[1]
    max_days.append(get_max_days_by_month(year, month))


In [61]:
train_data_grouped_max_days = train_data_grouped_year_month.with_columns(
    [pl.Series("max_days", max_days).alias("max_days")]
)


In [67]:
(
    ggplot(train_data_grouped_max_days, aes("month", "max_days"))
    + geom_bar(stat="identity", fill="red")
    + geom_bar(aes(y="count"), stat="identity")
    + facet_grid("year", scales="free")
)

By this way we're seeing every lake of days on the dataset, and probably we'll need to input some values and create those dates that are missing. 

In [82]:
correct_date_range = pl.date_range(datetime(2015, 2, 19), datetime(2021, 12, 31), "1d", name="date")
correct_date_range_df = pl.DataFrame({"date": correct_date_range})
correct_date_range_df.head()

date
datetime[μs]
2015-02-19 00:00:00
2015-02-20 00:00:00
2015-02-21 00:00:00
2015-02-22 00:00:00
2015-02-23 00:00:00


In [84]:
train_data_fixed_df = correct_date_range_df.join(
    train_data_dt,
    on="date",
    how="left",
)

In [86]:
train_data_fixed_df.shape

(2508, 2)

In [87]:
train_data_fixed_df.write_parquet("data/pog-sleep-data/train_dt_fixed.parquet")