In [37]:
import polars as pl
from datetime import date, timedelta
import random
import string

# Function to generate a date range from 2000 to 2024
def generate_date_range(start_year, end_year):
    start_date = date(start_year, 1, 1)
    end_date = date(end_year, 12, 31)
    delta = end_date - start_date
    return [start_date + timedelta(days=i) for i in range(delta.days + 1)]

# Generate a list of dates from 2000 to 2024
dates = generate_date_range(2, 9999)

# Generate random groups, values, and set ctr to 1 for each entry
groups = [random.choice(string.ascii_uppercase) for _ in dates]
values = [random.randint(1, 5000000) for _ in dates]
ctrs = [1 for _ in dates]

# Create the DataFrame
df = pl.DataFrame({
    "group": groups,
    "values": values,
    "date": dates,
    "ctr": ctrs
})

df.describe()

statistic,group,values,date,ctr
str,str,f64,str,f64
"""count""","""3651694""",3651694.0,"""3651694""",3651694.0
"""null_count""","""0""",0.0,"""0""",0.0
"""mean""",,2499800.0,"""5000-12-31""",1.0
"""std""",,1443300.0,,0.0
"""min""","""A""",1.0,"""0002-01-01""",1.0
"""25%""",,1250474.0,"""2501-07-02""",1.0
"""50%""",,2500447.0,"""5001-01-01""",1.0
"""75%""",,3749534.0,"""7500-07-02""",1.0
"""max""","""Z""",5000000.0,"""9999-12-31""",1.0


In [38]:
# Define a custom function for rolling sum operation on a DataFrame grouped by 'group'
# df = pl.read_parquet("huge file for testing polars.parquet")
def rolling_sum(group_df):
    # Selects specific columns and applies a rolling sum on the 'ctr' column with a window of 3
    return group_df.select([
        pl.col("group"),                     # Include the 'group' column
        pl.col("date"),                      # Include the 'date' column
        pl.col("ctr").rolling_sum(window_size="30d",by="date", min_periods=1)  # Apply rolling sum on 'ctr'
    ])

# Group the DataFrame by 'group', apply the custom rolling_sum function, and sort the result
result = df.group_by("group").map_groups(rolling_sum).sort(by=["group","date"], descending=False)

# Display the first 6 rows of the resulting DataFrame to showcase the rolling sum operation
result.head(30)

group,date,ctr
str,date,i64
"""A""",0002-02-17,1
"""A""",0002-03-17,2
"""A""",0002-04-15,2
"""A""",0002-05-11,2
"""A""",0002-06-01,2
"""A""",0002-06-08,3
"""A""",0002-06-21,3
"""A""",0002-07-10,2
"""A""",0002-07-30,2
"""A""",0002-08-03,3


In [39]:
##trying a lazy frame method of doing this

# lazy_df = pl.scan_parquet("huge file for testing polars.parquet")
lazy_df = df.lazy()
#create a query
q = (
    lazy_df
    .with_columns(
        pl.col("ctr").rolling_sum(window_size="30d", min_periods=1, by="date")
        .over("group")
        .alias("group_ctr_rolling")
    )
    .sort(by=["group","date"], descending=False)
    # .with_columns(
    #     pl.col("ctr")
    #     .rolling_sum()
    # )
)

lazy_df = q.collect(streaming=True)

lazy_df.head(30)

group,values,date,ctr,group_ctr_rolling
str,i64,date,i64,i64
"""A""",3851930,0002-02-17,1,1
"""A""",744878,0002-03-17,1,2
"""A""",3538740,0002-04-15,1,2
"""A""",587206,0002-05-11,1,2
"""A""",2493932,0002-06-01,1,2
"""A""",3520584,0002-06-08,1,3
"""A""",3210145,0002-06-21,1,3
"""A""",981206,0002-07-10,1,2
"""A""",2167480,0002-07-30,1,2
"""A""",877124,0002-08-03,1,3
