In [1]:
import polars as pl
from datetime import date, timedelta
import random
import string

# Function to generate a date range from 2000 to 2024
def generate_date_range(start_year, end_year):
    start_date = date(start_year, 1, 1)
    end_date = date(end_year, 12, 31)
    delta = end_date - start_date
    return [start_date + timedelta(days=i) for i in range(delta.days + 1)]

# Generate a list of dates from 2000 to 2024
dates = generate_date_range(1961, 2023)

# Generate random groups, values, and set ctr to 1 for each entry
groups = [random.choice(string.ascii_uppercase) for _ in dates]
values = [random.randint(1, 5000000) for _ in dates]
ctrs = [1 for _ in dates]

# Create the DataFrame
df = pl.DataFrame({
    "group": groups,
    "values": values,
    "date": dates,
    "ctr": ctrs
})

df.describe()


statistic,group,values,date,ctr
str,str,f64,str,f64
"""count""","""23010""",23010.0,"""23010""",23010.0
"""null_count""","""0""",0.0,"""0""",0.0
"""mean""",,2505700.0,"""1992-07-01""",1.0
"""std""",,1445300.0,,0.0
"""min""","""A""",220.0,"""1961-01-01""",1.0
"""25%""",,1245988.0,"""1976-10-01""",1.0
"""50%""",,2517033.0,"""1992-07-02""",1.0
"""75%""",,3758319.0,"""2008-04-01""",1.0
"""max""","""Z""",4999620.0,"""2023-12-31""",1.0


In [2]:
# Define a custom function for rolling sum operation on a DataFrame grouped by 'group'
# df = pl.read_parquet("little file for testing polars.parquet")
def rolling_sum(group_df):
    # Selects specific columns and applies a rolling sum on the 'ctr' column with a window of 3
    return group_df.select([
        pl.col("group"),                     # Include the 'group' column
        pl.col("date"),                      # Include the 'date' column
        pl.col("ctr").rolling_sum(window_size="30d",by="date", min_periods=1)  # Apply rolling sum on 'ctr'
    ])

# Group the DataFrame by 'group', apply the custom rolling_sum function, and sort the result
result = df.group_by("group").map_groups(rolling_sum).sort(by=["group","date"], descending=False)

# Display the first 6 rows of the resulting DataFrame to showcase the rolling sum operation
result.head(30)


- sorting your data by your `by` column beforehand;
- setting `.set_sorted()` if you already know your data is sorted;
  (this is known to happen when combining rolling aggregations with `over`);

before passing calling the rolling aggregation function.



group,date,ctr
str,date,i64
"""A""",1961-01-12,1
"""A""",1961-01-14,2
"""A""",1961-01-26,3
"""A""",1961-02-16,2
"""A""",1961-02-23,3
"""A""",1961-04-24,1
"""A""",1961-05-10,2
"""A""",1961-05-13,3
"""A""",1961-05-16,4
"""A""",1961-08-12,1


In [4]:
##trying a lazy frame method of doing this

# lazy_df = pl.scan_parquet("huge file for testing polars.parquet")
lazy_df = df.lazy()
#create a query
q = (
    #invoke the lazy df
    lazy_df
    #lets CRUD some columns
    .with_columns(
        #do a 30 day backward looking sum of the counter column by the date column
        pl.col("ctr").rolling_sum(window_size="30d", min_periods=1, by="date")
        #do the above over the group column, so it will calculate each rolling sum by group
        .over("group")
        #use an alias to ensure no confusion
        .alias("group_ctr_rolling")
    )
    #sort so we can interpret the results easily
    .sort(by=["group","date"], descending=False)
    
)
#invoke the query and put the result into lazy_df with streaming on for best performance
lazy_df = q.collect(streaming=True)

#view results
lazy_df.head(30)

group,values,date,ctr,group_ctr_rolling
str,i64,date,i64,i64
"""A""",3011570,1961-01-12,1,1
"""A""",231490,1961-01-14,1,2
"""A""",4473502,1961-01-26,1,3
"""A""",3168188,1961-02-16,1,2
"""A""",4866426,1961-02-23,1,3
"""A""",4181646,1961-04-24,1,1
"""A""",2626628,1961-05-10,1,2
"""A""",3761687,1961-05-13,1,3
"""A""",3431929,1961-05-16,1,4
"""A""",535262,1961-08-12,1,1


In [12]:
#import interest rates
df_prime = pl.read_csv("canada_prime_Interest.csv")

#change rate column name
df_prime = df_prime.rename({"IRSTPI01CAM156N": "rate"})

df_prime = (
    df_prime
    .with_columns(
        pl.col("DATE").str.slice(0,7).alias("YYYY-MM"),
        pl.col("DATE").str.strptime(pl.Date, "%Y-%m-%d")
        
    )
)



DATE,rate,YYYY-MM
date,f64,str
1960-01-01,5.75,"""1960-01"""
1960-02-01,5.75,"""1960-02"""
1960-03-01,5.75,"""1960-03"""
1960-04-01,5.75,"""1960-04"""
1960-05-01,5.75,"""1960-05"""


In [13]:
df.head()

group,values,date,ctr
str,i64,date,i64
"""E""",2451350,1961-01-01,1
"""T""",2602489,1961-01-02,1
"""U""",66163,1961-01-03,1
"""T""",2405258,1961-01-04,1
"""V""",4350964,1961-01-05,1


In [15]:
#convert ddate to string
df = (
    df
    .with_columns(
        pl.col("date").dt.strftime("%Y-%m").alias("YYYY-MM")
    )
    
)

df.head()

group,values,date,ctr,YYYY-MM
str,i64,date,i64,str
"""E""",2451350,1961-01-01,1,"""1961-01"""
"""T""",2602489,1961-01-02,1,"""1961-01"""
"""U""",66163,1961-01-03,1,"""1961-01"""
"""T""",2405258,1961-01-04,1,"""1961-01"""
"""V""",4350964,1961-01-05,1,"""1961-01"""
