In [8]:
import time as t

In [4]:
!python --version

Python 3.13.5


In [None]:
# !pip install polar

In [1]:
import polars as pl
import numpy as np
import html
# from Typing import Optional
# import glob

In [None]:
# decorator to calculate function exection time
def measure_time(func):
    """
    A decorator function to calculate execution time.
    """

    def wrapper(**kwargs):
        start = time.time()
        func()
        end = time.time()
        print(f"Exection time: {end - start} seconds")

    return wrapper

> if __name__ == "__main__":
>   measure_time(main())

# Test Class

In [None]:
class Helper:
    def __init__(self,func_select:str)->None:
        self.func_select = func_select

    def __str__(self)->str:
        attrs:str = ','.join(f'{k}={v!r}' for k,v in self.__dict__.items())
        return f'{self.__class__.__name__}({attrs})'

# Helper functions

## Date helper func

In [4]:
def add_flag_if_date2_in_date1(lf: pl.LazyFrame, date1_col: str = "date1", date2_col: str = "date2", flag_col: str = "flag") -> pl.LazyFrame:
    """
    Add a boolean column (flag_col) to lf: True if date2 value is in the set of unique, non-null date1 values.
    
    Args:
        lf (pl.LazyFrame): Input LazyFrame.
        date1_col (str): Column name from which to collect unique non-null values.
        date2_col (str): Column name to check membership of.
        flag_col (str): Name for the resulting boolean column.
    
    Returns:
        pl.LazyFrame: The original LazyFrame with the flag column appended.
    """
    # Step 1: Collect unique, non-null values from date1_col
    date1_unique_expr = (
        pl.col(date1_col).drop_nulls().unique().alias("date1_set")
    )
    date1_unique = lf.select(date1_unique_expr).collect()["date1_set"]
    
    # Step 2: Add the flag column based on membership checking
    lf_flagged = lf.with_columns(
        (
            pl.when(
                pl.col(date2_col).is_null() | date1_unique.is_empty()
            )
            .then(False)
            .otherwise(pl.col(date2_col).is_in(date1_unique))
            .alias(flag_col)
        )
    )
    return lf_flagged

# Example usage:
# new_lf = add_flag_if_date2_in_date1(lf)


## Html decoder function

In [3]:
def html_decode(lf:pl.DataFrame,strcols_name:list[str])->pl.DataFrame:
    """
    Add a boolean column (flag_col) to lf: True if date2 value is in the set of unique, non-null date1 values.
    
    Args:
        lf (pl.LazyFrame): Input LazyFrame.
        strcols_name (list[str]): list of Column name with html encoded string values.
        
    Returns:
        pl.LazyFrame: The original LazyFrame with the flag column appended.
    """
    lf_decoded = lf.with_columns(
        pl.col(col).map_elements(
            lambda x: html.unescape(str(x)) if x is not None else None,
            skip_nulls=False,
            return_dtype=pl.Utf8
        ).alias(col)
        for col in strcols_name
    )
    return lf_decoded

## Extract JSON tag

In [2]:
from records_tag import extract_json_field

In [3]:
#sample df w/ JSON data in a column
df = pl.DataFrame({
    "id": [1, 2],
    "data": [
        '{"user": {"name": "Alice", "age": 30}}',
        '{"user": {"name": "Bob", "age": 25}}'
    ]
})

# Extract the nested "user.name" field
df_extracted = extract_json_field(df, column="data", json_path=["user", "name"])
print(df_extracted)

shape: (2, 3)
┌─────┬─────────────────────────────────┬───────────┐
│ id  ┆ data                            ┆ data_name │
│ --- ┆ ---                             ┆ ---       │
│ i64 ┆ str                             ┆ str       │
╞═════╪═════════════════════════════════╪═══════════╡
│ 1   ┆ {"user": {"name": "Alice", "ag… ┆ Alice     │
│ 2   ┆ {"user": {"name": "Bob", "age"… ┆ Bob       │
└─────┴─────────────────────────────────┴───────────┘


  return df.with_columns(
