In [1]:
import polars as pl
from src.main import GetTanksReq, fetch_tank_data

req = GetTanksReq(property_ids={"11009"})

df = await fetch_tank_data(req)
if df is None:
    raise ValueError("df is None")
lf = df.lazy()
lf.collect()

unique_id,property_id,source_key,metric_nice_name,uom,timestamp,value,tanksize
object,str,str,str,str,datetime[μs],f64,f64
0046e74d-807d-4dc3-832f-50e81fc585a1,"""11009""","""11009FAC""","""ESD-OilTankInchesUntilAlarm""","""in""",2024-08-29 15:49:58.875497,270.015925,
00df10ed-9159-4b71-b564-0cb0b14dfa07,"""11009""","""11009W1""","""WaterTank1Volume""","""bbl""",2024-08-29 15:02:32.480785,345.259648,832.309683
0284e85b-a66d-4deb-b26b-898e73525f34,"""11009""","""1100905""","""OilTank5Volume""","""bbl""",2024-08-29 11:56:12.578042,351.646441,594.738209
028aa180-e62d-4079-9b7b-8020a093c4cb,"""11009""","""11009W2""","""WaterTank2Volume""","""bbl""",2024-08-29 08:09:43.832419,346.863257,653.644903
12129608-bf60-4cf9-a056-872d4494001f,"""11009""","""1100903""","""OilTank3Volume""","""bbl""",2024-08-29 16:06:31.381841,341.268066,455.738209
…,…,…,…,…,…,…,…
b50f29bd-b5a2-4194-a465-5cc630910860,"""11009""","""1100906""","""OilTank6Level""","""in""",2024-08-29 17:03:44.224613,82.921132,594.483209
c7105e6f-8a8e-4b3d-872d-ed2072f4a655,"""11009""","""1100902""","""OilTank2Level""","""in""",2024-08-29 05:11:34.923631,82.691424,531.081137
d6589a9f-f49b-4304-afcb-909c6cf38ba2,"""11009""","""1100901""","""OilTank1Volume""","""bbl""",2024-08-29 11:41:36.209056,351.760979,479.54315
e427e0f4-55fc-4af9-93e7-b77c206a2b11,"""11009""","""1100903""","""OilTank3Level""","""in""",2024-08-28 22:20:34.568712,79.202608,455.738209


In [3]:
lf = df.lazy()

tank_metrics = ["Level", "Volume", "InchesUntilAlarm", "InchesToESD", "TimeUntilESD", "Capacity", "ID"]
tank_types = ["Water", "Oil"]

tank_metrics_str= "|".join(tank_metrics)
tank_types_str = "|".join(tank_types)

pattern = f"^(?<is_ESD>ESD-)?(?<tank_type>{tank_types_str})Tank(?<tank_number>[0-9]*)(?<tank_metric>{tank_metrics_str})"
lf = lf.with_columns(
    separated_metrics=pl.col("metric_nice_name").str.extract_groups(pattern)
)
lf = lf.unnest("separated_metrics")

lf = lf.with_columns(pl.col("tank_number").cast(pl.UInt8, strict=False))


# pivoting the data
values = pl.col("value")
columns = pl.col("tank_metric")
pivoted_lf = lf.group_by("property_id", "tank_type", "tank_number", "source_key").agg(
    values.filter(columns == metric).first().alias(metric) for metric in tank_metrics
)

null_condition = pl.col("tank_number").is_null()
not_null_condition = pl.col("tank_number").is_not_null()

null_tanks = pivoted_lf.filter(null_condition)
numbered_tanks = pivoted_lf.filter(not_null_condition)

null_tanks = null_tanks.with_columns(pl.col("ID").alias("tank_number"))
null_tanks = null_tanks.drop("ID")
null_tanks = null_tanks.with_columns(
    pl.col("tank_number").cast(pl.UInt8, strict=False)
)

joined_lf = numbered_tanks.join(
    null_tanks, on=["property_id", "tank_type", "tank_number"], how="left"
)

final_lf = joined_lf.join(lf, on=["source_key"], how="left")
final_lf = final_lf.group_by("property_id", "tank_type", "tank_number").agg(
    pl.all().first()
)

final_lf = final_lf.with_columns(
    pl.col("property_id"),
    pl.col("tank_type"),
    pl.col("tank_number"),
    pl.coalesce(pl.col("Level"), pl.col("Level_right")).alias("level"),
    pl.coalesce(pl.col("Volume"), pl.col("Volume_right")).alias("volume"),
    pl.coalesce(pl.col("InchesToESD"), pl.col("InchesUntilAlarm_right")).alias(
        "inches_to_esd"
    ),
    pl.coalesce(pl.col("TimeUntilESD"), pl.col("TimeUntilESD_right")).alias(
        "time_until_esd"
    ),
    pl.coalesce(pl.col("Capacity"), pl.col("tanksize")).alias("capacity"),
    pl.coalesce(pl.col("unique_id")).alias("unique_id")
)

required_columns = [
    "property_id",
    "source_key",
    "tank_type",
    "tank_number",
    "level",
    "volume",
    "inches_to_esd",
    "time_until_esd",
    "capacity",
]
final_lf = final_lf.select(required_columns)

final_lf = final_lf.sort("property_id", "tank_type", "tank_number")

percent_tank_full = (
    (pl.col("volume") / pl.col("capacity") * 100).round().cast(pl.UInt8)
)
final_lf = final_lf.with_columns(percent_tank_full.alias("percent_full"))

capacity_rounded = pl.col("capacity").round()
final_lf = final_lf.with_columns(capacity_rounded.alias("capacity"))

volume_to_feet = pl.col("volume").round().cast(pl.UInt64)
final_lf = final_lf.with_columns(volume_to_feet.alias("volume"))

result = final_lf.collect()
result

property_id,source_key,tank_type,tank_number,level,volume,inches_to_esd,time_until_esd,capacity,percent_full
str,str,str,u8,f64,u64,f64,f64,f64,u8
"""11009""","""1100901""","""Oil""",1,83.291167,352,,,480.0,73
"""11009""","""1100902""","""Oil""",2,82.691424,350,,,531.0,66
"""11009""","""1100903""","""Oil""",3,79.202608,341,,,456.0,75
"""11009""","""1100904""","""Oil""",4,84.438585,349,270.015925,,535.0,65
"""11009""","""1100905""","""Oil""",5,83.172086,352,,,595.0,59
"""11009""","""1100906""","""Oil""",6,82.921132,352,,,594.0,59
"""11009""","""11009W1""","""Water""",1,84.156545,345,265.538665,,832.0,41
"""11009""","""11009W2""","""Water""",2,84.370506,347,,,654.0,53
"""11009""","""11009W3""","""Water""",3,82.710045,347,,,683.0,51


In [None]:
lf = df.lazy()

tank_metrics = ["Level", "Volume", "InchesUntilAlarm", "InchesToESD", "TimeUntilESD", "Capacity", "ID"]
tank_types = ["Water", "Oil"]

tank_metrics_str= "|".join(tank_metrics)
tank_types_str = "|".join(tank_types)

pattern = f'^(?<is_esd>ESD-)?(?<tank_type>{tank_types_str})Tank(?<tank_number>[0-9]*)(?<tank_metric>{tank_metrics_str})'
lf = lf.with_columns(separated_metrics = pl.col("tank_name").str.extract_groups(pattern))

lf = lf.unnest("separated_metrics")

lf = lf.with_columns(pl.col("tank_number").cast(pl.UInt8, strict=False))

values = pl.col("value")
columns = pl.col("tank_metric")
pivoted_lf = lf.group_by("property_id", "tank_type", "tank_number", "source_key", "unique_id").agg(
    values.filter(columns == metric).first().alias(metric) for metric in tank_metrics
)

null_condition = pl.col("tank_number").is_null()
not_null_condition = pl.col("tank_number").is_not_null()

null_tanks = pivoted_lf.filter(null_condition)
numbered_tanks = pivoted_lf.filter(not_null_condition)

null_tanks = null_tanks.with_columns(pl.col("ID").alias("tank_number"))
null_tanks = null_tanks.drop("ID")
null_tanks = null_tanks.with_columns(pl.col("tank_number").cast(pl.UInt8, strict=False))

numbered_tanks = numbered_tanks.with_columns(pl.col("unique_id").alias("identifier"))
numbered_tanks = numbered_tanks.drop("unique_id")

null_tanks_merged = null_tanks.group_by(["property_id", "tank_type", "source_key"]).agg(
    [
        pl.col("tank_number").max(),  # Use max or min to fill missing values
        pl.col("unique_id").last(),  # Use first() or last() for string-like columns
        pl.col("Level").max(),
        pl.col("Volume").max(),
        pl.col("InchesUntilAlarm").max(),
        pl.col("InchesToESD").max(),
        pl.col("TimeUntilESD").max(),
        pl.col("Capacity").max(),
    ]
)

joined_lf = numbered_tanks.join(null_tanks_merged, on=["property_id", "tank_type", "tank_number"], how="left")

final_lf = joined_lf.join(lf, on=["source_key"], how="left")

final_lf = final_lf.with_columns(
    pl.col("unique_id").alias("identifier"),
    pl.col("property_id"),
    pl.col("tank_type"),
    pl.col("tank_number"),
    pl.coalesce(pl.col("Level"), pl.col("Level_right")).alias("level"),
    pl.coalesce(pl.col("Volume"), pl.col("Volume_right")).alias("volume"),
    pl.coalesce(pl.col("InchesToESD"), pl.col("InchesUntilAlarm_right")).alias("inches_to_esd"),
    pl.coalesce(pl.col("TimeUntilESD"), pl.col("TimeUntilESD_right")).alias("time_until_esd"),
    pl.coalesce(pl.col("Capacity"), pl.col("tanksize")).alias("capacity")
)

required_columns = ["identifier", "property_id", "source_key", "tank_type", "tank_number", "level", "volume", "inches_to_esd", "time_until_esd", "capacity"]
final_lf = final_lf.select(required_columns)

final_lf = final_lf.sort("property_id", "tank_type", "tank_number")

percent_tank_full = (pl.col("volume") / pl.col("capacity") * 100).round().cast(pl.UInt8)
final_lf = final_lf.with_columns(percent_tank_full.alias("percent_full"))

capacity_rounded = pl.col("capacity").round()
final_lf = final_lf.with_columns(capacity_rounded.alias("capacity"))

volume_to_feet = pl.col("volume").round().cast(pl.UInt64)

final_lf = final_lf.with_columns(volume_to_feet.alias("volume"))

result = final_lf.group_by(["property_id", "source_key", "tank_type", "tank_number"]).agg(
    [
        pl.col("identifier").first(),  # Use first() or last() for non-numeric columns
        pl.col("level").max(),  # Use max() to get the highest value (fill missing)
        pl.col("volume").max(),
        pl.col("inches_to_esd").max(),
        pl.col("time_until_esd").max(),
        pl.col("capacity").max(),
        pl.col("percent_full").max(),
    ]
)

result = result.sort("property_id", "tank_type", "tank_number")
result = result.select(["identifier", "property_id", "source_key", "tank_type", "tank_number", "level", "volume", "inches_to_esd", "time_until_esd", "capacity", "percent_full"])
result = result.with_columns(pl.col("identifier").map_elements(lambda x: str(x) if x is not None else None, return_dtype=pl.Utf8).alias("identifier"))
result.collect()


In [2]:
lf = df.lazy()


In [3]:
lf = lf.drop("unique_id")
lf.collect()

property_id,source_key,metric_nice_name,uom,timestamp,value,tanksize
str,str,str,str,datetime[μs],f64,f64
"""11009""","""11009FAC""","""ESD-OilTankInchesUntilAlarm""","""in""",2024-08-29 15:49:58.875497,270.015925,
"""11009""","""11009W1""","""WaterTank1Volume""","""bbl""",2024-08-29 15:02:32.480785,345.259648,832.309683
"""11009""","""1100905""","""OilTank5Volume""","""bbl""",2024-08-29 11:56:12.578042,351.646441,594.738209
"""11009""","""11009W2""","""WaterTank2Volume""","""bbl""",2024-08-29 08:09:43.832419,346.863257,653.644903
"""11009""","""1100903""","""OilTank3Volume""","""bbl""",2024-08-29 16:06:31.381841,341.268066,455.738209
…,…,…,…,…,…,…
"""11009""","""1100906""","""OilTank6Level""","""in""",2024-08-29 17:03:44.224613,82.921132,594.483209
"""11009""","""1100902""","""OilTank2Level""","""in""",2024-08-29 05:11:34.923631,82.691424,531.081137
"""11009""","""1100901""","""OilTank1Volume""","""bbl""",2024-08-29 11:41:36.209056,351.760979,479.54315
"""11009""","""1100903""","""OilTank3Level""","""in""",2024-08-28 22:20:34.568712,79.202608,455.738209


In [4]:
tank_metrics = ["Level", "Volume", "InchesUntilAlarm", "InchesToESD", "TimeUntilESD", "Capacity", "ID"]
tank_types = ["Water", "Oil"]

tank_metrics_str= "|".join(tank_metrics)
tank_types_str = "|".join(tank_types)

pattern = f'^(?<is_esd>ESD-)?(?<tank_type>{tank_types_str})Tank(?<tank_number>[0-9]*)(?<tank_metric>{tank_metrics_str})'
lf = lf.with_columns(separated_metrics = pl.col("metric_nice_name").str.extract_groups(pattern))
lf.collect()

property_id,source_key,metric_nice_name,uom,timestamp,value,tanksize,separated_metrics
str,str,str,str,datetime[μs],f64,f64,struct[4]
"""11009""","""11009FAC""","""ESD-OilTankInchesUntilAlarm""","""in""",2024-08-29 15:49:58.875497,270.015925,,"{""ESD-"",""Oil"","""",""InchesUntilAlarm""}"
"""11009""","""11009W1""","""WaterTank1Volume""","""bbl""",2024-08-29 15:02:32.480785,345.259648,832.309683,"{null,""Water"",""1"",""Volume""}"
"""11009""","""1100905""","""OilTank5Volume""","""bbl""",2024-08-29 11:56:12.578042,351.646441,594.738209,"{null,""Oil"",""5"",""Volume""}"
"""11009""","""11009W2""","""WaterTank2Volume""","""bbl""",2024-08-29 08:09:43.832419,346.863257,653.644903,"{null,""Water"",""2"",""Volume""}"
"""11009""","""1100903""","""OilTank3Volume""","""bbl""",2024-08-29 16:06:31.381841,341.268066,455.738209,"{null,""Oil"",""3"",""Volume""}"
…,…,…,…,…,…,…,…
"""11009""","""1100906""","""OilTank6Level""","""in""",2024-08-29 17:03:44.224613,82.921132,594.483209,"{null,""Oil"",""6"",""Level""}"
"""11009""","""1100902""","""OilTank2Level""","""in""",2024-08-29 05:11:34.923631,82.691424,531.081137,"{null,""Oil"",""2"",""Level""}"
"""11009""","""1100901""","""OilTank1Volume""","""bbl""",2024-08-29 11:41:36.209056,351.760979,479.54315,"{null,""Oil"",""1"",""Volume""}"
"""11009""","""1100903""","""OilTank3Level""","""in""",2024-08-28 22:20:34.568712,79.202608,455.738209,"{null,""Oil"",""3"",""Level""}"


In [5]:
lf = lf.unnest("separated_metrics")
lf.collect()

property_id,source_key,metric_nice_name,uom,timestamp,value,tanksize,is_esd,tank_type,tank_number,tank_metric
str,str,str,str,datetime[μs],f64,f64,str,str,str,str
"""11009""","""11009FAC""","""ESD-OilTankInchesUntilAlarm""","""in""",2024-08-29 15:49:58.875497,270.015925,,"""ESD-""","""Oil""","""""","""InchesUntilAlarm"""
"""11009""","""11009W1""","""WaterTank1Volume""","""bbl""",2024-08-29 15:02:32.480785,345.259648,832.309683,,"""Water""","""1""","""Volume"""
"""11009""","""1100905""","""OilTank5Volume""","""bbl""",2024-08-29 11:56:12.578042,351.646441,594.738209,,"""Oil""","""5""","""Volume"""
"""11009""","""11009W2""","""WaterTank2Volume""","""bbl""",2024-08-29 08:09:43.832419,346.863257,653.644903,,"""Water""","""2""","""Volume"""
"""11009""","""1100903""","""OilTank3Volume""","""bbl""",2024-08-29 16:06:31.381841,341.268066,455.738209,,"""Oil""","""3""","""Volume"""
…,…,…,…,…,…,…,…,…,…,…
"""11009""","""1100906""","""OilTank6Level""","""in""",2024-08-29 17:03:44.224613,82.921132,594.483209,,"""Oil""","""6""","""Level"""
"""11009""","""1100902""","""OilTank2Level""","""in""",2024-08-29 05:11:34.923631,82.691424,531.081137,,"""Oil""","""2""","""Level"""
"""11009""","""1100901""","""OilTank1Volume""","""bbl""",2024-08-29 11:41:36.209056,351.760979,479.54315,,"""Oil""","""1""","""Volume"""
"""11009""","""1100903""","""OilTank3Level""","""in""",2024-08-28 22:20:34.568712,79.202608,455.738209,,"""Oil""","""3""","""Level"""


In [6]:
lf = lf.with_columns(pl.col("tank_number").cast(pl.UInt8, strict=False))
lf.collect()

property_id,source_key,metric_nice_name,uom,timestamp,value,tanksize,is_esd,tank_type,tank_number,tank_metric
str,str,str,str,datetime[μs],f64,f64,str,str,u8,str
"""11009""","""11009FAC""","""ESD-OilTankInchesUntilAlarm""","""in""",2024-08-29 15:49:58.875497,270.015925,,"""ESD-""","""Oil""",,"""InchesUntilAlarm"""
"""11009""","""11009W1""","""WaterTank1Volume""","""bbl""",2024-08-29 15:02:32.480785,345.259648,832.309683,,"""Water""",1,"""Volume"""
"""11009""","""1100905""","""OilTank5Volume""","""bbl""",2024-08-29 11:56:12.578042,351.646441,594.738209,,"""Oil""",5,"""Volume"""
"""11009""","""11009W2""","""WaterTank2Volume""","""bbl""",2024-08-29 08:09:43.832419,346.863257,653.644903,,"""Water""",2,"""Volume"""
"""11009""","""1100903""","""OilTank3Volume""","""bbl""",2024-08-29 16:06:31.381841,341.268066,455.738209,,"""Oil""",3,"""Volume"""
…,…,…,…,…,…,…,…,…,…,…
"""11009""","""1100906""","""OilTank6Level""","""in""",2024-08-29 17:03:44.224613,82.921132,594.483209,,"""Oil""",6,"""Level"""
"""11009""","""1100902""","""OilTank2Level""","""in""",2024-08-29 05:11:34.923631,82.691424,531.081137,,"""Oil""",2,"""Level"""
"""11009""","""1100901""","""OilTank1Volume""","""bbl""",2024-08-29 11:41:36.209056,351.760979,479.54315,,"""Oil""",1,"""Volume"""
"""11009""","""1100903""","""OilTank3Level""","""in""",2024-08-28 22:20:34.568712,79.202608,455.738209,,"""Oil""",3,"""Level"""


In [7]:
#pivoting the data
values = pl.col("value")
columns = pl.col("tank_metric")
pivoted_lf = lf.group_by("property_id", "tank_type", "tank_number", "source_key").agg(
    values.filter(columns == metric).first().alias(metric) for metric in tank_metrics
)

pivoted_lf.collect()

property_id,tank_type,tank_number,source_key,Level,Volume,InchesUntilAlarm,InchesToESD,TimeUntilESD,Capacity,ID
str,str,u8,str,f64,f64,f64,f64,f64,f64,f64
"""11009""","""Oil""",3,"""1100903""",79.202608,341.268066,,,,,
"""11009""","""Oil""",,"""11009FAC""",,,270.015925,,,,4.0
"""11009""","""Water""",2,"""11009W2""",84.370506,346.863257,,,,,
"""11009""","""Water""",3,"""11009W3""",82.710045,347.302192,,,,,
"""11009""","""Water""",,"""11009FAC""",,,265.538665,,,,1.0
…,…,…,…,…,…,…,…,…,…,…
"""11009""","""Oil""",6,"""1100906""",82.921132,351.742534,,,,,
"""11009""","""Oil""",5,"""1100905""",83.172086,351.646441,,,,,
"""11009""","""Oil""",4,"""1100904""",84.438585,348.998913,,,,,
"""11009""","""Oil""",1,"""1100901""",83.291167,351.760979,,,,,


In [8]:
null_condition = pl.col("tank_number").is_null()
not_null_condition = pl.col("tank_number").is_not_null()

null_tanks = pivoted_lf.filter(null_condition)
numbered_tanks = pivoted_lf.filter(not_null_condition)

null_tanks = null_tanks.with_columns(pl.col("ID").alias("tank_number"))
null_tanks = null_tanks.drop("ID")
null_tanks = null_tanks.with_columns(pl.col("tank_number").cast(pl.UInt8, strict=False))

null_tanks.collect()

property_id,tank_type,tank_number,source_key,Level,Volume,InchesUntilAlarm,InchesToESD,TimeUntilESD,Capacity
str,str,u8,str,f64,f64,f64,f64,f64,f64
"""11009""","""Water""",1,"""11009FAC""",,,265.538665,,,
"""11009""","""Oil""",4,"""11009FAC""",,,270.015925,,,


In [9]:
numbered_tanks.collect()

property_id,tank_type,tank_number,source_key,Level,Volume,InchesUntilAlarm,InchesToESD,TimeUntilESD,Capacity,ID
str,str,u8,str,f64,f64,f64,f64,f64,f64,f64
"""11009""","""Oil""",5,"""1100905""",83.172086,351.646441,,,,,
"""11009""","""Oil""",3,"""1100903""",79.202608,341.268066,,,,,
"""11009""","""Oil""",2,"""1100902""",82.691424,350.138454,,,,,
"""11009""","""Oil""",4,"""1100904""",84.438585,348.998913,,,,,
"""11009""","""Water""",2,"""11009W2""",84.370506,346.863257,,,,,
"""11009""","""Oil""",1,"""1100901""",83.291167,351.760979,,,,,
"""11009""","""Water""",1,"""11009W1""",84.156545,345.259648,,,,,
"""11009""","""Water""",3,"""11009W3""",82.710045,347.302192,,,,,
"""11009""","""Oil""",6,"""1100906""",82.921132,351.742534,,,,,


In [13]:
pp = null_tanks.group_by(["property_id", "tank_type", "source_key"]).agg(
    [
        pl.col("tank_number").max(),  # Use max or min to fill missing values  # Use first() or last() for string-like columns
        pl.col("Level").max(),
        pl.col("Volume").max(),
        pl.col("InchesUntilAlarm").max(),
        pl.col("InchesToESD").max(),
        pl.col("TimeUntilESD").max(),
        pl.col("Capacity").max(),
    ]
)

pp.collect()

property_id,tank_type,source_key,tank_number,Level,Volume,InchesUntilAlarm,InchesToESD,TimeUntilESD,Capacity
str,str,str,u8,f64,f64,f64,f64,f64,f64
"""11009""","""Oil""","""11009FAC""",4,,,270.015925,,,
"""11009""","""Water""","""11009FAC""",1,,,265.538665,,,


In [14]:
joined_lf = numbered_tanks.join(pp, on=["property_id", "tank_type", "tank_number"], how="left")
joined_lf.collect()

ColumnNotFoundError: unique_id

This error occurred with the following context stack:
	[1] 'with_columns' failed
	[2] 'drop' input failed to resolve
	[3] 'join left' input failed to resolve


In [None]:
final_lf = joined_lf.join(lf, on=["scada_id"], how="left")
final_lf.collect()

In [None]:
final_lf = final_lf.with_columns(
    pl.col("unique_id").alias("identifier"),
    pl.col("property_id"),
    pl.col("tank_type"),
    pl.col("tank_number"),
    pl.coalesce(pl.col("Level"), pl.col("Level_right")).alias("level"),
    pl.coalesce(pl.col("Volume"), pl.col("Volume_right")).alias("volume"),
    pl.coalesce(pl.col("InchesToESD"), pl.col("InchesUntilAlarm_right")).alias("inches_to_esd"),
    pl.coalesce(pl.col("TimeUntilESD"), pl.col("TimeUntilESD_right")).alias("time_until_esd"),
    pl.coalesce(pl.col("Capacity"), pl.col("tanksize")).alias("capacity")
)

required_columns = ["identifier", "property_id", "scada_id", "tank_type", "tank_number", "level", "volume", "inches_to_esd", "time_until_esd", "capacity"]
final_lf = final_lf.select(required_columns)
final_lf.collect()

In [None]:
final_lf = final_lf.sort("property_id", "tank_type", "tank_number")
final_lf.collect()

In [None]:
percent_tank_full = (pl.col("volume") / pl.col("capacity") * 100).round().cast(pl.UInt8)
final_lf = final_lf.with_columns(percent_tank_full.alias("percent_full"))

final_lf.collect()

In [None]:
capacity_rounded = pl.col("capacity").round()
final_lf = final_lf.with_columns(capacity_rounded.alias("capacity"))
final_lf.collect()

In [None]:
volume_to_feet = pl.col("volume").round().cast(pl.UInt64)

final_lf = final_lf.with_columns(volume_to_feet.alias("volume"))

final_lf.collect()

In [None]:
new_pp = final_lf.group_by(["property_id", "scada_id", "tank_type", "tank_number"]).agg(
    [
        pl.col("identifier").first(),  # Use first() or last() for non-numeric columns
        pl.col("level").max(),  # Use max() to get the highest value (fill missing)
        pl.col("volume").max(),
        pl.col("inches_to_esd").max(),
        pl.col("time_until_esd").max(),
        pl.col("capacity").max(),
        pl.col("percent_full").max(),
    ]
)



new_pp.collect()

In [None]:
new_pp = new_pp.sort("property_id", "tank_type", "tank_number")
new_pp.collect()