## Shelter Animal Analytics Dashboard


In [1]:
import polars as pl
import polars.selectors as cs
import os

In [2]:
# Define Enum categories
enum_AGE = pl.Enum(['Adult', 'Baby', 'Senior', 'Young'])
enum_SEX = pl.Enum(['Male', 'Female', 'Unknown'])
enum_SIZE = pl.Enum(['Small', 'Medium', 'Large','Extra Large'])

In [3]:
root_file = 'allDogDescriptions'
if os.path.exists(root_file + '.parquet'):
    print(f'{"*"*20} Reading {root_file}.parquet  {"*"*20}')
    df = pl.read_parquet(root_file + '.parquet')
    
else:
    print(f'{"*"*20} Reading {root_file}.csv  {"*"*20}')
    df = (
        pl.read_csv(root_file + '.csv', ignore_errors=True)
        .select(
            ID = pl.col('id').cast(pl.UInt32),
            ORG_ID = pl.col('org_id'),
            BREED_PRIMARY = pl.col('breed_primary'),
            BREED_MIXED = pl.col('breed_mixed'),
            AGE = pl.col('age').cast(enum_AGE),
            SEX = pl.col('sex').cast(enum_SEX),
            SIZE = pl.col('size').cast(enum_SIZE),
            FIXED = pl.col('fixed'),
            HOUSE_TRAINED = pl.col('house_trained'),
            SHOTS_CURRENT = pl.col('shots_current'),
            NAME = pl.col('name').str.strip_chars().str.to_titlecase(),
            DATE = pl.col('posted')   # regex for dates formatted as YYYY-MM-DD
                .str.extract(r'(\d{4}-\d{2}-\d{2})', group_index=1)
                .str.to_date('%Y-%m-%d'),
            TIME = pl.col('posted')   # regex for time formatted as HH:MM:SS
                .str.extract(r'(\d{2}:\d{2}:\d{2})', group_index=1)
                .str.to_time('%H:%M:%S'),
            CONTACT_CITY = pl.col('contact_city'),
            CONTACT_STATE = pl.col('contact_state'),
            CONTACT_ZIP = pl.col('contact_zip').cast(pl.UInt32),
        )
        .filter(  # regex to accept states comprised of 2 uppercase letters    
            pl.col('CONTACT_STATE').str.contains(r'^[A-Z]{2}$')
        )
    )
    df.write_parquet(root_file + '.parquet')

print(df.shape)
print(df.columns[:8])
print(df.columns[8:])

print(df.sample(10).glimpse())

******************** Reading allDogDescriptions.parquet  ********************
(58147, 16)
['ID', 'ORG_ID', 'BREED_PRIMARY', 'BREED_MIXED', 'AGE', 'SEX', 'SIZE', 'FIXED']
['HOUSE_TRAINED', 'SHOTS_CURRENT', 'NAME', 'DATE', 'TIME', 'CONTACT_CITY', 'CONTACT_STATE', 'CONTACT_ZIP']
Rows: 10
Columns: 16
$ ID             <u32> 45910484, 43419415, 45541863, 45880818, 45510835, 45746283, 46036509, 44240879, 42321698, 45227194
$ ORG_ID         <str> 'PA142', 'NM160', 'GA644', 'AL211', 'MI921', 'IN32', 'NM13', 'OK474', 'NC871', 'CT258'
$ BREED_PRIMARY  <str> 'Pit Bull Terrier', 'Chihuahua', 'Chihuahua', 'Dachshund', 'Miniature Pinscher', 'Pit Bull Terrier', 'Australian Cattle Dog / Blue Heeler', 'Shepherd', 'Pit Bull Terrier', 'Hound'
$ BREED_MIXED   <bool> False, False, True, True, True, False, True, True, True, True
$ AGE           <enum> Young, Baby, Young, Baby, Baby, Adult, Young, Adult, Adult, Young
$ SEX           <enum> Female, Female, Female, Male, Female, Female, Male, Female, Male, Male

In [12]:
print(df.sample(5).glimpse())
df_time = (
    df
    .sort('DATE')
    .group_by_dynamic(
        index_column='DATE',  # specify the datetime column
        every='1mo',          # interval size
        period='1mo',         # window size
        closed='left'         # interval includes the left endpoint
    )
    .agg(pl.col('ID').count().alias('Dog Count'))   
                                    
    # ())# .alias('Dog Count'))
    # .sort('DATE')
)
print(f'{type(df_time) = }')
print(df_time)

Rows: 5
Columns: 16
$ ID             <u32> 45947366, 45619199, 35672471, 45801517, 45905429
$ ORG_ID         <str> 'WV28', 'FL1026', 'NY816', 'SC170', 'CT523'
$ BREED_PRIMARY  <str> 'Black and Tan Coonhound', 'Miniature Pinscher', 'Pit Bull Terrier', 'American Staffordshire Terrier', 'Beagle'
$ BREED_MIXED   <bool> False, True, False, False, True
$ AGE           <enum> Young, Young, Senior, Adult, Adult
$ SEX           <enum> Male, Male, Female, Male, Female
$ SIZE          <enum> Large, Small, Large, Medium, Medium
$ FIXED         <bool> False, True, True, True, False
$ HOUSE_TRAINED <bool> False, False, True, False, False
$ SHOTS_CURRENT <bool> True, True, True, True, False
$ NAME           <str> 'Mally', 'Zeus', 'Rosie', 'Chase', 'Abigail'
$ DATE          <date> 2019-09-12, 2019-08-15, 2016-07-15, 2019-08-30, 2019-09-08
$ TIME          <time> 06:30:46, 01:19:09, 05:22:01, 21:28:34, 11:09:06
$ CONTACT_CITY   <str> 'Parkersburg', 'Tavares', 'Lake Ronkonkoma', 'Darlington', 'Bloomfield